diff --git a/.build/README.md b/.build/README.md index 4c15297908c2..c62424d89411 100644 --- a/.build/README.md +++ b/.build/README.md @@ -100,6 +100,7 @@ Running other types of tests with docker: .build/docker/run-tests.sh -a test .build/docker/run-tests.sh -a stress-test .build/docker/run-tests.sh -a fqltool-test + .build/docker/run-tests.sh -a sstableloader-test .build/docker/run-tests.sh -a microbench .build/docker/run-tests.sh -a test-cdc .build/docker/run-tests.sh -a test-compression diff --git a/.build/build-accord.xml b/.build/build-accord.xml new file mode 100644 index 000000000000..0d16197c6bea --- /dev/null +++ b/.build/build-accord.xml @@ -0,0 +1,51 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.build/build-checkstyle.xml b/.build/build-checkstyle.xml index af5867e4aa9a..0484e4098c66 100644 --- a/.build/build-checkstyle.xml +++ b/.build/build-checkstyle.xml @@ -19,7 +19,7 @@ - + @@ -45,7 +45,7 @@ - + diff --git a/.build/build-owasp.xml b/.build/build-owasp.xml index b71127982ab4..4c00a6a98977 100644 --- a/.build/build-owasp.xml +++ b/.build/build-owasp.xml @@ -19,7 +19,7 @@ - + @@ -34,7 +34,7 @@ unless="dependency-check-ant.archive.present"> - @@ -105,7 +105,7 @@ - + diff --git a/.build/build-rat.xml b/.build/build-rat.xml index 2f6f5c715666..32bf3d736909 100644 --- a/.build/build-rat.xml +++ b/.build/build-rat.xml @@ -30,6 +30,7 @@ + @@ -78,6 +79,7 @@ + diff --git a/.build/build-resolver.xml b/.build/build-resolver.xml index 42bcc82512d0..29031b33a115 100644 --- a/.build/build-resolver.xml +++ b/.build/build-resolver.xml @@ -178,7 +178,7 @@ - + @@ -206,7 +206,7 @@ - + @@ -251,8 +251,8 @@ - - + + diff --git a/.build/build-sonar.xml b/.build/build-sonar.xml index 250191064c43..31472d007103 100644 --- a/.build/build-sonar.xml +++ b/.build/build-sonar.xml @@ -226,9 +226,9 @@ - + - + diff --git a/.build/cassandra-build-deps-template.xml b/.build/cassandra-build-maven-pom.xml similarity index 96% rename from .build/cassandra-build-deps-template.xml rename to .build/cassandra-build-maven-pom.xml index 4ec59cdf2d4b..c6b56955e013 100644 --- a/.build/cassandra-build-deps-template.xml +++ b/.build/cassandra-build-maven-pom.xml @@ -155,5 +155,10 @@ org.bouncycastle bcutil-jdk18on + + org.apache.cassandra + cassandra-accord + tests + diff --git a/.build/cassandra-deps-template.xml b/.build/cassandra-deps-maven-pom.xml similarity index 98% rename from .build/cassandra-deps-template.xml rename to .build/cassandra-deps-maven-pom.xml index a7c27ee12666..be58faa2f478 100644 --- a/.build/cassandra-deps-template.xml +++ b/.build/cassandra-deps-maven-pom.xml @@ -116,6 +116,10 @@ org.mindrot jbcrypt + + org.apache.cassandra + cassandra-accord + io.airlift airline @@ -145,7 +149,7 @@ logback-classic - com.datastax.cassandra + org.apache.cassandra cassandra-driver-core shaded diff --git a/.build/checkstyle_suppressions.xml b/.build/checkstyle_suppressions.xml index ed4d1443f7fc..230c808c1435 100644 --- a/.build/checkstyle_suppressions.xml +++ b/.build/checkstyle_suppressions.xml @@ -21,5 +21,4 @@ "https://checkstyle.org/dtds/suppressions_1_1.dtd"> - diff --git a/.build/docker/bullseye-build.docker b/.build/docker/bullseye-build.docker index 6928ec9f2992..b31bf03b3a75 100644 --- a/.build/docker/bullseye-build.docker +++ b/.build/docker/bullseye-build.docker @@ -15,7 +15,7 @@ # limitations under the License. FROM debian:bullseye -MAINTAINER Apache Cassandra +LABEL org.opencontainers.image.authors="Apache Cassandra " # CONTEXT is expected to be cassandra/.build @@ -52,3 +52,22 @@ RUN pip install --upgrade pip # dependencies for .build/ci/ci_parser.py RUN pip install beautifulsoup4==4.12.3 jinja2==3.1.3 + +# install golang. GO_VERSION_SHA must be updated with VERSION +RUN sh -c '\ + GO_VERSION="1.24.3" ;\ + GO_VERSION_SHAS="3333f6ea53afa971e9078895eaa4ac7204a8c6b5c68c10e6bc9a33e8e391bdd8 a463cb59382bd7ae7d8f4c68846e73c4d589f223c589ac76871b66811ded7836 13e6fe3fcf65689d77d40e633de1e31c6febbdbcb846eb05fc2434ed2213e92b 64a3fa22142f627e78fac3018ce3d4aeace68b743eff0afda8aae0411df5e4fb" ;\ + GO_OS=linux ;\ + [ $(uname) = "Darwin" ] && GO_OS=darwin ;\ + GO_PLATFORM=amd64 ;\ + [ $(uname -m) = "aarch64" ] && GO_PLATFORM=arm64 ;\ + GO_TAR="go${GO_VERSION}.${GO_OS}-${GO_PLATFORM}.tar.gz" ;\ + curl -L --fail --silent --retry 2 --retry-delay 5 --max-time 30 https://go.dev/dl/$GO_TAR -o $GO_TAR ;\ + GO_SHA="$(sha256sum $GO_TAR | cut -d" " -f2)" ;\ + echo "$GO_VERSION_SHAS" | sed "s/ /\n/g" | grep -q "$GO_SHA" || { echo "SHA256 mismatch for $GO_TAR $GO_SHA"; exit 1; } ;\ + tar -C /usr/local -xzf $GO_TAR ;\ + rm $GO_TAR' + +ENV GOROOT="/usr/local/go" +ENV GOPATH="$BUILD_HOME/go" +ENV PATH="$PATH:/usr/local/go/bin" \ No newline at end of file diff --git a/.build/docker/run-tests.sh b/.build/docker/run-tests.sh index 7bb8fc9d1ec5..3bfbec718a02 100755 --- a/.build/docker/run-tests.sh +++ b/.build/docker/run-tests.sh @@ -40,7 +40,7 @@ error() { # legacy argument handling case ${1} in - "build_dtest_jars" | "stress-test" | "fqltool-test" | "microbench" | "test-burn" | "long-test" | "cqlsh-test" | "simulator-dtest" | "dtest" | "dtest-novnode" | "dtest-latest" | "dtest-large" | "dtest-large-novnode" | "dtest-upgrade" | "dtest-upgrade-novnode"| "dtest-upgrade-large" | "dtest-upgrade-novnode-large" | "test" | "test-cdc" | "test-compression" | "test-oa" | "test-system-keyspace-directory" | "test-latest" | "jvm-dtest" | "jvm-dtest-upgrade" | "jvm-dtest-novnode" | "jvm-dtest-upgrade-novnode") + "build_dtest_jars" | "stress-test" | "fqltool-test" | "sstableloader-test" | "microbench" | "test-burn" | "long-test" | "cqlsh-test" | "simulator-dtest" | "dtest" | "dtest-novnode" | "dtest-latest" | "dtest-large" | "dtest-large-novnode" | "dtest-upgrade" | "dtest-upgrade-novnode"| "dtest-upgrade-large" | "dtest-upgrade-novnode-large" | "test" | "test-cdc" | "test-compression" | "test-oa" | "test-system-keyspace-directory" | "test-latest" | "jvm-dtest" | "jvm-dtest-upgrade" | "jvm-dtest-novnode" | "jvm-dtest-upgrade-novnode") test_type="-a ${1}" if [[ -z ${2} ]]; then test_list="" @@ -182,7 +182,7 @@ docker_flags="-m 5g --memory-swap 5g" case ${test_target/-repeat/} in "build_dtest_jars") ;; - "stress-test" | "fqltool-test" ) + "stress-test" | "fqltool-test" | "sstableloader-test" ) [[ ${mem} -gt $((1 * 1024 * 1024 * 1024 * ${jenkins_executors})) ]] || { error 1 "${target} require minimum docker memory 1g (per jenkins executor (${jenkins_executors})), found ${mem}"; } ;; # test-burn doesn't have enough tests in it to split beyond 8, and burn and long we want a bit more resources anyway diff --git a/.build/docker/ubuntu2004_test.docker b/.build/docker/ubuntu2004_test.docker index 8ffd24d18a7b..9d19baef18b6 100644 --- a/.build/docker/ubuntu2004_test.docker +++ b/.build/docker/ubuntu2004_test.docker @@ -124,22 +124,23 @@ RUN /bin/bash -c "source ${BUILD_HOME}/env3.8/bin/activate && \ ccm create -n 1 -v git:cassandra-4.1 test && ccm remove test && \ ccm create -n 1 -v git:cassandra-4.0 test && ccm remove test" -# Initialize ccm versions. right side of each sequence needs to be updated with new releases. -# this can be checked with: -# `curl -s https://downloads.apache.org/cassandra/ | grep -oP '(?<=href=\")[0-9]+\.[0-9]+\.[0-9]+(?=)' | sort -rV | uniq -w 3` +# Initialize ccm versions. branch heads and all versions iterating through to the latest version found on downloads.apache.org/cassandra RUN bash -c 'source ${BUILD_HOME}/env3.8/bin/activate && \ - for i in {1..14} ; do echo $i ; ccm create --quiet -n 1 -v binary:4.0.$i test && ccm remove test ; done && \ - for i in {1..7} ; do echo $i ; ccm create --quiet -n 1 -v binary:4.1.$i test && ccm remove test ; done' + latest_4_0=$(curl -s https://downloads.apache.org/cassandra/ | grep -oP "(?<=href=\")4\.0\.[0-9]+(?=\")" | sort -V | tail -1 | cut -d"." -f3) && \ + for i in $(seq 1 $latest_4_0); do echo $i ; ccm create --quiet -n 1 -v binary:4.0.$i test && ccm remove test ; done && \ + latest_4_1=$(curl -s https://downloads.apache.org/cassandra/ | grep -oP "(?<=href=\")4\.1\.[0-9]+(?=\")" | sort -V | tail -1 | cut -d"." -f3) && \ + for i in $(seq 1 $latest_4_1); do echo $i ; ccm create --quiet -n 1 -v binary:4.1.$i test && ccm remove test ; done' # 5+ requires java11 RUN sudo update-java-alternatives --set java-1.11.0-openjdk-$(dpkg --print-architecture) -# Initialize the CCM git repo, after removing the git cache, as this also can fail to clone +# Initialize ccm versions. branch heads and all versions iterating through to the latest version found on downloads.apache.org/cassandra RUN rm -fr ${BUILD_HOME}/.ccm/repository/_git_cache_apache RUN /bin/bash -c 'source ${BUILD_HOME}/env3.8/bin/activate && \ ccm create --quiet -n 1 -v git:cassandra-5.0 test && ccm remove test && \ ccm create --quiet -n 1 -v git:trunk test && ccm remove test && \ - for i in {1..2} ; do echo $i ; ccm create --quiet -n 1 -v binary:5.0.$i test && ccm remove test ; done' + latest_5_0=$(curl -s https://downloads.apache.org/cassandra/ | grep -oP "(?<=href=\")5\.0\.[0-9]+(?=\")" | sort -V | tail -1 | cut -d"." -f3) && \ + for i in $(seq 1 $latest_5_0); do echo $i ; ccm create --quiet -n 1 -v binary:5.0.$i test && ccm remove test ; done' # the .git subdirectories to pip installed cassandra-driver breaks virtualenv-clone, so just remove them # and other directories we don't need in image diff --git a/.build/git/git-hooks/post-checkout/100-update-submodules.sh b/.build/git/git-hooks/post-checkout/100-update-submodules.sh new file mode 100755 index 000000000000..b495ed086054 --- /dev/null +++ b/.build/git/git-hooks/post-checkout/100-update-submodules.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Redirect output to stderr. +exec 1>&2 + +#set -o xtrace +set -o errexit +set -o pipefail +set -o nounset + +bin="$(cd "$(dirname "$0")" > /dev/null; pwd)" + +_main() { + # In case the usage happens at a different layer, make sure to cd to the toplevel + local root_dir + root_dir="$(git rev-parse --show-toplevel)" + cd "$root_dir" + + if [[ ! -e .gitmodules ]]; then + # nothing to see here, look away! + return 0 + fi + git submodule update --init --recursive +} + +_main "$@" diff --git a/.build/git/git-hooks/post-switch b/.build/git/git-hooks/post-switch new file mode 120000 index 000000000000..5513d1deed30 --- /dev/null +++ b/.build/git/git-hooks/post-switch @@ -0,0 +1 @@ +post-checkout \ No newline at end of file diff --git a/.build/git/git-hooks/pre-commit/100-verify-submodules-pushed.sh b/.build/git/git-hooks/pre-commit/100-verify-submodules-pushed.sh new file mode 100755 index 000000000000..ec10bba04a5d --- /dev/null +++ b/.build/git/git-hooks/pre-commit/100-verify-submodules-pushed.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +## +## When working with submodules the top level project (Apache Cassandra) needs to commit all submodule +## changes so the top level knows what SHA to use. When working in a development environment it is +## common that multiple commits will exist in both projects, if the submodule has its history +## rewritten, then historic top level commits are no longer valid unless the SHAs are pushed to a +## remote repo; this is what the script attempts to do, make sure all SHAs added to the +## Apache Cassandra are backed up to a remote repo to make the Cassandra SHA buildable. +## + +# Redirect output to stderr. +exec 1>&2 + + +#set -o xtrace +set -o errexit +set -o pipefail +set -o nounset + +bin="$(cd "$(dirname "$0")" > /dev/null; pwd)" + +_log() { + echo -e "[pre-commit]\t$*" +} + +error() { + _log "$@" 1>&2 + exit 1 +} + +# Status Table +# A Added +# C Copied +# D Deleted +# M Modified +# R Renamed +# T Type Changed (i.e. regular file, symlink, submodule, …<200b>) +# U Unmerged +# X Unknown +# B Broken +_main() { + # In case the usage happens at a different layer, make sure to cd to the toplevel + local root_dir + root_dir="$(git rev-parse --show-toplevel)" + cd "$root_dir" + + [[ ! -e .gitmodules ]] && return 0 + local enabled=$(git config --bool cassandra.pre-commit.verify-submodules.enabled || echo true) + [ "$enabled" == "false" ] && return 0 + local submodules=( $(git config --file .gitmodules --get-regexp path | awk '{ print $2 }') ) + + local is_submodule=false + local git_sub_dir + local git_sha + while read status file; do + is_submodule=false + for to_check in "${submodules[*]}"; do + if [[ "$to_check" == "$file" ]]; then + is_submodule=true + break + fi + done + if $is_submodule; then + local enabled=$(git config --bool cassandra.pre-commit.verify-submodule-${file}.enabled || echo true) + [ "$enabled" == "false" ] && continue + _log "Submodule detected: ${file} with status ${status}; attempting a push" + _log "\tTo disable pushes, run" + _log "\t\tgit config --local cassandra.pre-commit.verify-submodules.enabled false" + _log "\tOr" + _log "\t\tgit config --local cassandra.pre-commit.verify-submodule-${file}.enabled false" + git_sub_dir="${file}/.git" + branch="$(git config -f .gitmodules "submodule.${file}.branch")" + [[ -z "${branch:-}" ]] && error "Submodule ${file} does not define a branch" + git_sha="$(git --git-dir "${git_sub_dir}" rev-parse HEAD)" + local remote="$(git --git-dir "${git_sub_dir}" config --get "branch.${branch}.remote" || error "Git branch ${branch} is not set up to track any remote in submodule ${file}")" + git --git-dir "${git_sub_dir}" fetch "${remote}" + git --git-dir "${git_sub_dir}" branch "${remote}/${branch}" --contains "${git_sha}" || error "Git commit ${git_sha} not found in $(git remote get-url "${remote}") on branch ${branch}" + fi + done < <(git diff --cached --name-status) +} + +_main "$@" diff --git a/.build/git/install-git-defaults.sh b/.build/git/install-git-defaults.sh new file mode 100755 index 000000000000..7c26ed5eda7c --- /dev/null +++ b/.build/git/install-git-defaults.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#set -o xtrace +set -o errexit +set -o pipefail +set -o nounset + +bin="$(cd "$(dirname "$0")" > /dev/null; pwd)" + +install_template_script() { + local -r name="$1" + local -r d_dir="$2" + + cat < "$name" +#!/usr/bin/env bash + +# This script is autogenerated by the Apache Cassandra build; DO NOT CHANGE! +# When this script is not found it will be installed automatically by the build +# If an existing script is found, that script will be reloated under ${d_dir} as 000-original.sh + +# Redirect output to stderr. +exec 1>&2 + +# Find all scripts to run +for path in \$(find "$d_dir" -name '*.sh' | perl -e "print sort{(split '/', \\\$a)[-1] <=> (split '/', \\\$b)[-1]}<>"); do + "\$path" "\$@" +done +EOF + chmod a+x "$name" +} + +install_hook() { + local -r git_dir="$1" + local -r hooks_dir="${git_dir}/hooks" + local -r name="$2" + local -r d_dir="${hooks_dir}/${name}.d" + local -r trigger_on_install=$3 + + mkdir "${d_dir}" &> /dev/null || true + local -r script_name="${hooks_dir}/${name}" + local installed=true + if [[ -e "$script_name" ]]; then + # was the script already installed? + if ! grep "This script is autogenerated by the Apache Cassandra build" "$script_name" &> /dev/null ; then + echo "$script_name found, but was not generated by the Apache Cassandra build; please remove or move to ${d_dir}/000-original.sh; creating and moving to ${d_dir} will cause it to run as expected, but won't conflict with hooks this build adds" 1>&2 + exit 1 + else + installed=false + fi + fi + # install all hooks + cp "$bin"/git-hooks/"${name}"/* "$d_dir"/ + + # install coordinator hook + install_template_script "$script_name" "$d_dir" + if $installed && $trigger_on_install ; then + echo "Running script $script_name" + "$script_name" + fi +} + +_install_hooks() { + local git_dir + # make sure to use --git-common-dir and not --git-dir to support worktrees + git_dir="$(git rev-parse --git-common-dir 2> /dev/null || true)" + if [[ -z "${git_dir:-}" ]]; then + # not in a git repo, noop + return 0 + fi + + # make sure hooks directory exists; does not exist by default for worktrees + mkdir -p "${git_dir}/hooks" &> /dev/null || true + + install_hook "$git_dir" "post-checkout" true + install_hook "$git_dir" "post-switch" false + install_hook "$git_dir" "pre-commit" false +} + +_git_config_set() { + local -r name="$1" + # only care about rc + git config --local --get "$name" &> /dev/null +} + +_install_configs() { + # when doing pull, this makes sure submodules are updated + _git_config_set submodule.recurse || git config --local submodule.recurse true +} + +_main() { + local git_dir + # make sure to use --git-common-dir and not --git-dir to support worktrees + git_dir="$(git rev-parse --git-common-dir 2> /dev/null || true)" + # not in a git repo, noop + [[ -z "${git_dir:-}" ]] && return 0 + + _install_configs + _install_hooks +} + +_main "$@" diff --git a/.build/owasp/dependency-check-suppressions.xml b/.build/owasp/dependency-check-suppressions.xml index 16e9a819155f..a2c92ebde30f 100644 --- a/.build/owasp/dependency-check-suppressions.xml +++ b/.build/owasp/dependency-check-suppressions.xml @@ -26,6 +26,12 @@ CVE-2023-44487 + + + ^pkg:maven/io\.netty/netty\-.*@.*$ + CVE-2025-25193 + + ^pkg:maven/com\.fasterxml\.jackson\.core/jackson\-databind@.*$ diff --git a/.build/parent-pom-template.xml b/.build/parent-maven-pom.xml similarity index 96% rename from .build/parent-pom-template.xml rename to .build/parent-maven-pom.xml index 954127b87c17..a42991bca455 100644 --- a/.build/parent-pom-template.xml +++ b/.build/parent-maven-pom.xml @@ -38,7 +38,7 @@ 1.12.13 4.0.20 - 4.1.113.Final + 4.1.119.Final 0.5.1 @@ -403,27 +403,27 @@ org.slf4j slf4j-api - 1.7.36 + 2.0.17 org.slf4j log4j-over-slf4j - 1.7.36 + 2.0.17 org.slf4j jcl-over-slf4j - 1.7.36 + 2.0.17 ch.qos.logback logback-core - 1.2.12 + 1.5.18 ch.qos.logback logback-classic - 1.2.12 + 1.5.18 com.fasterxml.jackson.core @@ -671,12 +671,12 @@ org.apache.cassandra cassandra-all - 4.1-alpha2-SNAPSHOT + @version@ io.dropwizard.metrics metrics-core - 4.2.19 + 4.2.28 org.slf4j @@ -715,6 +715,42 @@ jbcrypt 0.4 + + org.apache.cassandra + cassandra-accord + @version@ + + + org.apache.cassandra + cassandra-all + + + + + org.apache.cassandra + cassandra-accord + @version@ + tests + test + + + org.junit.jupiter + junit-jupiter-api + + + org.junit.jupiter + junit-jupiter-engine + + + ch.qos.logback + logback-classic + + + org.apache.cassandra + cassandra-all + + + io.airlift airline @@ -800,7 +836,7 @@ io.netty netty-tcnative-boringssl-static - 2.0.61.Final + 2.0.70.Final org.bouncycastle @@ -940,9 +976,9 @@ - com.datastax.cassandra + org.apache.cassandra cassandra-driver-core - 3.11.5 + 3.12.1 shaded diff --git a/.build/rat-include-accord.sh b/.build/rat-include-accord.sh new file mode 100755 index 000000000000..3c4945c5e429 --- /dev/null +++ b/.build/rat-include-accord.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#set -o xtrace +set -o errexit +set -o pipefail +set -o nounset + +home="$(cd "$(dirname "$0")"/.. > /dev/null; pwd)" + +git --git-dir="$home"/modules/accord/.git ls-tree -r HEAD --name-only | sed 's;^;modules/accord/;' diff --git a/.build/run-tests.sh b/.build/run-tests.sh index ad2e04b40b12..0c5c5ce558ba 100755 --- a/.build/run-tests.sh +++ b/.build/run-tests.sh @@ -66,7 +66,7 @@ print_help() { # legacy argument handling case ${1} in - "build_dtest_jars" | "stress-test" | "fqltool-test" | "microbench" | "test-burn" | "long-test" | "cqlsh-test" | "simulator-dtest" | "test" | "test-cdc" | "test-compression" | "test-oa" | "test-system-keyspace-directory" | "test-latest" | "jvm-dtest" | "jvm-dtest-upgrade" | "jvm-dtest-novnode" | "jvm-dtest-upgrade-novnode") + "build_dtest_jars" | "stress-test" | "fqltool-test" | "sstableloader-test" | "microbench" | "test-burn" | "long-test" | "cqlsh-test" | "simulator-dtest" | "test" | "test-cdc" | "test-compression" | "test-oa" | "test-system-keyspace-directory" | "test-latest" | "jvm-dtest" | "jvm-dtest-upgrade" | "jvm-dtest-novnode" | "jvm-dtest-upgrade-novnode") test_type="-a ${1}" if [[ -z ${2} ]]; then test_list="" @@ -285,7 +285,7 @@ _main() { # check split_chunk is compatible with target (if not a regexp) if [[ "${_split_chunk}" =~ ^\d+/\d+$ ]] && [[ "1/1" != "${split_chunk}" ]] ; then case ${target} in - "stress-test" | "fqltool-test" | "microbench" | "cqlsh-test" | "simulator-dtest") + "stress-test" | "fqltool-test" | "sstableloader-test" | "microbench" | "cqlsh-test" | "simulator-dtest") error 1 "Target ${target} does not suport splits." ;; *) @@ -344,6 +344,11 @@ _main() { ant fqltool-build-test ${ANT_TEST_OPTS} ant $target ${ANT_TEST_OPTS} || echo "failed ${target} ${split_chunk}" ;; + "sstableloader-test") + # hard fail on test compilation, but dont fail the test run so unstable test reports are processed + ant sstableloader-build-test ${ANT_TEST_OPTS} + ant $target ${ANT_TEST_OPTS} || echo "failed ${target} ${split_chunk}" + ;; "microbench") ant $target ${ANT_TEST_OPTS} -Dmaven.test.failure.ignore=true ;; diff --git a/.build/sh/bump-accord.sh b/.build/sh/bump-accord.sh new file mode 100755 index 000000000000..43a476f3edfb --- /dev/null +++ b/.build/sh/bump-accord.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#set -o xtrace +set -o errexit +set -o pipefail +set -o nounset + +_main() { + local home + home="$(git rev-parse --show-toplevel)" + cd "$home" + + git submodule status modules/accord + echo "Is this the correct SHA? [y/n; default=y]" + read correct + if [[ "${correct:-y}" != "y" ]]; then + echo "Please update Accord's SHA and try again" + exit 1 + fi + git commit -m "Change Accord to $(cd modules/accord; git log -1 --format='%h: %B')" modules/accord +} + +_main "$@" diff --git a/.build/sh/change-submodule-accord.sh b/.build/sh/change-submodule-accord.sh new file mode 100755 index 000000000000..997db3dc2c29 --- /dev/null +++ b/.build/sh/change-submodule-accord.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#set -o xtrace +set -o errexit +set -o pipefail +set -o nounset + +bin="$(cd "$(dirname "$0")" > /dev/null; pwd)" + +"$bin"/change-submodule.sh modules/accord 'https://github.com/apache/cassandra-accord.git' trunk diff --git a/.build/sh/change-submodule.sh b/.build/sh/change-submodule.sh new file mode 100755 index 000000000000..6ab2d3795afd --- /dev/null +++ b/.build/sh/change-submodule.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#set -o xtrace +set -o errexit +set -o pipefail +set -o nounset + +_usage() { + cat <&2 + exit 1 +} + +_usage() { + cat < + + + 4.0.0 + + org.apache.cassandra + cassandra-parent + @version@ + @final.name@-parent.pom + + cassandra-sstableloader + @version@ + Apache Cassandra SSTableLoader + Standalone SSTableLoader for Apache Cassandra. + https://cassandra.apache.org + 2025 + + + The Apache Software License, Version 2.0 + https://www.apache.org/licenses/LICENSE-2.0.txt + + + + scm:https://gitbox.apache.org/repos/asf/cassandra.git + scm:https://gitbox.apache.org/repos/asf/cassandra.git + https://gitbox.apache.org/repos/asf?p=cassandra.git + + + + org.apache.cassandra + cassandra-all + + + diff --git a/.circleci/config.yml b/.circleci/config.yml index 864919b8f418..edcbb164c35e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -53,7 +53,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_UPGRADE_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_UPGRADE_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_UPGRADE_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_UPGRADE_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_UPGRADE_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_UPGRADE_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -84,6 +84,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -191,6 +193,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -308,6 +312,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -373,6 +379,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -431,7 +439,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_FQLTOOL_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_FQLTOOL_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_FQLTOOL} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=fqltool-test\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant fqltool-test $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_FQLTOOL_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_FQLTOOL_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_FQLTOOL} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=fqltool-test\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant fqltool-test $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -462,6 +470,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -595,6 +605,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -660,6 +672,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -778,6 +792,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -886,6 +902,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1040,6 +1058,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1172,6 +1192,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1281,6 +1303,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1398,6 +1422,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1457,7 +1483,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_STRESS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_STRESS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_STRESS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=stress-test-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant stress-test-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_STRESS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_STRESS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_STRESS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=stress-test-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant stress-test-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -1488,6 +1514,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1606,6 +1634,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1714,6 +1744,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1773,7 +1805,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-system-keyspace-directory\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-system-keyspace-directory $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-system-keyspace-directory\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-system-keyspace-directory $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -1804,6 +1836,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1912,6 +1946,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2045,6 +2081,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2199,6 +2237,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2316,6 +2356,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2375,7 +2417,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-latest\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-latest $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-latest\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-latest $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -2406,6 +2448,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2514,6 +2558,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2622,6 +2668,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2681,7 +2729,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_STRESS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_STRESS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_STRESS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=stress-test-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant stress-test-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_STRESS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_STRESS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_STRESS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=stress-test-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant stress-test-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -2712,6 +2760,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2735,6 +2785,80 @@ jobs: - REPEATED_ANT_TEST_COUNT: 500 - JAVA_HOME: /usr/lib/jvm/java-17-openjdk-amd64 - JDK_HOME: /usr/lib/jvm/java-17-openjdk-amd64 + j11_utests_sstableloader: + docker: + - image: apache/cassandra-testing-ubuntu2004-java11-w-dependencies:latest + resource_class: medium + working_directory: ~/ + shell: /bin/bash -eo pipefail -l + parallelism: 1 + steps: + - attach_workspace: + at: /home/cassandra + - run: + name: Run Unit Tests (sstableloader-test) + command: | + export PATH=$JAVA_HOME/bin:$PATH + time mv ~/cassandra /tmp + cd /tmp/cassandra + if [ -d ~/dtest_jars ]; then + cp ~/dtest_jars/dtest* /tmp/cassandra/build/ + fi + ant sstableloader-test -Dno-build-test=true + no_output_timeout: 15m + - store_test_results: + path: /tmp/cassandra/build/test/output/ + - store_artifacts: + path: /tmp/cassandra/build/test/output + destination: junitxml + - store_artifacts: + path: /tmp/cassandra/build/test/logs + destination: logs + environment: + - ANT_HOME: /usr/share/ant + - LANG: en_US.UTF-8 + - KEEP_TEST_DIR: true + - DEFAULT_DIR: /home/cassandra/cassandra-dtest + - PYTHONIOENCODING: utf-8 + - PYTHONUNBUFFERED: true + - CASS_DRIVER_NO_EXTENSIONS: true + - CASS_DRIVER_NO_CYTHON: true + - CASSANDRA_SKIP_SYNC: true + - DTEST_REPO: https://github.com/apache/cassandra-dtest.git + - DTEST_BRANCH: trunk + - CCM_MAX_HEAP_SIZE: 1024M + - CCM_HEAP_NEWSIZE: 256M + - REPEATED_TESTS_STOP_ON_FAILURE: false + - REPEATED_UTESTS: null + - REPEATED_UTESTS_COUNT: 500 + - REPEATED_UTESTS_FQLTOOL: null + - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 + - REPEATED_UTESTS_LONG: null + - REPEATED_UTESTS_LONG_COUNT: 100 + - REPEATED_UTESTS_STRESS: null + - REPEATED_UTESTS_STRESS_COUNT: 500 + - REPEATED_SIMULATOR_DTESTS: null + - REPEATED_SIMULATOR_DTESTS_COUNT: 500 + - REPEATED_JVM_DTESTS: null + - REPEATED_JVM_DTESTS_COUNT: 500 + - REPEATED_JVM_UPGRADE_DTESTS: null + - REPEATED_JVM_UPGRADE_DTESTS_COUNT: 500 + - REPEATED_DTESTS: null + - REPEATED_DTESTS_COUNT: 500 + - REPEATED_LARGE_DTESTS: null + - REPEATED_LARGE_DTESTS_COUNT: 100 + - REPEATED_UPGRADE_DTESTS: null + - REPEATED_UPGRADE_DTESTS_COUNT: 25 + - REPEATED_ANT_TEST_TARGET: testsome + - REPEATED_ANT_TEST_CLASS: null + - REPEATED_ANT_TEST_METHODS: null + - REPEATED_ANT_TEST_VNODES: false + - REPEATED_ANT_TEST_COUNT: 500 + - JAVA_HOME: /usr/lib/jvm/java-11-openjdk-amd64 + - JDK_HOME: /usr/lib/jvm/java-11-openjdk-amd64 + - CASSANDRA_USE_JDK11: true j11_utests_compression_repeat: docker: - image: apache/cassandra-testing-ubuntu2004-java11-w-dependencies:latest @@ -2770,7 +2894,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-compression\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-compression $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-compression\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-compression $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -2801,6 +2925,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2919,6 +3045,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3019,6 +3147,7 @@ jobs: $target == "test-oa" || \ $target == "test-system-keyspace-directory" || \ $target == "fqltool-test" || \ + $target == "sstableloader-test" || \ $target == "long-test" || \ $target == "stress-test" || \ $target == "test-simulator-dtest" ]]; then @@ -3114,6 +3243,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3199,6 +3330,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3223,6 +3356,97 @@ jobs: - JAVA_HOME: /usr/lib/jvm/java-11-openjdk-amd64 - JDK_HOME: /usr/lib/jvm/java-11-openjdk-amd64 - CASSANDRA_USE_JDK11: true + j17_utests_sstableloader_repeat: + docker: + - image: apache/cassandra-testing-ubuntu2004-java11:latest + resource_class: medium + working_directory: ~/ + shell: /bin/bash -eo pipefail -l + parallelism: 4 + steps: + - attach_workspace: + at: /home/cassandra + - run: + name: Log Environment Information + command: | + echo '*** id ***' + id + echo '*** cat /proc/cpuinfo ***' + cat /proc/cpuinfo + echo '*** free -m ***' + free -m + echo '*** df -m ***' + df -m + echo '*** ifconfig -a ***' + ifconfig -a + echo '*** uname -a ***' + uname -a + echo '*** mount ***' + mount + echo '*** env ***' + env + echo '*** java ***' + which java + java -version + - run: + name: Repeatedly run new or modifed JUnit tests + no_output_timeout: 15m + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_SSTABLELOADER_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_SSTABLELOADER_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_SSTABLELOADER} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=sstableloader-test\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant sstableloader-test $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + - store_test_results: + path: /tmp/results/repeated_utests/output + - store_artifacts: + path: /tmp/results/repeated_utests/stdout + destination: stdout + - store_artifacts: + path: /tmp/results/repeated_utests/output + destination: junitxml + - store_artifacts: + path: /tmp/results/repeated_utests/logs + destination: logs + environment: + - ANT_HOME: /usr/share/ant + - LANG: en_US.UTF-8 + - KEEP_TEST_DIR: true + - DEFAULT_DIR: /home/cassandra/cassandra-dtest + - PYTHONIOENCODING: utf-8 + - PYTHONUNBUFFERED: true + - CASS_DRIVER_NO_EXTENSIONS: true + - CASS_DRIVER_NO_CYTHON: true + - CASSANDRA_SKIP_SYNC: true + - DTEST_REPO: https://github.com/apache/cassandra-dtest.git + - DTEST_BRANCH: trunk + - CCM_MAX_HEAP_SIZE: 1024M + - CCM_HEAP_NEWSIZE: 256M + - REPEATED_TESTS_STOP_ON_FAILURE: false + - REPEATED_UTESTS: null + - REPEATED_UTESTS_COUNT: 500 + - REPEATED_UTESTS_FQLTOOL: null + - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 + - REPEATED_UTESTS_LONG: null + - REPEATED_UTESTS_LONG_COUNT: 100 + - REPEATED_UTESTS_STRESS: null + - REPEATED_UTESTS_STRESS_COUNT: 500 + - REPEATED_SIMULATOR_DTESTS: null + - REPEATED_SIMULATOR_DTESTS_COUNT: 500 + - REPEATED_JVM_DTESTS: null + - REPEATED_JVM_DTESTS_COUNT: 500 + - REPEATED_JVM_UPGRADE_DTESTS: null + - REPEATED_JVM_UPGRADE_DTESTS_COUNT: 500 + - REPEATED_DTESTS: null + - REPEATED_DTESTS_COUNT: 500 + - REPEATED_LARGE_DTESTS: null + - REPEATED_LARGE_DTESTS_COUNT: 100 + - REPEATED_UPGRADE_DTESTS: null + - REPEATED_UPGRADE_DTESTS_COUNT: 25 + - REPEATED_ANT_TEST_TARGET: testsome + - REPEATED_ANT_TEST_CLASS: null + - REPEATED_ANT_TEST_METHODS: null + - REPEATED_ANT_TEST_VNODES: false + - REPEATED_ANT_TEST_COUNT: 500 + - JAVA_HOME: /usr/lib/jvm/java-17-openjdk-amd64 + - JDK_HOME: /usr/lib/jvm/java-17-openjdk-amd64 j11_dtests_large_vnode_repeat: docker: - image: apache/cassandra-testing-ubuntu2004-java11-w-dependencies:latest @@ -3332,6 +3556,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3417,6 +3643,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3476,7 +3704,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-system-keyspace-directory\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-system-keyspace-directory $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-system-keyspace-directory\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-system-keyspace-directory $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -3507,6 +3735,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3625,6 +3855,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3758,6 +3990,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3876,6 +4110,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3976,6 +4212,7 @@ jobs: $target == "test-oa" || \ $target == "test-system-keyspace-directory" || \ $target == "fqltool-test" || \ + $target == "sstableloader-test" || \ $target == "long-test" || \ $target == "stress-test" || \ $target == "test-simulator-dtest" ]]; then @@ -4071,6 +4308,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4177,6 +4416,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4294,6 +4535,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4353,7 +4596,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-cdc\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-cdc $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-cdc\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-cdc $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -4384,6 +4627,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4501,6 +4746,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4572,6 +4819,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4631,7 +4880,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=true\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=true\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -4662,6 +4911,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4721,7 +4972,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=true\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=true\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -4752,6 +5003,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4855,6 +5108,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4927,6 +5182,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4986,7 +5243,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -5017,6 +5274,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5130,6 +5389,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5248,6 +5509,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5320,6 +5583,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5379,7 +5644,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-cdc\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-cdc $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-cdc\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-cdc $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -5410,6 +5675,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5517,6 +5784,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5649,6 +5918,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5767,6 +6038,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5885,6 +6158,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6018,6 +6293,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6076,7 +6353,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-latest\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-latest $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-latest\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-latest $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -6107,6 +6384,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6192,6 +6471,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6325,6 +6606,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6384,7 +6667,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-compression\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-compression $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-compression\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-compression $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -6415,6 +6698,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6486,6 +6771,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6570,6 +6857,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6678,6 +6967,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6737,7 +7028,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_SIMULATOR_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_SIMULATOR_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_SIMULATOR_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-simulator-dtest\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-simulator-dtest $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_SIMULATOR_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_SIMULATOR_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_SIMULATOR_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-simulator-dtest\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-simulator-dtest $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -6768,6 +7059,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6827,7 +7120,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -6858,6 +7151,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6967,6 +7262,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7031,6 +7328,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7139,6 +7438,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7257,6 +7558,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7365,6 +7668,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7473,6 +7778,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7590,6 +7897,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7697,6 +8006,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7814,6 +8125,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7885,6 +8198,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7991,6 +8306,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8050,7 +8367,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-oa\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-oa $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-oa\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-oa $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -8081,6 +8398,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8166,6 +8485,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8230,6 +8551,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8315,6 +8638,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8424,6 +8749,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8541,6 +8868,173 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 + - REPEATED_UTESTS_LONG: null + - REPEATED_UTESTS_LONG_COUNT: 100 + - REPEATED_UTESTS_STRESS: null + - REPEATED_UTESTS_STRESS_COUNT: 500 + - REPEATED_SIMULATOR_DTESTS: null + - REPEATED_SIMULATOR_DTESTS_COUNT: 500 + - REPEATED_JVM_DTESTS: null + - REPEATED_JVM_DTESTS_COUNT: 500 + - REPEATED_JVM_UPGRADE_DTESTS: null + - REPEATED_JVM_UPGRADE_DTESTS_COUNT: 500 + - REPEATED_DTESTS: null + - REPEATED_DTESTS_COUNT: 500 + - REPEATED_LARGE_DTESTS: null + - REPEATED_LARGE_DTESTS_COUNT: 100 + - REPEATED_UPGRADE_DTESTS: null + - REPEATED_UPGRADE_DTESTS_COUNT: 25 + - REPEATED_ANT_TEST_TARGET: testsome + - REPEATED_ANT_TEST_CLASS: null + - REPEATED_ANT_TEST_METHODS: null + - REPEATED_ANT_TEST_VNODES: false + - REPEATED_ANT_TEST_COUNT: 500 + - JAVA_HOME: /usr/lib/jvm/java-17-openjdk-amd64 + - JDK_HOME: /usr/lib/jvm/java-17-openjdk-amd64 + j11_utests_sstableloader_repeat: + docker: + - image: apache/cassandra-testing-ubuntu2004-java11-w-dependencies:latest + resource_class: medium + working_directory: ~/ + shell: /bin/bash -eo pipefail -l + parallelism: 4 + steps: + - attach_workspace: + at: /home/cassandra + - run: + name: Log Environment Information + command: | + echo '*** id ***' + id + echo '*** cat /proc/cpuinfo ***' + cat /proc/cpuinfo + echo '*** free -m ***' + free -m + echo '*** df -m ***' + df -m + echo '*** ifconfig -a ***' + ifconfig -a + echo '*** uname -a ***' + uname -a + echo '*** mount ***' + mount + echo '*** env ***' + env + echo '*** java ***' + which java + java -version + - run: + name: Repeatedly run new or modifed JUnit tests + no_output_timeout: 15m + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_SSTABLELOADER_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_SSTABLELOADER_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_SSTABLELOADER} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=sstableloader-test\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant sstableloader-test $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + - store_test_results: + path: /tmp/results/repeated_utests/output + - store_artifacts: + path: /tmp/results/repeated_utests/stdout + destination: stdout + - store_artifacts: + path: /tmp/results/repeated_utests/output + destination: junitxml + - store_artifacts: + path: /tmp/results/repeated_utests/logs + destination: logs + environment: + - ANT_HOME: /usr/share/ant + - LANG: en_US.UTF-8 + - KEEP_TEST_DIR: true + - DEFAULT_DIR: /home/cassandra/cassandra-dtest + - PYTHONIOENCODING: utf-8 + - PYTHONUNBUFFERED: true + - CASS_DRIVER_NO_EXTENSIONS: true + - CASS_DRIVER_NO_CYTHON: true + - CASSANDRA_SKIP_SYNC: true + - DTEST_REPO: https://github.com/apache/cassandra-dtest.git + - DTEST_BRANCH: trunk + - CCM_MAX_HEAP_SIZE: 1024M + - CCM_HEAP_NEWSIZE: 256M + - REPEATED_TESTS_STOP_ON_FAILURE: false + - REPEATED_UTESTS: null + - REPEATED_UTESTS_COUNT: 500 + - REPEATED_UTESTS_FQLTOOL: null + - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 + - REPEATED_UTESTS_LONG: null + - REPEATED_UTESTS_LONG_COUNT: 100 + - REPEATED_UTESTS_STRESS: null + - REPEATED_UTESTS_STRESS_COUNT: 500 + - REPEATED_SIMULATOR_DTESTS: null + - REPEATED_SIMULATOR_DTESTS_COUNT: 500 + - REPEATED_JVM_DTESTS: null + - REPEATED_JVM_DTESTS_COUNT: 500 + - REPEATED_JVM_UPGRADE_DTESTS: null + - REPEATED_JVM_UPGRADE_DTESTS_COUNT: 500 + - REPEATED_DTESTS: null + - REPEATED_DTESTS_COUNT: 500 + - REPEATED_LARGE_DTESTS: null + - REPEATED_LARGE_DTESTS_COUNT: 100 + - REPEATED_UPGRADE_DTESTS: null + - REPEATED_UPGRADE_DTESTS_COUNT: 25 + - REPEATED_ANT_TEST_TARGET: testsome + - REPEATED_ANT_TEST_CLASS: null + - REPEATED_ANT_TEST_METHODS: null + - REPEATED_ANT_TEST_VNODES: false + - REPEATED_ANT_TEST_COUNT: 500 + - JAVA_HOME: /usr/lib/jvm/java-11-openjdk-amd64 + - JDK_HOME: /usr/lib/jvm/java-11-openjdk-amd64 + - CASSANDRA_USE_JDK11: true + j17_utests_sstableloader: + docker: + - image: apache/cassandra-testing-ubuntu2004-java11:latest + resource_class: medium + working_directory: ~/ + shell: /bin/bash -eo pipefail -l + parallelism: 1 + steps: + - attach_workspace: + at: /home/cassandra + - run: + name: Run Unit Tests (sstableloader-test) + command: | + export PATH=$JAVA_HOME/bin:$PATH + time mv ~/cassandra /tmp + cd /tmp/cassandra + if [ -d ~/dtest_jars ]; then + cp ~/dtest_jars/dtest* /tmp/cassandra/build/ + fi + ant sstableloader-test -Dno-build-test=true + no_output_timeout: 15m + - store_test_results: + path: /tmp/cassandra/build/test/output/ + - store_artifacts: + path: /tmp/cassandra/build/test/output + destination: junitxml + - store_artifacts: + path: /tmp/cassandra/build/test/logs + destination: logs + environment: + - ANT_HOME: /usr/share/ant + - LANG: en_US.UTF-8 + - KEEP_TEST_DIR: true + - DEFAULT_DIR: /home/cassandra/cassandra-dtest + - PYTHONIOENCODING: utf-8 + - PYTHONUNBUFFERED: true + - CASS_DRIVER_NO_EXTENSIONS: true + - CASS_DRIVER_NO_CYTHON: true + - CASSANDRA_SKIP_SYNC: true + - DTEST_REPO: https://github.com/apache/cassandra-dtest.git + - DTEST_BRANCH: trunk + - CCM_MAX_HEAP_SIZE: 1024M + - CCM_HEAP_NEWSIZE: 256M + - REPEATED_TESTS_STOP_ON_FAILURE: false + - REPEATED_UTESTS: null + - REPEATED_UTESTS_COUNT: 500 + - REPEATED_UTESTS_FQLTOOL: null + - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8599,7 +9093,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_FQLTOOL_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_FQLTOOL_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_FQLTOOL} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=fqltool-test\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant fqltool-test $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_FQLTOOL_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_FQLTOOL_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_FQLTOOL} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=fqltool-test\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant fqltool-test $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -8630,6 +9124,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8714,6 +9210,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8772,7 +9270,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -8803,6 +9301,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8875,6 +9375,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8933,7 +9435,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-oa\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-oa $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-oa\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-oa $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -8964,6 +9466,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -9048,6 +9552,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -9107,7 +9613,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -9138,6 +9644,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -9270,6 +9778,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -9328,7 +9838,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_LONG_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_LONG_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_LONG} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=long-testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant long-testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_LONG_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_LONG_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_LONG} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=long-testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant long-testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -9359,6 +9869,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -9417,7 +9929,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_LONG_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_LONG_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_LONG} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=long-testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant long-testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_LONG_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_LONG_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_LONG} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=long-testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant long-testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -9448,6 +9960,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -9481,302 +9995,587 @@ workflows: - j11_build: requires: - start_j11_build + upstream: + start_j11_build: + - success - start_j11_unit_tests: type: approval - j11_unit_tests: requires: - start_j11_unit_tests - j11_build + upstream: + start_j11_unit_tests: + - success + j11_build: + - success - start_j11_jvm_dtests: type: approval - j11_jvm_dtests: requires: - start_j11_jvm_dtests - j11_build + upstream: + start_j11_jvm_dtests: + - success + j11_build: + - success - start_j11_jvm_dtests_latest_vnode: type: approval - j11_jvm_dtests_latest_vnode: requires: - start_j11_jvm_dtests_latest_vnode - j11_build + upstream: + start_j11_jvm_dtests_latest_vnode: + - success + j11_build: + - success - start_j17_jvm_dtests: type: approval - j17_jvm_dtests: requires: - start_j17_jvm_dtests - j11_build + upstream: + start_j17_jvm_dtests: + - success + j11_build: + - success - start_j17_jvm_dtests_latest_vnode: type: approval - j17_jvm_dtests_latest_vnode: requires: - start_j17_jvm_dtests_latest_vnode - j11_build + upstream: + start_j17_jvm_dtests_latest_vnode: + - success + j11_build: + - success - start_j11_simulator_dtests: type: approval - j11_simulator_dtests: requires: - start_j11_simulator_dtests - j11_build + upstream: + start_j11_simulator_dtests: + - success + j11_build: + - success - start_j11_cqlshlib_tests: type: approval - j11_cqlshlib_tests: requires: - start_j11_cqlshlib_tests - j11_build + upstream: + start_j11_cqlshlib_tests: + - success + j11_build: + - success - start_j11_cqlshlib_cython_tests: type: approval - j11_cqlshlib_cython_tests: requires: - start_j11_cqlshlib_cython_tests - j11_build + upstream: + start_j11_cqlshlib_cython_tests: + - success + j11_build: + - success - start_j17_cqlshlib_tests: type: approval - j17_cqlshlib_tests: requires: - start_j17_cqlshlib_tests - j11_build + upstream: + start_j17_cqlshlib_tests: + - success + j11_build: + - success - start_j17_cqlshlib_cython_tests: type: approval - j17_cqlshlib_cython_tests: requires: - start_j17_cqlshlib_cython_tests - j11_build + upstream: + start_j17_cqlshlib_cython_tests: + - success + j11_build: + - success - start_j17_unit_tests: type: approval - j17_unit_tests: requires: - start_j17_unit_tests - j11_build + upstream: + start_j17_unit_tests: + - success + j11_build: + - success - start_j11_utests_oa: type: approval - j11_utests_oa: requires: - start_j11_utests_oa - j11_build + upstream: + start_j11_utests_oa: + - success + j11_build: + - success - start_j17_utests_oa: type: approval - j17_utests_oa: requires: - start_j17_utests_oa - j11_build + upstream: + start_j17_utests_oa: + - success + j11_build: + - success - start_j11_utests_long: type: approval - j11_utests_long: requires: - start_j11_utests_long - j11_build + upstream: + start_j11_utests_long: + - success + j11_build: + - success - start_j17_utests_long: type: approval - j17_utests_long: requires: - start_j17_utests_long - j11_build + upstream: + start_j17_utests_long: + - success + j11_build: + - success - start_j11_utests_cdc: type: approval - j11_utests_cdc: requires: - start_j11_utests_cdc - j11_build + upstream: + start_j11_utests_cdc: + - success + j11_build: + - success - start_j17_utests_cdc: type: approval - j17_utests_cdc: requires: - start_j17_utests_cdc - j11_build + upstream: + start_j17_utests_cdc: + - success + j11_build: + - success - start_j11_utests_compression: type: approval - j11_utests_compression: requires: - start_j11_utests_compression - j11_build + upstream: + start_j11_utests_compression: + - success + j11_build: + - success - start_j17_utests_compression: type: approval - j17_utests_compression: requires: - start_j17_utests_compression - j11_build + upstream: + start_j17_utests_compression: + - success + j11_build: + - success - start_j11_utests_latest: type: approval - j11_utests_latest: requires: - start_j11_utests_latest - j11_build + upstream: + start_j11_utests_latest: + - success + j11_build: + - success - start_j17_utests_latest: type: approval - j17_utests_latest: requires: - start_j17_utests_latest - j11_build + upstream: + start_j17_utests_latest: + - success + j11_build: + - success - start_j11_utests_stress: type: approval - j11_utests_stress: requires: - start_j11_utests_stress - j11_build + upstream: + start_j11_utests_stress: + - success + j11_build: + - success - start_j17_utests_stress: type: approval - j17_utests_stress: requires: - start_j17_utests_stress - j11_build + upstream: + start_j17_utests_stress: + - success + j11_build: + - success - start_j11_utests_fqltool: type: approval - j11_utests_fqltool: requires: - start_j11_utests_fqltool - j11_build + upstream: + start_j11_utests_fqltool: + - success + j11_build: + - success - start_j17_utests_fqltool: type: approval - j17_utests_fqltool: requires: - start_j17_utests_fqltool - j11_build + upstream: + start_j17_utests_fqltool: + - success + j11_build: + - success + - start_j11_utests_sstableloader: + type: approval + - j11_utests_sstableloader: + requires: + - start_j11_utests_sstableloader + - j11_build + upstream: + start_j11_utests_sstableloader: + - success + j11_build: + - success + - start_j17_utests_sstableloader: + type: approval + - j17_utests_sstableloader: + requires: + - start_j17_utests_sstableloader + - j11_build + upstream: + start_j17_utests_sstableloader: + - success + j11_build: + - success - start_j11_utests_system_keyspace_directory: type: approval - j11_utests_system_keyspace_directory: requires: - start_j11_utests_system_keyspace_directory - j11_build + upstream: + start_j11_utests_system_keyspace_directory: + - success + j11_build: + - success - start_j17_utests_system_keyspace_directory: type: approval - j17_utests_system_keyspace_directory: requires: - start_j17_utests_system_keyspace_directory - j11_build + upstream: + start_j17_utests_system_keyspace_directory: + - success + j11_build: + - success - start_j11_dtest_jars_build: type: approval - j11_dtest_jars_build: requires: - j11_build - start_j11_dtest_jars_build + upstream: + j11_build: + - success + start_j11_dtest_jars_build: + - success - start_jvm_upgrade_dtests: type: approval - j11_jvm_upgrade_dtests: requires: - start_jvm_upgrade_dtests - j11_dtest_jars_build + upstream: + start_jvm_upgrade_dtests: + - success + j11_dtest_jars_build: + - success - start_j11_dtests: type: approval - j11_dtests: requires: - start_j11_dtests - j11_build + upstream: + start_j11_dtests: + - success + j11_build: + - success - start_j11_dtests_vnode: type: approval - j11_dtests_vnode: requires: - start_j11_dtests_vnode - j11_build + upstream: + start_j11_dtests_vnode: + - success + j11_build: + - success - start_j11_dtests_latest: type: approval - j11_dtests_latest: requires: - start_j11_dtests_latest - j11_build + upstream: + start_j11_dtests_latest: + - success + j11_build: + - success - start_j17_dtests: type: approval - j17_dtests: requires: - start_j17_dtests - j11_build + upstream: + start_j17_dtests: + - success + j11_build: + - success - start_j17_dtests_vnode: type: approval - j17_dtests_vnode: requires: - start_j17_dtests_vnode - j11_build + upstream: + start_j17_dtests_vnode: + - success + j11_build: + - success - start_j17_dtests_latest: type: approval - j17_dtests_latest: requires: - start_j17_dtests_latest - j11_build + upstream: + start_j17_dtests_latest: + - success + j11_build: + - success - start_j11_dtests_large: type: approval - j11_dtests_large: requires: - start_j11_dtests_large - j11_build + upstream: + start_j11_dtests_large: + - success + j11_build: + - success - start_j11_dtests_large_vnode: type: approval - j11_dtests_large_vnode: requires: - start_j11_dtests_large_vnode - j11_build + upstream: + start_j11_dtests_large_vnode: + - success + j11_build: + - success - start_j17_dtests_large: type: approval - j17_dtests_large: requires: - start_j17_dtests_large - j11_build + upstream: + start_j17_dtests_large: + - success + j11_build: + - success - start_j17_dtests_large_vnode: type: approval - j17_dtests_large_vnode: requires: - start_j17_dtests_large_vnode - j11_build + upstream: + start_j17_dtests_large_vnode: + - success + j11_build: + - success - start_j11_cqlsh_tests: type: approval - j11_cqlsh_dtests_py38: requires: - start_j11_cqlsh_tests - j11_build + upstream: + start_j11_cqlsh_tests: + - success + j11_build: + - success - j11_cqlsh_dtests_py311: requires: - start_j11_cqlsh_tests - j11_build + upstream: + start_j11_cqlsh_tests: + - success + j11_build: + - success - j11_cqlsh_dtests_py38_vnode: requires: - start_j11_cqlsh_tests - j11_build + upstream: + start_j11_cqlsh_tests: + - success + j11_build: + - success - j11_cqlsh_dtests_py311_vnode: requires: - start_j11_cqlsh_tests - j11_build + upstream: + start_j11_cqlsh_tests: + - success + j11_build: + - success - start_j11_cqlsh_tests_latest: type: approval - j11_cqlsh_dtests_py38_latest: requires: - start_j11_cqlsh_tests_latest - j11_build + upstream: + start_j11_cqlsh_tests_latest: + - success + j11_build: + - success - j11_cqlsh_dtests_py311_latest: requires: - start_j11_cqlsh_tests_latest - j11_build + upstream: + start_j11_cqlsh_tests_latest: + - success + j11_build: + - success - start_j17_cqlsh_tests: type: approval - j17_cqlsh_dtests_py38: requires: - start_j17_cqlsh_tests - j11_build + upstream: + start_j17_cqlsh_tests: + - success + j11_build: + - success - j17_cqlsh_dtests_py311: requires: - start_j17_cqlsh_tests - j11_build + upstream: + start_j17_cqlsh_tests: + - success + j11_build: + - success - j17_cqlsh_dtests_py38_vnode: requires: - start_j17_cqlsh_tests - j11_build + upstream: + start_j17_cqlsh_tests: + - success + j11_build: + - success - j17_cqlsh_dtests_py311_vnode: requires: - start_j17_cqlsh_tests - j11_build + upstream: + start_j17_cqlsh_tests: + - success + j11_build: + - success - start_j17_cqlsh_tests_latest: type: approval - j17_cqlsh_dtests_py38_latest: requires: - start_j17_cqlsh_tests_latest - j11_build + upstream: + start_j17_cqlsh_tests_latest: + - success + j11_build: + - success - j17_cqlsh_dtests_py311_latest: requires: - start_j17_cqlsh_tests_latest - j11_build + upstream: + start_j17_cqlsh_tests_latest: + - success + j11_build: + - success - start_j11_upgrade_dtests: type: approval - j11_upgrade_dtests: requires: - start_j11_upgrade_dtests - j11_build + upstream: + start_j11_upgrade_dtests: + - success + j11_build: + - success java11_pre-commit_tests: jobs: - start_pre-commit_tests: @@ -9784,207 +10583,428 @@ workflows: - j11_build: requires: - start_pre-commit_tests + upstream: + start_pre-commit_tests: + - success - j11_unit_tests: requires: - j11_build + upstream: + j11_build: + - success - j11_utests_oa: requires: - j11_build + upstream: + j11_build: + - success - j11_utests_latest: requires: - j11_build + upstream: + j11_build: + - success - j11_simulator_dtests: requires: - j11_build + upstream: + j11_build: + - success - j11_jvm_dtests: requires: - j11_build + upstream: + j11_build: + - success - j11_jvm_dtests_latest_vnode: requires: - j11_build + upstream: + j11_build: + - success - j17_jvm_dtests: requires: - j11_build + upstream: + j11_build: + - success - j17_jvm_dtests_latest_vnode: requires: - j11_build + upstream: + j11_build: + - success - j11_cqlshlib_tests: requires: - j11_build + upstream: + j11_build: + - success - j11_cqlshlib_cython_tests: requires: - j11_build + upstream: + j11_build: + - success - j17_cqlshlib_tests: requires: - j11_build + upstream: + j11_build: + - success - j17_cqlshlib_cython_tests: requires: - j11_build + upstream: + j11_build: + - success - j17_unit_tests: requires: - j11_build + upstream: + j11_build: + - success - j17_utests_oa: requires: - j11_build + upstream: + j11_build: + - success - j17_utests_latest: requires: - j11_build + upstream: + j11_build: + - success - start_utests_long: type: approval - j11_utests_long: requires: - start_utests_long - j11_build + upstream: + start_utests_long: + - success + j11_build: + - success - j17_utests_long: requires: - start_utests_long - j11_build + upstream: + start_utests_long: + - success + j11_build: + - success - start_utests_cdc: type: approval - j11_utests_cdc: requires: - start_utests_cdc - j11_build + upstream: + start_utests_cdc: + - success + j11_build: + - success - j17_utests_cdc: requires: - start_utests_cdc - j11_build + upstream: + start_utests_cdc: + - success + j11_build: + - success - start_utests_compression: type: approval - j11_utests_compression: requires: - start_utests_compression - j11_build + upstream: + start_utests_compression: + - success + j11_build: + - success - j17_utests_compression: requires: - start_utests_compression - j11_build + upstream: + start_utests_compression: + - success + j11_build: + - success - start_utests_stress: type: approval - j11_utests_stress: requires: - start_utests_stress - j11_build + upstream: + start_utests_stress: + - success + j11_build: + - success - j17_utests_stress: requires: - start_utests_stress - j11_build + upstream: + start_utests_stress: + - success + j11_build: + - success - start_utests_fqltool: type: approval - j11_utests_fqltool: requires: - start_utests_fqltool - j11_build + upstream: + start_utests_fqltool: + - success + j11_build: + - success - j17_utests_fqltool: requires: - start_utests_fqltool - j11_build + upstream: + start_utests_fqltool: + - success + j11_build: + - success + - start_utests_sstableloader: + type: approval + - j11_utests_sstableloader: + requires: + - start_utests_sstableloader + - j11_build + upstream: + start_utests_sstableloader: + - success + j11_build: + - success + - j17_utests_sstableloader: + requires: + - start_utests_sstableloader + - j11_build + upstream: + start_utests_sstableloader: + - success + j11_build: + - success - start_utests_system_keyspace_directory: type: approval - j11_utests_system_keyspace_directory: requires: - j11_build + upstream: + j11_build: + - success - j17_utests_system_keyspace_directory: requires: - start_utests_system_keyspace_directory - j11_build + upstream: + start_utests_system_keyspace_directory: + - success + j11_build: + - success - start_jvm_upgrade_dtests: type: approval - j11_dtest_jars_build: requires: - j11_build - start_jvm_upgrade_dtests + upstream: + j11_build: + - success + start_jvm_upgrade_dtests: + - success - j11_jvm_upgrade_dtests: requires: - j11_dtest_jars_build + upstream: + j11_dtest_jars_build: + - success - j11_dtests: requires: - j11_build + upstream: + j11_build: + - success - j11_dtests_vnode: requires: - j11_build + upstream: + j11_build: + - success - j11_dtests_latest: requires: - j11_build + upstream: + j11_build: + - success - j17_dtests: requires: - j11_build + upstream: + j11_build: + - success - j17_dtests_vnode: requires: - j11_build + upstream: + j11_build: + - success - j17_dtests_latest: requires: - j11_build + upstream: + j11_build: + - success - start_j11_dtests_large: type: approval - j11_dtests_large: requires: - start_j11_dtests_large - j11_build + upstream: + start_j11_dtests_large: + - success + j11_build: + - success - j11_dtests_large_vnode: requires: - start_j11_dtests_large - j11_build + upstream: + start_j11_dtests_large: + - success + j11_build: + - success - start_j17_dtests_large: type: approval - j17_dtests_large: requires: - start_j17_dtests_large - j11_build + upstream: + start_j17_dtests_large: + - success + j11_build: + - success - j17_dtests_large_vnode: requires: - start_j17_dtests_large - j11_build + upstream: + start_j17_dtests_large: + - success + j11_build: + - success - j11_cqlsh_dtests_py38: requires: - j11_build + upstream: + j11_build: + - success - j11_cqlsh_dtests_py311: requires: - j11_build + upstream: + j11_build: + - success - j11_cqlsh_dtests_py38_vnode: requires: - j11_build + upstream: + j11_build: + - success - j11_cqlsh_dtests_py311_vnode: requires: - j11_build + upstream: + j11_build: + - success - start_j11_cqlsh_dtests_latest: type: approval - j11_cqlsh_dtests_py38_latest: requires: - start_j11_cqlsh_dtests_latest - j11_build + upstream: + start_j11_cqlsh_dtests_latest: + - success + j11_build: + - success - j11_cqlsh_dtests_py311_latest: requires: - start_j11_cqlsh_dtests_latest - j11_build + upstream: + start_j11_cqlsh_dtests_latest: + - success + j11_build: + - success - j17_cqlsh_dtests_py38: requires: - j11_build + upstream: + j11_build: + - success - j17_cqlsh_dtests_py311: requires: - j11_build + upstream: + j11_build: + - success - j17_cqlsh_dtests_py38_vnode: requires: - j11_build + upstream: + j11_build: + - success - j17_cqlsh_dtests_py311_vnode: requires: - j11_build + upstream: + j11_build: + - success - start_j17_cqlsh-dtests-latest: type: approval - j17_cqlsh_dtests_py38_latest: requires: - start_j17_cqlsh-dtests-latest - j11_build + upstream: + start_j17_cqlsh-dtests-latest: + - success + j11_build: + - success - j17_cqlsh_dtests_py311_latest: requires: - start_j17_cqlsh-dtests-latest - j11_build + upstream: + start_j17_cqlsh-dtests-latest: + - success + j11_build: + - success - start_j11_upgrade_dtests: type: approval - j11_upgrade_dtests: requires: - j11_build - start_j11_upgrade_dtests + upstream: + j11_build: + - success + start_j11_upgrade_dtests: + - success java17_separate_tests: jobs: - start_j17_build: @@ -9992,142 +11012,276 @@ workflows: - j17_build: requires: - start_j17_build + upstream: + start_j17_build: + - success - start_j17_unit_tests: type: approval - j17_unit_tests: requires: - start_j17_unit_tests - j17_build + upstream: + start_j17_unit_tests: + - success + j17_build: + - success - start_j17_jvm_dtests: type: approval - j17_jvm_dtests: requires: - start_j17_jvm_dtests - j17_build + upstream: + start_j17_jvm_dtests: + - success + j17_build: + - success - start_j17_jvm_dtests_latest_vnode: type: approval - j17_jvm_dtests_latest_vnode: requires: - start_j17_jvm_dtests_latest_vnode - j17_build + upstream: + start_j17_jvm_dtests_latest_vnode: + - success + j17_build: + - success - start_j17_cqlshlib_tests: type: approval - j17_cqlshlib_tests: requires: - start_j17_cqlshlib_tests - j17_build + upstream: + start_j17_cqlshlib_tests: + - success + j17_build: + - success - start_j17_cqlshlib_cython_tests: type: approval - j17_cqlshlib_cython_tests: requires: - start_j17_cqlshlib_cython_tests - j17_build + upstream: + start_j17_cqlshlib_cython_tests: + - success + j17_build: + - success - start_j17_dtests: type: approval - j17_dtests: requires: - start_j17_dtests - j17_build + upstream: + start_j17_dtests: + - success + j17_build: + - success - start_j17_dtests_vnode: type: approval - j17_dtests_vnode: requires: - start_j17_dtests_vnode - j17_build + upstream: + start_j17_dtests_vnode: + - success + j17_build: + - success - start_j17_dtests_latest: type: approval - j17_dtests_latest: requires: - start_j17_dtests_latest - j17_build + upstream: + start_j17_dtests_latest: + - success + j17_build: + - success - start_j17_dtests_large: type: approval - j17_dtests_large: requires: - start_j17_dtests_large - j17_build + upstream: + start_j17_dtests_large: + - success + j17_build: + - success - start_j17_dtests_large_vnode: type: approval - j17_dtests_large_vnode: requires: - start_j17_dtests_large_vnode - j17_build + upstream: + start_j17_dtests_large_vnode: + - success + j17_build: + - success - start_j17_cqlsh_tests: type: approval - j17_cqlsh_dtests_py38: requires: - start_j17_cqlsh_tests - j17_build + upstream: + start_j17_cqlsh_tests: + - success + j17_build: + - success - j17_cqlsh_dtests_py311: requires: - start_j17_cqlsh_tests - j17_build + upstream: + start_j17_cqlsh_tests: + - success + j17_build: + - success - j17_cqlsh_dtests_py38_vnode: requires: - start_j17_cqlsh_tests - j17_build + upstream: + start_j17_cqlsh_tests: + - success + j17_build: + - success - j17_cqlsh_dtests_py311_vnode: requires: - start_j17_cqlsh_tests - j17_build + upstream: + start_j17_cqlsh_tests: + - success + j17_build: + - success - start_j17_cqlsh-dtests-latest: type: approval - j17_cqlsh_dtests_py38_latest: requires: - start_j17_cqlsh-dtests-latest - j17_build + upstream: + start_j17_cqlsh-dtests-latest: + - success + j17_build: + - success - j17_cqlsh_dtests_py311_latest: requires: - start_j17_cqlsh-dtests-latest - j17_build + upstream: + start_j17_cqlsh-dtests-latest: + - success + j17_build: + - success - start_j17_utests_oa: type: approval - j17_utests_oa: requires: - start_j17_utests_oa - j17_build + upstream: + start_j17_utests_oa: + - success + j17_build: + - success - start_j17_utests_long: type: approval - j17_utests_long: requires: - start_j17_utests_long - j17_build + upstream: + start_j17_utests_long: + - success + j17_build: + - success - start_j17_utests_cdc: type: approval - j17_utests_cdc: requires: - start_j17_utests_cdc - j17_build + upstream: + start_j17_utests_cdc: + - success + j17_build: + - success - start_j17_utests_compression: type: approval - j17_utests_compression: requires: - start_j17_utests_compression - j17_build + upstream: + start_j17_utests_compression: + - success + j17_build: + - success - start_j17_utests_latest: type: approval - j17_utests_latest: requires: - start_j17_utests_latest - j17_build + upstream: + start_j17_utests_latest: + - success + j17_build: + - success - start_j17_utests_stress: type: approval - j17_utests_stress: requires: - start_j17_utests_stress - j17_build + upstream: + start_j17_utests_stress: + - success + j17_build: + - success - start_j17_utests_fqltool: type: approval - j17_utests_fqltool: requires: - start_j17_utests_fqltool - j17_build + upstream: + start_j17_utests_fqltool: + - success + j17_build: + - success + - start_j17_utests_sstableloader: + type: approval + - j17_utests_sstableloader: + requires: + - start_j17_utests_sstableloader + - j17_build + upstream: + start_j17_utests_sstableloader: + - success + j17_build: + - success - start_j17_utests_system_keyspace_directory: type: approval - j17_utests_system_keyspace_directory: requires: - start_j17_utests_system_keyspace_directory - j17_build + upstream: + start_j17_utests_system_keyspace_directory: + - success + j17_build: + - success java17_pre-commit_tests: jobs: - start_pre-commit_tests: @@ -10135,101 +11289,207 @@ workflows: - j17_build: requires: - start_pre-commit_tests + upstream: + start_pre-commit_tests: + - success - j17_unit_tests: requires: - j17_build + upstream: + j17_build: + - success - j17_utests_oa: requires: - j17_build + upstream: + j17_build: + - success - j17_utests_latest: requires: - j17_build + upstream: + j17_build: + - success - j17_jvm_dtests: requires: - j17_build + upstream: + j17_build: + - success - j17_jvm_dtests_latest_vnode: requires: - j17_build + upstream: + j17_build: + - success - j17_cqlshlib_tests: requires: - j17_build + upstream: + j17_build: + - success - j17_cqlshlib_cython_tests: requires: - j17_build + upstream: + j17_build: + - success - j17_dtests: requires: - j17_build + upstream: + j17_build: + - success - j17_dtests_vnode: requires: - j17_build + upstream: + j17_build: + - success - j17_dtests_latest: requires: - j17_build + upstream: + j17_build: + - success - start_j17_dtests_large: type: approval - j17_dtests_large: requires: - start_j17_dtests_large - j17_build + upstream: + start_j17_dtests_large: + - success + j17_build: + - success - j17_dtests_large_vnode: requires: - start_j17_dtests_large - j17_build + upstream: + start_j17_dtests_large: + - success + j17_build: + - success - j17_cqlsh_dtests_py38: requires: - j17_build + upstream: + j17_build: + - success - j17_cqlsh_dtests_py311: requires: - j17_build + upstream: + j17_build: + - success - j17_cqlsh_dtests_py38_vnode: requires: - j17_build + upstream: + j17_build: + - success - j17_cqlsh_dtests_py311_vnode: requires: - j17_build + upstream: + j17_build: + - success - start_j17_cqlsh-dtests-latest: type: approval - j17_cqlsh_dtests_py38_latest: requires: - start_j17_cqlsh-dtests-latest - j17_build + upstream: + start_j17_cqlsh-dtests-latest: + - success + j17_build: + - success - j17_cqlsh_dtests_py311_latest: requires: - start_j17_cqlsh-dtests-latest - j17_build + upstream: + start_j17_cqlsh-dtests-latest: + - success + j17_build: + - success - start_utests_long: type: approval - j17_utests_long: requires: - start_utests_long - j17_build + upstream: + start_utests_long: + - success + j17_build: + - success - start_utests_cdc: type: approval - j17_utests_cdc: requires: - start_utests_cdc - j17_build + upstream: + start_utests_cdc: + - success + j17_build: + - success - start_utests_compression: type: approval - j17_utests_compression: requires: - start_utests_compression - j17_build + upstream: + start_utests_compression: + - success + j17_build: + - success - start_utests_stress: type: approval - j17_utests_stress: requires: - start_utests_stress - j17_build + upstream: + start_utests_stress: + - success + j17_build: + - success - start_utests_fqltool: type: approval - j17_utests_fqltool: requires: - start_utests_fqltool - j17_build + upstream: + start_utests_fqltool: + - success + j17_build: + - success + - start_utests_sstableloader: + type: approval + - j17_utests_sstableloader: + requires: + - start_utests_sstableloader + - j17_build + upstream: + start_utests_sstableloader: + - success + j17_build: + - success - start_utests_system_keyspace_directory: type: approval - j17_utests_system_keyspace_directory: requires: - start_utests_system_keyspace_directory - j17_build + upstream: + start_utests_system_keyspace_directory: + - success + j17_build: + - success diff --git a/.circleci/config.yml.FREE b/.circleci/config.yml.FREE index 864919b8f418..edcbb164c35e 100644 --- a/.circleci/config.yml.FREE +++ b/.circleci/config.yml.FREE @@ -53,7 +53,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_UPGRADE_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_UPGRADE_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_UPGRADE_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_UPGRADE_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_UPGRADE_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_UPGRADE_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -84,6 +84,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -191,6 +193,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -308,6 +312,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -373,6 +379,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -431,7 +439,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_FQLTOOL_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_FQLTOOL_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_FQLTOOL} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=fqltool-test\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant fqltool-test $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_FQLTOOL_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_FQLTOOL_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_FQLTOOL} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=fqltool-test\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant fqltool-test $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -462,6 +470,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -595,6 +605,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -660,6 +672,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -778,6 +792,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -886,6 +902,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1040,6 +1058,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1172,6 +1192,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1281,6 +1303,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1398,6 +1422,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1457,7 +1483,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_STRESS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_STRESS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_STRESS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=stress-test-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant stress-test-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_STRESS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_STRESS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_STRESS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=stress-test-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant stress-test-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -1488,6 +1514,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1606,6 +1634,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1714,6 +1744,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1773,7 +1805,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-system-keyspace-directory\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-system-keyspace-directory $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-system-keyspace-directory\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-system-keyspace-directory $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -1804,6 +1836,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1912,6 +1946,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2045,6 +2081,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2199,6 +2237,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2316,6 +2356,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2375,7 +2417,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-latest\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-latest $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-latest\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-latest $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -2406,6 +2448,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2514,6 +2558,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2622,6 +2668,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2681,7 +2729,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_STRESS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_STRESS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_STRESS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=stress-test-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant stress-test-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_STRESS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_STRESS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_STRESS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=stress-test-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant stress-test-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -2712,6 +2760,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2735,6 +2785,80 @@ jobs: - REPEATED_ANT_TEST_COUNT: 500 - JAVA_HOME: /usr/lib/jvm/java-17-openjdk-amd64 - JDK_HOME: /usr/lib/jvm/java-17-openjdk-amd64 + j11_utests_sstableloader: + docker: + - image: apache/cassandra-testing-ubuntu2004-java11-w-dependencies:latest + resource_class: medium + working_directory: ~/ + shell: /bin/bash -eo pipefail -l + parallelism: 1 + steps: + - attach_workspace: + at: /home/cassandra + - run: + name: Run Unit Tests (sstableloader-test) + command: | + export PATH=$JAVA_HOME/bin:$PATH + time mv ~/cassandra /tmp + cd /tmp/cassandra + if [ -d ~/dtest_jars ]; then + cp ~/dtest_jars/dtest* /tmp/cassandra/build/ + fi + ant sstableloader-test -Dno-build-test=true + no_output_timeout: 15m + - store_test_results: + path: /tmp/cassandra/build/test/output/ + - store_artifacts: + path: /tmp/cassandra/build/test/output + destination: junitxml + - store_artifacts: + path: /tmp/cassandra/build/test/logs + destination: logs + environment: + - ANT_HOME: /usr/share/ant + - LANG: en_US.UTF-8 + - KEEP_TEST_DIR: true + - DEFAULT_DIR: /home/cassandra/cassandra-dtest + - PYTHONIOENCODING: utf-8 + - PYTHONUNBUFFERED: true + - CASS_DRIVER_NO_EXTENSIONS: true + - CASS_DRIVER_NO_CYTHON: true + - CASSANDRA_SKIP_SYNC: true + - DTEST_REPO: https://github.com/apache/cassandra-dtest.git + - DTEST_BRANCH: trunk + - CCM_MAX_HEAP_SIZE: 1024M + - CCM_HEAP_NEWSIZE: 256M + - REPEATED_TESTS_STOP_ON_FAILURE: false + - REPEATED_UTESTS: null + - REPEATED_UTESTS_COUNT: 500 + - REPEATED_UTESTS_FQLTOOL: null + - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 + - REPEATED_UTESTS_LONG: null + - REPEATED_UTESTS_LONG_COUNT: 100 + - REPEATED_UTESTS_STRESS: null + - REPEATED_UTESTS_STRESS_COUNT: 500 + - REPEATED_SIMULATOR_DTESTS: null + - REPEATED_SIMULATOR_DTESTS_COUNT: 500 + - REPEATED_JVM_DTESTS: null + - REPEATED_JVM_DTESTS_COUNT: 500 + - REPEATED_JVM_UPGRADE_DTESTS: null + - REPEATED_JVM_UPGRADE_DTESTS_COUNT: 500 + - REPEATED_DTESTS: null + - REPEATED_DTESTS_COUNT: 500 + - REPEATED_LARGE_DTESTS: null + - REPEATED_LARGE_DTESTS_COUNT: 100 + - REPEATED_UPGRADE_DTESTS: null + - REPEATED_UPGRADE_DTESTS_COUNT: 25 + - REPEATED_ANT_TEST_TARGET: testsome + - REPEATED_ANT_TEST_CLASS: null + - REPEATED_ANT_TEST_METHODS: null + - REPEATED_ANT_TEST_VNODES: false + - REPEATED_ANT_TEST_COUNT: 500 + - JAVA_HOME: /usr/lib/jvm/java-11-openjdk-amd64 + - JDK_HOME: /usr/lib/jvm/java-11-openjdk-amd64 + - CASSANDRA_USE_JDK11: true j11_utests_compression_repeat: docker: - image: apache/cassandra-testing-ubuntu2004-java11-w-dependencies:latest @@ -2770,7 +2894,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-compression\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-compression $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-compression\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-compression $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -2801,6 +2925,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2919,6 +3045,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3019,6 +3147,7 @@ jobs: $target == "test-oa" || \ $target == "test-system-keyspace-directory" || \ $target == "fqltool-test" || \ + $target == "sstableloader-test" || \ $target == "long-test" || \ $target == "stress-test" || \ $target == "test-simulator-dtest" ]]; then @@ -3114,6 +3243,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3199,6 +3330,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3223,6 +3356,97 @@ jobs: - JAVA_HOME: /usr/lib/jvm/java-11-openjdk-amd64 - JDK_HOME: /usr/lib/jvm/java-11-openjdk-amd64 - CASSANDRA_USE_JDK11: true + j17_utests_sstableloader_repeat: + docker: + - image: apache/cassandra-testing-ubuntu2004-java11:latest + resource_class: medium + working_directory: ~/ + shell: /bin/bash -eo pipefail -l + parallelism: 4 + steps: + - attach_workspace: + at: /home/cassandra + - run: + name: Log Environment Information + command: | + echo '*** id ***' + id + echo '*** cat /proc/cpuinfo ***' + cat /proc/cpuinfo + echo '*** free -m ***' + free -m + echo '*** df -m ***' + df -m + echo '*** ifconfig -a ***' + ifconfig -a + echo '*** uname -a ***' + uname -a + echo '*** mount ***' + mount + echo '*** env ***' + env + echo '*** java ***' + which java + java -version + - run: + name: Repeatedly run new or modifed JUnit tests + no_output_timeout: 15m + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_SSTABLELOADER_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_SSTABLELOADER_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_SSTABLELOADER} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=sstableloader-test\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant sstableloader-test $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + - store_test_results: + path: /tmp/results/repeated_utests/output + - store_artifacts: + path: /tmp/results/repeated_utests/stdout + destination: stdout + - store_artifacts: + path: /tmp/results/repeated_utests/output + destination: junitxml + - store_artifacts: + path: /tmp/results/repeated_utests/logs + destination: logs + environment: + - ANT_HOME: /usr/share/ant + - LANG: en_US.UTF-8 + - KEEP_TEST_DIR: true + - DEFAULT_DIR: /home/cassandra/cassandra-dtest + - PYTHONIOENCODING: utf-8 + - PYTHONUNBUFFERED: true + - CASS_DRIVER_NO_EXTENSIONS: true + - CASS_DRIVER_NO_CYTHON: true + - CASSANDRA_SKIP_SYNC: true + - DTEST_REPO: https://github.com/apache/cassandra-dtest.git + - DTEST_BRANCH: trunk + - CCM_MAX_HEAP_SIZE: 1024M + - CCM_HEAP_NEWSIZE: 256M + - REPEATED_TESTS_STOP_ON_FAILURE: false + - REPEATED_UTESTS: null + - REPEATED_UTESTS_COUNT: 500 + - REPEATED_UTESTS_FQLTOOL: null + - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 + - REPEATED_UTESTS_LONG: null + - REPEATED_UTESTS_LONG_COUNT: 100 + - REPEATED_UTESTS_STRESS: null + - REPEATED_UTESTS_STRESS_COUNT: 500 + - REPEATED_SIMULATOR_DTESTS: null + - REPEATED_SIMULATOR_DTESTS_COUNT: 500 + - REPEATED_JVM_DTESTS: null + - REPEATED_JVM_DTESTS_COUNT: 500 + - REPEATED_JVM_UPGRADE_DTESTS: null + - REPEATED_JVM_UPGRADE_DTESTS_COUNT: 500 + - REPEATED_DTESTS: null + - REPEATED_DTESTS_COUNT: 500 + - REPEATED_LARGE_DTESTS: null + - REPEATED_LARGE_DTESTS_COUNT: 100 + - REPEATED_UPGRADE_DTESTS: null + - REPEATED_UPGRADE_DTESTS_COUNT: 25 + - REPEATED_ANT_TEST_TARGET: testsome + - REPEATED_ANT_TEST_CLASS: null + - REPEATED_ANT_TEST_METHODS: null + - REPEATED_ANT_TEST_VNODES: false + - REPEATED_ANT_TEST_COUNT: 500 + - JAVA_HOME: /usr/lib/jvm/java-17-openjdk-amd64 + - JDK_HOME: /usr/lib/jvm/java-17-openjdk-amd64 j11_dtests_large_vnode_repeat: docker: - image: apache/cassandra-testing-ubuntu2004-java11-w-dependencies:latest @@ -3332,6 +3556,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3417,6 +3643,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3476,7 +3704,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-system-keyspace-directory\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-system-keyspace-directory $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-system-keyspace-directory\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-system-keyspace-directory $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -3507,6 +3735,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3625,6 +3855,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3758,6 +3990,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3876,6 +4110,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3976,6 +4212,7 @@ jobs: $target == "test-oa" || \ $target == "test-system-keyspace-directory" || \ $target == "fqltool-test" || \ + $target == "sstableloader-test" || \ $target == "long-test" || \ $target == "stress-test" || \ $target == "test-simulator-dtest" ]]; then @@ -4071,6 +4308,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4177,6 +4416,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4294,6 +4535,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4353,7 +4596,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-cdc\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-cdc $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-cdc\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-cdc $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -4384,6 +4627,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4501,6 +4746,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4572,6 +4819,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4631,7 +4880,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=true\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=true\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -4662,6 +4911,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4721,7 +4972,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=true\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=true\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -4752,6 +5003,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4855,6 +5108,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4927,6 +5182,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4986,7 +5243,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -5017,6 +5274,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5130,6 +5389,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5248,6 +5509,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5320,6 +5583,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5379,7 +5644,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-cdc\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-cdc $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-cdc\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-cdc $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -5410,6 +5675,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5517,6 +5784,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5649,6 +5918,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5767,6 +6038,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5885,6 +6158,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6018,6 +6293,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6076,7 +6353,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-latest\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-latest $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-latest\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-latest $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -6107,6 +6384,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6192,6 +6471,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6325,6 +6606,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6384,7 +6667,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-compression\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-compression $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-compression\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-compression $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -6415,6 +6698,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6486,6 +6771,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6570,6 +6857,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6678,6 +6967,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6737,7 +7028,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_SIMULATOR_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_SIMULATOR_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_SIMULATOR_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-simulator-dtest\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-simulator-dtest $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_SIMULATOR_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_SIMULATOR_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_SIMULATOR_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-simulator-dtest\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-simulator-dtest $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -6768,6 +7059,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6827,7 +7120,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -6858,6 +7151,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6967,6 +7262,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7031,6 +7328,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7139,6 +7438,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7257,6 +7558,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7365,6 +7668,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7473,6 +7778,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7590,6 +7897,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7697,6 +8006,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7814,6 +8125,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7885,6 +8198,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7991,6 +8306,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8050,7 +8367,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-oa\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-oa $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-oa\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-oa $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -8081,6 +8398,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8166,6 +8485,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8230,6 +8551,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8315,6 +8638,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8424,6 +8749,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8541,6 +8868,173 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 + - REPEATED_UTESTS_LONG: null + - REPEATED_UTESTS_LONG_COUNT: 100 + - REPEATED_UTESTS_STRESS: null + - REPEATED_UTESTS_STRESS_COUNT: 500 + - REPEATED_SIMULATOR_DTESTS: null + - REPEATED_SIMULATOR_DTESTS_COUNT: 500 + - REPEATED_JVM_DTESTS: null + - REPEATED_JVM_DTESTS_COUNT: 500 + - REPEATED_JVM_UPGRADE_DTESTS: null + - REPEATED_JVM_UPGRADE_DTESTS_COUNT: 500 + - REPEATED_DTESTS: null + - REPEATED_DTESTS_COUNT: 500 + - REPEATED_LARGE_DTESTS: null + - REPEATED_LARGE_DTESTS_COUNT: 100 + - REPEATED_UPGRADE_DTESTS: null + - REPEATED_UPGRADE_DTESTS_COUNT: 25 + - REPEATED_ANT_TEST_TARGET: testsome + - REPEATED_ANT_TEST_CLASS: null + - REPEATED_ANT_TEST_METHODS: null + - REPEATED_ANT_TEST_VNODES: false + - REPEATED_ANT_TEST_COUNT: 500 + - JAVA_HOME: /usr/lib/jvm/java-17-openjdk-amd64 + - JDK_HOME: /usr/lib/jvm/java-17-openjdk-amd64 + j11_utests_sstableloader_repeat: + docker: + - image: apache/cassandra-testing-ubuntu2004-java11-w-dependencies:latest + resource_class: medium + working_directory: ~/ + shell: /bin/bash -eo pipefail -l + parallelism: 4 + steps: + - attach_workspace: + at: /home/cassandra + - run: + name: Log Environment Information + command: | + echo '*** id ***' + id + echo '*** cat /proc/cpuinfo ***' + cat /proc/cpuinfo + echo '*** free -m ***' + free -m + echo '*** df -m ***' + df -m + echo '*** ifconfig -a ***' + ifconfig -a + echo '*** uname -a ***' + uname -a + echo '*** mount ***' + mount + echo '*** env ***' + env + echo '*** java ***' + which java + java -version + - run: + name: Repeatedly run new or modifed JUnit tests + no_output_timeout: 15m + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_SSTABLELOADER_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_SSTABLELOADER_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_SSTABLELOADER} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=sstableloader-test\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant sstableloader-test $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + - store_test_results: + path: /tmp/results/repeated_utests/output + - store_artifacts: + path: /tmp/results/repeated_utests/stdout + destination: stdout + - store_artifacts: + path: /tmp/results/repeated_utests/output + destination: junitxml + - store_artifacts: + path: /tmp/results/repeated_utests/logs + destination: logs + environment: + - ANT_HOME: /usr/share/ant + - LANG: en_US.UTF-8 + - KEEP_TEST_DIR: true + - DEFAULT_DIR: /home/cassandra/cassandra-dtest + - PYTHONIOENCODING: utf-8 + - PYTHONUNBUFFERED: true + - CASS_DRIVER_NO_EXTENSIONS: true + - CASS_DRIVER_NO_CYTHON: true + - CASSANDRA_SKIP_SYNC: true + - DTEST_REPO: https://github.com/apache/cassandra-dtest.git + - DTEST_BRANCH: trunk + - CCM_MAX_HEAP_SIZE: 1024M + - CCM_HEAP_NEWSIZE: 256M + - REPEATED_TESTS_STOP_ON_FAILURE: false + - REPEATED_UTESTS: null + - REPEATED_UTESTS_COUNT: 500 + - REPEATED_UTESTS_FQLTOOL: null + - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 + - REPEATED_UTESTS_LONG: null + - REPEATED_UTESTS_LONG_COUNT: 100 + - REPEATED_UTESTS_STRESS: null + - REPEATED_UTESTS_STRESS_COUNT: 500 + - REPEATED_SIMULATOR_DTESTS: null + - REPEATED_SIMULATOR_DTESTS_COUNT: 500 + - REPEATED_JVM_DTESTS: null + - REPEATED_JVM_DTESTS_COUNT: 500 + - REPEATED_JVM_UPGRADE_DTESTS: null + - REPEATED_JVM_UPGRADE_DTESTS_COUNT: 500 + - REPEATED_DTESTS: null + - REPEATED_DTESTS_COUNT: 500 + - REPEATED_LARGE_DTESTS: null + - REPEATED_LARGE_DTESTS_COUNT: 100 + - REPEATED_UPGRADE_DTESTS: null + - REPEATED_UPGRADE_DTESTS_COUNT: 25 + - REPEATED_ANT_TEST_TARGET: testsome + - REPEATED_ANT_TEST_CLASS: null + - REPEATED_ANT_TEST_METHODS: null + - REPEATED_ANT_TEST_VNODES: false + - REPEATED_ANT_TEST_COUNT: 500 + - JAVA_HOME: /usr/lib/jvm/java-11-openjdk-amd64 + - JDK_HOME: /usr/lib/jvm/java-11-openjdk-amd64 + - CASSANDRA_USE_JDK11: true + j17_utests_sstableloader: + docker: + - image: apache/cassandra-testing-ubuntu2004-java11:latest + resource_class: medium + working_directory: ~/ + shell: /bin/bash -eo pipefail -l + parallelism: 1 + steps: + - attach_workspace: + at: /home/cassandra + - run: + name: Run Unit Tests (sstableloader-test) + command: | + export PATH=$JAVA_HOME/bin:$PATH + time mv ~/cassandra /tmp + cd /tmp/cassandra + if [ -d ~/dtest_jars ]; then + cp ~/dtest_jars/dtest* /tmp/cassandra/build/ + fi + ant sstableloader-test -Dno-build-test=true + no_output_timeout: 15m + - store_test_results: + path: /tmp/cassandra/build/test/output/ + - store_artifacts: + path: /tmp/cassandra/build/test/output + destination: junitxml + - store_artifacts: + path: /tmp/cassandra/build/test/logs + destination: logs + environment: + - ANT_HOME: /usr/share/ant + - LANG: en_US.UTF-8 + - KEEP_TEST_DIR: true + - DEFAULT_DIR: /home/cassandra/cassandra-dtest + - PYTHONIOENCODING: utf-8 + - PYTHONUNBUFFERED: true + - CASS_DRIVER_NO_EXTENSIONS: true + - CASS_DRIVER_NO_CYTHON: true + - CASSANDRA_SKIP_SYNC: true + - DTEST_REPO: https://github.com/apache/cassandra-dtest.git + - DTEST_BRANCH: trunk + - CCM_MAX_HEAP_SIZE: 1024M + - CCM_HEAP_NEWSIZE: 256M + - REPEATED_TESTS_STOP_ON_FAILURE: false + - REPEATED_UTESTS: null + - REPEATED_UTESTS_COUNT: 500 + - REPEATED_UTESTS_FQLTOOL: null + - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8599,7 +9093,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_FQLTOOL_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_FQLTOOL_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_FQLTOOL} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=fqltool-test\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant fqltool-test $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_FQLTOOL_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_FQLTOOL_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_FQLTOOL} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=fqltool-test\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant fqltool-test $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -8630,6 +9124,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8714,6 +9210,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8772,7 +9270,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -8803,6 +9301,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8875,6 +9375,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8933,7 +9435,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-oa\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-oa $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-oa\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-oa $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -8964,6 +9466,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -9048,6 +9552,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -9107,7 +9613,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -9138,6 +9644,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -9270,6 +9778,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -9328,7 +9838,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_LONG_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_LONG_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_LONG} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=long-testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant long-testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_LONG_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_LONG_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_LONG} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=long-testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant long-testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -9359,6 +9869,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -9417,7 +9929,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_LONG_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_LONG_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_LONG} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=long-testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant long-testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_LONG_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_LONG_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_LONG} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=long-testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant long-testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -9448,6 +9960,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -9481,302 +9995,587 @@ workflows: - j11_build: requires: - start_j11_build + upstream: + start_j11_build: + - success - start_j11_unit_tests: type: approval - j11_unit_tests: requires: - start_j11_unit_tests - j11_build + upstream: + start_j11_unit_tests: + - success + j11_build: + - success - start_j11_jvm_dtests: type: approval - j11_jvm_dtests: requires: - start_j11_jvm_dtests - j11_build + upstream: + start_j11_jvm_dtests: + - success + j11_build: + - success - start_j11_jvm_dtests_latest_vnode: type: approval - j11_jvm_dtests_latest_vnode: requires: - start_j11_jvm_dtests_latest_vnode - j11_build + upstream: + start_j11_jvm_dtests_latest_vnode: + - success + j11_build: + - success - start_j17_jvm_dtests: type: approval - j17_jvm_dtests: requires: - start_j17_jvm_dtests - j11_build + upstream: + start_j17_jvm_dtests: + - success + j11_build: + - success - start_j17_jvm_dtests_latest_vnode: type: approval - j17_jvm_dtests_latest_vnode: requires: - start_j17_jvm_dtests_latest_vnode - j11_build + upstream: + start_j17_jvm_dtests_latest_vnode: + - success + j11_build: + - success - start_j11_simulator_dtests: type: approval - j11_simulator_dtests: requires: - start_j11_simulator_dtests - j11_build + upstream: + start_j11_simulator_dtests: + - success + j11_build: + - success - start_j11_cqlshlib_tests: type: approval - j11_cqlshlib_tests: requires: - start_j11_cqlshlib_tests - j11_build + upstream: + start_j11_cqlshlib_tests: + - success + j11_build: + - success - start_j11_cqlshlib_cython_tests: type: approval - j11_cqlshlib_cython_tests: requires: - start_j11_cqlshlib_cython_tests - j11_build + upstream: + start_j11_cqlshlib_cython_tests: + - success + j11_build: + - success - start_j17_cqlshlib_tests: type: approval - j17_cqlshlib_tests: requires: - start_j17_cqlshlib_tests - j11_build + upstream: + start_j17_cqlshlib_tests: + - success + j11_build: + - success - start_j17_cqlshlib_cython_tests: type: approval - j17_cqlshlib_cython_tests: requires: - start_j17_cqlshlib_cython_tests - j11_build + upstream: + start_j17_cqlshlib_cython_tests: + - success + j11_build: + - success - start_j17_unit_tests: type: approval - j17_unit_tests: requires: - start_j17_unit_tests - j11_build + upstream: + start_j17_unit_tests: + - success + j11_build: + - success - start_j11_utests_oa: type: approval - j11_utests_oa: requires: - start_j11_utests_oa - j11_build + upstream: + start_j11_utests_oa: + - success + j11_build: + - success - start_j17_utests_oa: type: approval - j17_utests_oa: requires: - start_j17_utests_oa - j11_build + upstream: + start_j17_utests_oa: + - success + j11_build: + - success - start_j11_utests_long: type: approval - j11_utests_long: requires: - start_j11_utests_long - j11_build + upstream: + start_j11_utests_long: + - success + j11_build: + - success - start_j17_utests_long: type: approval - j17_utests_long: requires: - start_j17_utests_long - j11_build + upstream: + start_j17_utests_long: + - success + j11_build: + - success - start_j11_utests_cdc: type: approval - j11_utests_cdc: requires: - start_j11_utests_cdc - j11_build + upstream: + start_j11_utests_cdc: + - success + j11_build: + - success - start_j17_utests_cdc: type: approval - j17_utests_cdc: requires: - start_j17_utests_cdc - j11_build + upstream: + start_j17_utests_cdc: + - success + j11_build: + - success - start_j11_utests_compression: type: approval - j11_utests_compression: requires: - start_j11_utests_compression - j11_build + upstream: + start_j11_utests_compression: + - success + j11_build: + - success - start_j17_utests_compression: type: approval - j17_utests_compression: requires: - start_j17_utests_compression - j11_build + upstream: + start_j17_utests_compression: + - success + j11_build: + - success - start_j11_utests_latest: type: approval - j11_utests_latest: requires: - start_j11_utests_latest - j11_build + upstream: + start_j11_utests_latest: + - success + j11_build: + - success - start_j17_utests_latest: type: approval - j17_utests_latest: requires: - start_j17_utests_latest - j11_build + upstream: + start_j17_utests_latest: + - success + j11_build: + - success - start_j11_utests_stress: type: approval - j11_utests_stress: requires: - start_j11_utests_stress - j11_build + upstream: + start_j11_utests_stress: + - success + j11_build: + - success - start_j17_utests_stress: type: approval - j17_utests_stress: requires: - start_j17_utests_stress - j11_build + upstream: + start_j17_utests_stress: + - success + j11_build: + - success - start_j11_utests_fqltool: type: approval - j11_utests_fqltool: requires: - start_j11_utests_fqltool - j11_build + upstream: + start_j11_utests_fqltool: + - success + j11_build: + - success - start_j17_utests_fqltool: type: approval - j17_utests_fqltool: requires: - start_j17_utests_fqltool - j11_build + upstream: + start_j17_utests_fqltool: + - success + j11_build: + - success + - start_j11_utests_sstableloader: + type: approval + - j11_utests_sstableloader: + requires: + - start_j11_utests_sstableloader + - j11_build + upstream: + start_j11_utests_sstableloader: + - success + j11_build: + - success + - start_j17_utests_sstableloader: + type: approval + - j17_utests_sstableloader: + requires: + - start_j17_utests_sstableloader + - j11_build + upstream: + start_j17_utests_sstableloader: + - success + j11_build: + - success - start_j11_utests_system_keyspace_directory: type: approval - j11_utests_system_keyspace_directory: requires: - start_j11_utests_system_keyspace_directory - j11_build + upstream: + start_j11_utests_system_keyspace_directory: + - success + j11_build: + - success - start_j17_utests_system_keyspace_directory: type: approval - j17_utests_system_keyspace_directory: requires: - start_j17_utests_system_keyspace_directory - j11_build + upstream: + start_j17_utests_system_keyspace_directory: + - success + j11_build: + - success - start_j11_dtest_jars_build: type: approval - j11_dtest_jars_build: requires: - j11_build - start_j11_dtest_jars_build + upstream: + j11_build: + - success + start_j11_dtest_jars_build: + - success - start_jvm_upgrade_dtests: type: approval - j11_jvm_upgrade_dtests: requires: - start_jvm_upgrade_dtests - j11_dtest_jars_build + upstream: + start_jvm_upgrade_dtests: + - success + j11_dtest_jars_build: + - success - start_j11_dtests: type: approval - j11_dtests: requires: - start_j11_dtests - j11_build + upstream: + start_j11_dtests: + - success + j11_build: + - success - start_j11_dtests_vnode: type: approval - j11_dtests_vnode: requires: - start_j11_dtests_vnode - j11_build + upstream: + start_j11_dtests_vnode: + - success + j11_build: + - success - start_j11_dtests_latest: type: approval - j11_dtests_latest: requires: - start_j11_dtests_latest - j11_build + upstream: + start_j11_dtests_latest: + - success + j11_build: + - success - start_j17_dtests: type: approval - j17_dtests: requires: - start_j17_dtests - j11_build + upstream: + start_j17_dtests: + - success + j11_build: + - success - start_j17_dtests_vnode: type: approval - j17_dtests_vnode: requires: - start_j17_dtests_vnode - j11_build + upstream: + start_j17_dtests_vnode: + - success + j11_build: + - success - start_j17_dtests_latest: type: approval - j17_dtests_latest: requires: - start_j17_dtests_latest - j11_build + upstream: + start_j17_dtests_latest: + - success + j11_build: + - success - start_j11_dtests_large: type: approval - j11_dtests_large: requires: - start_j11_dtests_large - j11_build + upstream: + start_j11_dtests_large: + - success + j11_build: + - success - start_j11_dtests_large_vnode: type: approval - j11_dtests_large_vnode: requires: - start_j11_dtests_large_vnode - j11_build + upstream: + start_j11_dtests_large_vnode: + - success + j11_build: + - success - start_j17_dtests_large: type: approval - j17_dtests_large: requires: - start_j17_dtests_large - j11_build + upstream: + start_j17_dtests_large: + - success + j11_build: + - success - start_j17_dtests_large_vnode: type: approval - j17_dtests_large_vnode: requires: - start_j17_dtests_large_vnode - j11_build + upstream: + start_j17_dtests_large_vnode: + - success + j11_build: + - success - start_j11_cqlsh_tests: type: approval - j11_cqlsh_dtests_py38: requires: - start_j11_cqlsh_tests - j11_build + upstream: + start_j11_cqlsh_tests: + - success + j11_build: + - success - j11_cqlsh_dtests_py311: requires: - start_j11_cqlsh_tests - j11_build + upstream: + start_j11_cqlsh_tests: + - success + j11_build: + - success - j11_cqlsh_dtests_py38_vnode: requires: - start_j11_cqlsh_tests - j11_build + upstream: + start_j11_cqlsh_tests: + - success + j11_build: + - success - j11_cqlsh_dtests_py311_vnode: requires: - start_j11_cqlsh_tests - j11_build + upstream: + start_j11_cqlsh_tests: + - success + j11_build: + - success - start_j11_cqlsh_tests_latest: type: approval - j11_cqlsh_dtests_py38_latest: requires: - start_j11_cqlsh_tests_latest - j11_build + upstream: + start_j11_cqlsh_tests_latest: + - success + j11_build: + - success - j11_cqlsh_dtests_py311_latest: requires: - start_j11_cqlsh_tests_latest - j11_build + upstream: + start_j11_cqlsh_tests_latest: + - success + j11_build: + - success - start_j17_cqlsh_tests: type: approval - j17_cqlsh_dtests_py38: requires: - start_j17_cqlsh_tests - j11_build + upstream: + start_j17_cqlsh_tests: + - success + j11_build: + - success - j17_cqlsh_dtests_py311: requires: - start_j17_cqlsh_tests - j11_build + upstream: + start_j17_cqlsh_tests: + - success + j11_build: + - success - j17_cqlsh_dtests_py38_vnode: requires: - start_j17_cqlsh_tests - j11_build + upstream: + start_j17_cqlsh_tests: + - success + j11_build: + - success - j17_cqlsh_dtests_py311_vnode: requires: - start_j17_cqlsh_tests - j11_build + upstream: + start_j17_cqlsh_tests: + - success + j11_build: + - success - start_j17_cqlsh_tests_latest: type: approval - j17_cqlsh_dtests_py38_latest: requires: - start_j17_cqlsh_tests_latest - j11_build + upstream: + start_j17_cqlsh_tests_latest: + - success + j11_build: + - success - j17_cqlsh_dtests_py311_latest: requires: - start_j17_cqlsh_tests_latest - j11_build + upstream: + start_j17_cqlsh_tests_latest: + - success + j11_build: + - success - start_j11_upgrade_dtests: type: approval - j11_upgrade_dtests: requires: - start_j11_upgrade_dtests - j11_build + upstream: + start_j11_upgrade_dtests: + - success + j11_build: + - success java11_pre-commit_tests: jobs: - start_pre-commit_tests: @@ -9784,207 +10583,428 @@ workflows: - j11_build: requires: - start_pre-commit_tests + upstream: + start_pre-commit_tests: + - success - j11_unit_tests: requires: - j11_build + upstream: + j11_build: + - success - j11_utests_oa: requires: - j11_build + upstream: + j11_build: + - success - j11_utests_latest: requires: - j11_build + upstream: + j11_build: + - success - j11_simulator_dtests: requires: - j11_build + upstream: + j11_build: + - success - j11_jvm_dtests: requires: - j11_build + upstream: + j11_build: + - success - j11_jvm_dtests_latest_vnode: requires: - j11_build + upstream: + j11_build: + - success - j17_jvm_dtests: requires: - j11_build + upstream: + j11_build: + - success - j17_jvm_dtests_latest_vnode: requires: - j11_build + upstream: + j11_build: + - success - j11_cqlshlib_tests: requires: - j11_build + upstream: + j11_build: + - success - j11_cqlshlib_cython_tests: requires: - j11_build + upstream: + j11_build: + - success - j17_cqlshlib_tests: requires: - j11_build + upstream: + j11_build: + - success - j17_cqlshlib_cython_tests: requires: - j11_build + upstream: + j11_build: + - success - j17_unit_tests: requires: - j11_build + upstream: + j11_build: + - success - j17_utests_oa: requires: - j11_build + upstream: + j11_build: + - success - j17_utests_latest: requires: - j11_build + upstream: + j11_build: + - success - start_utests_long: type: approval - j11_utests_long: requires: - start_utests_long - j11_build + upstream: + start_utests_long: + - success + j11_build: + - success - j17_utests_long: requires: - start_utests_long - j11_build + upstream: + start_utests_long: + - success + j11_build: + - success - start_utests_cdc: type: approval - j11_utests_cdc: requires: - start_utests_cdc - j11_build + upstream: + start_utests_cdc: + - success + j11_build: + - success - j17_utests_cdc: requires: - start_utests_cdc - j11_build + upstream: + start_utests_cdc: + - success + j11_build: + - success - start_utests_compression: type: approval - j11_utests_compression: requires: - start_utests_compression - j11_build + upstream: + start_utests_compression: + - success + j11_build: + - success - j17_utests_compression: requires: - start_utests_compression - j11_build + upstream: + start_utests_compression: + - success + j11_build: + - success - start_utests_stress: type: approval - j11_utests_stress: requires: - start_utests_stress - j11_build + upstream: + start_utests_stress: + - success + j11_build: + - success - j17_utests_stress: requires: - start_utests_stress - j11_build + upstream: + start_utests_stress: + - success + j11_build: + - success - start_utests_fqltool: type: approval - j11_utests_fqltool: requires: - start_utests_fqltool - j11_build + upstream: + start_utests_fqltool: + - success + j11_build: + - success - j17_utests_fqltool: requires: - start_utests_fqltool - j11_build + upstream: + start_utests_fqltool: + - success + j11_build: + - success + - start_utests_sstableloader: + type: approval + - j11_utests_sstableloader: + requires: + - start_utests_sstableloader + - j11_build + upstream: + start_utests_sstableloader: + - success + j11_build: + - success + - j17_utests_sstableloader: + requires: + - start_utests_sstableloader + - j11_build + upstream: + start_utests_sstableloader: + - success + j11_build: + - success - start_utests_system_keyspace_directory: type: approval - j11_utests_system_keyspace_directory: requires: - j11_build + upstream: + j11_build: + - success - j17_utests_system_keyspace_directory: requires: - start_utests_system_keyspace_directory - j11_build + upstream: + start_utests_system_keyspace_directory: + - success + j11_build: + - success - start_jvm_upgrade_dtests: type: approval - j11_dtest_jars_build: requires: - j11_build - start_jvm_upgrade_dtests + upstream: + j11_build: + - success + start_jvm_upgrade_dtests: + - success - j11_jvm_upgrade_dtests: requires: - j11_dtest_jars_build + upstream: + j11_dtest_jars_build: + - success - j11_dtests: requires: - j11_build + upstream: + j11_build: + - success - j11_dtests_vnode: requires: - j11_build + upstream: + j11_build: + - success - j11_dtests_latest: requires: - j11_build + upstream: + j11_build: + - success - j17_dtests: requires: - j11_build + upstream: + j11_build: + - success - j17_dtests_vnode: requires: - j11_build + upstream: + j11_build: + - success - j17_dtests_latest: requires: - j11_build + upstream: + j11_build: + - success - start_j11_dtests_large: type: approval - j11_dtests_large: requires: - start_j11_dtests_large - j11_build + upstream: + start_j11_dtests_large: + - success + j11_build: + - success - j11_dtests_large_vnode: requires: - start_j11_dtests_large - j11_build + upstream: + start_j11_dtests_large: + - success + j11_build: + - success - start_j17_dtests_large: type: approval - j17_dtests_large: requires: - start_j17_dtests_large - j11_build + upstream: + start_j17_dtests_large: + - success + j11_build: + - success - j17_dtests_large_vnode: requires: - start_j17_dtests_large - j11_build + upstream: + start_j17_dtests_large: + - success + j11_build: + - success - j11_cqlsh_dtests_py38: requires: - j11_build + upstream: + j11_build: + - success - j11_cqlsh_dtests_py311: requires: - j11_build + upstream: + j11_build: + - success - j11_cqlsh_dtests_py38_vnode: requires: - j11_build + upstream: + j11_build: + - success - j11_cqlsh_dtests_py311_vnode: requires: - j11_build + upstream: + j11_build: + - success - start_j11_cqlsh_dtests_latest: type: approval - j11_cqlsh_dtests_py38_latest: requires: - start_j11_cqlsh_dtests_latest - j11_build + upstream: + start_j11_cqlsh_dtests_latest: + - success + j11_build: + - success - j11_cqlsh_dtests_py311_latest: requires: - start_j11_cqlsh_dtests_latest - j11_build + upstream: + start_j11_cqlsh_dtests_latest: + - success + j11_build: + - success - j17_cqlsh_dtests_py38: requires: - j11_build + upstream: + j11_build: + - success - j17_cqlsh_dtests_py311: requires: - j11_build + upstream: + j11_build: + - success - j17_cqlsh_dtests_py38_vnode: requires: - j11_build + upstream: + j11_build: + - success - j17_cqlsh_dtests_py311_vnode: requires: - j11_build + upstream: + j11_build: + - success - start_j17_cqlsh-dtests-latest: type: approval - j17_cqlsh_dtests_py38_latest: requires: - start_j17_cqlsh-dtests-latest - j11_build + upstream: + start_j17_cqlsh-dtests-latest: + - success + j11_build: + - success - j17_cqlsh_dtests_py311_latest: requires: - start_j17_cqlsh-dtests-latest - j11_build + upstream: + start_j17_cqlsh-dtests-latest: + - success + j11_build: + - success - start_j11_upgrade_dtests: type: approval - j11_upgrade_dtests: requires: - j11_build - start_j11_upgrade_dtests + upstream: + j11_build: + - success + start_j11_upgrade_dtests: + - success java17_separate_tests: jobs: - start_j17_build: @@ -9992,142 +11012,276 @@ workflows: - j17_build: requires: - start_j17_build + upstream: + start_j17_build: + - success - start_j17_unit_tests: type: approval - j17_unit_tests: requires: - start_j17_unit_tests - j17_build + upstream: + start_j17_unit_tests: + - success + j17_build: + - success - start_j17_jvm_dtests: type: approval - j17_jvm_dtests: requires: - start_j17_jvm_dtests - j17_build + upstream: + start_j17_jvm_dtests: + - success + j17_build: + - success - start_j17_jvm_dtests_latest_vnode: type: approval - j17_jvm_dtests_latest_vnode: requires: - start_j17_jvm_dtests_latest_vnode - j17_build + upstream: + start_j17_jvm_dtests_latest_vnode: + - success + j17_build: + - success - start_j17_cqlshlib_tests: type: approval - j17_cqlshlib_tests: requires: - start_j17_cqlshlib_tests - j17_build + upstream: + start_j17_cqlshlib_tests: + - success + j17_build: + - success - start_j17_cqlshlib_cython_tests: type: approval - j17_cqlshlib_cython_tests: requires: - start_j17_cqlshlib_cython_tests - j17_build + upstream: + start_j17_cqlshlib_cython_tests: + - success + j17_build: + - success - start_j17_dtests: type: approval - j17_dtests: requires: - start_j17_dtests - j17_build + upstream: + start_j17_dtests: + - success + j17_build: + - success - start_j17_dtests_vnode: type: approval - j17_dtests_vnode: requires: - start_j17_dtests_vnode - j17_build + upstream: + start_j17_dtests_vnode: + - success + j17_build: + - success - start_j17_dtests_latest: type: approval - j17_dtests_latest: requires: - start_j17_dtests_latest - j17_build + upstream: + start_j17_dtests_latest: + - success + j17_build: + - success - start_j17_dtests_large: type: approval - j17_dtests_large: requires: - start_j17_dtests_large - j17_build + upstream: + start_j17_dtests_large: + - success + j17_build: + - success - start_j17_dtests_large_vnode: type: approval - j17_dtests_large_vnode: requires: - start_j17_dtests_large_vnode - j17_build + upstream: + start_j17_dtests_large_vnode: + - success + j17_build: + - success - start_j17_cqlsh_tests: type: approval - j17_cqlsh_dtests_py38: requires: - start_j17_cqlsh_tests - j17_build + upstream: + start_j17_cqlsh_tests: + - success + j17_build: + - success - j17_cqlsh_dtests_py311: requires: - start_j17_cqlsh_tests - j17_build + upstream: + start_j17_cqlsh_tests: + - success + j17_build: + - success - j17_cqlsh_dtests_py38_vnode: requires: - start_j17_cqlsh_tests - j17_build + upstream: + start_j17_cqlsh_tests: + - success + j17_build: + - success - j17_cqlsh_dtests_py311_vnode: requires: - start_j17_cqlsh_tests - j17_build + upstream: + start_j17_cqlsh_tests: + - success + j17_build: + - success - start_j17_cqlsh-dtests-latest: type: approval - j17_cqlsh_dtests_py38_latest: requires: - start_j17_cqlsh-dtests-latest - j17_build + upstream: + start_j17_cqlsh-dtests-latest: + - success + j17_build: + - success - j17_cqlsh_dtests_py311_latest: requires: - start_j17_cqlsh-dtests-latest - j17_build + upstream: + start_j17_cqlsh-dtests-latest: + - success + j17_build: + - success - start_j17_utests_oa: type: approval - j17_utests_oa: requires: - start_j17_utests_oa - j17_build + upstream: + start_j17_utests_oa: + - success + j17_build: + - success - start_j17_utests_long: type: approval - j17_utests_long: requires: - start_j17_utests_long - j17_build + upstream: + start_j17_utests_long: + - success + j17_build: + - success - start_j17_utests_cdc: type: approval - j17_utests_cdc: requires: - start_j17_utests_cdc - j17_build + upstream: + start_j17_utests_cdc: + - success + j17_build: + - success - start_j17_utests_compression: type: approval - j17_utests_compression: requires: - start_j17_utests_compression - j17_build + upstream: + start_j17_utests_compression: + - success + j17_build: + - success - start_j17_utests_latest: type: approval - j17_utests_latest: requires: - start_j17_utests_latest - j17_build + upstream: + start_j17_utests_latest: + - success + j17_build: + - success - start_j17_utests_stress: type: approval - j17_utests_stress: requires: - start_j17_utests_stress - j17_build + upstream: + start_j17_utests_stress: + - success + j17_build: + - success - start_j17_utests_fqltool: type: approval - j17_utests_fqltool: requires: - start_j17_utests_fqltool - j17_build + upstream: + start_j17_utests_fqltool: + - success + j17_build: + - success + - start_j17_utests_sstableloader: + type: approval + - j17_utests_sstableloader: + requires: + - start_j17_utests_sstableloader + - j17_build + upstream: + start_j17_utests_sstableloader: + - success + j17_build: + - success - start_j17_utests_system_keyspace_directory: type: approval - j17_utests_system_keyspace_directory: requires: - start_j17_utests_system_keyspace_directory - j17_build + upstream: + start_j17_utests_system_keyspace_directory: + - success + j17_build: + - success java17_pre-commit_tests: jobs: - start_pre-commit_tests: @@ -10135,101 +11289,207 @@ workflows: - j17_build: requires: - start_pre-commit_tests + upstream: + start_pre-commit_tests: + - success - j17_unit_tests: requires: - j17_build + upstream: + j17_build: + - success - j17_utests_oa: requires: - j17_build + upstream: + j17_build: + - success - j17_utests_latest: requires: - j17_build + upstream: + j17_build: + - success - j17_jvm_dtests: requires: - j17_build + upstream: + j17_build: + - success - j17_jvm_dtests_latest_vnode: requires: - j17_build + upstream: + j17_build: + - success - j17_cqlshlib_tests: requires: - j17_build + upstream: + j17_build: + - success - j17_cqlshlib_cython_tests: requires: - j17_build + upstream: + j17_build: + - success - j17_dtests: requires: - j17_build + upstream: + j17_build: + - success - j17_dtests_vnode: requires: - j17_build + upstream: + j17_build: + - success - j17_dtests_latest: requires: - j17_build + upstream: + j17_build: + - success - start_j17_dtests_large: type: approval - j17_dtests_large: requires: - start_j17_dtests_large - j17_build + upstream: + start_j17_dtests_large: + - success + j17_build: + - success - j17_dtests_large_vnode: requires: - start_j17_dtests_large - j17_build + upstream: + start_j17_dtests_large: + - success + j17_build: + - success - j17_cqlsh_dtests_py38: requires: - j17_build + upstream: + j17_build: + - success - j17_cqlsh_dtests_py311: requires: - j17_build + upstream: + j17_build: + - success - j17_cqlsh_dtests_py38_vnode: requires: - j17_build + upstream: + j17_build: + - success - j17_cqlsh_dtests_py311_vnode: requires: - j17_build + upstream: + j17_build: + - success - start_j17_cqlsh-dtests-latest: type: approval - j17_cqlsh_dtests_py38_latest: requires: - start_j17_cqlsh-dtests-latest - j17_build + upstream: + start_j17_cqlsh-dtests-latest: + - success + j17_build: + - success - j17_cqlsh_dtests_py311_latest: requires: - start_j17_cqlsh-dtests-latest - j17_build + upstream: + start_j17_cqlsh-dtests-latest: + - success + j17_build: + - success - start_utests_long: type: approval - j17_utests_long: requires: - start_utests_long - j17_build + upstream: + start_utests_long: + - success + j17_build: + - success - start_utests_cdc: type: approval - j17_utests_cdc: requires: - start_utests_cdc - j17_build + upstream: + start_utests_cdc: + - success + j17_build: + - success - start_utests_compression: type: approval - j17_utests_compression: requires: - start_utests_compression - j17_build + upstream: + start_utests_compression: + - success + j17_build: + - success - start_utests_stress: type: approval - j17_utests_stress: requires: - start_utests_stress - j17_build + upstream: + start_utests_stress: + - success + j17_build: + - success - start_utests_fqltool: type: approval - j17_utests_fqltool: requires: - start_utests_fqltool - j17_build + upstream: + start_utests_fqltool: + - success + j17_build: + - success + - start_utests_sstableloader: + type: approval + - j17_utests_sstableloader: + requires: + - start_utests_sstableloader + - j17_build + upstream: + start_utests_sstableloader: + - success + j17_build: + - success - start_utests_system_keyspace_directory: type: approval - j17_utests_system_keyspace_directory: requires: - start_utests_system_keyspace_directory - j17_build + upstream: + start_utests_system_keyspace_directory: + - success + j17_build: + - success diff --git a/.circleci/config.yml.PAID b/.circleci/config.yml.PAID index 02e3aed428a7..e10097a4263b 100644 --- a/.circleci/config.yml.PAID +++ b/.circleci/config.yml.PAID @@ -53,7 +53,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_UPGRADE_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_UPGRADE_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_UPGRADE_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_UPGRADE_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_UPGRADE_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_UPGRADE_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -84,6 +84,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -191,6 +193,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -308,6 +312,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -373,6 +379,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -431,7 +439,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_FQLTOOL_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_FQLTOOL_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_FQLTOOL} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=fqltool-test\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant fqltool-test $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_FQLTOOL_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_FQLTOOL_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_FQLTOOL} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=fqltool-test\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant fqltool-test $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -462,6 +470,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -595,6 +605,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -660,6 +672,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -778,6 +792,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -886,6 +902,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1040,6 +1058,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1172,6 +1192,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1281,6 +1303,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1398,6 +1422,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1457,7 +1483,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_STRESS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_STRESS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_STRESS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=stress-test-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant stress-test-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_STRESS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_STRESS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_STRESS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=stress-test-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant stress-test-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -1488,6 +1514,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1606,6 +1634,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1714,6 +1744,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1773,7 +1805,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-system-keyspace-directory\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-system-keyspace-directory $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-system-keyspace-directory\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-system-keyspace-directory $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -1804,6 +1836,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -1912,6 +1946,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2045,6 +2081,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2199,6 +2237,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2316,6 +2356,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2375,7 +2417,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-latest\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-latest $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-latest\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-latest $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -2406,6 +2448,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2514,6 +2558,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2622,6 +2668,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2681,7 +2729,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_STRESS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_STRESS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_STRESS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=stress-test-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant stress-test-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_STRESS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_STRESS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_STRESS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=stress-test-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant stress-test-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -2712,6 +2760,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2735,6 +2785,80 @@ jobs: - REPEATED_ANT_TEST_COUNT: 500 - JAVA_HOME: /usr/lib/jvm/java-17-openjdk-amd64 - JDK_HOME: /usr/lib/jvm/java-17-openjdk-amd64 + j11_utests_sstableloader: + docker: + - image: apache/cassandra-testing-ubuntu2004-java11-w-dependencies:latest + resource_class: medium + working_directory: ~/ + shell: /bin/bash -eo pipefail -l + parallelism: 1 + steps: + - attach_workspace: + at: /home/cassandra + - run: + name: Run Unit Tests (sstableloader-test) + command: | + export PATH=$JAVA_HOME/bin:$PATH + time mv ~/cassandra /tmp + cd /tmp/cassandra + if [ -d ~/dtest_jars ]; then + cp ~/dtest_jars/dtest* /tmp/cassandra/build/ + fi + ant sstableloader-test -Dno-build-test=true + no_output_timeout: 15m + - store_test_results: + path: /tmp/cassandra/build/test/output/ + - store_artifacts: + path: /tmp/cassandra/build/test/output + destination: junitxml + - store_artifacts: + path: /tmp/cassandra/build/test/logs + destination: logs + environment: + - ANT_HOME: /usr/share/ant + - LANG: en_US.UTF-8 + - KEEP_TEST_DIR: true + - DEFAULT_DIR: /home/cassandra/cassandra-dtest + - PYTHONIOENCODING: utf-8 + - PYTHONUNBUFFERED: true + - CASS_DRIVER_NO_EXTENSIONS: true + - CASS_DRIVER_NO_CYTHON: true + - CASSANDRA_SKIP_SYNC: true + - DTEST_REPO: https://github.com/apache/cassandra-dtest.git + - DTEST_BRANCH: trunk + - CCM_MAX_HEAP_SIZE: 1024M + - CCM_HEAP_NEWSIZE: 256M + - REPEATED_TESTS_STOP_ON_FAILURE: false + - REPEATED_UTESTS: null + - REPEATED_UTESTS_COUNT: 500 + - REPEATED_UTESTS_FQLTOOL: null + - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 + - REPEATED_UTESTS_LONG: null + - REPEATED_UTESTS_LONG_COUNT: 100 + - REPEATED_UTESTS_STRESS: null + - REPEATED_UTESTS_STRESS_COUNT: 500 + - REPEATED_SIMULATOR_DTESTS: null + - REPEATED_SIMULATOR_DTESTS_COUNT: 500 + - REPEATED_JVM_DTESTS: null + - REPEATED_JVM_DTESTS_COUNT: 500 + - REPEATED_JVM_UPGRADE_DTESTS: null + - REPEATED_JVM_UPGRADE_DTESTS_COUNT: 500 + - REPEATED_DTESTS: null + - REPEATED_DTESTS_COUNT: 500 + - REPEATED_LARGE_DTESTS: null + - REPEATED_LARGE_DTESTS_COUNT: 100 + - REPEATED_UPGRADE_DTESTS: null + - REPEATED_UPGRADE_DTESTS_COUNT: 25 + - REPEATED_ANT_TEST_TARGET: testsome + - REPEATED_ANT_TEST_CLASS: null + - REPEATED_ANT_TEST_METHODS: null + - REPEATED_ANT_TEST_VNODES: false + - REPEATED_ANT_TEST_COUNT: 500 + - JAVA_HOME: /usr/lib/jvm/java-11-openjdk-amd64 + - JDK_HOME: /usr/lib/jvm/java-11-openjdk-amd64 + - CASSANDRA_USE_JDK11: true j11_utests_compression_repeat: docker: - image: apache/cassandra-testing-ubuntu2004-java11-w-dependencies:latest @@ -2770,7 +2894,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-compression\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-compression $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-compression\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-compression $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -2801,6 +2925,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -2919,6 +3045,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3019,6 +3147,7 @@ jobs: $target == "test-oa" || \ $target == "test-system-keyspace-directory" || \ $target == "fqltool-test" || \ + $target == "sstableloader-test" || \ $target == "long-test" || \ $target == "stress-test" || \ $target == "test-simulator-dtest" ]]; then @@ -3114,6 +3243,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3199,6 +3330,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3223,6 +3356,97 @@ jobs: - JAVA_HOME: /usr/lib/jvm/java-11-openjdk-amd64 - JDK_HOME: /usr/lib/jvm/java-11-openjdk-amd64 - CASSANDRA_USE_JDK11: true + j17_utests_sstableloader_repeat: + docker: + - image: apache/cassandra-testing-ubuntu2004-java11:latest + resource_class: medium + working_directory: ~/ + shell: /bin/bash -eo pipefail -l + parallelism: 25 + steps: + - attach_workspace: + at: /home/cassandra + - run: + name: Log Environment Information + command: | + echo '*** id ***' + id + echo '*** cat /proc/cpuinfo ***' + cat /proc/cpuinfo + echo '*** free -m ***' + free -m + echo '*** df -m ***' + df -m + echo '*** ifconfig -a ***' + ifconfig -a + echo '*** uname -a ***' + uname -a + echo '*** mount ***' + mount + echo '*** env ***' + env + echo '*** java ***' + which java + java -version + - run: + name: Repeatedly run new or modifed JUnit tests + no_output_timeout: 15m + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_SSTABLELOADER_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_SSTABLELOADER_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_SSTABLELOADER} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=sstableloader-test\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant sstableloader-test $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + - store_test_results: + path: /tmp/results/repeated_utests/output + - store_artifacts: + path: /tmp/results/repeated_utests/stdout + destination: stdout + - store_artifacts: + path: /tmp/results/repeated_utests/output + destination: junitxml + - store_artifacts: + path: /tmp/results/repeated_utests/logs + destination: logs + environment: + - ANT_HOME: /usr/share/ant + - LANG: en_US.UTF-8 + - KEEP_TEST_DIR: true + - DEFAULT_DIR: /home/cassandra/cassandra-dtest + - PYTHONIOENCODING: utf-8 + - PYTHONUNBUFFERED: true + - CASS_DRIVER_NO_EXTENSIONS: true + - CASS_DRIVER_NO_CYTHON: true + - CASSANDRA_SKIP_SYNC: true + - DTEST_REPO: https://github.com/apache/cassandra-dtest.git + - DTEST_BRANCH: trunk + - CCM_MAX_HEAP_SIZE: 1024M + - CCM_HEAP_NEWSIZE: 256M + - REPEATED_TESTS_STOP_ON_FAILURE: false + - REPEATED_UTESTS: null + - REPEATED_UTESTS_COUNT: 500 + - REPEATED_UTESTS_FQLTOOL: null + - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 + - REPEATED_UTESTS_LONG: null + - REPEATED_UTESTS_LONG_COUNT: 100 + - REPEATED_UTESTS_STRESS: null + - REPEATED_UTESTS_STRESS_COUNT: 500 + - REPEATED_SIMULATOR_DTESTS: null + - REPEATED_SIMULATOR_DTESTS_COUNT: 500 + - REPEATED_JVM_DTESTS: null + - REPEATED_JVM_DTESTS_COUNT: 500 + - REPEATED_JVM_UPGRADE_DTESTS: null + - REPEATED_JVM_UPGRADE_DTESTS_COUNT: 500 + - REPEATED_DTESTS: null + - REPEATED_DTESTS_COUNT: 500 + - REPEATED_LARGE_DTESTS: null + - REPEATED_LARGE_DTESTS_COUNT: 100 + - REPEATED_UPGRADE_DTESTS: null + - REPEATED_UPGRADE_DTESTS_COUNT: 25 + - REPEATED_ANT_TEST_TARGET: testsome + - REPEATED_ANT_TEST_CLASS: null + - REPEATED_ANT_TEST_METHODS: null + - REPEATED_ANT_TEST_VNODES: false + - REPEATED_ANT_TEST_COUNT: 500 + - JAVA_HOME: /usr/lib/jvm/java-17-openjdk-amd64 + - JDK_HOME: /usr/lib/jvm/java-17-openjdk-amd64 j11_dtests_large_vnode_repeat: docker: - image: apache/cassandra-testing-ubuntu2004-java11-w-dependencies:latest @@ -3332,6 +3556,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3417,6 +3643,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3476,7 +3704,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-system-keyspace-directory\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-system-keyspace-directory $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-system-keyspace-directory\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-system-keyspace-directory $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -3507,6 +3735,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3625,6 +3855,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3758,6 +3990,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3876,6 +4110,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -3976,6 +4212,7 @@ jobs: $target == "test-oa" || \ $target == "test-system-keyspace-directory" || \ $target == "fqltool-test" || \ + $target == "sstableloader-test" || \ $target == "long-test" || \ $target == "stress-test" || \ $target == "test-simulator-dtest" ]]; then @@ -4071,6 +4308,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4177,6 +4416,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4294,6 +4535,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4353,7 +4596,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-cdc\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-cdc $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-cdc\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-cdc $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -4384,6 +4627,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4501,6 +4746,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4572,6 +4819,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4631,7 +4880,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=true\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=true\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -4662,6 +4911,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4721,7 +4972,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=true\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=true\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -4752,6 +5003,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4855,6 +5108,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4927,6 +5182,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -4986,7 +5243,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -5017,6 +5274,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5130,6 +5389,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5248,6 +5509,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5320,6 +5583,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5379,7 +5644,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-cdc\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-cdc $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-cdc\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-cdc $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -5410,6 +5675,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5517,6 +5784,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5649,6 +5918,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5767,6 +6038,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -5885,6 +6158,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6018,6 +6293,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6076,7 +6353,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-latest\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-latest $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-latest\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-latest $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -6107,6 +6384,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6192,6 +6471,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6325,6 +6606,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6384,7 +6667,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-compression\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-compression $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-compression\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-compression $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -6415,6 +6698,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6486,6 +6771,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6570,6 +6857,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6678,6 +6967,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6737,7 +7028,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_SIMULATOR_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_SIMULATOR_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_SIMULATOR_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-simulator-dtest\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-simulator-dtest $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_SIMULATOR_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_SIMULATOR_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_SIMULATOR_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-simulator-dtest\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-simulator-dtest $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -6768,6 +7059,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6827,7 +7120,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -6858,6 +7151,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -6967,6 +7262,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7031,6 +7328,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7139,6 +7438,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7257,6 +7558,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7365,6 +7668,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7473,6 +7778,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7590,6 +7897,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7697,6 +8006,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7814,6 +8125,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7885,6 +8198,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -7991,6 +8306,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8050,7 +8367,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-oa\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-oa $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-oa\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-oa $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -8081,6 +8398,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8166,6 +8485,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8230,6 +8551,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8315,6 +8638,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8424,6 +8749,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8541,6 +8868,173 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 + - REPEATED_UTESTS_LONG: null + - REPEATED_UTESTS_LONG_COUNT: 100 + - REPEATED_UTESTS_STRESS: null + - REPEATED_UTESTS_STRESS_COUNT: 500 + - REPEATED_SIMULATOR_DTESTS: null + - REPEATED_SIMULATOR_DTESTS_COUNT: 500 + - REPEATED_JVM_DTESTS: null + - REPEATED_JVM_DTESTS_COUNT: 500 + - REPEATED_JVM_UPGRADE_DTESTS: null + - REPEATED_JVM_UPGRADE_DTESTS_COUNT: 500 + - REPEATED_DTESTS: null + - REPEATED_DTESTS_COUNT: 500 + - REPEATED_LARGE_DTESTS: null + - REPEATED_LARGE_DTESTS_COUNT: 100 + - REPEATED_UPGRADE_DTESTS: null + - REPEATED_UPGRADE_DTESTS_COUNT: 25 + - REPEATED_ANT_TEST_TARGET: testsome + - REPEATED_ANT_TEST_CLASS: null + - REPEATED_ANT_TEST_METHODS: null + - REPEATED_ANT_TEST_VNODES: false + - REPEATED_ANT_TEST_COUNT: 500 + - JAVA_HOME: /usr/lib/jvm/java-17-openjdk-amd64 + - JDK_HOME: /usr/lib/jvm/java-17-openjdk-amd64 + j11_utests_sstableloader_repeat: + docker: + - image: apache/cassandra-testing-ubuntu2004-java11-w-dependencies:latest + resource_class: medium + working_directory: ~/ + shell: /bin/bash -eo pipefail -l + parallelism: 25 + steps: + - attach_workspace: + at: /home/cassandra + - run: + name: Log Environment Information + command: | + echo '*** id ***' + id + echo '*** cat /proc/cpuinfo ***' + cat /proc/cpuinfo + echo '*** free -m ***' + free -m + echo '*** df -m ***' + df -m + echo '*** ifconfig -a ***' + ifconfig -a + echo '*** uname -a ***' + uname -a + echo '*** mount ***' + mount + echo '*** env ***' + env + echo '*** java ***' + which java + java -version + - run: + name: Repeatedly run new or modifed JUnit tests + no_output_timeout: 15m + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_SSTABLELOADER_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_SSTABLELOADER_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_SSTABLELOADER} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=sstableloader-test\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant sstableloader-test $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + - store_test_results: + path: /tmp/results/repeated_utests/output + - store_artifacts: + path: /tmp/results/repeated_utests/stdout + destination: stdout + - store_artifacts: + path: /tmp/results/repeated_utests/output + destination: junitxml + - store_artifacts: + path: /tmp/results/repeated_utests/logs + destination: logs + environment: + - ANT_HOME: /usr/share/ant + - LANG: en_US.UTF-8 + - KEEP_TEST_DIR: true + - DEFAULT_DIR: /home/cassandra/cassandra-dtest + - PYTHONIOENCODING: utf-8 + - PYTHONUNBUFFERED: true + - CASS_DRIVER_NO_EXTENSIONS: true + - CASS_DRIVER_NO_CYTHON: true + - CASSANDRA_SKIP_SYNC: true + - DTEST_REPO: https://github.com/apache/cassandra-dtest.git + - DTEST_BRANCH: trunk + - CCM_MAX_HEAP_SIZE: 1024M + - CCM_HEAP_NEWSIZE: 256M + - REPEATED_TESTS_STOP_ON_FAILURE: false + - REPEATED_UTESTS: null + - REPEATED_UTESTS_COUNT: 500 + - REPEATED_UTESTS_FQLTOOL: null + - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 + - REPEATED_UTESTS_LONG: null + - REPEATED_UTESTS_LONG_COUNT: 100 + - REPEATED_UTESTS_STRESS: null + - REPEATED_UTESTS_STRESS_COUNT: 500 + - REPEATED_SIMULATOR_DTESTS: null + - REPEATED_SIMULATOR_DTESTS_COUNT: 500 + - REPEATED_JVM_DTESTS: null + - REPEATED_JVM_DTESTS_COUNT: 500 + - REPEATED_JVM_UPGRADE_DTESTS: null + - REPEATED_JVM_UPGRADE_DTESTS_COUNT: 500 + - REPEATED_DTESTS: null + - REPEATED_DTESTS_COUNT: 500 + - REPEATED_LARGE_DTESTS: null + - REPEATED_LARGE_DTESTS_COUNT: 100 + - REPEATED_UPGRADE_DTESTS: null + - REPEATED_UPGRADE_DTESTS_COUNT: 25 + - REPEATED_ANT_TEST_TARGET: testsome + - REPEATED_ANT_TEST_CLASS: null + - REPEATED_ANT_TEST_METHODS: null + - REPEATED_ANT_TEST_VNODES: false + - REPEATED_ANT_TEST_COUNT: 500 + - JAVA_HOME: /usr/lib/jvm/java-11-openjdk-amd64 + - JDK_HOME: /usr/lib/jvm/java-11-openjdk-amd64 + - CASSANDRA_USE_JDK11: true + j17_utests_sstableloader: + docker: + - image: apache/cassandra-testing-ubuntu2004-java11:latest + resource_class: medium + working_directory: ~/ + shell: /bin/bash -eo pipefail -l + parallelism: 1 + steps: + - attach_workspace: + at: /home/cassandra + - run: + name: Run Unit Tests (sstableloader-test) + command: | + export PATH=$JAVA_HOME/bin:$PATH + time mv ~/cassandra /tmp + cd /tmp/cassandra + if [ -d ~/dtest_jars ]; then + cp ~/dtest_jars/dtest* /tmp/cassandra/build/ + fi + ant sstableloader-test -Dno-build-test=true + no_output_timeout: 15m + - store_test_results: + path: /tmp/cassandra/build/test/output/ + - store_artifacts: + path: /tmp/cassandra/build/test/output + destination: junitxml + - store_artifacts: + path: /tmp/cassandra/build/test/logs + destination: logs + environment: + - ANT_HOME: /usr/share/ant + - LANG: en_US.UTF-8 + - KEEP_TEST_DIR: true + - DEFAULT_DIR: /home/cassandra/cassandra-dtest + - PYTHONIOENCODING: utf-8 + - PYTHONUNBUFFERED: true + - CASS_DRIVER_NO_EXTENSIONS: true + - CASS_DRIVER_NO_CYTHON: true + - CASSANDRA_SKIP_SYNC: true + - DTEST_REPO: https://github.com/apache/cassandra-dtest.git + - DTEST_BRANCH: trunk + - CCM_MAX_HEAP_SIZE: 1024M + - CCM_HEAP_NEWSIZE: 256M + - REPEATED_TESTS_STOP_ON_FAILURE: false + - REPEATED_UTESTS: null + - REPEATED_UTESTS_COUNT: 500 + - REPEATED_UTESTS_FQLTOOL: null + - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8599,7 +9093,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_FQLTOOL_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_FQLTOOL_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_FQLTOOL} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=fqltool-test\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant fqltool-test $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_FQLTOOL_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_FQLTOOL_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_FQLTOOL} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=fqltool-test\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant fqltool-test $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -8630,6 +9124,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8714,6 +9210,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8772,7 +9270,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -8803,6 +9301,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8875,6 +9375,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -8933,7 +9435,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-oa\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-oa $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-oa\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-oa $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -8964,6 +9466,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -9048,6 +9552,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -9107,7 +9613,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_JVM_DTESTS_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_JVM_DTESTS_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_JVM_DTESTS} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=test-jvm-dtest-some\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant test-jvm-dtest-some $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -9138,6 +9644,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -9270,6 +9778,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -9328,7 +9838,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_LONG_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_LONG_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_LONG} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=long-testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant long-testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_LONG_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_LONG_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_LONG} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=long-testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant long-testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -9359,6 +9869,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -9417,7 +9929,7 @@ jobs: - run: name: Repeatedly run new or modifed JUnit tests no_output_timeout: 15m - command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_LONG_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_LONG_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_LONG} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=long-testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant long-testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" + command: "set -x\nexport PATH=$JAVA_HOME/bin:$PATH\ntime mv ~/cassandra /tmp\ncd /tmp/cassandra\nif [ -d ~/dtest_jars ]; then\n cp ~/dtest_jars/dtest* /tmp/cassandra/build/\nfi\n\n# Calculate the number of test iterations to be run by the current parallel runner.\ncount=$((${REPEATED_UTESTS_LONG_COUNT} / CIRCLE_NODE_TOTAL))\nif (($CIRCLE_NODE_INDEX < (${REPEATED_UTESTS_LONG_COUNT} % CIRCLE_NODE_TOTAL))); then\n count=$((count+1))\nfi\n\n# Put manually specified tests and automatically detected tests together, removing duplicates\ntests=$(echo ${REPEATED_UTESTS_LONG} | sed -e \"s///\" | sed -e \"s/ //\" | tr \",\" \"\\n\" | tr \" \" \"\\n\" | sort -n | uniq -u)\necho \"Tests to be repeated: ${tests}\"\n\n# Prepare the JVM dtests vnodes argument, which is optional.\nvnodes=false\nvnodes_args=\"\"\nif [ \"$vnodes\" = true ] ; then\n vnodes_args=\"-Dtest.jvm.args='-Dcassandra.dtest.num_tokens=16'\"\nfi\n\n# Prepare the testtag for the target, used by the test macro in build.xml to group the output files\ntarget=long-testsome\ntesttag=\"\"\nif [[ $target == \"test-cdc\" ]]; then\n testtag=\"cdc\"\nelif [[ $target == \"test-compression\" ]]; then\n testtag=\"compression\"\nelif [[ $target == \"test-system-keyspace-directory\" ]]; then\n testtag=\"system_keyspace_directory\"\nelif [[ $target == \"test-latest\" ]]; then\n testtag=\"latest\"\nelif [[ $target == \"test-oa\" ]]; then\n testtag=\"oa\"\nfi\n\n# Run each test class as many times as requested.\nexit_code=\"$?\"\nfor test in $tests; do\n\n # Split class and method names from the test name\n if [[ $test =~ \"#\" ]]; then\n class=${test%\"#\"*}\n method=${test#*\"#\"}\n else\n class=$test\n method=\"\"\n fi\n\n # Prepare the -Dtest.name argument.\n # It can be the fully qualified class name or the short class name, depending on the target.\n if [[ $target == \"test\" || \\\n $target == \"test-cdc\" || \\\n $target == \"test-compression\" || \\\n $target == \"test-latest\" || \\\n $target == \"test-oa\" || \\\n $target == \"test-system-keyspace-directory\" || \\\n $target == \"fqltool-test\" || \\\n $target == \"sstableloader-test\" || \\\n $target == \"long-test\" || \\\n $target == \"stress-test\" || \\\n $target == \"test-simulator-dtest\" ]]; then\n name_arg=\"-Dtest.name=${class##*.}\"\n else\n name_arg=\"-Dtest.name=$class\"\n fi\n\n # Prepare the -Dtest.methods argument, which is optional\n if [[ $method == \"\" ]]; then\n methods_arg=\"\"\n else\n methods_arg=\"-Dtest.methods=$method\"\n fi\n\n for i in $(seq -w 1 $count); do\n echo \"Running test $test, iteration $i of $count\"\n\n # run the test\n status=\"passes\"\n if !( set -o pipefail && \\\n ant long-testsome $name_arg $methods_arg $vnodes_args -Dno-build-test=true | \\\n tee stdout.txt \\\n ); then\n status=\"fails\"\n exit_code=1\n fi\n\n # move the stdout output file\n dest=/tmp/results/repeated_utests/stdout/${status}/${i}\n mkdir -p $dest\n mv stdout.txt $dest/${test}.txt\n\n # move the XML output files\n source=build/test/output/${testtag}\n dest=/tmp/results/repeated_utests/output/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" ]]; then\n mv $source/* $dest/\n fi\n\n # move the log files\n source=\"build/test/logs\"\n dest=/tmp/results/repeated_utests/logs/${status}/${i}\n mkdir -p $dest\n if [[ -d $source && -n \"$(ls $source)\" && -n \"$(ls ${source}/${testtag}*)\" ]]; then\n mv $source/${testtag}*/* $dest/\n fi\n \n # maybe stop iterations on test failure\n if [[ ${REPEATED_TESTS_STOP_ON_FAILURE} = true ]] && (( $exit_code > 0 )); then\n break\n fi\n done\ndone\n(exit ${exit_code})\n" - store_test_results: path: /tmp/results/repeated_utests/output - store_artifacts: @@ -9448,6 +9960,8 @@ jobs: - REPEATED_UTESTS_COUNT: 500 - REPEATED_UTESTS_FQLTOOL: null - REPEATED_UTESTS_FQLTOOL_COUNT: 500 + - REPEATED_UTESTS_SSTABLELOADER: null + - REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 - REPEATED_UTESTS_LONG: null - REPEATED_UTESTS_LONG_COUNT: 100 - REPEATED_UTESTS_STRESS: null @@ -9481,302 +9995,587 @@ workflows: - j11_build: requires: - start_j11_build + upstream: + start_j11_build: + - success - start_j11_unit_tests: type: approval - j11_unit_tests: requires: - start_j11_unit_tests - j11_build + upstream: + start_j11_unit_tests: + - success + j11_build: + - success - start_j11_jvm_dtests: type: approval - j11_jvm_dtests: requires: - start_j11_jvm_dtests - j11_build + upstream: + start_j11_jvm_dtests: + - success + j11_build: + - success - start_j11_jvm_dtests_latest_vnode: type: approval - j11_jvm_dtests_latest_vnode: requires: - start_j11_jvm_dtests_latest_vnode - j11_build + upstream: + start_j11_jvm_dtests_latest_vnode: + - success + j11_build: + - success - start_j17_jvm_dtests: type: approval - j17_jvm_dtests: requires: - start_j17_jvm_dtests - j11_build + upstream: + start_j17_jvm_dtests: + - success + j11_build: + - success - start_j17_jvm_dtests_latest_vnode: type: approval - j17_jvm_dtests_latest_vnode: requires: - start_j17_jvm_dtests_latest_vnode - j11_build + upstream: + start_j17_jvm_dtests_latest_vnode: + - success + j11_build: + - success - start_j11_simulator_dtests: type: approval - j11_simulator_dtests: requires: - start_j11_simulator_dtests - j11_build + upstream: + start_j11_simulator_dtests: + - success + j11_build: + - success - start_j11_cqlshlib_tests: type: approval - j11_cqlshlib_tests: requires: - start_j11_cqlshlib_tests - j11_build + upstream: + start_j11_cqlshlib_tests: + - success + j11_build: + - success - start_j11_cqlshlib_cython_tests: type: approval - j11_cqlshlib_cython_tests: requires: - start_j11_cqlshlib_cython_tests - j11_build + upstream: + start_j11_cqlshlib_cython_tests: + - success + j11_build: + - success - start_j17_cqlshlib_tests: type: approval - j17_cqlshlib_tests: requires: - start_j17_cqlshlib_tests - j11_build + upstream: + start_j17_cqlshlib_tests: + - success + j11_build: + - success - start_j17_cqlshlib_cython_tests: type: approval - j17_cqlshlib_cython_tests: requires: - start_j17_cqlshlib_cython_tests - j11_build + upstream: + start_j17_cqlshlib_cython_tests: + - success + j11_build: + - success - start_j17_unit_tests: type: approval - j17_unit_tests: requires: - start_j17_unit_tests - j11_build + upstream: + start_j17_unit_tests: + - success + j11_build: + - success - start_j11_utests_oa: type: approval - j11_utests_oa: requires: - start_j11_utests_oa - j11_build + upstream: + start_j11_utests_oa: + - success + j11_build: + - success - start_j17_utests_oa: type: approval - j17_utests_oa: requires: - start_j17_utests_oa - j11_build + upstream: + start_j17_utests_oa: + - success + j11_build: + - success - start_j11_utests_long: type: approval - j11_utests_long: requires: - start_j11_utests_long - j11_build + upstream: + start_j11_utests_long: + - success + j11_build: + - success - start_j17_utests_long: type: approval - j17_utests_long: requires: - start_j17_utests_long - j11_build + upstream: + start_j17_utests_long: + - success + j11_build: + - success - start_j11_utests_cdc: type: approval - j11_utests_cdc: requires: - start_j11_utests_cdc - j11_build + upstream: + start_j11_utests_cdc: + - success + j11_build: + - success - start_j17_utests_cdc: type: approval - j17_utests_cdc: requires: - start_j17_utests_cdc - j11_build + upstream: + start_j17_utests_cdc: + - success + j11_build: + - success - start_j11_utests_compression: type: approval - j11_utests_compression: requires: - start_j11_utests_compression - j11_build + upstream: + start_j11_utests_compression: + - success + j11_build: + - success - start_j17_utests_compression: type: approval - j17_utests_compression: requires: - start_j17_utests_compression - j11_build + upstream: + start_j17_utests_compression: + - success + j11_build: + - success - start_j11_utests_latest: type: approval - j11_utests_latest: requires: - start_j11_utests_latest - j11_build + upstream: + start_j11_utests_latest: + - success + j11_build: + - success - start_j17_utests_latest: type: approval - j17_utests_latest: requires: - start_j17_utests_latest - j11_build + upstream: + start_j17_utests_latest: + - success + j11_build: + - success - start_j11_utests_stress: type: approval - j11_utests_stress: requires: - start_j11_utests_stress - j11_build + upstream: + start_j11_utests_stress: + - success + j11_build: + - success - start_j17_utests_stress: type: approval - j17_utests_stress: requires: - start_j17_utests_stress - j11_build + upstream: + start_j17_utests_stress: + - success + j11_build: + - success - start_j11_utests_fqltool: type: approval - j11_utests_fqltool: requires: - start_j11_utests_fqltool - j11_build + upstream: + start_j11_utests_fqltool: + - success + j11_build: + - success - start_j17_utests_fqltool: type: approval - j17_utests_fqltool: requires: - start_j17_utests_fqltool - j11_build + upstream: + start_j17_utests_fqltool: + - success + j11_build: + - success + - start_j11_utests_sstableloader: + type: approval + - j11_utests_sstableloader: + requires: + - start_j11_utests_sstableloader + - j11_build + upstream: + start_j11_utests_sstableloader: + - success + j11_build: + - success + - start_j17_utests_sstableloader: + type: approval + - j17_utests_sstableloader: + requires: + - start_j17_utests_sstableloader + - j11_build + upstream: + start_j17_utests_sstableloader: + - success + j11_build: + - success - start_j11_utests_system_keyspace_directory: type: approval - j11_utests_system_keyspace_directory: requires: - start_j11_utests_system_keyspace_directory - j11_build + upstream: + start_j11_utests_system_keyspace_directory: + - success + j11_build: + - success - start_j17_utests_system_keyspace_directory: type: approval - j17_utests_system_keyspace_directory: requires: - start_j17_utests_system_keyspace_directory - j11_build + upstream: + start_j17_utests_system_keyspace_directory: + - success + j11_build: + - success - start_j11_dtest_jars_build: type: approval - j11_dtest_jars_build: requires: - j11_build - start_j11_dtest_jars_build + upstream: + j11_build: + - success + start_j11_dtest_jars_build: + - success - start_jvm_upgrade_dtests: type: approval - j11_jvm_upgrade_dtests: requires: - start_jvm_upgrade_dtests - j11_dtest_jars_build + upstream: + start_jvm_upgrade_dtests: + - success + j11_dtest_jars_build: + - success - start_j11_dtests: type: approval - j11_dtests: requires: - start_j11_dtests - j11_build + upstream: + start_j11_dtests: + - success + j11_build: + - success - start_j11_dtests_vnode: type: approval - j11_dtests_vnode: requires: - start_j11_dtests_vnode - j11_build + upstream: + start_j11_dtests_vnode: + - success + j11_build: + - success - start_j11_dtests_latest: type: approval - j11_dtests_latest: requires: - start_j11_dtests_latest - j11_build + upstream: + start_j11_dtests_latest: + - success + j11_build: + - success - start_j17_dtests: type: approval - j17_dtests: requires: - start_j17_dtests - j11_build + upstream: + start_j17_dtests: + - success + j11_build: + - success - start_j17_dtests_vnode: type: approval - j17_dtests_vnode: requires: - start_j17_dtests_vnode - j11_build + upstream: + start_j17_dtests_vnode: + - success + j11_build: + - success - start_j17_dtests_latest: type: approval - j17_dtests_latest: requires: - start_j17_dtests_latest - j11_build + upstream: + start_j17_dtests_latest: + - success + j11_build: + - success - start_j11_dtests_large: type: approval - j11_dtests_large: requires: - start_j11_dtests_large - j11_build + upstream: + start_j11_dtests_large: + - success + j11_build: + - success - start_j11_dtests_large_vnode: type: approval - j11_dtests_large_vnode: requires: - start_j11_dtests_large_vnode - j11_build + upstream: + start_j11_dtests_large_vnode: + - success + j11_build: + - success - start_j17_dtests_large: type: approval - j17_dtests_large: requires: - start_j17_dtests_large - j11_build + upstream: + start_j17_dtests_large: + - success + j11_build: + - success - start_j17_dtests_large_vnode: type: approval - j17_dtests_large_vnode: requires: - start_j17_dtests_large_vnode - j11_build + upstream: + start_j17_dtests_large_vnode: + - success + j11_build: + - success - start_j11_cqlsh_tests: type: approval - j11_cqlsh_dtests_py38: requires: - start_j11_cqlsh_tests - j11_build + upstream: + start_j11_cqlsh_tests: + - success + j11_build: + - success - j11_cqlsh_dtests_py311: requires: - start_j11_cqlsh_tests - j11_build + upstream: + start_j11_cqlsh_tests: + - success + j11_build: + - success - j11_cqlsh_dtests_py38_vnode: requires: - start_j11_cqlsh_tests - j11_build + upstream: + start_j11_cqlsh_tests: + - success + j11_build: + - success - j11_cqlsh_dtests_py311_vnode: requires: - start_j11_cqlsh_tests - j11_build + upstream: + start_j11_cqlsh_tests: + - success + j11_build: + - success - start_j11_cqlsh_tests_latest: type: approval - j11_cqlsh_dtests_py38_latest: requires: - start_j11_cqlsh_tests_latest - j11_build + upstream: + start_j11_cqlsh_tests_latest: + - success + j11_build: + - success - j11_cqlsh_dtests_py311_latest: requires: - start_j11_cqlsh_tests_latest - j11_build + upstream: + start_j11_cqlsh_tests_latest: + - success + j11_build: + - success - start_j17_cqlsh_tests: type: approval - j17_cqlsh_dtests_py38: requires: - start_j17_cqlsh_tests - j11_build + upstream: + start_j17_cqlsh_tests: + - success + j11_build: + - success - j17_cqlsh_dtests_py311: requires: - start_j17_cqlsh_tests - j11_build + upstream: + start_j17_cqlsh_tests: + - success + j11_build: + - success - j17_cqlsh_dtests_py38_vnode: requires: - start_j17_cqlsh_tests - j11_build + upstream: + start_j17_cqlsh_tests: + - success + j11_build: + - success - j17_cqlsh_dtests_py311_vnode: requires: - start_j17_cqlsh_tests - j11_build + upstream: + start_j17_cqlsh_tests: + - success + j11_build: + - success - start_j17_cqlsh_tests_latest: type: approval - j17_cqlsh_dtests_py38_latest: requires: - start_j17_cqlsh_tests_latest - j11_build + upstream: + start_j17_cqlsh_tests_latest: + - success + j11_build: + - success - j17_cqlsh_dtests_py311_latest: requires: - start_j17_cqlsh_tests_latest - j11_build + upstream: + start_j17_cqlsh_tests_latest: + - success + j11_build: + - success - start_j11_upgrade_dtests: type: approval - j11_upgrade_dtests: requires: - start_j11_upgrade_dtests - j11_build + upstream: + start_j11_upgrade_dtests: + - success + j11_build: + - success java11_pre-commit_tests: jobs: - start_pre-commit_tests: @@ -9784,207 +10583,428 @@ workflows: - j11_build: requires: - start_pre-commit_tests + upstream: + start_pre-commit_tests: + - success - j11_unit_tests: requires: - j11_build + upstream: + j11_build: + - success - j11_utests_oa: requires: - j11_build + upstream: + j11_build: + - success - j11_utests_latest: requires: - j11_build + upstream: + j11_build: + - success - j11_simulator_dtests: requires: - j11_build + upstream: + j11_build: + - success - j11_jvm_dtests: requires: - j11_build + upstream: + j11_build: + - success - j11_jvm_dtests_latest_vnode: requires: - j11_build + upstream: + j11_build: + - success - j17_jvm_dtests: requires: - j11_build + upstream: + j11_build: + - success - j17_jvm_dtests_latest_vnode: requires: - j11_build + upstream: + j11_build: + - success - j11_cqlshlib_tests: requires: - j11_build + upstream: + j11_build: + - success - j11_cqlshlib_cython_tests: requires: - j11_build + upstream: + j11_build: + - success - j17_cqlshlib_tests: requires: - j11_build + upstream: + j11_build: + - success - j17_cqlshlib_cython_tests: requires: - j11_build + upstream: + j11_build: + - success - j17_unit_tests: requires: - j11_build + upstream: + j11_build: + - success - j17_utests_oa: requires: - j11_build + upstream: + j11_build: + - success - j17_utests_latest: requires: - j11_build + upstream: + j11_build: + - success - start_utests_long: type: approval - j11_utests_long: requires: - start_utests_long - j11_build + upstream: + start_utests_long: + - success + j11_build: + - success - j17_utests_long: requires: - start_utests_long - j11_build + upstream: + start_utests_long: + - success + j11_build: + - success - start_utests_cdc: type: approval - j11_utests_cdc: requires: - start_utests_cdc - j11_build + upstream: + start_utests_cdc: + - success + j11_build: + - success - j17_utests_cdc: requires: - start_utests_cdc - j11_build + upstream: + start_utests_cdc: + - success + j11_build: + - success - start_utests_compression: type: approval - j11_utests_compression: requires: - start_utests_compression - j11_build + upstream: + start_utests_compression: + - success + j11_build: + - success - j17_utests_compression: requires: - start_utests_compression - j11_build + upstream: + start_utests_compression: + - success + j11_build: + - success - start_utests_stress: type: approval - j11_utests_stress: requires: - start_utests_stress - j11_build + upstream: + start_utests_stress: + - success + j11_build: + - success - j17_utests_stress: requires: - start_utests_stress - j11_build + upstream: + start_utests_stress: + - success + j11_build: + - success - start_utests_fqltool: type: approval - j11_utests_fqltool: requires: - start_utests_fqltool - j11_build + upstream: + start_utests_fqltool: + - success + j11_build: + - success - j17_utests_fqltool: requires: - start_utests_fqltool - j11_build + upstream: + start_utests_fqltool: + - success + j11_build: + - success + - start_utests_sstableloader: + type: approval + - j11_utests_sstableloader: + requires: + - start_utests_sstableloader + - j11_build + upstream: + start_utests_sstableloader: + - success + j11_build: + - success + - j17_utests_sstableloader: + requires: + - start_utests_sstableloader + - j11_build + upstream: + start_utests_sstableloader: + - success + j11_build: + - success - start_utests_system_keyspace_directory: type: approval - j11_utests_system_keyspace_directory: requires: - j11_build + upstream: + j11_build: + - success - j17_utests_system_keyspace_directory: requires: - start_utests_system_keyspace_directory - j11_build + upstream: + start_utests_system_keyspace_directory: + - success + j11_build: + - success - start_jvm_upgrade_dtests: type: approval - j11_dtest_jars_build: requires: - j11_build - start_jvm_upgrade_dtests + upstream: + j11_build: + - success + start_jvm_upgrade_dtests: + - success - j11_jvm_upgrade_dtests: requires: - j11_dtest_jars_build + upstream: + j11_dtest_jars_build: + - success - j11_dtests: requires: - j11_build + upstream: + j11_build: + - success - j11_dtests_vnode: requires: - j11_build + upstream: + j11_build: + - success - j11_dtests_latest: requires: - j11_build + upstream: + j11_build: + - success - j17_dtests: requires: - j11_build + upstream: + j11_build: + - success - j17_dtests_vnode: requires: - j11_build + upstream: + j11_build: + - success - j17_dtests_latest: requires: - j11_build + upstream: + j11_build: + - success - start_j11_dtests_large: type: approval - j11_dtests_large: requires: - start_j11_dtests_large - j11_build + upstream: + start_j11_dtests_large: + - success + j11_build: + - success - j11_dtests_large_vnode: requires: - start_j11_dtests_large - j11_build + upstream: + start_j11_dtests_large: + - success + j11_build: + - success - start_j17_dtests_large: type: approval - j17_dtests_large: requires: - start_j17_dtests_large - j11_build + upstream: + start_j17_dtests_large: + - success + j11_build: + - success - j17_dtests_large_vnode: requires: - start_j17_dtests_large - j11_build + upstream: + start_j17_dtests_large: + - success + j11_build: + - success - j11_cqlsh_dtests_py38: requires: - j11_build + upstream: + j11_build: + - success - j11_cqlsh_dtests_py311: requires: - j11_build + upstream: + j11_build: + - success - j11_cqlsh_dtests_py38_vnode: requires: - j11_build + upstream: + j11_build: + - success - j11_cqlsh_dtests_py311_vnode: requires: - j11_build + upstream: + j11_build: + - success - start_j11_cqlsh_dtests_latest: type: approval - j11_cqlsh_dtests_py38_latest: requires: - start_j11_cqlsh_dtests_latest - j11_build + upstream: + start_j11_cqlsh_dtests_latest: + - success + j11_build: + - success - j11_cqlsh_dtests_py311_latest: requires: - start_j11_cqlsh_dtests_latest - j11_build + upstream: + start_j11_cqlsh_dtests_latest: + - success + j11_build: + - success - j17_cqlsh_dtests_py38: requires: - j11_build + upstream: + j11_build: + - success - j17_cqlsh_dtests_py311: requires: - j11_build + upstream: + j11_build: + - success - j17_cqlsh_dtests_py38_vnode: requires: - j11_build + upstream: + j11_build: + - success - j17_cqlsh_dtests_py311_vnode: requires: - j11_build + upstream: + j11_build: + - success - start_j17_cqlsh-dtests-latest: type: approval - j17_cqlsh_dtests_py38_latest: requires: - start_j17_cqlsh-dtests-latest - j11_build + upstream: + start_j17_cqlsh-dtests-latest: + - success + j11_build: + - success - j17_cqlsh_dtests_py311_latest: requires: - start_j17_cqlsh-dtests-latest - j11_build + upstream: + start_j17_cqlsh-dtests-latest: + - success + j11_build: + - success - start_j11_upgrade_dtests: type: approval - j11_upgrade_dtests: requires: - j11_build - start_j11_upgrade_dtests + upstream: + j11_build: + - success + start_j11_upgrade_dtests: + - success java17_separate_tests: jobs: - start_j17_build: @@ -9992,142 +11012,276 @@ workflows: - j17_build: requires: - start_j17_build + upstream: + start_j17_build: + - success - start_j17_unit_tests: type: approval - j17_unit_tests: requires: - start_j17_unit_tests - j17_build + upstream: + start_j17_unit_tests: + - success + j17_build: + - success - start_j17_jvm_dtests: type: approval - j17_jvm_dtests: requires: - start_j17_jvm_dtests - j17_build + upstream: + start_j17_jvm_dtests: + - success + j17_build: + - success - start_j17_jvm_dtests_latest_vnode: type: approval - j17_jvm_dtests_latest_vnode: requires: - start_j17_jvm_dtests_latest_vnode - j17_build + upstream: + start_j17_jvm_dtests_latest_vnode: + - success + j17_build: + - success - start_j17_cqlshlib_tests: type: approval - j17_cqlshlib_tests: requires: - start_j17_cqlshlib_tests - j17_build + upstream: + start_j17_cqlshlib_tests: + - success + j17_build: + - success - start_j17_cqlshlib_cython_tests: type: approval - j17_cqlshlib_cython_tests: requires: - start_j17_cqlshlib_cython_tests - j17_build + upstream: + start_j17_cqlshlib_cython_tests: + - success + j17_build: + - success - start_j17_dtests: type: approval - j17_dtests: requires: - start_j17_dtests - j17_build + upstream: + start_j17_dtests: + - success + j17_build: + - success - start_j17_dtests_vnode: type: approval - j17_dtests_vnode: requires: - start_j17_dtests_vnode - j17_build + upstream: + start_j17_dtests_vnode: + - success + j17_build: + - success - start_j17_dtests_latest: type: approval - j17_dtests_latest: requires: - start_j17_dtests_latest - j17_build + upstream: + start_j17_dtests_latest: + - success + j17_build: + - success - start_j17_dtests_large: type: approval - j17_dtests_large: requires: - start_j17_dtests_large - j17_build + upstream: + start_j17_dtests_large: + - success + j17_build: + - success - start_j17_dtests_large_vnode: type: approval - j17_dtests_large_vnode: requires: - start_j17_dtests_large_vnode - j17_build + upstream: + start_j17_dtests_large_vnode: + - success + j17_build: + - success - start_j17_cqlsh_tests: type: approval - j17_cqlsh_dtests_py38: requires: - start_j17_cqlsh_tests - j17_build + upstream: + start_j17_cqlsh_tests: + - success + j17_build: + - success - j17_cqlsh_dtests_py311: requires: - start_j17_cqlsh_tests - j17_build + upstream: + start_j17_cqlsh_tests: + - success + j17_build: + - success - j17_cqlsh_dtests_py38_vnode: requires: - start_j17_cqlsh_tests - j17_build + upstream: + start_j17_cqlsh_tests: + - success + j17_build: + - success - j17_cqlsh_dtests_py311_vnode: requires: - start_j17_cqlsh_tests - j17_build + upstream: + start_j17_cqlsh_tests: + - success + j17_build: + - success - start_j17_cqlsh-dtests-latest: type: approval - j17_cqlsh_dtests_py38_latest: requires: - start_j17_cqlsh-dtests-latest - j17_build + upstream: + start_j17_cqlsh-dtests-latest: + - success + j17_build: + - success - j17_cqlsh_dtests_py311_latest: requires: - start_j17_cqlsh-dtests-latest - j17_build + upstream: + start_j17_cqlsh-dtests-latest: + - success + j17_build: + - success - start_j17_utests_oa: type: approval - j17_utests_oa: requires: - start_j17_utests_oa - j17_build + upstream: + start_j17_utests_oa: + - success + j17_build: + - success - start_j17_utests_long: type: approval - j17_utests_long: requires: - start_j17_utests_long - j17_build + upstream: + start_j17_utests_long: + - success + j17_build: + - success - start_j17_utests_cdc: type: approval - j17_utests_cdc: requires: - start_j17_utests_cdc - j17_build + upstream: + start_j17_utests_cdc: + - success + j17_build: + - success - start_j17_utests_compression: type: approval - j17_utests_compression: requires: - start_j17_utests_compression - j17_build + upstream: + start_j17_utests_compression: + - success + j17_build: + - success - start_j17_utests_latest: type: approval - j17_utests_latest: requires: - start_j17_utests_latest - j17_build + upstream: + start_j17_utests_latest: + - success + j17_build: + - success - start_j17_utests_stress: type: approval - j17_utests_stress: requires: - start_j17_utests_stress - j17_build + upstream: + start_j17_utests_stress: + - success + j17_build: + - success - start_j17_utests_fqltool: type: approval - j17_utests_fqltool: requires: - start_j17_utests_fqltool - j17_build + upstream: + start_j17_utests_fqltool: + - success + j17_build: + - success + - start_j17_utests_sstableloader: + type: approval + - j17_utests_sstableloader: + requires: + - start_j17_utests_sstableloader + - j17_build + upstream: + start_j17_utests_sstableloader: + - success + j17_build: + - success - start_j17_utests_system_keyspace_directory: type: approval - j17_utests_system_keyspace_directory: requires: - start_j17_utests_system_keyspace_directory - j17_build + upstream: + start_j17_utests_system_keyspace_directory: + - success + j17_build: + - success java17_pre-commit_tests: jobs: - start_pre-commit_tests: @@ -10135,101 +11289,207 @@ workflows: - j17_build: requires: - start_pre-commit_tests + upstream: + start_pre-commit_tests: + - success - j17_unit_tests: requires: - j17_build + upstream: + j17_build: + - success - j17_utests_oa: requires: - j17_build + upstream: + j17_build: + - success - j17_utests_latest: requires: - j17_build + upstream: + j17_build: + - success - j17_jvm_dtests: requires: - j17_build + upstream: + j17_build: + - success - j17_jvm_dtests_latest_vnode: requires: - j17_build + upstream: + j17_build: + - success - j17_cqlshlib_tests: requires: - j17_build + upstream: + j17_build: + - success - j17_cqlshlib_cython_tests: requires: - j17_build + upstream: + j17_build: + - success - j17_dtests: requires: - j17_build + upstream: + j17_build: + - success - j17_dtests_vnode: requires: - j17_build + upstream: + j17_build: + - success - j17_dtests_latest: requires: - j17_build + upstream: + j17_build: + - success - start_j17_dtests_large: type: approval - j17_dtests_large: requires: - start_j17_dtests_large - j17_build + upstream: + start_j17_dtests_large: + - success + j17_build: + - success - j17_dtests_large_vnode: requires: - start_j17_dtests_large - j17_build + upstream: + start_j17_dtests_large: + - success + j17_build: + - success - j17_cqlsh_dtests_py38: requires: - j17_build + upstream: + j17_build: + - success - j17_cqlsh_dtests_py311: requires: - j17_build + upstream: + j17_build: + - success - j17_cqlsh_dtests_py38_vnode: requires: - j17_build + upstream: + j17_build: + - success - j17_cqlsh_dtests_py311_vnode: requires: - j17_build + upstream: + j17_build: + - success - start_j17_cqlsh-dtests-latest: type: approval - j17_cqlsh_dtests_py38_latest: requires: - start_j17_cqlsh-dtests-latest - j17_build + upstream: + start_j17_cqlsh-dtests-latest: + - success + j17_build: + - success - j17_cqlsh_dtests_py311_latest: requires: - start_j17_cqlsh-dtests-latest - j17_build + upstream: + start_j17_cqlsh-dtests-latest: + - success + j17_build: + - success - start_utests_long: type: approval - j17_utests_long: requires: - start_utests_long - j17_build + upstream: + start_utests_long: + - success + j17_build: + - success - start_utests_cdc: type: approval - j17_utests_cdc: requires: - start_utests_cdc - j17_build + upstream: + start_utests_cdc: + - success + j17_build: + - success - start_utests_compression: type: approval - j17_utests_compression: requires: - start_utests_compression - j17_build + upstream: + start_utests_compression: + - success + j17_build: + - success - start_utests_stress: type: approval - j17_utests_stress: requires: - start_utests_stress - j17_build + upstream: + start_utests_stress: + - success + j17_build: + - success - start_utests_fqltool: type: approval - j17_utests_fqltool: requires: - start_utests_fqltool - j17_build + upstream: + start_utests_fqltool: + - success + j17_build: + - success + - start_utests_sstableloader: + type: approval + - j17_utests_sstableloader: + requires: + - start_utests_sstableloader + - j17_build + upstream: + start_utests_sstableloader: + - success + j17_build: + - success - start_utests_system_keyspace_directory: type: approval - j17_utests_system_keyspace_directory: requires: - start_utests_system_keyspace_directory - j17_build + upstream: + start_utests_system_keyspace_directory: + - success + j17_build: + - success diff --git a/.circleci/config_template.yml b/.circleci/config_template.yml index d015b5e8728e..a3e09b1fa01f 100644 --- a/.circleci/config_template.yml +++ b/.circleci/config_template.yml @@ -63,6 +63,13 @@ default_env_vars: &default_env_vars # The number of times that new, modified or manually specified fqltool unit tests should be run. REPEATED_UTESTS_FQLTOOL_COUNT: 500 + # Comma-separated list of tests that should be included in the repeated run for sstableloader unit tests, + # in addition to automatically detected new and modified tests. For example: + # REPEATED_UTESTS_SSTABLELOADER: org.apache.cassandra.tools.LoaderOptionsTest + REPEATED_UTESTS_SSTABLELOADER: + # The number of times that new, modified or manually specified sstableloader unit tests should be run. + REPEATED_UTESTS_SSTABLELOADER_COUNT: 500 + # Comma-separated list of tests that should be included in the repeated run for long unit tests, # in addition to automatically detected new and modified tests. For example: # REPEATED_UTESTS_LONG: org.apache.cassandra.db.commitlog.CommitLogStressTest @@ -537,6 +544,30 @@ j11_separate_jobs: &j11_separate_jobs requires: - start_j17_utests_fqltool_repeat - j11_build + - start_j11_utests_sstableloader: + type: approval + - j11_utests_sstableloader: + requires: + - start_j11_utests_sstableloader + - j11_build + - start_j17_utests_sstableloader: + type: approval + - j17_utests_sstableloader: + requires: + - start_j17_utests_sstableloader + - j11_build + - start_j11_utests_sstableloader_repeat: + type: approval + - j11_utests_sstableloader_repeat: + requires: + - start_j11_utests_sstableloader_repeat + - j11_build + - start_j17_utests_sstableloader_repeat: + type: approval + - j17_utests_sstableloader_repeat: + requires: + - start_j17_utests_sstableloader_repeat + - j11_build - start_j11_utests_system_keyspace_directory: type: approval - j11_utests_system_keyspace_directory: @@ -974,6 +1005,24 @@ j11_pre-commit_jobs: &j11_pre-commit_jobs requires: - start_utests_fqltool - j11_build + - start_utests_sstableloader: + type: approval + - j11_utests_sstableloader: + requires: + - start_utests_sstableloader + - j11_build + - j17_utests_sstableloader: + requires: + - start_utests_sstableloader + - j11_build + - j11_utests_sstableloader_repeat: + requires: + - start_utests_sstableloader + - j11_build + - j17_utests_sstableloader_repeat: + requires: + - start_utests_sstableloader + - j11_build - start_utests_system_keyspace_directory: type: approval - j11_utests_system_keyspace_directory: @@ -1356,6 +1405,18 @@ j17_separate_jobs: &j17_separate_jobs requires: - start_j17_utests_fqltool_repeat - j17_build + - start_j17_utests_sstableloader: + type: approval + - j17_utests_sstableloader: + requires: + - start_j17_utests_sstableloader + - j17_build + - start_j17_utests_sstableloader_repeat: + type: approval + - j17_utests_sstableloader_repeat: + requires: + - start_j17_utests_sstableloader_repeat + - j17_build - start_j17_utests_system_keyspace_directory: type: approval - j17_utests_system_keyspace_directory: @@ -1541,6 +1602,16 @@ j17_pre-commit_jobs: &j17_pre-commit_jobs requires: - start_utests_fqltool - j17_build + - start_utests_sstableloader: + type: approval + - j17_utests_sstableloader: + requires: + - start_utests_sstableloader + - j17_build + - j17_utests_sstableloader_repeat: + requires: + - start_utests_sstableloader + - j17_build - start_utests_system_keyspace_directory: type: approval - j17_utests_system_keyspace_directory: @@ -1875,6 +1946,22 @@ jobs: - run_junit_tests: target: fqltool-test + j11_utests_sstableloader: + <<: *j11_seq_executor + steps: + - attach_workspace: + at: /home/cassandra + - run_junit_tests: + target: sstableloader-test + + j17_utests_sstableloader: + <<: *j17_seq_executor + steps: + - attach_workspace: + at: /home/cassandra + - run_junit_tests: + target: sstableloader-test + j11_utests_system_keyspace_directory: <<: *j11_par_executor steps: @@ -2380,6 +2467,22 @@ jobs: - log_environment - run_utests_fqltool_repeat + j11_utests_sstableloader_repeat: + <<: *j11_repeated_utest_executor + steps: + - attach_workspace: + at: /home/cassandra + - log_environment + - run_utests_sstableloader_repeat + + j17_utests_sstableloader_repeat: + <<: *j17_repeated_utest_executor + steps: + - attach_workspace: + at: /home/cassandra + - log_environment + - run_utests_sstableloader_repeat + j11_utests_long_repeat: <<: *j11_repeated_utest_executor steps: @@ -3114,6 +3217,14 @@ commands: count: ${REPEATED_UTESTS_FQLTOOL_COUNT} stop_on_failure: ${REPEATED_TESTS_STOP_ON_FAILURE} + run_utests_sstableloader_repeat: + steps: + - run_repeated_utests: + target: sstableloader-test + tests: ${REPEATED_UTESTS_SSTABLELOADER} + count: ${REPEATED_UTESTS_SSTABLELOADER_COUNT} + stop_on_failure: ${REPEATED_TESTS_STOP_ON_FAILURE} + run_utests_stress_repeat: steps: - run_repeated_utests: @@ -3237,6 +3348,7 @@ commands: $target == "test-oa" || \ $target == "test-system-keyspace-directory" || \ $target == "fqltool-test" || \ + $target == "sstableloader-test" || \ $target == "long-test" || \ $target == "stress-test" || \ $target == "test-simulator-dtest" ]]; then @@ -3365,6 +3477,7 @@ commands: $target == "test-oa" || \ $target == "test-system-keyspace-directory" || \ $target == "fqltool-test" || \ + $target == "sstableloader-test" || \ $target == "long-test" || \ $target == "stress-test" || \ $target == "test-simulator-dtest" ]]; then diff --git a/.circleci/generate.sh b/.circleci/generate.sh index 97a433e2d77c..26d68dc2a3b2 100755 --- a/.circleci/generate.sh +++ b/.circleci/generate.sh @@ -51,6 +51,8 @@ print_help() echo " -e REPEATED_UTESTS_COUNT=500" echo " -e REPEATED_UTESTS_FQLTOOL=org.apache.cassandra.fqltool.FQLCompareTest" echo " -e REPEATED_UTESTS_FQLTOOL_COUNT=500" + echo " -e REPEATED_UTESTS_SSTABLELOADER=org.apache.cassandra.tools.LoaderOptionsTest" + echo " -e REPEATED_UTESTS_SSTABLELOADER_COUNT=500" echo " -e REPEATED_UTESTS_LONG=org.apache.cassandra.db.commitlog.CommitLogStressTest" echo " -e REPEATED_UTESTS_LONG_COUNT=100" echo " -e REPEATED_UTESTS_STRESS=org.apache.cassandra.stress.generate.DistributionGaussianTest" @@ -131,6 +133,8 @@ if $has_env_vars && $check_env_vars; then [ "$key" != "REPEATED_UTESTS_COUNT" ] && [ "$key" != "REPEATED_UTESTS_FQLTOOL" ] && [ "$key" != "REPEATED_UTESTS_FQLTOOL_COUNT" ] && + [ "$key" != "REPEATED_UTESTS_SSTABLELOADER" ] && + [ "$key" != "REPEATED_UTESTS_SSTABLELOADER_COUNT" ] && [ "$key" != "REPEATED_UTESTS_LONG" ] && [ "$key" != "REPEATED_UTESTS_LONG_COUNT" ] && [ "$key" != "REPEATED_UTESTS_STRESS" ] && @@ -171,7 +175,7 @@ if $free; then elif $paid; then ($all || $free) && die "Cannot use option -p with options -a or -f" echo "Generating new config.yml file for paid tier from config_template.yml" - patch -o $BASEDIR/config_template.yml.PAID $BASEDIR/config_template.yml $BASEDIR/config_template.yml.PAID.patch + patch --silent -o $BASEDIR/config_template.yml.PAID $BASEDIR/config_template.yml $BASEDIR/config_template.yml.PAID.patch circleci config process $BASEDIR/config_template.yml.PAID > $BASEDIR/config.yml.PAID.tmp cat $BASEDIR/license.yml $BASEDIR/config.yml.PAID.tmp > $BASEDIR/config.yml rm $BASEDIR/config_template.yml.PAID $BASEDIR/config.yml.PAID.tmp @@ -188,7 +192,7 @@ elif $all; then rm $BASEDIR/config.yml.FREE.tmp # setup config for paid tier - patch -o $BASEDIR/config_template.yml.PAID $BASEDIR/config_template.yml $BASEDIR/config_template.yml.PAID.patch + patch --silent -o $BASEDIR/config_template.yml.PAID $BASEDIR/config_template.yml $BASEDIR/config_template.yml.PAID.patch circleci config process $BASEDIR/config_template.yml.PAID > $BASEDIR/config.yml.PAID.tmp cat $BASEDIR/license.yml $BASEDIR/config.yml.PAID.tmp > $BASEDIR/config.yml.PAID rm $BASEDIR/config_template.yml.PAID $BASEDIR/config.yml.PAID.tmp @@ -241,6 +245,7 @@ if $detect_changed_tests; then add_diff_tests "REPEATED_UTESTS_LONG" "test/long/" "org.apache.cassandra" add_diff_tests "REPEATED_UTESTS_STRESS" "tools/stress/test/unit/" "org.apache.cassandra.stress" add_diff_tests "REPEATED_UTESTS_FQLTOOL" "tools/fqltool/test/unit/" "org.apache.cassandra.fqltool" + add_diff_tests "REPEATED_UTESTS_SSTABLELOADER" "tools/sstableloader/test/unit/" "org.apache.cassandra.tools" add_diff_tests "REPEATED_SIMULATOR_DTESTS" "test/simulator/test/" "org.apache.cassandra.simulator.test" add_diff_tests "REPEATED_JVM_DTESTS" "test/distributed/" "org.apache.cassandra.distributed.test" add_diff_tests "REPEATED_JVM_UPGRADE_DTESTS" "test/distributed/" "org.apache.cassandra.distributed.upgrade" @@ -305,6 +310,10 @@ delete_repeated_jobs() delete_job "$1" "j11_utests_fqltool_repeat" delete_job "$1" "j17_utests_fqltool_repeat" fi + if (! (echo "$env_vars" | grep -q "REPEATED_UTESTS_SSTABLELOADER=")); then + delete_job "$1" "j11_utests_sstableloader_repeat" + delete_job "$1" "j17_utests_sstableloader_repeat" + fi if (! (echo "$env_vars" | grep -q "REPEATED_SIMULATOR_DTESTS=")); then delete_job "$1" "j11_simulator_dtests_repeat" fi @@ -386,6 +395,7 @@ build_dev_min_jobs() delete_job "$1" "j11_utests_cdc" delete_job "$1" "j11_utests_compression" delete_job "$1" "j11_utests_fqltool" + delete_job "$1" "j11_utests_sstableloader" delete_job "$1" "j11_utests_long" delete_job "$1" "j11_utests_stress" delete_job "$1" "j11_utests_system_keyspace_directory" @@ -394,6 +404,7 @@ build_dev_min_jobs() delete_job "$1" "j17_utests_cdc" delete_job "$1" "j17_utests_compression" delete_job "$1" "j17_utests_fqltool" + delete_job "$1" "j17_utests_sstableloader" delete_job "$1" "j17_utests_long" delete_job "$1" "j17_utests_stress" delete_job "$1" "j11_utests_latest" @@ -403,6 +414,7 @@ build_dev_min_jobs() delete_job "$1" "start_utests_stress" delete_job "$1" "start_utests_long" delete_job "$1" "start_utests_fqltool" + delete_job "$1" "start_utests_sstableloader" delete_job "$1" "start_utests_compression" delete_job "$1" "start_utests_cdc" delete_job "$1" "start_j17_cqlsh-dtests-latest" diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000000..616dacf610a7 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "modules/accord"] + path = modules/accord + url = https://github.com/apache/cassandra-accord.git + branch = trunk diff --git a/.jenkins/Jenkinsfile b/.jenkins/Jenkinsfile index 4d5cb6dfa189..60e1f093ccc2 100644 --- a/.jenkins/Jenkinsfile +++ b/.jenkins/Jenkinsfile @@ -127,9 +127,9 @@ def pipelineProfiles() { return [ 'packaging': ['artifacts', 'lint', 'debian', 'redhat'], 'skinny': ['lint', 'cqlsh-test', 'test', 'jvm-dtest', 'simulator-dtest', 'dtest'], - 'pre-commit': ['artifacts', 'lint', 'debian', 'redhat', 'fqltool-test', 'cqlsh-test', 'test', 'test-latest', 'stress-test', 'test-burn', 'jvm-dtest', 'simulator-dtest', 'dtest', 'dtest-latest'], - 'pre-commit w/ upgrades': ['artifacts', 'lint', 'debian', 'redhat', 'fqltool-test', 'cqlsh-test', 'test', 'test-latest', 'stress-test', 'test-burn', 'jvm-dtest', 'jvm-dtest-upgrade', 'simulator-dtest', 'dtest', 'dtest-novnode', 'dtest-latest', 'dtest-upgrade'], - 'post-commit': ['artifacts', 'lint', 'debian', 'redhat', 'fqltool-test', 'cqlsh-test', 'test-cdc', 'test', 'test-latest', 'test-compression', 'stress-test', 'test-burn', 'long-test', 'test-oa', 'test-system-keyspace-directory', 'jvm-dtest', 'jvm-dtest-upgrade', 'simulator-dtest', 'dtest', 'dtest-novnode', 'dtest-latest', 'dtest-large', 'dtest-large-novnode', 'dtest-large-latest', 'dtest-upgrade', 'dtest-upgrade-novnode', 'dtest-upgrade-large', 'dtest-upgrade-novnode-large'], + 'pre-commit': ['artifacts', 'lint', 'debian', 'redhat', 'fqltool-test', 'sstableloader-test', 'cqlsh-test', 'test', 'test-latest', 'stress-test', 'test-burn', 'jvm-dtest', 'simulator-dtest', 'dtest', 'dtest-latest'], + 'pre-commit w/ upgrades': ['artifacts', 'lint', 'debian', 'redhat', 'fqltool-test', 'sstableloader-test', 'cqlsh-test', 'test', 'test-latest', 'stress-test', 'test-burn', 'jvm-dtest', 'jvm-dtest-upgrade', 'simulator-dtest', 'dtest', 'dtest-novnode', 'dtest-latest', 'dtest-upgrade'], + 'post-commit': ['artifacts', 'lint', 'debian', 'redhat', 'fqltool-test', 'sstableloader-test', 'cqlsh-test', 'test-cdc', 'test', 'test-latest', 'test-compression', 'stress-test', 'test-burn', 'long-test', 'test-oa', 'test-system-keyspace-directory', 'jvm-dtest', 'jvm-dtest-upgrade', 'simulator-dtest', 'dtest', 'dtest-novnode', 'dtest-latest', 'dtest-large', 'dtest-large-novnode', 'dtest-large-latest', 'dtest-upgrade', 'dtest-upgrade-novnode', 'dtest-upgrade-large', 'dtest-upgrade-novnode-large'], 'custom': [] ] } @@ -164,16 +164,17 @@ def tasks() { // (some buffer on the heaviest split under the 1h max is required, ref `timeout(…)` in `test(…)`) 'cqlsh-test': [splits: 1], 'fqltool-test': [splits: 1, size: 'small'], - 'test-cdc': [splits: 8], - 'test': [splits: 16], - 'test-latest': [splits: 16], - 'test-compression': [splits: 16], + 'sstableloader-test': [splits: 1, size: 'small'], + 'test-cdc': [splits: 20], + 'test': [splits: 20], + 'test-latest': [splits: 20], + 'test-compression': [splits: 20], 'stress-test': [splits: 1, size: 'small'], 'test-burn': [splits: 2], 'long-test': [splits: 4], - 'test-oa': [splits: 16], - 'test-system-keyspace-directory': [splits: 16], - 'jvm-dtest': [splits: 12], + 'test-oa': [splits: 20], + 'test-system-keyspace-directory': [splits: 20], + 'jvm-dtest': [splits: 16], 'jvm-dtest-upgrade': [splits: 6], 'simulator-dtest': [splits: 1, size: 'large'], 'dtest': [splits: 64, size: 'large'], @@ -182,10 +183,10 @@ def tasks() { 'dtest-large': [splits: 6, size: 'large'], 'dtest-large-novnode': [splits: 6, size: 'large'], 'dtest-large-latest': [splits: 6, size: 'large'], - 'dtest-upgrade': [splits: 128, size: 'large'], - 'dtest-upgrade-novnode': [splits: 128, size: 'large'], - 'dtest-upgrade-large': [splits: 32, size: 'large'], - 'dtest-upgrade-novnode-large': [splits: 32, size: 'large'], + 'dtest-upgrade': [splits: 160, size: 'large'], + 'dtest-upgrade-novnode': [splits: 160, size: 'large'], + 'dtest-upgrade-large': [splits: 40, size: 'large'], + 'dtest-upgrade-novnode-large': [splits: 40, size: 'large'], ] testSteps.each() { it.value.put('type', 'test') @@ -337,7 +338,7 @@ def build(command, cell) { test -f .jenkins/Jenkinsfile || { echo "Invalid git fork/branch"; exit 1; } grep -q "Jenkins CI declaration" .jenkins/Jenkinsfile || { echo "Only Cassandra 5.0+ supported"; exit 1; } """ - fetchDockerImages(['almalinux-build', 'bullseye-build']) + fetchDockerImages("redhat" == cell.step ? ['almalinux-build'] : ['bullseye-build']) def cell_suffix = "_jdk${cell.jdk}_${cell.arch}" def logfile = "stage-logs/${JOB_NAME}_${BUILD_NUMBER}_${cell.step}${cell_suffix}_attempt${attempt}.log.xz" def script_vars = "#!/bin/bash \n set -o pipefail ; " // pipe to tee needs pipefail diff --git a/.snyk b/.snyk index e111ff3e10b1..aae9ae408474 100644 --- a/.snyk +++ b/.snyk @@ -16,3 +16,5 @@ ignore: - reason: Suppressed due to internal review, see project's .build/dependency-check-suppressions.xml CVE-2024-45772: - reason: https://issues.apache.org/jira/browse/CASSANDRA-20024 -- ^pkg:maven/org\.apache\.lucene/lucene\-.*@9.7.0$ + CVE-2025-25193: + - reason: https://issues.apache.org/jira/browse/CASSANDRA-20504 -- ^pkg:maven/io\.netty/netty\-.*@.*$ diff --git a/CHANGES.txt b/CHANGES.txt index 0e155821c0ab..c54c0d3ffc64 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,37 @@ 5.1 + * Support for add and replace in IntervalTree (CASSANDRA-20513) + * Enable single_sstable_uplevel by default for LCS (CASSANDRA-18509) + * Introduce NativeAccessor to avoid new ByteBuffer allocation on flush for each NativeCell (CASSANDRA-20173) + * Migrate sstableloader code to its own tools directory and artifact (CASSANDRA-20328) + * Stop AutoRepair monitoring thread upon Cassandra shutdown (CASSANDRA-20623) + * Avoid duplicate hardlink error upon forceful taking of ephemeral snapshots during repair (CASSANDRA-20490) + * When a custom disk error handler fails to initiate, fail the startup of a node instead of using the no-op handler (CASSANDRA-20614) + * Rewrite constraint framework to remove column specification from constraint definition, introduce SQL-like NOT NULL (CASSANDRA-20563) + * Fix a bug in AutoRepair duration metric calculation if schedule finishes quickly (CASSANDRA-20622) + * Fix AutoRepair flaky InJvm dtest (CASSANDRA-20620) + * Increasing default for auto_repair.sstable_upper_threshold considering large Cassandra tables & revert three lines removed from CHANGES.txt due to a merge mistake (CASSANDRA-20586) + * Fix token restrictions with MIN_TOKEN (CASSANDRO-20557) + * Upgrade logback version to 1.5.18 and slf4j dependencies to 2.0.17 (CASSANDRA-20429) + * Switch memtable-related off-heap objects to Native Endian and Memory to Little Endian (CASSANDRA-20190) + * Change SSTableSimpleScanner to use SSTableReader#openDataReaderForScan (CASSANDRA-20538) + * Automated Repair Inside Cassandra [CEP-37] (CASSANDRA-19918) + * Implement appender of slow queries to system_views.slow_queries table (CASSANDRA-13001) + * Add autocompletion in CQLSH for built-in functions (CASSANDRA-19631) + * Grant permission on keyspaces system_views and system_virtual_schema not possible (CASSANDRA-20171) + * General Purpose Transactions (Accord) [CEP-15] (CASSANDRA-17092) + * Improve performance when getting writePlacementsAllSettled from ClusterMetadata (CASSANDRA-20526) + * Add nodetool command to dump the contents of the system_views.{cluster_metadata_log, cluster_metadata_directory} tables (CASSANDRA-20525) + * Fix TreeMap race in CollectionVirtualTableAdapter causing us to lose rows in the virtual table (CASSANDRA-20524) + * Improve metadata log catch up with inter-DC mutation forwarding (CASSANDRA-20523) + * Support topology-safe changes to Datacenter & Rack for live nodes (CASSANDRA-20528) + * Add SSTableIntervalTree latency metric (CASSANDRA-20502) + * Ignore repetitions of semicolon in CQLSH (CASSANDRA-19956) + * Avoid NPE during cms initialization abort (CASSANDRA-20527) + * Avoid failing queries when epoch changes and replica goes up/down (CASSANDRA-20489) + * Split out truncation record lock (CASSANDRA-20480) + * Throw new IndexBuildInProgressException when queries fail during index build, instead of IndexNotAvailableException (CASSANDRA-20402) + * Fix Paxos repair interrupts running transactions (CASSANDRA-20469) + * Various fixes in constraint framework (CASSANDRA-20481) * Add support in CAS for -= on numeric types, and fixed improper handling of empty bytes which lead to NPE (CASSANDRA-20477) * Do not fail to start a node with materialized views after they are turned off in config (CASSANDRA-20452) * Fix nodetool gcstats output, support human-readable units and more output formats (CASSANDRA-19022) @@ -163,6 +196,23 @@ * Add the ability to disable bulk loading of SSTables (CASSANDRA-18781) * Clean up obsolete functions and simplify cql_version handling in cqlsh (CASSANDRA-18787) Merged from 5.0: + * Full Java 17 support (CASSANDRA-20681) + * Ensure replica filtering protection does not trigger unnecessary short read protection reads (CASSANDRA-20639) + * Unified Compaction does not properly validate min and target sizes (CASSANDRA-20398) + * Avoid lambda usage in TrieMemoryIndex range queries and ensure queue size tracking is per column (CASSANDRA-20668) + * Avoid CQLSH throwing an exception loading .cqlshrc on non-supported platforms (CASSANDRA-20478) + * Relax validation of snapshot name as a part of SSTable files path validation (CASSANDRA-20649) + * Optimize initial skipping logic for SAI queries on large partitions (CASSANDRA-20191) + * zero copy streaming allocates direct memory that isn't used, but does help to fragment the memory space (CASSANDRA-20577) + * CQLSSTableWriter supports setting the format (BTI or Big) (CASSANDRA-20609) + * Don't allocate in ThreadLocalReadAheadBuffer#close() (CASSANDRA-20551) + * Ensure RowFilter#isMutableIntersection() properly evaluates numeric ranges on a single column (CASSANDRA-20566) + * SAI marks an index as non-empty when a partial partition/row modifications is flushed due to repair (CASSANDRA-20567) + * SAI fails queries when multiple columns exist and a non-indexed column is a composite with a map (CASSANDRA-19891) + * Avoid purging deletions in RowFilter when reconciliation is required (CASSANDRA-20541) + * Fixed multiple single-node SAI query bugs relating to static columns (CASSANDRA-20338) + * Upgrade com.datastax.cassandra:cassandra-driver-core:3.11.5 to org.apache.cassandra:cassandra-driver-core:3.12.1 (CASSANDRA-17231) + * Update netty to 4.1.119.Final and netty-tcnative to 2.0.70.Final (CASSANDRA-20314) * Serialization can lose complex deletions in a mutation with multiple collections in a row (CASSANDRA-20449) * Improve error messages when initializing auth classes (CASSANDRA-20368) * Prioritize legacy 2i over SAI for columns with multiple indexes (CASSANDRA-20334) @@ -206,6 +256,7 @@ Merged from 5.0: * Prioritize built indexes in IndexStatusManager (CASSANDRA-19400) * Add java.base/java.lang.reflect among opens for jvm11-client.options (CASSANDRA-19780) Merged from 4.1: + * Fix mixed mode paxos ttl commit hang (CASSANDRA-20514) * Fix paxos mixed mode infinite loop (CASSANDRA-20493) * Optionally skip exception logging on invalid legacy protocol magic exception (CASSANDRA-19483) * Fix SimpleClient ability to release acquired capacity (CASSANDRA-20202) @@ -214,6 +265,15 @@ Merged from 4.1: * Enforce CQL message size limit on multiframe messages (CASSANDRA-20052) * Fix race condition in DecayingEstimatedHistogramReservoir during rescale (CASSANDRA-19365) Merged from 4.0: + * Ensure prepared_statement INSERT timestamp precedes eviction DELETE (CASSANDRA-19703) + * Gossip doesn't converge due to race condition when updating EndpointStates multiple fields (CASSANDRA-20659) + * Handle sstable metadata stats file getting a new mtime after compaction has finished (CASSANDRA-18119) + * Honor MAX_PARALLEL_TRANSFERS correctly (CASSANDRA-20532) + * Updating a column with a new TTL but same expiration time is non-deterministic and causes repair mismatches. (CASSANDRA-20561) + * Avoid computing prepared statement size for unprepared batches (CASSANDRA-20556) + * Fix Dropwizard Meter causes timeouts when infrequently used (CASSANDRA-19332) + * Update OWASP dependency checker to version 12.1.0 (CASSANDRA-20501) + * Suppress CVE-2025-25193 (CASSANDRA-20504) * Include in source tree and build packages a Snyk policy file that lists known false positives (CASSANDRA-20319) * Update zstd-jni to 1.5.7-2 (CASSANDRA-20453) * Suppress CVE-2024-12801 (CASSANDRA-20412) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9b33116d16a1..044615a57143 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -27,6 +27,37 @@ In fact, this repository is a GitHub mirror of [the official repo](https://gitbo Use [Cassandra JIRA](https://issues.apache.org/jira/browse/CASSANDRA/) to create an issue, then either attach a patch or post a link to a GitHub branch with your changes. +# Working with submodules + +Apache Cassandra uses git submodules for a set of dependencies, this is to make cross cutting changes easier for developers. When working on such changes, there are a set of scripts to help with the process. + +## Local Development + +When starting a development branch, the following will change all submodules to a new branch based off the JIRA + +``` +$ .build/sh/development-switch.sh --jira CASSANDRA- +``` + +When changes are made to a submodule (such as to accord), you need to commit and update the reference in Apache Cassandra + +``` +$ (cd modules/accord ; git commit -am 'Saving progress') +$ .build/sh/bump-accord.sh +``` + +## Commit and Merge Process + +Due to the nature of submodules, the changes to the submodules must be committed and pushed before the changes to Apache Cassandra; these are different repositories so git's `--atomic` does not prevent conflicts from concurrent merges; the basic process is as follows: + +* Follow the normal merge process for the submodule +* Update Apache Cassandra's submodule entry to point to the newly committed change; follow the Accord example below for an example + +``` +$ .build/sh/change-submodule-accord.sh +$ .build/sh/bump-accord.sh +``` + # Useful Links - How you can contribute to Apache Cassandra [presentation](http://www.slideshare.net/yukim/cassandrasummit2013) by Yuki Morishita diff --git a/NEWS.txt b/NEWS.txt index b35a3c02745b..2026fb09a762 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -72,6 +72,11 @@ using the provided 'sstableupgrade' tool. New features ------------ [The following is a placeholder, to be revised asap] + - CEP-37 Auto Repair is a fully automated scheduler that provides repair orchestration within Apache Cassandra. This + significantly reduces operational overhead by eliminating the need for operators to deploy external tools to submit + and manage repairs. See + https://cwiki.apache.org/confluence/display/CASSANDRA/CEP-37+Apache+Cassandra+Unified+Repair+Solution for more + details on the motivation and design. - CEP-21 Transactional Cluster Metadata introduces a distributed log for linearizing modifications to cluster metadata. In the first instance, this encompasses cluster membership, token ownership and schema metadata. See https://cwiki.apache.org/confluence/display/CASSANDRA/CEP-21%3A+Transactional+Cluster+Metadata for more detail on diff --git a/bin/sstableloader b/bin/sstableloader index 9045adfda392..74cc041538e0 100755 --- a/bin/sstableloader +++ b/bin/sstableloader @@ -32,18 +32,16 @@ elif [ -r "$CASSANDRA_INCLUDE" ]; then . "$CASSANDRA_INCLUDE" fi -if [ -z "$CLASSPATH" ]; then - echo "You must set the CLASSPATH var" >&2 - exit 1 -fi +# SSTableLoader has been moved to tools/bin, this script simply +# invokes the script in the new path. +SSTABLELOADER_PATH="$CASSANDRA_HOME/tools/bin/sstableloader" -if [ "x$MAX_HEAP_SIZE" = "x" ]; then - MAX_HEAP_SIZE="256M" +if [ ! -f "$SSTABLELOADER_PATH" ]; then + echo "Error: sstableloader has moved to the tools directory. \ +Detected that $SSTABLELOADER_PATH does not exist." >&2 + exit 2 fi -"$JAVA" $JAVA_AGENT -ea -cp "$CLASSPATH" $JVM_OPTS -Xmx$MAX_HEAP_SIZE \ - -Dcassandra.storagedir="$cassandra_storagedir" \ - -Dlogback.configurationFile=logback-tools.xml \ - org.apache.cassandra.tools.BulkLoader "$@" +"$SSTABLELOADER_PATH" "$@" # vi:ai sw=4 ts=4 tw=0 et diff --git a/build.xml b/build.xml index dd8b0e2d2940..9fbbc815d028 100644 --- a/build.xml +++ b/build.xml @@ -100,6 +100,8 @@ the user specifies the tmp.dir property --> + + @@ -109,8 +111,12 @@ + + + + @@ -220,6 +226,24 @@ + + + + + + + + + + + + + + + + @@ -322,6 +346,7 @@ -XX:-CMSClassUnloadingEnabled -Dio.netty.tryReflectionSetAccessible=true + -XX:MaxMetaspaceSize=2G @@ -360,26 +385,38 @@ - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -527,7 +565,8 @@ - + + @@ -547,7 +586,7 @@ - @@ -719,24 +758,13 @@ - - - - - - - - - - - - + - + - + @@ -837,7 +865,7 @@ @@ -922,7 +950,7 @@ - @@ -973,6 +1001,9 @@ + + + @@ -992,6 +1023,7 @@ + @@ -1009,6 +1041,32 @@ + + + + + + + + + + + + + + + + + + + + + + + + @@ -1072,7 +1131,7 @@ - + @@ -1084,9 +1143,11 @@ + + @@ -1143,6 +1204,7 @@ + @@ -1162,7 +1224,7 @@ - + @@ -1181,6 +1243,7 @@ + @@ -1211,10 +1274,12 @@ + + @@ -1254,16 +1319,14 @@ - - - - + + + - @@ -1341,7 +1404,7 @@ - @@ -1652,12 +1715,14 @@ + + testtag="@{testtag}" showoutput="@{showoutput}" + maxmemory="@{maxmemory}"> @@ -1792,7 +1857,7 @@ - + @@ -2020,8 +2085,10 @@ + + @@ -2061,6 +2128,7 @@ + + + + + + + + + + + @@ -2112,4 +2202,6 @@ + + diff --git a/conf/cassandra-env.sh b/conf/cassandra-env.sh index 36863bd96902..0b4c35fd0c2a 100644 --- a/conf/cassandra-env.sh +++ b/conf/cassandra-env.sh @@ -56,11 +56,14 @@ calculate_heap_sizes() heap_limit="31744" fi half_system_memory_in_mb=`expr $system_memory_in_mb / 2` + quarter_system_memory_in_mb=`expr $system_memory_in_mb / 4` if [ "$half_system_memory_in_mb" -gt "$heap_limit" ] ; then CALCULATED_MAX_HEAP_SIZE="${heap_limit}M" + CALCULATED_MAX_DIRECT_MEMORY_SIZE="`expr $heap_limit / 2`M" CALCULATED_CMS_HEAP_NEWSIZE="8G" else CALCULATED_MAX_HEAP_SIZE="${half_system_memory_in_mb}M" + CALCULATED_MAX_DIRECT_MEMORY_SIZE="${quarter_system_memory_in_mb}M" CALCULATED_CMS_HEAP_NEWSIZE="`expr $half_system_memory_in_mb / 4`M" fi } @@ -87,6 +90,8 @@ echo $JVM_OPTS | grep -q Xmx DEFINED_XMX=$? echo $JVM_OPTS | grep -q Xms DEFINED_XMS=$? +echo $JVM_OPTS | grep -q MaxDirectMemorySize +DEFINED_MAX_DIRECT_MEMORY_SIZE=$? echo $JVM_OPTS | grep -q ParallelGCThreads DEFINED_PARALLEL_GC_THREADS=$? echo $JVM_OPTS | grep -q ConcGCThreads @@ -112,6 +117,7 @@ calculate_heap_sizes #MAX_HEAP_SIZE="20G" #HEAP_NEWSIZE="10G" +#MAX_DIRECT_MEMORY_SIZE="10G" # Set this to control the amount of arenas per-thread in glibc #export MALLOC_ARENA_MAX=4 @@ -130,6 +136,10 @@ elif [ "x$MAX_HEAP_SIZE" = "x" ] || [ "x$HEAP_NEWSIZE" = "x" -a $USING_G1 -ne 0 exit 1 fi +if [ "x$MAX_DIRECT_MEMORY_SIZE" = "x" ]; then + MAX_DIRECT_MEMORY_SIZE="$CALCULATED_MAX_DIRECT_MEMORY_SIZE" +fi + if [ "x$MALLOC_ARENA_MAX" = "x" ] ; then export MALLOC_ARENA_MAX=4 fi @@ -144,6 +154,10 @@ elif [ $DEFINED_XMX -ne 0 ] || [ $DEFINED_XMS -ne 0 ]; then exit 1 fi +if [ $DEFINED_MAX_DIRECT_MEMORY_SIZE -ne 0 ]; then + JVM_OPTS="$JVM_OPTS -XX:MaxDirectMemorySize=${MAX_DIRECT_MEMORY_SIZE}" +fi + # We only set -Xmn flag if it was not defined in jvm-server.options file # and CMS is being used. If defined, both Xmn and Xmx must be defined together. if [ $DEFINED_XMN -eq 0 ] && [ $DEFINED_XMX -ne 0 ]; then diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml index c813bf530ad2..a7efe5735a0d 100644 --- a/conf/cassandra.yaml +++ b/conf/cassandra.yaml @@ -1396,6 +1396,10 @@ request_timeout: 10000ms # How long before a node logs slow queries. Select queries that take longer than # this timeout to execute, will generate an aggregated log message, so that slow queries # can be identified. Set this value to zero to disable slow query logging. +# +# It is possible to log slow queries into system_views.slow_queries virtual table. +# Consult logback.xml to uncomment specific appender and logger to enable this functionality. +# # Min unit: ms slow_query_log_timeout: 500ms @@ -2184,6 +2188,13 @@ report_unconfirmed_repaired_data_mismatches: false # Materialized views are considered experimental and are not recommended for production use. materialized_views_enabled: false +# Specify whether Materialized View mutations are replayed through the write path on streaming, e.g. repair. +# When enabled, Materialized View data streamed to the destination node will be written into commit log first. When setting to false, +# the streamed Materialized View data is written into SSTables just the same as normal streaming. The default is true. +# If this is set to false, streaming will be considerably faster however it's possible that, in extreme situations +# (losing > quorum # nodes in a replica set), you may have data in your SSTables that never makes it to the Materialized View. +# materialized_views_on_repair_enabled: true + # Enables SASI index creation on this node. # SASI indexes are considered experimental and are not recommended for production use. sasi_indexes_enabled: false @@ -2648,3 +2659,181 @@ drop_compact_storage_enabled: false # compatibility mode would no longer toggle behaviors as when it was running in the UPGRADING mode. # storage_compatibility_mode: NONE + +#accord: +# # Enables the execution of Accord (multi-key) transactions on this node. +# enabled: false +# +# # Journal directory for Accord +# journal_directory: +# +# # The number of Accord shards on this node; -1 means use the number of cores +# queue_shard_count: -1 +# +# # The number of Accord shards on this node; -1 means use the number of cores +# command_store_shard_count: -1 +# +# # Recover delay: the time between a transaction being initiated and a remote replica being willing to interrupt it to complete it +# recover_delay: 1s +# +# # how quickly the fast path is reconfigured when nodes go up/down +# fast_path_update_delay: 5s + +# Prevents preparing a repair session or beginning a repair streaming session if pending compactions is over +# the given value. Defaults to disabled. +# reject_repair_compaction_threshold: 1024 + +# At least 20% of disk must be unused to run incremental repair. It is useful to avoid disks filling up during +# incremental repair as anti-compaction during incremental repair may contribute to additional space temporarily. +# if you want to disable this feature (the recommendation is not to, but if you want to disable it for whatever reason) +# then set the ratio to 0.0 +# incremental_repair_disk_headroom_reject_ratio: 0.2; + +# Configuration for Auto Repair Scheduler. +# +# This feature is disabled by default. +# +# See: https://cassandra.apache.org/doc/latest/cassandra/managing/operating/auto_repair.html for an overview of this +# feature. +# +# auto_repair: +# # Enable/Disable the auto-repair scheduler. +# # If set to false, the scheduler thread will not be started. +# # If set to true, the repair scheduler thread will be created. The thread will +# # check for secondary configuration available for each repair type (full, incremental, +# # and preview_repaired), and based on that, it will schedule repairs. +# enabled: true +# repair_type_overrides: +# full: +# # Enable/Disable full auto-repair +# enabled: true +# # Minimum duration between repairing the same node again. This is useful for tiny clusters, +# # such as clusters with 5 nodes that finish repairs quickly. This means that if the scheduler completes one +# # round on all nodes in less than this duration, it will not start a new repair round on a given node until +# # this much time has passed since the last repair completed. Consider increasing to a larger value to reduce +# # the impact of repairs, however note that one should attempt to run repairs at a smaller interval than +# # gc_grace_seconds to avoid potential data resurrection. +# min_repair_interval: 24h +# token_range_splitter: +# # Implementation of IAutoRepairTokenRangeSplitter; responsible for splitting token ranges +# # for repair assignments. +# # +# # Out of the box, Cassandra provides org.apache.cassandra.repair.autorepair.{RepairTokenRangeSplitter, +# # FixedTokenRangeSplitter}. +# # +# # - RepairTokenRangeSplitter (default) attempts to intelligently split ranges based on data size and partition +# # count. +# # - FixedTokenRangeSplitter splits into fixed ranges based on the 'number_of_subranges' option. +# # class_name: org.apache.cassandra.repair.autorepair.RepairTokenRangeSplitter +# +# # Optional parameters can be specified in the form of: +# # parameters: +# # param_key1: param_value1 +# parameters: +# # The target and maximum amount of compressed bytes that should be included in a repair assignment. +# # This scopes the amount of work involved in a repair and includes the data covering the range being +# # repaired. +# bytes_per_assignment: 50GiB +# # The maximum number of bytes to cover in an individual schedule. This serves as +# # a mechanism to throttle the work done in each repair cycle. You may reduce this +# # value if the impact of repairs is causing too much load on the cluster or increase it +# # if writes outpace the amount of data being repaired. Alternatively, adjust the +# # min_repair_interval. +# # This is set to a large value for full repair to attempt to repair all data per repair schedule. +# max_bytes_per_schedule: 100000GiB +# incremental: +# enabled: false +# # Incremental repairs operate over unrepaired data and should finish quickly. Running incremental repair +# # frequently keeps the unrepaired set smaller and thus causes repairs to operate over a smaller set of data, +# # so a more frequent schedule such as 1h is recommended. +# # NOTE: Please consult +# # https://cassandra.apache.org/doc/latest/cassandra/managing/operating/auto_repair.html#enabling-ir +# # for guidance on enabling incremental repair on ane exiting cluster. +# min_repair_interval: 24h +# token_range_splitter: +# parameters: +# # Configured to attempt repairing 50GiB of compressed data per repair. +# # This throttles the amount of incremental repair and anticompaction done per schedule after incremental +# # repairs are turned on. +# bytes_per_assignment: 50GiB +# # Restricts the maximum number of bytes to cover in an individual schedule to the configured +# # max_bytes_per_schedule value (defaults to 100GiB for incremental). +# # Consider increasing this value if more data is written than this limit within the min_repair_interval. +# max_bytes_per_schedule: 100GiB +# preview_repaired: +# # Performs preview repair over repaired SSTables, useful to detect possible inconsistencies in the repaired +# # data set. +# enabled: false +# min_repair_interval: 24h +# token_range_splitter: +# parameters: +# bytes_per_assignment: 50GiB +# max_bytes_per_schedule: 100000GiB +# # Time interval between successive checks to see if ongoing repairs are complete or if it is time to schedule +# # repairs. +# repair_check_interval: 5m +# # Minimum duration for the execution of a single repair task. This prevents the scheduler from overwhelming +# # the node by scheduling too many repair tasks in a short period of time. +# repair_task_min_duration: 5s +# # The scheduler needs to adjust its order when nodes leave the ring. Deleted hosts are tracked in metadata +# # for a specified duration to ensure they are indeed removed before adjustments are made to the schedule. +# history_clear_delete_hosts_buffer_interval: 2h +# # NOTE: Each of the below settings can be overridden per repair type under repair_type_overrides +# global_settings: +# # If true, attempts to group tables in the same keyspace into one repair; otherwise, each table is repaired +# # individually. +# repair_by_keyspace: true +# # Number of threads to use for each repair job scheduled by the scheduler. Similar to the -j option in nodetool +# # repair. +# number_of_repair_threads: 1 +# # Number of nodes running repair in parallel. If parallel_repair_percentage is set, the larger value is used. +# parallel_repair_count: 3 +# # Percentage of nodes in the cluster running repair in parallel. If parallel_repair_count is set, the larger value +# # is used. +# parallel_repair_percentage: 3 +# # Whether to allow a node to take its turn running repair while one or more of its replicas are running repair. +# # Defaults to false, as running repairs concurrently on replicas can increase load and also cause anticompaction +# # conflicts while running incremental repair. +# allow_parallel_replica_repair: false +# # An addition to allow_parallel_replica_repair that also blocks repairs when replicas (including this node itself) +# # are repairing in any schedule. For example, if a replica is executing full repairs, a value of false will +# # prevent starting incremental repairs for this node. Defaults to true and is only evaluated when +# # allow_parallel_replica_repair is false. +# allow_parallel_replica_repair_across_schedules: true +# # Repairs materialized views if true. +# materialized_view_repair_enabled: false +# # Delay before starting repairs after a node restarts to avoid repairs starting immediately after a restart. +# initial_scheduler_delay: 5m +# # Timeout for retrying stuck repair sessions. +# repair_session_timeout: 3h +# # Force immediate repair on new nodes after they join the ring. +# force_repair_new_node: false +# # Threshold to skip repairing tables with too many SSTables. Defaults to 10,000 SSTables to avoid penalizing good +# # tables. +# sstable_upper_threshold: 50000 +# # Maximum time allowed for repairing one table on a given node. If exceeded, the repair proceeds to the +# # next table. +# table_max_repair_time: 6h +# # Avoid running repairs in specific data centers. By default, repairs run in all data centers. Specify data +# # centers to exclude in this list. Note that repair sessions will still consider all replicas from excluded +# # data centers. Useful if you have keyspaces that are not replicated in certain data centers, and you want to +# # not run repair schedule in certain data centers. +# ignore_dcs: [] +# # Repair only the primary ranges owned by a node. Equivalent to the -pr option in nodetool repair. Defaults +# # to true. General advice is to keep this true. +# repair_primary_token_range_only: true +# # Maximum number of retries for a repair session. +# repair_max_retries: 3 +# # Backoff time before retrying a repair session. +# repair_retry_backoff: 30s +# token_range_splitter: +# # Splitter implementation to generate repair assignments. Defaults to RepairTokenRangeSplitter. +# class_name: org.apache.cassandra.repair.autorepair.RepairTokenRangeSplitter +# parameters: +# # Maximum number of partitions to include in a repair assignment. Used to reduce number of partitions +# # present in merkle tree leaf nodes to avoid overstreaming. +# partitions_per_assignment: 1048576 +# # Maximum number of tables to include in a repair assignment. This reduces the number of repairs, +# # especially in keyspaces with many tables. The splitter avoids batching tables together if they +# # exceed other configuration parameters like bytes_per_assignment or partitions_per_assignment. +# max_tables_per_assignment: 64 diff --git a/conf/cassandra_latest.yaml b/conf/cassandra_latest.yaml index 9c86beeea829..69b4c647f731 100644 --- a/conf/cassandra_latest.yaml +++ b/conf/cassandra_latest.yaml @@ -2047,6 +2047,13 @@ report_unconfirmed_repaired_data_mismatches: false # Materialized views are considered experimental and are not recommended for production use. materialized_views_enabled: false +# Specify whether Materialized View mutations are replayed through the write path on streaming, e.g. repair. +# When enabled, Materialized View data streamed to the destination node will be written into commit log first. When setting to false, +# the streamed Materialized View data is written into SSTables just the same as normal streaming. The default is true. +# If this is set to false, streaming will be considerably faster however it's possible that, in extreme situations +# (losing > quorum # nodes in a replica set), you may have data in your SSTables that never makes it to the Materialized View. +# materialized_views_on_repair_enabled: true + # Enables SASI index creation on this node. # SASI indexes are considered experimental and are not recommended for production use. sasi_indexes_enabled: false @@ -2362,3 +2369,160 @@ default_secondary_index_enabled: true # compatibility mode would no longer toggle behaviors as when it was running in the UPGRADING mode. # storage_compatibility_mode: NONE + +# Prevents preparing a repair session or beginning a repair streaming session if pending compactions is over +# the given value. Defaults to disabled. +# reject_repair_compaction_threshold: 1024 + +# At least 20% of disk must be unused to run incremental repair. It is useful to avoid disks filling up during +# incremental repair as anti-compaction during incremental repair may contribute to additional space temporarily. +# if you want to disable this feature (the recommendation is not to, but if you want to disable it for whatever reason) +# then set the ratio to 0.0 +# incremental_repair_disk_headroom_reject_ratio: 0.2; + +# Configuration for Auto Repair Scheduler. +# +# This feature is disabled by default. +# +# See: https://cassandra.apache.org/doc/latest/cassandra/managing/operating/auto_repair.html for an overview of this +# feature. +# +# auto_repair: +# # Enable/Disable the auto-repair scheduler. +# # If set to false, the scheduler thread will not be started. +# # If set to true, the repair scheduler thread will be created. The thread will +# # check for secondary configuration available for each repair type (full, incremental, +# # and preview_repaired), and based on that, it will schedule repairs. +# enabled: true +# repair_type_overrides: +# full: +# # Enable/Disable full auto-repair +# enabled: true +# # Minimum duration between repairing the same node again. This is useful for tiny clusters, +# # such as clusters with 5 nodes that finish repairs quickly. This means that if the scheduler completes one +# # round on all nodes in less than this duration, it will not start a new repair round on a given node until +# # this much time has passed since the last repair completed. Consider increasing to a larger value to reduce +# # the impact of repairs, however note that one should attempt to run repairs at a smaller interval than +# # gc_grace_seconds to avoid potential data resurrection. +# min_repair_interval: 24h +# token_range_splitter: +# # Implementation of IAutoRepairTokenRangeSplitter; responsible for splitting token ranges +# # for repair assignments. +# # +# # Out of the box, Cassandra provides org.apache.cassandra.repair.autorepair.{RepairTokenRangeSplitter, +# # FixedTokenRangeSplitter}. +# # +# # - RepairTokenRangeSplitter (default) attempts to intelligently split ranges based on data size and partition +# # count. +# # - FixedTokenRangeSplitter splits into fixed ranges based on the 'number_of_subranges' option. +# # class_name: org.apache.cassandra.repair.autorepair.RepairTokenRangeSplitter +# +# # Optional parameters can be specified in the form of: +# # parameters: +# # param_key1: param_value1 +# parameters: +# # The target and maximum amount of compressed bytes that should be included in a repair assignment. +# # This scopes the amount of work involved in a repair and includes the data covering the range being +# # repaired. +# bytes_per_assignment: 50GiB +# # The maximum number of bytes to cover in an individual schedule. This serves as +# # a mechanism to throttle the work done in each repair cycle. You may reduce this +# # value if the impact of repairs is causing too much load on the cluster or increase it +# # if writes outpace the amount of data being repaired. Alternatively, adjust the +# # min_repair_interval. +# # This is set to a large value for full repair to attempt to repair all data per repair schedule. +# max_bytes_per_schedule: 100000GiB +# incremental: +# # Enable incremental repair by default for new clusters. +# enabled: true +# # Incremental repairs operate over unrepaired data and should finish quickly. Running incremental repair +# # frequently keeps the unrepaired set smaller and thus causes repairs to operate over a smaller set of data, +# # so a more frequent schedule such as 1h is recommended. +# min_repair_interval: 1h +# token_range_splitter: +# parameters: +# # Configured to attempt repairing 50GiB of compressed data per repair. +# # This throttles the amount of incremental repair and anticompaction done per schedule after incremental +# # repairs are turned on. +# bytes_per_assignment: 50GiB +# # Restricts the maximum number of bytes to cover in an individual schedule to the configured +# # max_bytes_per_schedule value (defaults to 100GiB for incremental). +# # Consider increasing this value if more data is written than this limit within the min_repair_interval. +# max_bytes_per_schedule: 100GiB +# preview_repaired: +# # Performs preview repair over repaired SSTables, useful to detect possible inconsistencies in the repaired +# # data set. +# enabled: false +# min_repair_interval: 24h +# token_range_splitter: +# parameters: +# bytes_per_assignment: 50GiB +# max_bytes_per_schedule: 100000GiB +# # Time interval between successive checks to see if ongoing repairs are complete or if it is time to schedule +# # repairs. +# repair_check_interval: 5m +# # Minimum duration for the execution of a single repair task. This prevents the scheduler from overwhelming +# # the node by scheduling too many repair tasks in a short period of time. +# repair_task_min_duration: 5s +# # The scheduler needs to adjust its order when nodes leave the ring. Deleted hosts are tracked in metadata +# # for a specified duration to ensure they are indeed removed before adjustments are made to the schedule. +# history_clear_delete_hosts_buffer_interval: 2h +# # NOTE: Each of the below settings can be overridden per repair type under repair_type_overrides +# global_settings: +# # If true, attempts to group tables in the same keyspace into one repair; otherwise, each table is repaired +# # individually. +# repair_by_keyspace: true +# # Number of threads to use for each repair job scheduled by the scheduler. Similar to the -j option in nodetool +# # repair. +# number_of_repair_threads: 1 +# # Number of nodes running repair in parallel. If parallel_repair_percentage is set, the larger value is used. +# parallel_repair_count: 3 +# # Percentage of nodes in the cluster running repair in parallel. If parallel_repair_count is set, the larger value +# # is used. +# parallel_repair_percentage: 3 +# # Whether to allow a node to take its turn running repair while one or more of its replicas are running repair. +# # Defaults to false, as running repairs concurrently on replicas can increase load and also cause anticompaction +# # conflicts while running incremental repair. +# allow_parallel_replica_repair: false +# # An addition to allow_parallel_replica_repair that also blocks repairs when replicas (including this node itself) +# # are repairing in any schedule. For example, if a replica is executing full repairs, a value of false will +# # prevent starting incremental repairs for this node. Defaults to true and is only evaluated when +# # allow_parallel_replica_repair is false. +# allow_parallel_replica_repair_across_schedules: true +# # Repairs materialized views if true. +# materialized_view_repair_enabled: false +# # Delay before starting repairs after a node restarts to avoid repairs starting immediately after a restart. +# initial_scheduler_delay: 5m +# # Timeout for retrying stuck repair sessions. +# repair_session_timeout: 3h +# # Force immediate repair on new nodes after they join the ring. +# force_repair_new_node: false +# # Threshold to skip repairing tables with too many SSTables. Defaults to 10,000 SSTables to avoid penalizing good +# # tables. +# sstable_upper_threshold: 50000 +# # Maximum time allowed for repairing one table on a given node. If exceeded, the repair proceeds to the +# # next table. +# table_max_repair_time: 6h +# # Avoid running repairs in specific data centers. By default, repairs run in all data centers. Specify data +# # centers to exclude in this list. Note that repair sessions will still consider all replicas from excluded +# # data centers. Useful if you have keyspaces that are not replicated in certain data centers, and you want to +# # not run repair schedule in certain data centers. +# ignore_dcs: [] +# # Repair only the primary ranges owned by a node. Equivalent to the -pr option in nodetool repair. Defaults +# # to true. General advice is to keep this true. +# repair_primary_token_range_only: true +# # Maximum number of retries for a repair session. +# repair_max_retries: 3 +# # Backoff time before retrying a repair session. +# repair_retry_backoff: 30s +# token_range_splitter: +# # Splitter implementation to generate repair assignments. Defaults to RepairTokenRangeSplitter. +# class_name: org.apache.cassandra.repair.autorepair.RepairTokenRangeSplitter +# parameters: +# # Maximum number of partitions to include in a repair assignment. Used to reduce number of partitions +# # present in merkle tree leaf nodes to avoid overstreaming. +# partitions_per_assignment: 1048576 +# # Maximum number of tables to include in a repair assignment. This reduces the number of repairs, +# # especially in keyspaces with many tables. The splitter avoids batching tables together if they +# # exceed other configuration parameters like bytes_per_assignment or partitions_per_assignment. +# max_tables_per_assignment: 64 diff --git a/conf/jvm-server.options b/conf/jvm-server.options index 6be55030a383..c63863aa3b11 100644 --- a/conf/jvm-server.options +++ b/conf/jvm-server.options @@ -162,6 +162,12 @@ # For production use you may wish to adjust this for your environment. # If that's the case, see MAX_HEAP_SIZE (and HEAP_NEWSIZE for CMS) in cassandra-env.sh +##################### +# OFF-HEAP SETTINGS # +##################### + +# By default, this setting is half of max heap size +#-XX:MaxDirectMemorySize= ################################### # EXPIRATION DATE OVERFLOW POLICY # diff --git a/conf/logback.xml b/conf/logback.xml index 102cf06352a4..4855433b99d4 100644 --- a/conf/logback.xml +++ b/conf/logback.xml @@ -23,8 +23,6 @@ appender reference in the root level section below. --> - - @@ -43,7 +41,7 @@ appender reference in the root level section below. 5GB - %-5level [%thread] %date{ISO8601} %F:%L - %msg%n + %-5level [%thread] %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n @@ -60,7 +58,7 @@ appender reference in the root level section below. 5GB - %-5level [%thread] %date{ISO8601} %F:%L - %msg%n + %-5level [%thread] %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n @@ -80,7 +78,7 @@ appender reference in the root level section below. INFO - %-5level [%thread] %date{ISO8601} %F:%L - %msg%n + %-5level [%thread] %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n @@ -98,7 +96,7 @@ appender reference in the root level section below. 5GB - %-5level [%thread] %date{ISO8601} %F:%L - %msg%n + %-5level [%thread] %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n --> @@ -119,6 +117,18 @@ appender reference in the root level section below. --> + + + + + + + + + + diff --git a/debian/cassandra.install b/debian/cassandra.install index 7ee058bb593e..0573128ddd46 100644 --- a/debian/cassandra.install +++ b/debian/cassandra.install @@ -16,7 +16,6 @@ bin/cassandra.in.sh usr/share/cassandra bin/cassandra usr/sbin bin/nodetool usr/bin bin/sstableutil usr/bin -bin/sstableloader usr/bin bin/cqlsh usr/bin bin/cqlsh.py usr/bin bin/sstablescrub usr/bin @@ -28,6 +27,7 @@ tools/bin/auditlogviewer usr/bin tools/bin/jmxtool usr/bin tools/bin/hash_password usr/bin tools/bin/sstablepartitions usr/bin +tools/bin/sstableloader usr/bin lib/*.jar usr/share/cassandra/lib lib/*.zip usr/share/cassandra/lib lib/x86_64/* usr/share/cassandra/lib/x86_64 diff --git a/debian/changelog b/debian/changelog index 4a2fc47012c2..1287b1cb5529 100644 --- a/debian/changelog +++ b/debian/changelog @@ -4,6 +4,61 @@ cassandra (5.1) UNRELEASED; urgency=medium -- Mick Semb Wever Wed, 21 Apr 2021 19:24:28 +0200 +cassandra (5.0.4) unstable; urgency=medium + + * New release + + -- Brandon Williams Mon, 07 Apr 2025 07:04:52 -0500 + +cassandra (5.0.3) unstable; urgency=medium + + * New release + + -- Stefan Miklosovic Mon, 27 Jan 2025 14:49:28 +0100 + +cassandra (5.0.2) unstable; urgency=medium + + * New release + + -- Mick Semb Wever Sat, 12 Oct 2024 14:37:59 +0200 + +cassandra (5.0.1) unstable; urgency=medium + + * New release + + -- Mick Semb Wever Wed, 18 Sep 2024 14:31:50 +0200 + +cassandra (5.0.0) unstable; urgency=medium + + * New release + + -- Mick Semb Wever Thu, 29 Aug 2024 11:08:10 +0200 + +cassandra (5.0~rc2) unstable; urgency=medium + + * New release + + -- Mick Semb Wever Wed, 21 Aug 2024 20:29:17 +0200 + +cassandra (5.0~rc1) unstable; urgency=medium + + * New release + + -- Mick Semb Wever Wed, 10 Jul 2024 17:49:44 +0200 + +cassandra (5.0~beta1) unstable; urgency=medium + + * New release + + -- Mick Semb Wever Fri, 01 Dec 2023 14:09:07 +0100 + +cassandra (5.0~alpha1) unstable; urgency=medium + + * New release + + -- Mick Semb Wever Fri, 01 Sep 2023 11:22:35 +0200 +>>>>>>> cassandra-5.0 + cassandra (4.0~rc1) unstable; urgency=medium * New release diff --git a/debian/rules b/debian/rules index b3de486117c1..70305b41e981 100755 --- a/debian/rules +++ b/debian/rules @@ -67,6 +67,10 @@ install: build dh_install $(BUILD_DIR)/tools/lib/fqltool.jar \ usr/share/cassandra + # Copy sstableloader jars + dh_install $(BUILD_DIR)/tools/lib/sstableloader.jar \ + usr/share/cassandra + dh_link usr/share/cassandra/apache-cassandra-$(VERSION).jar \ usr/share/cassandra/apache-cassandra.jar diff --git a/doc/SASI.md b/doc/SASI.md index fc38845ce2cd..c7bf17391846 100644 --- a/doc/SASI.md +++ b/doc/SASI.md @@ -199,7 +199,7 @@ cqlsh:demo> SELECT first_name, last_name, age, height, created_at FROM sasi SASI supports queries with multiple predicates, however, due to the nature of the default indexing implementation, CQL requires the user -to specify `ALLOW FILTERING` to opt-in to the potential performance +to specify `ALLOW FILTERING` to opt in to the potential performance pitfalls of such a query. With SASI, while the requirement to include `ALLOW FILTERING` remains, to reduce modifications to the grammar, the performance pitfalls do not exist because filtering is not @@ -383,7 +383,7 @@ of the memtable to disk -- this is the origin of the name "SSTable Attached Secondary Index". The SASI index data structures are built in memory as the SSTable is -being written and they are flushed to disk before the writing of the +being written, and they are flushed to disk before the writing of the SSTable completes. The writing of each index file only requires sequential writes to disk. In some cases, partial flushes are performed, and later stitched back together, to reduce memory @@ -467,7 +467,7 @@ collision. To optimize for its write-once environment the [`TokenTreeBuilder`](https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/index/sasi/disk/TokenTreeBuilder.java) -completely loads its interior nodes as the tree is built and it uses +completely loads its interior nodes as the tree is built, and it uses the well-known algorithm optimized for bulk-loading the data structure. @@ -562,7 +562,7 @@ been found, or there is no more matching data, the result set is returned to the coordinator through the existing internal components. The number of queries (total/failed/timed-out), and their latencies, -are maintined per-table/column family. +are maintained per-table/column family. SASI also supports concurrently iterating terms for the same index across SSTables. The concurrency factor is controlled by the @@ -713,7 +713,7 @@ the documentation The abstract `RangeIterator` class provides a unified interface over the two main operations performed by SASI at various layers in the execution path: set intersection and union. These operations are -performed in a iterated, or "streaming", fashion to prevent unneeded +performed in an iterated, or "streaming", fashion to prevent unneeded reads of elements from either set. In both the intersection and union cases the algorithms take advantage of the data being pre-sorted using the same sort order, e.g. term or token order. @@ -725,7 +725,7 @@ performs the "Merge-Join" portion of the algorithm, with the properties of an outer-join, or union. It is implemented with several optimizations to improve its performance over a large number of iterators -- sets to union. Specifically, the -iterator exploits the likely case of the data having many sub-groups +iterator exploits the likely case of the data having many subgroups of overlapping ranges and the unlikely case that all ranges will overlap each other. For more details see the [javadoc](https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/index/sasi/utils/RangeUnionIterator.java#L9-L21). @@ -742,7 +742,7 @@ between them based on some properties of the data. the [`RangeUnionIterator`](https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/index/sasi/utils/RangeUnionIterator.java) in that it performs a "Merge-Join", however, its nature is similar to -a inner-join, where like values are merged by a data-specific merge +an inner-join, where like values are merged by a data-specific merge function (e.g. merging two tokens in a list to lookup in a SSTable later). See the [javadoc](https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/index/sasi/utils/RangeIntersectionIterator.java#L88-L101) diff --git a/doc/modules/cassandra/examples/BASH/docker-run-cqlsh-quickstart.sh b/doc/modules/cassandra/examples/BASH/docker-run-cqlsh-quickstart.sh index c75954924aaa..cbfc0bddbde1 100644 --- a/doc/modules/cassandra/examples/BASH/docker-run-cqlsh-quickstart.sh +++ b/doc/modules/cassandra/examples/BASH/docker-run-cqlsh-quickstart.sh @@ -1,3 +1,3 @@ docker run --rm -it --network \ cassandra nuvo/docker-cqlsh cqlsh cassandra \ -9042 --cqlversion='3.4.5' \ No newline at end of file +9042 --cqlversion='3.4.7' \ No newline at end of file diff --git a/doc/modules/cassandra/examples/CQL/create_ks_trans_repl.cql b/doc/modules/cassandra/examples/CQL/create_ks_trans_repl.cql index afff433eec8f..4fe1c3a98e77 100644 --- a/doc/modules/cassandra/examples/CQL/create_ks_trans_repl.cql +++ b/doc/modules/cassandra/examples/CQL/create_ks_trans_repl.cql @@ -1,2 +1,2 @@ CREATE KEYSPACE some_keyspace - WITH replication = {'class': 'NetworkTopologyStrategy', 'DC1' : '3/1'', 'DC2' : '5/2'}; + WITH replication = {'class': 'NetworkTopologyStrategy', 'DC1' : '3/1', 'DC2' : '5/2'}; diff --git a/doc/modules/cassandra/examples/CQL/no_revoke.cql b/doc/modules/cassandra/examples/CQL/no_revoke.cql index b6a044cf2038..c12b210b76e5 100644 --- a/doc/modules/cassandra/examples/CQL/no_revoke.cql +++ b/doc/modules/cassandra/examples/CQL/no_revoke.cql @@ -1,5 +1,5 @@ -* `system_schema.keyspaces` -* `system_schema.columns` -* `system_schema.tables` -* `system.local` -* `system.peers` +* system_schema.keyspaces +* system_schema.columns +* system_schema.tables +* system.local +* system.peers diff --git a/doc/modules/cassandra/examples/CQL/to_date.cql b/doc/modules/cassandra/examples/CQL/to_date.cql new file mode 100644 index 000000000000..160dcaab6726 --- /dev/null +++ b/doc/modules/cassandra/examples/CQL/to_date.cql @@ -0,0 +1 @@ +SELECT id, to_date(create_ts) FROM myTable diff --git a/doc/modules/cassandra/examples/CQL/vector-search/vector-search-cycling.cql b/doc/modules/cassandra/examples/CQL/vector-search/vector-search-cycling.cql index cc8dad67741f..9b3984b19503 100644 --- a/doc/modules/cassandra/examples/CQL/vector-search/vector-search-cycling.cql +++ b/doc/modules/cassandra/examples/CQL/vector-search/vector-search-cycling.cql @@ -22,7 +22,7 @@ WITH CLUSTERING ORDER BY (created_at DESC); // tag::alter-vs-table[] ALTER TABLE cycling.comments_vs - ADD comment_vector VECTOR ; <1> + ADD comment_vector VECTOR ; // end::alter-vs-table[] // tag::create-vs-index[] @@ -116,4 +116,4 @@ SELECT comment, similarity_cosine(comment_vector, [0.2, 0.15, 0.3, 0.2, 0.05]) FROM cycling.comments_vs ORDER BY comment_vector ANN OF [0.1, 0.15, 0.3, 0.12, 0.05] LIMIT 1; -// end::select-vector-data-similarity-cycling[] \ No newline at end of file +// end::select-vector-data-similarity-cycling[] diff --git a/doc/modules/cassandra/examples/RESULTS/2i/2i-check.result b/doc/modules/cassandra/examples/RESULTS/2i/2i-check.result index 2fd9f9570028..2d7ce864b95e 100644 --- a/doc/modules/cassandra/examples/RESULTS/2i/2i-check.result +++ b/doc/modules/cassandra/examples/RESULTS/2i/2i-check.result @@ -1 +1,7 @@ -TBD \ No newline at end of file +CREATE TABLE cycling.birthday_list ( + cyclist_name text PRIMARY KEY, +. +. +. + +CREATE INDEX blist_values_idx ON cycling.birthday_list (values(blist)); diff --git a/doc/modules/cassandra/nav.adoc b/doc/modules/cassandra/nav.adoc index dd9dd1054df9..813aea24f110 100644 --- a/doc/modules/cassandra/nav.adoc +++ b/doc/modules/cassandra/nav.adoc @@ -23,6 +23,9 @@ *** xref:cassandra:architecture/guarantees.adoc[] *** xref:cassandra:architecture/messaging.adoc[] *** xref:cassandra:architecture/streaming.adoc[] +*** xref:cassandra:architecture/accord.adoc[] +**** xref:cassandra:architecture/accord-architecture.adoc[] +**** xref:cassandra:architecture/cql-on-accord.adoc[] ** xref:cassandra:developing/data-modeling/index.adoc[] *** xref:cassandra:developing/data-modeling/intro.adoc[] @@ -99,18 +102,22 @@ ***** xref:cassandra:managing/operating/fqllogging.adoc[Full query logging] **** xref:cassandra:managing/operating/metrics.adoc[Monitoring metrics] **** xref:cassandra:managing/operating/repair.adoc[Repair] +**** xref:cassandra:managing/operating/auto_repair.adoc[Auto Repair] **** xref:cassandra:managing/operating/read_repair.adoc[Read repair] **** xref:cassandra:managing/operating/security.adoc[Security] **** xref:cassandra:managing/operating/snitch.adoc[Snitches] **** xref:cassandra:managing/operating/topo_changes.adoc[Topology changes] **** xref:cassandra:managing/operating/transientreplication.adoc[Transient replication] **** xref:cassandra:managing/operating/virtualtables.adoc[Virtual tables] +**** xref:cassandra:managing/operating/password_validation.adoc[Password validation] +**** xref:cassandra:managing/operating/onboarding-to-accord.adoc[] *** xref:cassandra:managing/tools/index.adoc[Tools] **** xref:cassandra:managing/tools/cqlsh.adoc[cqlsh: the CQL shell] **** xref:cassandra:managing/tools/nodetool/nodetool.adoc[nodetool] **** xref:cassandra:managing/tools/sstable/index.adoc[SSTable tools] **** xref:cassandra:managing/tools/cassandra_stress.adoc[cassandra-stress] + ** xref:cassandra:troubleshooting/index.adoc[Troubleshooting] *** xref:cassandra:troubleshooting/finding_nodes.adoc[Finding misbehaving nodes] *** xref:cassandra:troubleshooting/reading_logs.adoc[Reading Cassandra logs] @@ -126,4 +133,4 @@ *** xref:reference/static.adoc[Static columns] *** xref:reference/vector-data-type.adoc[Vector data type] -** xref:integrating/plugins/index.adoc[] \ No newline at end of file +** xref:integrating/plugins/index.adoc[] diff --git a/doc/modules/cassandra/pages/architecture/accord-architecture.adoc b/doc/modules/cassandra/pages/architecture/accord-architecture.adoc new file mode 100644 index 000000000000..201abd861ec6 --- /dev/null +++ b/doc/modules/cassandra/pages/architecture/accord-architecture.adoc @@ -0,0 +1,360 @@ += Accord Architecture + +This document is intended to facilitate quick dive into Accord and +Cassandra Integration code for anyone interested in the project. Readers +should be closely familiar at very least with Single-Decree Paxos and +fluent in Consensus terminology. Familiarity with Accord protocol +itself, or similar protocols such as EPaxos, TAPIR, Janus, or Tempo, can +be useful. + +Accord code is logically split into local and coordinator part. +Coordination code contains code intended for coordination/invocation of +the client query, driving it through the Accord state machine, and all +commands and utilities for tracking/retrying their state. Node-local +code contains utility for keeping record of replica state and facilitate +local execution (i.e. responding to coordinator queries). + +There are _many_ enums in Accord. They’re extremely useful for +understanding the state machine of each of the components. + +Cassandra Integration implements interfaces provided by Accord, and +plugs in messaging, serialization, CQL, concurrency/execution, on-disk +state management, and stable storage (i.e. Cassandra tables). + +When the request comes from the client, broadly speaking, it gets parsed +and turns into `TransactionStatement`. `TransactionStatement` contains +updates, selects, assignments, and conditions intended for +atomic/transactional execution. These statements are translated into +Accord commands (i.e. `Read`, `Write`, or `Update`), and form Accord +Transaction (`Txn`). Transaction is executed yielding `TxnResult` that +can be returned to the client. + +== Coordinator Side + +=== Accord Protocol Basics + +Coordinator allocates a globally unique transaction ID `TxnId` for the +transaction, and begins coordination (see `CoordinateTransaction`). +Here, coordinator perform initial rounds of `PreAccept` and `Accept` +until the agreement about when transaction should execute is reached. +Coordinated query execution starts with a `PreAccept` message, which +contains transaction definition and routing information. + +On the replica locally, each Accord message first lands in +`AccordVerbHandler`, which handles _all_ Accord messages. Replica +determines whether it is aware of the _epoch_ specified by the +transaction coordinator. Messages for the future epochs are parked until +epoch becomes active on the node; messages for known epochs are +submitted to their corresponding command stores (think: local shards). +Replica applies the message locally, changing its local state, and +producing coordinator response. Coordinator collects replica responses +and continues driving transaction through the execution state machine. + +Every transaction has a home key - a global value that defines the home +shard, the one tasked with ensuring the transaction is finished. Home +key is chosen arbitrarily: it is either a first key the coordinator +owns, or it is picked completely at random. + +== Replica Side + +=== CommandStore + +`Command` is a unit of Accord _metadata_ that relates to a specific +operation, as opposed to `Message`, which is an _instruction_ sent by +coordinator to the replica for execution that _changes_ this command +state. `Command` does _not_ hold the state of an entire transaction, but +rather a _part_ of transaction executed on a particular shard. +_Coordinator_ is responsible for executing the entirety of the +transaction, `Command`s are just local execution states. + +Commands are held by a Command _Store_, a single threaded internal shard +of accord transaction metadata. It holds state required for command +execution, and executes commands sequentially. For command execution, +`CommandStore` creates a `SafeCommandStore`, a version of `CommandStore` +created for command execution, during which it has exclusive access to +it. + +Roughly speaking, you can think of relation between CommandStore and +SafeCommandStore as: + +.... +SafeCommandStore safeStore = commandStore.beginOperation(context) +try { + message.apply(safeStore); +} +finally { + commandStore.completeOperation(safeStore); +} +.... + +In other words, `CommandStore` collects the `PreLoadContext`, state +required to be in memory for command execution (possible dependencies, +such as `TxnId`s, and `Key`s of commands, but also `CommandsForKeys` +that will be needed during execution). Once the context is collected and +command’s turn to execute on command store comes, _safe_ command store +is created and passed to the command. + +Any executing operation may require changes to command store state. For +this, `SafeCommandStore` creates a special version of command state, +`SafeCommand` and `SafeCommandsForKey` that can be updated during +execution. Naturally, either _all_ of the states changed during +operation execution will become visible, or none of them will. In order +to ensure transactional integrity, changes to commands are tracked and +are recorded into `Journal` for crash-recovery. `ProgressLog` and +`CommandsForKey` are up + +On Cassandra side, concurrent execution is controlled by `AccordTask`, +which contains cache loading logic and persistence callbacks. Since +Accord may potentially hold a large number of command states in memory, +their states may be _shrunk_ to their binary representation to save some +memory, or they can get fully evicted. This also means that `AccordTask` +will have to reload relevant dependencies from preload context before +command execution can begin. + +=== AsyncChain, AccordTask, AccordExecutor + +Accord is designed for high concurrency, and most things are constructed +as asynchronous chains. `AsyncChain` API is very similar to the one of +Java futures, but has several convenient methods that make execution on +multiple executors (think: command stores, loaders) simpler. + +Each `CommandStore` has its own `AccordExecutor`. For the purpose of +this document you may consider it as a single-threaded executor. +`AccordExecutor` keeps track of tasks in different states, primarily: + +* `WAITING_TO_LOAD` - executor has a maximum number of concurrent load +tasks. If the number of in-progress loads exceeds this number, all +subsequently added loads will go into the waiting to load queue. +* `LOADING` - tasks for which dependencies are being loaded. +`CommandsForKeys` are paged in from the auxiliary table, while `Command` +states are loaded directly from the `Journal`. +* `WAITING_TO_RUN` / `RUNNING` / `FINISHED` - these three are +self-explanatory; once dependencies are loaded, task is ready to run; +when its turn comes, it transitions to running state, and once its done, +it’s finished. + +There are several other states, which you can find in +`AccordTask$State`. It might be worth to mention that Accord tasks are +_cancellable_. Tasks that were timed out before execution, have been +preempted, or should not run due to other reasons, can and will be +cancelled. Tasks transition between different AccordExecutor queues +depending on their execution states. + +In Accord, all tasks have to be executed in strict order, and a task +can’t execute before its dependencies have executed, else there’s no +guarantee of strict order. Tasks are notified about dependency readiness +using `NotificationSink`, which updates the tasks’s `WaitingOn` +collection. `WaitingOn` is responsible for registering listeners with +`CommandStore` if dependencies need to be executed before the current +task can. + +`WaitingOn`, `NotificationSink` and `LocalListeners` registered with +CommandStore can be thought of as a ``happy path'' execution: when +coordinator makes timely progress changing command states. If +coordinator _fails_ to make progress, `ProgressLog` kicks in after the +registered deadline. + +=== ProgressLog + +The progress log is responsible for ensuring progress in transactions +that aren’t making any. It does two things: + +* Fetches data from peers via `WaitingState`. Depending on the state of +transaction, it may trigger fetch of a subset of required dependencies +from peers via `FetchData`. For example, we haven’t received Apply, but +we’re ReadyToExecute. +* Triggers recovery via `HomeState`. The progress log may also +autonomously decide that a transaction which hasn’t been +decided/executed (and otherwise should be able to do so) should have the +recovery protocol invoked. In other words, if _coordination_ of the +transaction is stuck (i.e. further progress is not happening not due to +lack of dependencies required locally, but because of the transaction +coordinator), may trigger recovery via `MaybeRecover`. + +=== Command + +Command is a core block of the Accord local state. `Message`s, such as +`PreAccept`, `Propose`, `Accept`, and many others, change `Command` +state for a given store during execution. + +* `SaveStatus` - node-local command status +* `Participants` - core routing information required for transaction. +Keys or Ranges participating in the transaction. +* Timestamps: +** `ExecuteAt` - a timestamp at which this transaction is decided to be +executed. May differ from its `TxnId` if a higher ballot was witnessed +during `PreAccept` phase, in case there any conflicts are discovered. +** `ExecutesAtLeast` - only relevant for `WaitingOnWithExecutesAtLeast` +** Ballots for coordinating within a specific `TxnId`: +*** `Promised` - a non-zero ballot can be set as a result of recovery; a +recovery coordinator (see Recovery Protocol in Accord paper for details) +is picking its own globally unique ballot for re-proposal. +*** `AcceptedOrCommitted` - same as `Promised` (i.e. a non-zero ballot +is set as a result of recovery), except for later protocol stages. +* `PartialTxn` - shard-relevant definition of the transaction. +* Dependencies: +** `PartialDeps` - a collection of transaction dependencies, keyed by +the key or range on which they were adopted. +** `WaitingOn` - a subset of the above dependencies this command needs +to wait on. +** A collection of transaction dependencies, keyed by the key or range +on which they were adopted. +* `Writes` - a collection of data to write to one or more stores +* `Result` - a result to be returned to a client, or be stored in a +node’s command state. Effectively unused in Cassandra implementation. + +=== CommandsForKey (CFK) + +`CommandsForKey` is a specialised collection for efficiently +representing and querying everything Accord needs for making +coordination and recovery decisions about a key’s command conflicts, and +for managing execution order. + +CommandsForKey is updated via `SafeCommandsForKey` after command +execution in `SafeCommandStore#updateCommandsForKey`. CommandsForKey +defferentiates between managed and unmanaged transactions: + +* Managed transactions are transactions witnessed by `CommandsForKey` +for dependency management (essentially all globally visible key +transactions): simple key transactions, like reads and writes. +* Unmanaged transactions are those that depend on the simple key +transactions but are not themselves such, e.g. sync points, range +transactions, etc. These transactions need only adopt a dependency on +the Key to represent _all of these transactions_. CFK will then notify +when they have executed. + +=== CommandStore’s auxiliary collections + +==== RedundantBefore + +RedundantBefore is (incrementally) persisted in Journal and used by +CommandStore to track transactions that have been fully applied, or +invalidated across all shards. Once the transaction is redundant +(i.e. it has been either _applied_ or _invalidated_ durably on the +majority of participants), its metadata can be removed and only +transactional bounds can be maintained for dependency tracking purposes. +`RedundantBefore` plays an important role during journal compaction (by +providing information about which transactions can be purged). + +=== DurabilityService and (Exclusive)SyncPoint + +For intent of this document, we will only be covering _Exclusive_ +SyncPoints, even though other kinds might still exist as of time of +writing this. `SyncPoints` serve as a logical barrier in transaction +history, and are used for invalidating older `TxnId`s, so that a newly +bootstrapped node may have a complete log as of a point in time `TxnId`, +and replicas could purge/GC earlier transaction metadata. + +SyncPoints are not expected to be processed by the the whole cluster, +and we do not want transaction processing to be held up, so while these +are processed much like a transaction, they are invisible to real +transactions which may proceed before SyncPoint is witnessed by the node +processing it. + +ExclusiveSyncPoint is created by `DurabilityScheduler`, as the first +step for coordinating shard durability, which is scheduled for periodic +execution. During this step, we perform initial rounds of `PreAccept` +and `Accept` until we have reached agreement about when `SyncPoint` +should execute. + +After shard is marked durable, `RedundantBefore` collection is updated, +which serves an important role in bootstrap, log replay, log compaction, +and replica-side command purging/invalidation. + +=== ConfigurationService and TopologyManager + +Time in Accord is sliced into epochs. Each epoch constitutes a unique +cluster configuration (`Topology`). Topology represents mapping between +key ranges and nodes, here every range has to be replicated to a certain +number of nodes. Coordinator assigns epoch to each transaction; replicas +may decline transactions that arrive to epochs that were previously +closed. + +`TopologyManager` is responsible for listening to notifications about +cluster configuration changes, and creation of epochs. Once epoch is +created, it needs to be bootstrapped before it is ready. Epoch readiness +consists of 4 _independent_ states: + +* Metadata: The new epoch has been setup locally and the node is ready +to process commands for it. +* Coordinate: The node has retrieved enough remote information to answer +coordination decisions for the epoch (including fast path decisions). +Once a quorum of the new epoch has achieved this, earlier epochs do not +need to be contacted by coordinators of transactions started in the new +epoch (or later). +* Data: The node has successfully replicated the underlying `DataStore` +information for the new epoch, but may need to perform some additional +coordination before it can execute the read portion of a transaction. +* Reads: The node has retrieved enough remote information to safely +process reads, including replicating all necessary DataStore +information, and any additional transactions necessary for consistency. + +=== Data Store + +One of the most important integration points, DataStore, is responsible +for application of transactional information into database’s stable +storage. + +=== Accord Journal + +==== Garbage Collection / Cleanup + +* `ERASE`: we can erase data once we are certain no other replicas +require our information. Erased should ONLY be adopted on a replica that +knows EVERY shard has successfully applied the transaction at all +healthy replicas (or else that it is durably invalidated). +* `EXPUNGE`: we can expunge data once we can reliably and safely expunge +any partial record. To achieve the latter, we use only global summary +information and the TxnId and if present any applyAt. +* `INVALIDATE`: command has been was decidedly (and durably) superseded +by a different command (e.g., a higher higher ballot was witnessed +during recovery), and will *never* be executed. +* `VESTIGIAL`: command cannot be completed and is either pre-bootstrap, +did not commit, or did not participate in this shard’s epoch. +* `TRUNCATE`: means the subset of command metadata (i.e., deps, outcome, +or appliedAt) can be partially discarded. + +== Contributing Changes to Accord + +Accord is covered by a large number of tests, but probably most +prominent among them is a `BurnTest`. BurnTest is a deterministic +simulation of the protocol with strict serializability checker. BurnTest +simulates time, message passing, concurrency, faults, and many other +things. If you are intending to make a chance to Accord, it is +recommended you run `BurnTest` at very least several dozen times in the +loop to ensure correctness of your change. BurnTest can also be useful +for reasoning about and exploring protocol states. Put a breakpoint at a +spot you consider important, run the burn test and see what’s going on. + +Accord also comes with many built-in assertions. Protocol has many +checks for internal consistency that can be helpful during development. +Most of the time, rather than triggering a strict serializability +checker error, you will see some form of internal assertion detecting an +inconsistency. These invariants are there for a reason, and in an +overwhelming majority of cases disabling or ignoring them is not a good +idea. + +== Cheat Sheet + +* Medium Path - is a coordinator optimization. This is the case where t0 +can be agreed (i.e. executeAt=txnId), and where we would like not to +take 3 round-trips, as this situation is likely to occur when we lose +the fast path quorum. The medium path permits only 2 round-trips because +it can be used as a complete set of dependencies (due to their having +been calculated against the correct bound, t0, and that bound having +been applied at a quorum so that conflicting transactions will propose a +higher executeAt). +* `SaveStatus` vs `Status` - `SaveStatus` is a replica-local status that +contains additional information helpful for tracking state machine state, +and heavily used for validating internal consistency in Accord, while +`Status` is a part of a distributed state machine that tracks distributed +transaction state. +* `Routable` - something that can be found in the cluster, and MAYBE +found on disk (if Seekable. +** `Unseekable` - _routing_ key; in Cassandra terms, you can think of a +`Token` +** `Seekable` - Something that can be found within the cluster AND found +on disk, queried and returned; i.e., key or key range. +* Route vs RoutingKey vs FullRoute vs PartialRoute - +** `Partial` vs `Full` route are understood in the context of a single +transaction. diff --git a/doc/modules/cassandra/pages/architecture/accord.adoc b/doc/modules/cassandra/pages/architecture/accord.adoc new file mode 100644 index 000000000000..51f9f953ef5c --- /dev/null +++ b/doc/modules/cassandra/pages/architecture/accord.adoc @@ -0,0 +1,7 @@ += Accord + +Accord is one of the transaction protocols supported by Apache Cassandra. Accord is a separate sub-project that +is implemented as a library that is Cassandra agnostic. + +* xref:architecture/accord-architecture.adoc[] +* xref:architecture/cql-on-accord.adoc[] diff --git a/doc/modules/cassandra/pages/architecture/cql-on-accord.adoc b/doc/modules/cassandra/pages/architecture/cql-on-accord.adoc new file mode 100644 index 000000000000..91a8fc285440 --- /dev/null +++ b/doc/modules/cassandra/pages/architecture/cql-on-accord.adoc @@ -0,0 +1,612 @@ += Developers guide to CQL on Accord + +== Intro + +Accord is implemented as a library that is agnostic to the underlying +database it integrates with. It has little to no awareness of schema, +query language, messaging, threading etc. Instead it presents interfaces +for the database to implement that describe the configuration and +topology of the database, what reads and writes need to execute and what +their dependencies are, and how to actually execute reads and writes at +the configured locations. + +This guide describes how Cassandra goes about leveraging those +interfaces to implement reading and writing CQL as well as live +migrating from CQL running on Cassandra to CQL running on Accord. + +This guide doesn't cover how Accord works and doesn't cover all parts of +Accord that are implemented in Cassandra like threading, caching, +persistence, and messaging. It also isn't intended to be a user guide +and doesn't fully overlap with the xref:cassandra:managing/operating/onboarding-to-accord.adoc[user guide]. You should start with the +xref:cassandra:managing/operating/onboarding-to-accord.adoc[user guide] to get any context that may be missing here. + +== Anatomy of a transaction + +The primary way of interacting with Accord is to define a transaction +using +https://github.com/apache/cassandra-accord/blob/134df57677bbd5092994923a4dc2f15cd1d033d1/accord-core/src/main/java/accord/primitives/Txn.java#L42[Txn/Txn.InMemory] +and then asking Accord to execute the transaction. Transactions express +what they touch by declaring a set of keys or ranges that will be +read/written to. This set needs to be declared up front and can't change +during transaction execution and the transaction can be either a key +transaction or range transaction but not both. + +Range transactions are more expensive for Accord to execute as the +dependency tracking work Accord has to do is more CPU and memory +intensive and the transactions are more likely to conflict and block +execution of other transactions. + +Accord is not aware of tables only ranges and keys. Keys and ranges can +span any tables managed by Accord and the keys and ranges encode the +tables they apply to. So a range transaction covering multiple tables +would have a range per table and from Accord's perspective these are +completely different ranges. + +Transactions also declare a `Kind` which can be `Read`, `Write` +(Read/Write), `EphemeralRead`, and `ExclusiveSyncPoint`. `Read`, and +`Write` are what you would expect. `EphemeralRead` is a read that only +provides per key linearizability, but offers better performance compared +to `Read` . + +`ExclusiveSyncPoint` is transaction that can be used to establish a +happens before relationship with its dependencies without interfering +with their execution. `ExclusiveSyncPoint` is used for live migration +and repair to ensure the visibility at `ALL` of all committed Accord +transactions to non-transactional reads. + +=== Keys and Ranges + +`Keys` and `Ranges` are prefixed with `TableId` in the most significant +position to allow Accord to interact with multiple tables without +knowing anything about schema. From Accord's perspective there is just a +set of ranges that it is responsible for replicating and transacting +over, and they can be compared, sorted, and split, but beyond that they +are completely opaque. A follow on effect from this is that token ranges +(or token ring) are per table. + +`Key` is conceptually similar to `DecoratedKey` and is implemented by +`https://github.com/apache/cassandra/blob/63d3538ba7352635b7b61a205b40e035e62b8d5d/src/java/org/apache/cassandra/service/accord/api/PartitionKey.java#L43[PartitionKey]` +. `RoutingKey` is conceptually similar to `Token` and is implemented by +`https://github.com/apache/cassandra/blob/63d3538ba7352635b7b61a205b40e035e62b8d5d/src/java/org/apache/cassandra/service/accord/api/TokenKey.java#L51[TokenKey]` +. + +Accord `Range` is conceptually equivalent to Cassandra's +`Range++<++RingPosition++>++` and is implemented by +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/accord/TokenRange.java[TokenRange]`. +Accord `Range` is start exclusive and end inclusive just like +Cassandra's `Range` and we use it exclusively in that mode. There are no +other forms of inclusive/exclusive bound or range used directly by +`Accord`. Accord `Range`'s implementation suggests support for other +forms of bounds but it's not currently supported. It's theoretically +possible to use something similar to `Range++<++PartitionPosition++>++` +as the implementation of Accord's `Range` but we don't do that because +Cassandra doesn't support splitting partitions. + +To integrate Cassandra with Accord it's necessary to have a few +different versions of `TokenKey` that make it possible to describe +cluster topology and perform query routing to Accord across a range of +partitioners. A `TokenKey` can be a sentinel for a given table which +maps to `-inf` or `{plus}inf` for that table and it's possible to create +a minimum sentinel that is ++<++ `-inf` or ++>++ `{plus}inf` . +Additionally it's possible to declare a `TokenKey` that is between +`token` and either `token - 1` or `token {plus} 1` . + +Accord expects to be able to convert a `RoutingKey` to a `Range` which +is facilitated by being able to create these in between tokens without +requiring the partitioner to support increment or decrement on token. +Partition range reads also leverage these in between tokens to convert +`Range` bounds from inclusive to exclusive and vice versa to match the +inclusivity/exclusivity of the query that is being executed. + +=== Seekable, Unseekable, Routable + +The implementations of these interfaces are always prefixed with +`TableId` most of which were just discussed. + +A `Seekable` has enough information that it can be used to both route a +query and then execute it because it identifies what exactly to read and +write. An `Unseekable` is more compact (just a token) for Accord to work +with and can be used to route and schedule transaction execution. A +`Routable` could be either `Seekable` or `Unseekable` and is generally +used when you need to handle both. + +`Seekable` can be either a `Key` or an Accord `Range`. `Key` has both +routing (token) and partition key/clustering information. `Range` is +`Seekable` but its bounds are only `Routable`. `Range` is in an odd +place in terms of being `Seekable` . It's helpful because APIs can +accept `Seekable` and then handle both `Key` and `Range` domains. + +`Seekables` is the collection version of `Seekable` and can be either +`Keys` or `Ranges`. + +`Unseekable` can either a `RoutingKey` (`TokenKey`) or `Range` +(`TokenRange`) and `Unseekables` is either `RoutingKeys` or `Ranges`. +`Route` and various kinds of `Routables` exists, but are primarily used +inside Accord. + +=== Data + +`https://github.com/apache/cassandra-accord/blob/134df57677bbd5092994923a4dc2f15cd1d033d1/accord-core/src/main/java/accord/api/Data.java#L28[Data]` +is an opaque container for data that has been read during execution of a +transaction. Accord doesn't know anything about the contents and the +only required interface for `Data` is that they can be merged since +Accord will execute multiple reads at different command stores and will +need to merge the result. + +`Data` is implemented by +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/accord/txn/TxnData.java#L47[TxnData]` +which is a glorified map from a unique integer identifying each piece of +data read to `TxnDataValue` which can be either +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/accord/txn/TxnDataKeyValue.java[TxnDataKeyValue]` +or +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/accord/txn/TxnDataRangeValue.java[TxnDataRangeValue]` +. `TxnDataKeyValue` doesn't support merging because Accord only reads +from a single replica, but `TxnDataRangeValue` does because the integer +key for `TxnData` identifies the logical read in the transaction, but +the actual execution of the range read could touch an arbitrary number +of command stores covered by the range and each will produce their own +`TxnDataRangeValue` for their portion of the read. + +=== Result + +`https://github.com/apache/cassandra-accord/blob/134df57677bbd5092994923a4dc2f15cd1d033d1/accord-core/src/main/java/accord/api/Result.java[Result]` +is the interface for what is returned by `Query` and ends up being +returned as the non-error result by Accord to the coordinator of a +transaction. This is also implemented by `TxnData` for key read results +and by `TxnRangeReadResult` for range reads. + +There is also `RetryNewProtocolResult` which can be returned by +Cassandra's integration with Accord during live migration. This retry +error indicates that Accord determined the transaction's execute time is +in an epoch where Accord does not manage some or all of that data for +read or write so the transaction should be retried on whatever system +currently manages that data. + +=== Read + +`https://github.com/apache/cassandra-accord/blob/trunk/accord-core/src/main/java/accord/api/Read.java#L32[Read]` +is where a transaction defines how data should be read during execution +in order to return a result, and it will have its `read` method invoked +along with specific keys to be read at command stores. + +A `Read` has to define all the keys it will access up front and needs to +support `slice/intersecting/merge` so Accord can send only the relevant +parts of a transactions reads to the command stores that are responsible +for persisting metadata about the transaction and executing the read. + +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/accord/txn/TxnRead.java[TxnRead]` +implements `Read` and is a sorted collection of +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/accord/txn/TxnNamedRead.java#L77[TxnNamedRead]`. +The name in `TxnNamedRead` refers to what is now the integer identifier +for each logical read in the transaction. `TxnNamedRead` supports both +key and range reads although not both in the same transaction. + +The name for a read is an incrementing integer encoded at planning time with the higher order bits storing +the kind of read and the lower order bits storing the index of the read. Kinds of reads include: + +* USER - let statements +* RETURNING - Returning select in `TransactionStatement` +* AUTO++_++READ - Automatically generated reads like list index set +* CAS++_++READ - Read for CAS statements + +Every read in a transaction is executed concurrently in the read stage +threadpool and the resulting `Data` (`TxnData`) is merged into a single +value. + +`TxnRead` contains a read consistency level that is not visible to +Accord that is used to declare the read consistency level that a +transaction requires. This will be discussed more later when we cover +interoperability, but if this is set then the transaction will actually +read from multiple replicas complete with short read protection and +blocking read repair. + +=== Query + +`https://github.com/apache/cassandra-accord/blob/trunk/accord-core/src/main/java/accord/api/Query.java#L31[Query]` +is the portion of the transaction definition responsible for computing +the `Result` of the transaction that will be returned at the +coordinator. It's implemented by +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/accord/txn/TxnQuery.java[TxnQuery]` +which has several different modes it can operate in. + +`Query` only has one method `compute` to compute the result and is run +on the coordinator of a transaction. There are few things `TxnQuery` is +responsible for such as validating the query is accessing data managed +by Accord generating a retry error if needed. For CAS statements it's +also responsible for checking the CAS condition and returning the +appropriate result. For range reads it's also responsible for merging +the range read results and reapplying the limit. + +`TxnQuery` also has an implementation, `UNSAFE++_++EMPTY`, used for +Accord system transactions that does no validation that Accord owns the +ranges in question. This is because from Accord's perspective it +immediately adopts all the ranges in a table when that table begins +migration to Accord, but from live migration's perspective (which Accord +can't see) there is a +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/consensus/migration/TableMigrationState.java[TableMigrationState]` +that specifies which ranges within a table are managed by Accord. + +Accord system transactions only impact Accord metadata so “they don't +exist” from the perspective of live migration and concurrent reading and +writing to data. + +=== Update + +`https://github.com/apache/cassandra-accord/blob/trunk/accord-core/src/main/java/accord/api/Update.java[Update]` +is invoked via the `apply` method on the Accord coordinator and is +responsible for taking in the `Data` from `Read` and producing the +`Write` that contains all the writes that we applied as part of +committing the transaction. + +`Update` requires support for `slice`/`intersecting`/`merge` so that +Accord only needs to distribute and persist the potentially sizable +partial or complete updates to the shards that actually need them. + +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/accord/txn/TxnUpdate.java[TxnUpdate]` +implements `Update` and can contain completed or partial updates which +are completed when `apply` is called with the `TxnData` from `TxnRead`. +Updates that are not data dependent (blind writes) are handled +differently from non-data dependent updates. Data dependent updates are +computed at the coordinator and returned in the `TxnWrite` but non-data +dependent updates are omitted and instead are retrieved from `TxnUpdate` +at each replica when `TxnWrite.apply` is called. + +`TxnUpdate` is also responsible for populating the update with the +monotonic transactional hybrid logical clock for the execution time of +the transaction. This is used instead of the coordinator generated +timestamp for `SERIAL` and `TransactionStatement` writes. Non-SERIAL +writes use the coordinator or user supplied timestamp although this may +change in between the time of this writing and final release. + +`TxnUpdate` has a write consistency level that is not visible to Accord +and is it similar to the commit consistency level for CAS writes. If the +write consistency level is set then Accord will do synchronous commit at +the specified consistency level. Otherwise Accord defaults to +asynchronous commit. How consistency levels are handled will be covered +in interoperability and live migration. + +=== Write + +`https://github.com/apache/cassandra-accord/blob/trunk/accord-core/src/main/java/accord/api/Write.java[Write]` +is produced by invoking `Update.apply` and is not required to be +splittable/mergeable because all writes are sent to all shards. `Write` +is implemented by +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/accord/txn/TxnWrite.java#L74[TxnWrite]` +which each command store will invoke via `apply` for each intersecting +key. This will cause all writes in a transaction to run concurrently on +the mutation stage. + +=== Putting it all together + +With all the components of a transaction available they can be assembled +and provided to Accord to coordinate to implement all the existing CQL +interfaces as well as the new `TransactionStatement` interface. + +See +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/cql3/statements/TransactionStatement.java#L435[TransactionStatement.createTxn]` +, +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java#L484[CQL3CasRequest.toAccordTxn]`, +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigrationMutationHelper.java#L236[ConsensusMigrationHelper.mutateWithAccordAsync]`, +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/StorageProxy.java#L2206[StorageProxy.readWithAccord]`, +and +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/reads/repair/BlockingReadRepair.java#L219[BlockingReadRepair.repairViaAccordTransaction]` +. + +There isn't as much magic as you would think in how Accord executes +transactions when operating with exclusive access to a table. Accord is +able to mostly execute `ReadCommands` unmodified with some +accommodations for the fact that reads are strongly consistent from a +single replica so filtering can be pushed down. The majority of the work +is just making the description of things like CAS serializable so it can +be persisted by Accord for transaction recovery. + +Where things get complicated is live migrating to Accord and supporting +interoperability with non-Accord reads and writes. + +== Live migration + +=== Core challenges + +Accord and Paxos operate fundamentally different in terms of what they +perform consensus on and how the transactions are recovered. Paxos +performs consensus on the exact set of writes to apply and recovering a +transaction only requires the writes to be applied. Accord consensus is +on the transaction definition, a superset of the dependencies, and the +execution timestamp of the transaction. + +Accord needs to recompute the writes during transaction recovery which +means it may need to repeat any reads necessary to compute those writes +which means Accord needs reads to be repeatable during transaction +execution and recovery. Non-Accord writes cause non-determinism for +Accord reads. Accord also reads at `ONE` so it would miss `QUORUM` +writes. + +The big hammer we use to deal with this is to avoid ever requiring +Accord to read data that is not replicated at `ALL`. If we did it would +lead to non-deterministic transaction recovery. This isn't something +that can be addressed by having Accord read at `QUORUM` and then +performing blocking read repair because different Accord coordinators +can still witness different sets of non-Accord writes. + +Accord also defaults to asynchronous commit so when migrating away from +Accord it's not safe for Paxos and non-SERIAL reads to read committed +Accord writes + +=== Bridging the gap + +Cassandra needs to be highly available while transitioning, but +operations that propagate data at `ALL` like Cassandra's Data Repair +{plus} Paxos Repair, or Accord's repair syncs are not highly available. +Going forward these will be referred to as range barriers. + +At every point during migration there has to be some system safely +capable of executing every operation type. Highly available key barriers +solve this problem by allowing the migration of a single key at `QUORUM` +to meet the requirements for execution on the migration target system. + +A key barrier on Paxos uses the existing Paxos repair mechanism to apply +any partially committed transactions at `QUORUM` which can then be +safely read by Accord if Accord read's at `QUORUM`. A key barrier on +Accord uses Accord's sync mechanism to wait until all transactions in an +epoch that could have modified the key are applied at `QUORUM`. + +There is a system table and small in memory cache for key barriers to +avoid repeatedly performing key migrations, but the key migration is +only recorded if the coordinator is a replica to avoid the cache growing +too large. + +=== No non-SERIAL key migration + +One wrinkle is that it is not possible to do key migration for +non-SERIAL Cassandra writes because there is no metadata to check for +uncommitted operations like there is with Paxos and Accord. Non-SERIAL +writes include _all_ sources of non-SERIAL writes such as read repair, +logged batches, and hints. Accord doesn't have this issue as any data +managed by Accord always has metadata available since all operations are +routed through Accord. + +Splitting migration to Accord into two phases solves this issue +because while Accord is unable to safely read non-SERIAL writes it can +safely apply non-SERIAL writes as recovery of blind write transactions +is still deterministic in Accord. In the first phase of migration to +Accord all non-SERIAL writes are executed on Accord and synchronously +applied at the requested consistency level while a data repair (full or +incremental) runs and makes it safe for Accord to read non-SERIAL +writes. Paxos continues to execute all SERIAL writes because Accord is +unable to execute SERIAL writes since it can't read yet. + +After a data repair completes the second phase of migration to Accord +begins and all operations are executed on Accord after Paxos key +migration is run to ensure that the key being read by Accord has no +unapplied Paxos transactions. After a Paxos repair {plus} data repair +(full only) the remaining Paxos writes will be visible at `ALL` and +Accord can begin executing reads at `ONE` instead of the requested +consistency level and performing asynchronous commit and ignore the +requested commit/write consistency level. + +A quirk of incremental repair is that it flushes memtables before Paxos +repair runs and as a result it doesn't replicate at `ALL` the data that +Paxos repair propagated at `QUORUM`. Thus a full repair is required for +the second phase of migration to Accord so that the Paxos data ends up +repaired at `ALL`. It's possible, but difficult, to make the +migration three phases and track the Paxos repair independently so that +you could do Paxos repair and then use IR, but this is not currently +implemented. + +=== Supported consistency levels + +Live migration to/from Accord requires Accord to honor requested +consistency levels for read and write. Cassandra's Accord integration +only adds support for a subset of consistency levels listed in +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/accord/IAccordService.java#L75[IAccordService]` +. DC aware consistency levels are not supported along with `TWO` and +`THREE`. + +Accord will always reject unsupported consistency levels even if it will +not actually be honoring them during execution to ensure that your +application remains ready to migrate away from Accord in the future. + +In the case of `ONE` as a write/commit consistency level the commit will +silently be performed at `QUORUM` + +=== Interoperability support + +Interoperability aims to extend Accord to support reading and writing at +configurable consistency levels as well as to add support for +synchronous commit. This is facilitated by extension points in Accord +that allow injecting custom implementations for various protocol steps +via +`https://github.com/apache/cassandra-accord/blob/134df57677bbd5092994923a4dc2f15cd1d033d1/accord-core/src/main/java/accord/coordinate/CoordinationAdapter.java#L64[CoordinationAdapter]` +and +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/accord/interop/AccordInteropAdapter.java[AccordInteropAdapter]`. + +`AccordInteropAdapter` can inject custom versions of the `execute` and +`persist` phases and does conditionally at transaction execution time +based on the read and write consistency levels provided by `TxnRead` and +`TxnUpdate` . These consistency levels can differ from the ones +requested by the application because live migration may choose to ignore +the consistency levels when they aren't needed. + +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/accord/interop/AccordInteropExecution.java[AccordInteropExecution]` +allows reading at a requested consistency level. It largely inverts +control of reading in Accord and uses Cassandra's existing Read Executor +functionality to determine what nodes to contact and what commands to +send them while providing short read protection and blocking read +repair. Read executors interface with Accord via the +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/reads/ReadCoordinator.java#L37[ReadCoordinator]` +interface which can either send a regular read message or go through +Accord to send an Accord specific read message which causes the read to +execute at the appropriate command store in the appropriate +transactional context after all dependencies have been applied. + +`ReadCoordinator` also intercepts blocking read repair during execution +of an Accord transaction and executes it through the appropriate command +store. The only legitimate way for this to occur is after Paxos key +migration the data is only propagated at `QUORUM` so it is possible that +Accord reading at `QUORUM` will find replicas to read repair. It's not +strictly necessary as we already know the data is propagated at +`QUORUM`, but the support is there. + +`ReadCoordinator` also helps apply read repair mutations via Accord in +`TransactionalMode.MIXED` and during migration by applying the read +repair mutations in Accord's execute phase instead of waiting for apply. +This is safe because read repair only proposes already committed Accord +writes or already unsafe non-SERIAL writes which aren't allowed anyways. + +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/accord/interop/AccordInteropPersist.java#L48[AccordInteropPersist]` +adds support for synchronous commit and commit at a requested +consistency level. It sends `AccordInteropApply` which is a synchronous +apply message that only responds once application is complete. + +https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/consensus/TransactionalMode.java#L34[`TransactionalMode`] +defines the supported modes and +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/consensus/TransactionalMode.java#L140[commitCLForMode]` +determines the commit consistency level and +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/consensus/TransactionalMode.java#L170[readCLForMode]` +determines the read consistency level. These two methods take into +account both the requested consistency level, the table specific +migration state, the current transactional mode, and the target +transactional mode in order to decide whether to honor the requested +consistency level. + +=== Routing requests during migration + +During migration, requests race with changes to +https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/consensus/migration/TableMigrationState.java[`TableMigrationState`] +to execute and may complete or partially complete on the system they +were originally routed to. This race is resolved by allowing requests to +return a new retry on different system error response that has to be +handled by the coordinator. It's possible that a request may still +complete after receiving a retry different system error because the +target consistency level was still met. + +Migration is per table and per token range so it's possible for part of +a table to be running on Accord and part of it to be running on Paxos. +Requests can end up executing partially on Cassandra and partially on +Accord. + +==== Detecting misrouted requests + +For Paxos this is resolved in the prepare phase where a failure to meet +the required consistency level at the prepare phase means the operation +does not run on Paxos. If the prepare phase is being performed to +recover an existing transaction then it is allowed to proceed because +recovery will deterministically create the same state every time it runs +so it's safe to repeat even after key or range migration has occurred +since those would have already recovered the transaction. + +Accord determines an `executeAt` timestamp, that is deterministic even +during transaction recovery, for each transaction that includes an epoch +that corresponds to the epoch used by `TableMigrationState` and this is +used to check all the tables and keys being touched in a transaction. +`TxnQuery` then returns a retry on different system error if the any +part of the transaction is not eligible to run on Accord. + +`ColumnFamilyStore` checks every `Mutation` to see if it is marked as +allowing potential transaction conflicts. Paxos and Accord always mark +their `Mutation`s as allowing potential transaction conflicts because +they do the work to check for them directly, but non-SERIAL sources of +`Mutation`s will be subject to that check and a +`RetryOnDifferentSystemException` is thrown if the mutation is detected +to be misrouted according to the latest cluster metadata available at +the node attempting to apply that mutation. + +`ReadCommand` has a similar arrangement where each read command is +marked with whether it allows potential transaction conflicts and when +`executeLocally` is run the check is done against cluster metadata to +determine whether or not to throw `RetryOnDifferentSystemException`. +Accord always allows potential transaction conflicts on its read +commands, but Paxos does not because Paxos does not need to read data in +order to recover transactions. + +==== Splitting write requests + +For non-SERIAL writes the `Mutation` is split into the portion that will +execute on Accord and the portion that will execute on Cassandra and the +Accord portion is executed asynchronously while the Cassandra portion is +executed synchronously. If either attempt fails due to misrouting the +write is re-split with updated cluster metadata and retried without +raising an error. + +Logged batches are currently always written to the system table and then +split for execution, and if part of the batch fails then batchlog replay +will replay the entire batch and re-split it in the process. Batchlog +replay only makes a single attempt to replay before converting the batch +contents to hints. If part of the batch was routed to Accord then there +is no node to hint so there is a fake node that a hint is written to and +when that hint is dispatched it will be split and then executed +appropriately. In https://issues.apache.org/jira/browse/CASSANDRA-20588[CASSANDRA-20588] this needs to be simplified to writing the +entire batch through Accord if any part of it should be written through +Accord because it also addresses an atomicity issue with single token +batches which can be torn when part is applied through Accord and part +is applied through Cassandra. + +Hints can be for multiple tables some of which may be Accord and some +non-Accord so splitting occurs. It's also possible a hint will be for an +operation that was sent to Accord (not a real node) via the batchlog and +it's possible that splitting discovers the hint now needs to be executed +without Accord. In that scenario the hint is converted to a hint for +every replica. This conversion can only occur once so the write +amplification is bounded. + +Splitting of mutations is done in +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigrationMutationHelper.java#L219[ConsensusMigrationMutationHelper]` +with the retry loop being implemented at each caller (batch mutation, +mutation, batch log, hints). + +Paxos has a retry loop but does not do any splitting because Paxos only +supports a single key. + +==== Partition range reads + +Partition range reads are managed by +`https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/reads/range/RangeCommandIterator.java#L75[RangeCommandIterator]` +which continues to split range reads using the existing algorithm that +is agnostic as to how the range command will be executed. Each generated +range read +https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/reads/range/RangeCommandIterator.java#L247[is +then split on the boundaries of which system is responsible for reading +that range] and that is wrapped in a +https://github.com/apache/cassandra/blob/122f5300855d56131948575f80ce0594547c9040/src/java/org/apache/cassandra/service/reads/range/RangeCommandIterator.java#L378[retrying +iterator] which repeats the splitting if any part of the range read ends +up routed to the wrong system. + +Range reads do not execute any key barriers and when migrating away from +Accord you will see weaker consistency compared to Paxos because Accord +does not necessarily honor commit consistency levels and does +asynchronous commit. As things currently stand it's uncertain the key +barriers would run fast enough to avoid timing out range read requests +so they are not done. + +Range reads also consume more memory when executed on Accord when a +limit is used. A single range read command is split into intersecting +command store number of range read commands that execute concurrently +and each one can return up to the limit number of results before they +are merged at the coordinator and the limit is re-applied. This could be +improved by applying the limit again before serializing or by executing +the reads serially at command stores until the limit is met. + +=== Transactional modes + +Transactional modes are set per table and define how Accord, Paxos, and +non-SERIAL operations will execute. The three supported modes are +`FULL`, `MIXED++_++READS`, and `OFF`. + +`FULL` routes all reads and writes through Accord once migration is +complete and allows Accord to ignore read and write consistency levels. +This allows Accord to perform asynchronous commit reducing the number of +WAN roundtrips from 2 to 1. + +`MIXED++_++READS` routes all writes through Accord once migration is +complete, but allows non-SERIAL reads to safely execute outside of +Accord and still read Accord writes because Accord will honor the +provided commit consistency level. This means Accord will need to +perform synchronous commit requiring an 1 extra WAN roundtrips for 2 +total. + +`OFF` is the default where everything runs either on Paxos if it is +`SERIAL` or on the usual eventually consistent paths for everything +else. + +Other modes exist for testing purposes and are disabled by default +unless unlocked via system property. diff --git a/doc/modules/cassandra/pages/architecture/index.adoc b/doc/modules/cassandra/pages/architecture/index.adoc index 9e674d95a2bb..893c2f78076d 100644 --- a/doc/modules/cassandra/pages/architecture/index.adoc +++ b/doc/modules/cassandra/pages/architecture/index.adoc @@ -7,3 +7,4 @@ This section describes the general architecture of Apache Cassandra. * xref:architecture/storage-engine.adoc[Storage Engine] * xref:architecture/guarantees.adoc[Guarantees] * xref:architecture/snitch.adoc[Snitches] +* xref:architecture/accord.adoc[Accord] diff --git a/doc/modules/cassandra/pages/developing/cql/SASI.adoc b/doc/modules/cassandra/pages/developing/cql/SASI.adoc index 705cf1d3372c..93d87f8ff385 100644 --- a/doc/modules/cassandra/pages/developing/cql/SASI.adoc +++ b/doc/modules/cassandra/pages/developing/cql/SASI.adoc @@ -1,4 +1,4 @@ -== SASI Index += SASI Index https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/index/sasi/SASIIndex.java[`SASIIndex`], or ``SASI`` for short, is an implementation of Cassandra's `Index` @@ -9,7 +9,7 @@ has superior performance in cases where queries would previously require filtering. In achieving this performance, SASI aims to be significantly less resource intensive than existing implementations, in memory, disk, and CPU usage. In addition, SASI supports prefix and contains queries on -strings (similar to SQL's `LIKE = "foo*"` or `LIKE = "*foo*"'`). +strings (similar to SQL's ``LIKE = "foo\*"`` or ``LIKE = "*foo*"`` ). The following goes on describe how to get up and running with SASI, demonstrates usage with examples, and provides some details on its @@ -357,7 +357,7 @@ parts: Indexing and Querying. Further, Cassandra makes it possible to divide those responsibilities into the memory and disk components. SASI takes advantage of Cassandra's write-once, immutable, ordered data model to build indexes along with the flushing of the memtable to disk – this -is the origin of the name ``SSTable Attached Secondary Index''. +is the origin of the name `SSTable Attached Secondary Index`. The SASI index data structures are built in memory as the SSTable is being written and they are flushed to disk before the writing of the @@ -405,15 +405,15 @@ or more page-sized blocks. The https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/index/sasi/disk/OnDiskIndex.java[`OnDiskIndex`] is structured as a tree of arrays, where each level describes the terms in the level below, the final level being the terms themselves. The -`PointerLevel`s and their `PointerBlock`s contain terms and pointers to +``PointerLevel``s and their ``PointerBlock``s contain terms and pointers to other blocks that _end_ with those terms. The `DataLevel`, the final -level, and its `DataBlock`s contain terms and point to the data itself, +level, and its ``DataBlock``s contain terms and point to the data itself, contained in https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/index/sasi/disk/TokenTree.java[`TokenTree`]s. The terms written to the https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/index/sasi/disk/OnDiskIndex.java[`OnDiskIndex`] -vary depending on its ``mode'': either `PREFIX`, `CONTAINS`, or +vary depending on its `mode` : either `PREFIX`, `CONTAINS`, or `SPARSE`. In the `PREFIX` and `SPARSE` cases, terms' exact values are written exactly once per `OnDiskIndex`. For example, when using a `PREFIX` index with terms `Jason`, `Jordan`, `Pavel`, all three will be @@ -428,14 +428,14 @@ is built merging all the https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/index/sasi/disk/TokenTree.java[`TokenTree`]s for each term into a single one. This copy of the data is used for efficient iteration of large ranges of e.g. timestamps. The index -``mode'' is configurable per column at index creation time. +`mode` is configurable per column at index creation time. ===== TokenTree(Builder) The https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/index/sasi/disk/TokenTree.java[`TokenTree`] is an implementation of the well-known -https://en.wikipedia.org/wiki/B%2B_tree[B+-tree] that has been modified +https://en.wikipedia.org/wiki/B%2B_tree[B+ tree] that has been modified to optimize for its use-case. In particular, it has been optimized to associate tokens, longs, with a set of positions in an SSTable, also longs. Allowing the set of long values accommodates the possibility of a @@ -519,7 +519,7 @@ execution. During the analysis phase, https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/index/sasi/plan/QueryPlan.java[`QueryPlan`] -converts from Cassandra's internal representation of `IndexExpression`s, +converts from Cassandra's internal representation of ``IndexExpression``s, which has also been modified to support encoding queries that contain ORs and groupings of expressions using parentheses (see the link:#cassandra-internal-changes[Cassandra Internal Changes] section @@ -653,8 +653,8 @@ like this: The last type of optimization applied, for this query, is to merge range expressions across branches of the tree – without modifying the meaning of the query, of course. In this case, because the query contains all -`AND`s the `age` expressions can be collapsed. Along with this -optimization, the initial collapsing of unneeded `AND`s can also be +``AND``s the `age` expressions can be collapsed. Along with this +optimization, the initial collapsing of unneeded ``AND``s can also be applied once more to result in this final tree using to execute the query: @@ -683,7 +683,7 @@ class, more specifically, can have zero, one, or two https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/index/sasi/plan/Operation.java[`Operation`]s as children and an unlimited number of expressions. The iterators used to perform the queries, discussed below in the -``Range(Union|Intersection)Iterator'' section, implement the necessary +`Range(Union|Intersection)Iterator` section, implement the necessary logic to merge results transparently regardless of the https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/index/sasi/plan/Operation.java[`Operation`]s children. @@ -706,14 +706,14 @@ the code]. The abstract `RangeIterator` class provides a unified interface over the two main operations performed by SASI at various layers in the execution path: set intersection and union. These operations are performed in a -iterated, or ``streaming'', fashion to prevent unneeded reads of +iterated, or `streaming`, fashion to prevent unneeded reads of elements from either set. In both the intersection and union cases the algorithms take advantage of the data being pre-sorted using the same sort order, e.g. term or token order. The https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/index/sasi/utils/RangeUnionIterator.java[`RangeUnionIterator`] -performs the ``Merge-Join'' portion of the +performs the `Merge-Join` portion of the https://en.wikipedia.org/wiki/Sort-merge_join[Sort-Merge-Join] algorithm, with the properties of an outer-join, or union. It is implemented with several optimizations to improve its performance over a @@ -733,7 +733,7 @@ between them based on some properties of the data. `BounceIntersectionIterator`, and the `BOUNCE` strategy, works like the https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/index/sasi/utils/RangeUnionIterator.java[`RangeUnionIterator`] -in that it performs a ``Merge-Join'', however, its nature is similar to +in that it performs a `Merge-Join`, however, its nature is similar to a inner-join, where like values are merged by a data-specific merge function (e.g. merging two tokens in a list to lookup in a SSTable later). See the @@ -742,7 +742,7 @@ for more details on its implementation. `LookupIntersectionIterator`, and the `LOOKUP` strategy, performs a different operation, more similar to a lookup in an associative data -structure, or ``hash lookup'' in database terminology. Once again, +structure, or `hash lookup` in database terminology. Once again, details on the implementation can be found in the https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/index/sasi/utils/RangeIntersectionIterator.java#L199-L208[javadoc]. @@ -794,7 +794,7 @@ The following are items that can be addressed in future updates but are not available in this repository or are not currently implemented. * The cluster must be configured to use a partitioner that produces -`LongToken`s, e.g. `Murmur3Partitioner`. Other existing partitioners +``LongToken``s, e.g. `Murmur3Partitioner`. Other existing partitioners which don't produce LongToken e.g. `ByteOrderedPartitioner` and `RandomPartitioner` will not work with SASI. * Not Equals and OR support have been removed in this release while diff --git a/doc/modules/cassandra/pages/developing/cql/changes.adoc b/doc/modules/cassandra/pages/developing/cql/changes.adoc index 2dbfae7ecfc0..12b77eef6c73 100644 --- a/doc/modules/cassandra/pages/developing/cql/changes.adoc +++ b/doc/modules/cassandra/pages/developing/cql/changes.adoc @@ -1,4 +1,4 @@ -= Changes += CQL Changes The following describes the changes in each version of CQL. @@ -63,15 +63,15 @@ explicitly set. * `ALTER TABLE` `ADD` and `DROP` now allow multiple columns to be added/removed. * New `PER PARTITION LIMIT` option for `SELECT` statements (see -https://issues.apache.org/jira/browse/CASSANDRA-7017)[CASSANDRA-7017]. +https://issues.apache.org/jira/browse/CASSANDRA-7017[CASSANDRA-7017]). * `User-defined functions ` can now instantiate `UDTValue` and `TupleValue` instances via the new `UDFContext` interface (see -https://issues.apache.org/jira/browse/CASSANDRA-10818)[CASSANDRA-10818]. +https://issues.apache.org/jira/browse/CASSANDRA-10818[CASSANDRA-10818]). * `User-defined types ` may now be stored in a non-frozen form, allowing individual fields to be updated and deleted in `UPDATE` statements and `DELETE` statements, respectively. -(https://issues.apache.org/jira/browse/CASSANDRA-7423)[CASSANDRA-7423]). +(https://issues.apache.org/jira/browse/CASSANDRA-7423[CASSANDRA-7423]). == 3.4.1 @@ -175,7 +175,7 @@ and `UPDATE` supports `IF` conditions. * `SELECT`, `UPDATE`, and `DELETE` statements now allow empty `IN` relations (see -https://issues.apache.org/jira/browse/CASSANDRA-5626)[CASSANDRA-5626]. +https://issues.apache.org/jira/browse/CASSANDRA-5626[CASSANDRA-5626]). == 3.0.4 diff --git a/doc/modules/cassandra/pages/developing/cql/constraints.adoc b/doc/modules/cassandra/pages/developing/cql/constraints.adoc index 390d6c27a979..2e729db01d8b 100644 --- a/doc/modules/cassandra/pages/developing/cql/constraints.adoc +++ b/doc/modules/cassandra/pages/developing/cql/constraints.adoc @@ -5,10 +5,9 @@ column level in a table schema definition and enforcing them at write time. == CREATE CONSTRAINT -Constraints can be created within the column definition, or as part -of the table properties. +Constraints can be created within the column definition. -The main syntax to define a constraint is as follows: +The syntax to define a constraint is as follows: [source,bnf] ---- @@ -20,7 +19,7 @@ CREATE TABLE ks.tb ( ); ---- -As shown in this syntax, more than one constraint can be defined for a given column using the AND keyword. +As shown in this syntax, more than one constraint can be defined for a given column using the `AND` keyword. == ALTER CONSTRAINT @@ -30,21 +29,32 @@ Altering a constraint is done by following the alter column CQL syntax: ALTER TABLE [IF EXISTS] ALTER [IF EXISTS] CHECK ; ---- +There is no way how to alter individual check when multiple checks are specified on a column. Altering constraints +on a column will set constraints to these specified checks. A user can, of course, chain them: + +[source,bnf] +---- +ALTER TABLE [IF EXISTS]
ALTER [IF EXISTS] CHECK AND +---- + == DROP CONSTRAINT -And DROP can be used to drop constraints for a column as well. +`DROP CHECK` can be used to drop constraints for a column as well. [source,bnf] ---- ALTER TABLE [IF EXISTS]
ALTER [IF EXISTS] DROP CHECK; ---- +There is no way how to drop individual check when multiple checks are specified on a column. After dropping checks, you +are required to re-define all necessary checks again. + == AVAILABLE CONSTRAINTS === SCALAR CONSTRAINT -Defines a comparator against a numeric type. It support all numeric types supported in Cassandra, with all the regular +Defines a comparator against a numeric type. It supports all numeric types supported in Cassandra, with all regular comparators. -For example, we can define constraints that ensure that i is bigger or equal than 100 but smaller than 1000. +For example, we can define constraints that ensure that `i` is bigger or equal than `100` but smaller than `1000`. [source,bnf] ---- @@ -97,6 +107,39 @@ CREATE TABLE ks.tb ) ---- +There is a basic satistfiability check conducted on checks' definitions so we ensure that unsatisfiable constraint +definitions are invalid as it would be impossible to insert a value for a specific colum which would satisty all constraints. + +For example, imagine a user tries to create the following table (e.g. by mistake): + +---- +CREATE TABLE ks.tb ( + name text, + i int CHECK i < 100 AND i > 1000 + ..., +); +---- + +If we insert `50` for `i`, it will not satisfy `i > 1000`. If we insert `1001` as `i`, it will not satisfy `i < 100`. + +There is a satisfiability check in place which would prevent such constaint definitions: + +---- +[Invalid query] message="Constraints of scalar are not satisfiable: i < 100, i > 1000" +---- + +It is also illegal to specify constraints which are repeating, or they repeat on their operators: + +---- +CREATE TABLE ks.tb7 (id int primary key, i int check i < 100 and i > 1000 and i < 10) ; +[Invalid query] message="There are duplicate constraint definitions on column 'i': [i <]" +---- + +---- +CREATE TABLE ks.tb7 (id int primary key, i int check i > 100 and i > 1000) ; +[Invalid query] message="There are duplicate constraint definitions on column 'i': [i >]" +---- + === LENGTH CONSTRAINT Defines a condition that checks the length of text or binary type. @@ -105,7 +148,7 @@ For example, we can create a constraint that checks that name can't be longer th ---- CREATE TABLE ks.tb ( - name text CHECK LENGTH(name) < 256 + name text CHECK LENGTH() < 256 ..., ); ---- @@ -113,7 +156,7 @@ CREATE TABLE ks.tb ( Altering that constraint can be done with: ---- -ALTER TABLE ks.tb ALTER name LENGTH(name) < 512; +ALTER TABLE ks.tb ALTER name LENGTH() < 512; ---- Finally, the constraint can be removed: @@ -130,25 +173,25 @@ For example, we can create a constraint that checks that name can't be bigger th ---- CREATE TABLE ks.tb ( - name text CHECK OCTET_LENGTH(name) < 2 + name text CHECK OCTET_LENGTH() < 2 ..., ); ---- Inserting a valid row: ---- -INSERT INTO ks.tb (name) VALUES ("f") +INSERT INTO ks.tb (name) VALUES ('f') ---- Inserting an invalid row: ---- -INSERT INTO ks.tb (name) VALUES ("fooooooo") +INSERT INTO ks.tb (name) VALUES ('fooooooo') ERROR: Column value does not satisfy value constraint for column 'name'. It has a length of 8 and and it should be should be < 2 ---- -=== NOT_NULL constraint +=== NOT NULL constraint Defines a constraint that checks if a column is not null in every modification statement. @@ -158,13 +201,33 @@ For example, let's have this table: CREATE TABLE ks.tb ( id int, cl int, - col1 int CHECK NOT_NULL(col1), - col2 int CHECK NOT_NULL(col2), + col1 int CHECK NOT NULL, + col2 int CHECK NOT NULL, PRIMARY KEY (id, cl) ); ---- -then this statement would fail: +It is possible to specify `NOT NULL` before `CHECK` / omit it to be more aligned with SQL syntax. + +---- +CREATE TABLE ks.tb ( + id int, + cl int, + col1 int NOT NULL, + col2 int NOT NULL, + PRIMARY KEY (id, cl) +); +---- + +Of course, mixing these two styles is forbidden: + +---- +-- this is illegal +col1 int NOT NULL CHECK NOT NULL, +[Invalid query] message="Duplicate definition of NOT NULL constraint" +---- + +When `NOT NULL` is specified as above, then this statement would fail: ---- INSERT INTO ks.tb (id, cl, col1) VALUES (1, 2, 3); @@ -177,7 +240,7 @@ as well as this statement: INSERT INTO ks.tb (id, cl, col1, col2) VALUES (1, 2, 3, null); ---- -A column which has `NOT_NULL` constraint has to be specified in every modification statement. +A column which has `NOT NULL` constraint has to be specified in every modification statement. The constraint can be removed: @@ -186,16 +249,55 @@ ALTER TABLE ks.tb ALTER col1 DROP CHECK; ALTER TABLE ks.tb ALTER col2 DROP CHECK; ---- -We can not remove the value of a column where `NOT_NULL` constraint is present: +We can not remove the value of a column where `NOT NULL` constraint is present: ---- DELETE col2 FROM ks.tb WHERE id = 1 AND cl = 2; ... [Invalid query] message="Column 'col2' can not be set to null." ---- -Additionally, `NOT_NULL` can not be specified on any column of a primary key, +Additionally, `NOT NULL` can not be specified on any column of a primary key, being it a partition key or a clustering column. +It is possible to chain `NOT NULL` with other checks, for example, if we require a column to not be null and its +size to be bigger than `0` every time, we do: + +---- +CREATE TABLE ks.tb ( + id int, + cl int, + col1 int CHECK NOT NULL AND col1 > 0, + PRIMARY KEY (id, cl) +); +---- + +As we said that `NOT NULL` can be put in front of `CHECK`, if we want to specify other constraints as well, +this syntax is indeed possible: + +---- +CREATE TABLE ks.tb ( + id int, + cl int, + col1 int NOT NULL CHECK col1 > 0, + PRIMARY KEY (id, cl) +); +---- + +Internally, `NOT NULL` specified before `CHECK` will be stored as any other check - that is after `CHECK`. +(`DESCRIBE` statement on a table will show this fact). This mean of constraint definition is just a syntax suggar. + +It is not possible to use `NOT NULL` before `CHECK` when altering. The following syntax is invalid: + +---- +ALTER TABLE ks.tb ALTER col2 NOT NULL CHECK col2 > 0; +---- + +However, this syntax is valid: + +---- +ALTER TABLE ks.tb ALTER col2 CHECK NOT NULL AND col2 > 0; +---- + === JSON constraint Defines a constraint which checks if a column contains a string which is a valid JSON. @@ -205,7 +307,7 @@ Defines a constraint which checks if a column contains a string which is a valid ---- CREATE TABLE ks.tb ( id int primary key, - val text CHECK JSON(val) + val text CHECK JSON() ); -- valid JSON string @@ -230,26 +332,28 @@ Defines a constraint which checks text-like values againt a regular expression. ---- CREATE TABLE ks.tb ( id int primary key, - value CHECK REGEXP(value) = 'a.*b' + value CHECK REGEXP() = 'a.*b' ) ---- ---- -cassandra@cqlsh> INSERT INTO ks.tb (id , value ) VALUES ( 1, 'asdadasdabb'); -cassandra@cqlsh> INSERT INTO ks.tb (id , value ) VALUES ( 1, 'aaaaa'); +INSERT INTO ks.tb (id , value ) VALUES ( 1, 'asdadasdabb'); +INSERT INTO ks.tb (id , value ) VALUES ( 1, 'aaaaa'); ... [Invalid query] message="Value does not match regular expression 'a.*b'" ---- Negation can be also used: ---- -ALTER TABLE ks.tb ALTER value CHECK REGEXP(value) != 'a.*b'; +ALTER TABLE ks.tb ALTER value CHECK REGEXP() != 'a.*b'; ---- which would logically invert the condition: ---- -cassandra@cqlsh> INSERT INTO ks.tb (id , value ) VALUES ( 1, 'asdadasdabb'); +INSERT INTO ks.tb (id , value ) VALUES ( 1, 'asdadasdabb'); ... [Invalid query] message="Value does match regular expression 'a.*b'" -cassandra@cqlsh> INSERT INTO ks.tb (id , value ) VALUES ( 1, 'aaaaa'); ----- \ No newline at end of file +INSERT INTO ks.tb (id , value ) VALUES ( 1, 'aaaaa'); +---- + +`REGEXP` constraint supports only `!=` and `=` operators as other operators are meaningless. \ No newline at end of file diff --git a/doc/modules/cassandra/pages/developing/cql/cql_singlefile.adoc b/doc/modules/cassandra/pages/developing/cql/cql_singlefile.adoc index f2bf21590682..8bd37834b871 100644 --- a/doc/modules/cassandra/pages/developing/cql/cql_singlefile.adoc +++ b/doc/modules/cassandra/pages/developing/cql/cql_singlefile.adoc @@ -273,6 +273,28 @@ provide values for `LIMIT`, `TIMESTAMP`, and `TTL` clauses. If anonymous bind markers are used, the names for the query parameters will be `[limit]`, `[timestamp]`, and `[ttl]`, respectively. +===== Prepared Statement Caching + +Prepared Statements are cached by cassandra in-memory using a +https://github.com/ben-manes/caffeine[Caffeine]-managed cache which +can be configured using +xref:managing/configuration/cass_yaml_file.adoc#_prepared_statements_cache_size[`prepared_statements_cache_size`]. +The cache is also persisted to the `system.prepared_statements` table +so it can be preloaded into memory on startup. + +To ensure optimal performance, it's important to use a bind `` +for *all non-constant values* in your CQL statements. If you include +literal values directly in the query instead, each variation will be +treated as a unique statement that must be prepared and cached +separately. This will soon overflow the prepared statement cache, +which is small by design. + +When the cache reaches its maximum size, older or less frequently +used statements are +https://github.com/ben-manes/caffeine/wiki/Eviction[evicted], +leading to additional overhead as previously prepared statements must +be re-prepared. + [[dataDefinition]] == Data Definition diff --git a/doc/modules/cassandra/pages/developing/cql/ddl.adoc b/doc/modules/cassandra/pages/developing/cql/ddl.adoc index a546e12b92b9..c18b26b83f33 100644 --- a/doc/modules/cassandra/pages/developing/cql/ddl.adoc +++ b/doc/modules/cassandra/pages/developing/cql/ddl.adoc @@ -283,7 +283,7 @@ following modifiers: Some columns can be declared as `STATIC` in a table definition. A column that is static will be “shared” by all the rows belonging to the same -partition (having the same xref:cassandra:developing/cql/ddl.adoc#partition-key[partition key]. +partition (having the same xref:cassandra:developing/cql/ddl.adoc#partition-key[partition key]). For example: diff --git a/doc/modules/cassandra/pages/developing/cql/functions.adoc b/doc/modules/cassandra/pages/developing/cql/functions.adoc index 43b95257eede..cb3e28b220e4 100644 --- a/doc/modules/cassandra/pages/developing/cql/functions.adoc +++ b/doc/modules/cassandra/pages/developing/cql/functions.adoc @@ -179,12 +179,12 @@ Retrieving the current date and time: |=== |Function name |Output type -| `current_timestamp` | `timestamp` - | `current_date` | `date` | `current_time` | `time` +| `current_timestamp` | `timestamp` + | `current_timeuuid` | `timeUUID` |=== @@ -218,6 +218,13 @@ A number of functions are provided to convert a `timeuuid`, a `timestamp` or a ` | `to_unix_timestamp` | `date` | Converts the `date` argument into a `bigInt` raw value |=== +For example, a timestamp can be converted to a date with the following: + +[source,cql] +---- +include::cassandra:example$CQL/to_date.cql[] +---- + ==== Blob conversion functions A number of functions are provided to convert the native types into @@ -527,7 +534,7 @@ UDFs can be _overloaded_, so that multiple UDFs with different argument types ca [NOTE] ==== _JavaScript_ user-defined functions have been deprecated in Cassandra 4.1. In preparation for Cassandra 5.0, their removal is -already in progress. For more information - CASSANDRA-17281, CASSANDRA-18252. +already in progress. For more information - https://issues.apache.org/jira/browse/CASSANDRA-17281[CASSANDRA-17281], https://issues.apache.org/jira/browse/CASSANDRA-18252[CASSANDRA-18252]. ==== For example: diff --git a/doc/modules/cassandra/pages/developing/cql/mvs.adoc b/doc/modules/cassandra/pages/developing/cql/mvs.adoc index e2949fd73685..00f023206f3d 100644 --- a/doc/modules/cassandra/pages/developing/cql/mvs.adoc +++ b/doc/modules/cassandra/pages/developing/cql/mvs.adoc @@ -73,7 +73,7 @@ The `WHERE` clause has the following restrictions: ** no other restriction is allowed ** cannot have columns that are part of the _view_ primary key be null, they must always be at least restricted by a `IS NOT NULL` restriction (or any other restriction, but they must have one). -* cannot have an xref:cassandra:developing/cql/dml.adoc#ordering-clause[ordering clause], a xref:cassandra:developing/cql/dml.adoc#limit-clause[limit], or xref:cassandra:developing/cql/dml.adoc#allow-filtering[ALLOW FILTERING +* cannot have an xref:cassandra:developing/cql/dml.adoc#ordering-clause[ordering clause], a xref:cassandra:developing/cql/dml.adoc#limit-clause[limit], or xref:cassandra:developing/cql/dml.adoc#allow-filtering[ALLOW FILTERING] === MV primary key @@ -152,5 +152,5 @@ Removal of columns not selected in the Materialized View (via `DELETE unselected_column FROM base`) may shadow missed updates to other columns received by hints or repair. For this reason, we advise against doing deletions on base columns not selected in views until this is -fixed on CASSANDRA-13826. +fixed on https://issues.apache.org/jira/browse/CASSANDRA-13826[CASSANDRA-13826]. ==== diff --git a/doc/modules/cassandra/pages/developing/cql/security.adoc b/doc/modules/cassandra/pages/developing/cql/security.adoc index 2d438b9815d1..99b6d444345f 100644 --- a/doc/modules/cassandra/pages/developing/cql/security.adoc +++ b/doc/modules/cassandra/pages/developing/cql/security.adoc @@ -171,7 +171,7 @@ xref:cassandra:developing/cql/security.adoc#authorization[authorization]. However, if authorization is enabled, xref:cassandra:developing/cql/security.adoc#cql-permissions[permissions] of the dropped role are also revoked, subject to the xref:cassandra:developing/cql/security.adoc#auth-caching[caching options] configured in xref:cassandra:developing/cql/configuring.adoc#cassandra.yaml[cassandra-yaml] file. Should a dropped role be subsequently recreated and have new xref:security.adoc#grant-permission-statement[permissions] or -xref:security.adoc#grant-role-statement[roles]` granted to it, any client sessions still +xref:security.adoc#grant-role-statement[roles] granted to it, any client sessions still connected will acquire the newly granted permissions and roles. ==== @@ -344,7 +344,7 @@ Existing users can be listed using the `LIST USERS` statement: include::cassandra:example$BNF/list_users_statement.bnf[] ---- -Note that this statement is equivalent to xref:security.adoc#list-roles-statement[`LIST ROLES], but only roles with the `LOGIN` privilege are included in the output. +Note that this statement is equivalent to xref:security.adoc#list-roles-statement[LIST ROLES], but only roles with the `LOGIN` privilege are included in the output. == Data Control @@ -660,5 +660,5 @@ which were directly granted to `bob` or one of `bob`'s roles: include::cassandra:example$CQL/list_select_perm.cql[] ---- -Show any permissions granted to `carlos` or any of `carlos`'s roles, +Show any permissions granted to `carlos` or any roles assigned to `carlos`, limited to `SELECT` permissions on any resource. diff --git a/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_conceptual.adoc b/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_conceptual.adoc index ca59a38800d1..3e28c34b9819 100644 --- a/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_conceptual.adoc +++ b/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_conceptual.adoc @@ -30,7 +30,7 @@ underlined. Relationships between entities are represented as diamonds, and the connectors between the relationship and each entity show the multiplicity of the connection. -image::data-modeling_hotel_erd.png[image] +image::data_modeling_hotel_erd.png[image] Obviously, in the real world, there would be many more considerations and much more complexity. For example, hotel rates are notoriously diff --git a/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_logical.adoc b/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_logical.adoc index 82aeb5d11446..80ddf3b6f0d6 100644 --- a/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_logical.adoc +++ b/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_logical.adoc @@ -34,7 +34,7 @@ informative way to visualize the relationships between queries and tables in your designs. This figure shows the Chebotko notation for a logical data model. -image::cassandra:developing/data-modeling/data-modeling_chebotko_logical.png[image] +image::cassandra:developing/data-modeling/data_modeling_chebotko_logical.png[image] Each table is shown with its title and a list of columns. Primary key columns are identified via symbols such as *K* for partition key columns @@ -51,7 +51,7 @@ dedicated tables for rooms or amenities, as you had in the relational design. This is because the workflow didn't identify any queries requiring this direct access. -image::cassandra:developing/data-modeling/data-modeling_hotel_logical.png[image] +image::cassandra:developing/data-modeling/data_modeling_hotel_logical.png[image] Let's explore the details of each of these tables. @@ -127,7 +127,7 @@ shows a logical data model for reservations. You'll notice that these tables represent a denormalized design; the same data appears in multiple tables, with differing keys. -image::cassandra:developing/data-modeling/data-modeling_reservation_logical.png[image] +image::cassandra:developing/data-modeling/data_modeling_reservation_logical.png[image] In order to satisfy Q6, the `reservations_by_guest` table can be used to look up the reservation by guest name. You could envision query Q7 being diff --git a/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_physical.adoc b/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_physical.adoc index 1328e459be17..ca9839b75057 100644 --- a/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_physical.adoc +++ b/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_physical.adoc @@ -19,7 +19,7 @@ notation for physical data models. To draw physical models, you need to be able to add the typing information for each column. This figure shows the addition of a type for each column in a sample table. -image::cassandra:developing/data-modeling/data-modeling_chebotko_physical.png[image] +image::cassandra:developing/data-modeling/data_modeling_chebotko_physical.png[image] The figure includes a designation of the keyspace containing each table and visual cues for columns represented using collections and @@ -61,7 +61,7 @@ As you work to create physical representations of various tables in the logical hotel data model, you use the same approach. The resulting design is shown in this figure: -image::cassandra:developing/data-modeling/data-modeling_hotel_physical.png[image] +image::cassandra:developing/data-modeling/data_modeling_hotel_physical.png[image] Note that the `address` type is also included in the design. It is designated with an asterisk to denote that it is a user-defined type, @@ -86,7 +86,7 @@ first iteration of your physical data model design, assume you're going to manage this denormalization manually. Note that this design could be revised to use Cassandra's (experimental) materialized view feature. -image::cassandra:developing/data-modeling/data-modeling_reservation_physical.png[image] +image::cassandra:developing/data-modeling/data_modeling_reservation_physical.png[image] Note that the `address` type is reproduced in this keyspace and `guest_id` is modeled as a `uuid` type in all of the tables. diff --git a/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_queries.adoc b/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_queries.adoc index 3a4fb8d54a2c..b33e91e05e4f 100644 --- a/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_queries.adoc +++ b/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_queries.adoc @@ -53,7 +53,7 @@ to obtain detailed description of the hotel. The act of booking a room creates a reservation record that may be accessed by the guest and hotel staff at a later time through various additional queries. -image::cassandra:developing/data-modeling/data-modeling_hotel_queries.png[image] +image::cassandra:developing/data-modeling/data_modeling_hotel_queries.png[image] _Material adapted from Cassandra, The Definitive Guide. Published by O'Reilly Media, Inc. Copyright © 2020 Jeff Carpenter, Eben Hewitt. All diff --git a/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_rdbms.adoc b/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_rdbms.adoc index 3de1210a5543..c045d7321463 100644 --- a/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_rdbms.adoc +++ b/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_rdbms.adoc @@ -12,7 +12,7 @@ relationships from the conceptual model of hotels-to-points of interest, rooms-to-amenities, rooms-to-availability, and guests-to-rooms (via a reservation). -image::data-modeling_hotel_relational.png[image] +image::data_modeling_hotel_relational.png[image] == Design Differences Between RDBMS and Cassandra diff --git a/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_refining.adoc b/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_refining.adoc index d613c2cea816..d7ea619c88f1 100644 --- a/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_refining.adoc +++ b/doc/modules/cassandra/pages/developing/data-modeling/data-modeling_refining.adoc @@ -188,7 +188,7 @@ the original design is shown in the figure below. While the `month` column is partially duplicative of the `date`, it provides a nice way of grouping related data in a partition that will not get too large. -image::data-modeling_hotel_bucketing.png[image] +image::data_modeling_hotel_bucketing.png[image] If you really felt strongly about preserving a wide partition design, you could instead add the `room_id` to the partition key, so that each diff --git a/doc/modules/cassandra/pages/developing/index.adoc b/doc/modules/cassandra/pages/developing/index.adoc index 8c9f735e2c3d..409a423a6fc8 100644 --- a/doc/modules/cassandra/pages/developing/index.adoc +++ b/doc/modules/cassandra/pages/developing/index.adoc @@ -2,3 +2,4 @@ * xref:cassandra:developing/data-modeling/index.adoc[Data Modeling] * xref:cassandra:developing/cql/index.adoc[CQL] +* xref:cassandra:developing/accord/index.adoc[Accord] diff --git a/doc/modules/cassandra/pages/getting-started/configuring.adoc b/doc/modules/cassandra/pages/getting-started/configuring.adoc index 00712d6132d4..9c393a4bd12a 100644 --- a/doc/modules/cassandra/pages/getting-started/configuring.adoc +++ b/doc/modules/cassandra/pages/getting-started/configuring.adoc @@ -14,7 +14,7 @@ to various Cassandra configuration files. Some examples that require non-default configuration are deploying a multi-node cluster or using clients that are not running on a cluster node. -* `cassandra.yaml`: the main configuration file for Cassandra +* `cassandra.yaml`: the main configuration file for Cassandra, it contains sensitive settings and therefore should not be accessed or modified by untrusted users * `cassandra-env.sh`: environment variables can be set * `cassandra-rackdc.properties` OR `cassandra-topology.properties`: set rack and datacenter information for a cluster diff --git a/doc/modules/cassandra/pages/managing/configuration/cass_jvm_options_file.adoc b/doc/modules/cassandra/pages/managing/configuration/cass_jvm_options_file.adoc index 79f67b6abb06..b9057a490424 100644 --- a/doc/modules/cassandra/pages/managing/configuration/cass_jvm_options_file.adoc +++ b/doc/modules/cassandra/pages/managing/configuration/cass_jvm_options_file.adoc @@ -14,7 +14,7 @@ See each file for examples of settings. [NOTE] ==== -The `jvm-*` files replace the `cassandra-envsh` file used in Cassandra +The `jvm-\*` files replace the `cassandra-env.sh` file used in Cassandra versions prior to Cassandra 3.0. The `cassandra-env.sh` bash script file is still useful if JVM settings must be dynamically calculated based on system settings. The `jvm-*` files only store static JVM settings. diff --git a/doc/modules/cassandra/pages/managing/configuration/cass_logback_xml_file.adoc b/doc/modules/cassandra/pages/managing/configuration/cass_logback_xml_file.adoc index a62dfe91a7be..73cae5b54590 100644 --- a/doc/modules/cassandra/pages/managing/configuration/cass_logback_xml_file.adoc +++ b/doc/modules/cassandra/pages/managing/configuration/cass_logback_xml_file.adoc @@ -76,11 +76,11 @@ the rolling policy. Specify the format of the message. Part of the rolling policy. -*Example:* 7 *Example:* -%-5level [%thread] %date\{ISO8601} %F:%L - %msg%n +*Example:* +%-5level [%thread] %date\{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n -=== Logging to Cassandra virtual table +=== Logging system logs to Cassandra virtual table It is possible to configure logback.xml in such a way that logs would appear in `system_views.system_log` table. This is achieved by appender implemented in class `VirtualTableAppender` which is called `CQLLOG` in the @@ -101,12 +101,77 @@ each message will occupy memory. The appender to virtual table is commented out by default so logging to virtual table is not active. +=== Logging slow queries to Cassandra virual table + +It is possible to log slow queries into `system_views.slow_queries` table. A query is evaluated to be slow +if it takes more than `slow_query_log_timeout` in `cassandra.yaml`. + +To log messages to `system_views.slow_queries` you need to: + +1. uncomment `SLOW_QUERIES_APPENDER` log appender +2. uncomment `appender-ref` pointing to `SLOW_QUERIES_APPENDER` in `slow_queries` logger: + +The respective configuration in `logback.xml` looks like this: + +[source,XML] +---- + + + + + + + + +---- + +By default, slow queries will be logged to `debug.log`. By uncommenting virtual table appender, it will be +logged to `debug.log` as well as to `system_views.slow_queries`. If you want to log it to `system_views.slow_queries` only, you need to comment out `DEBUGLOG` `appender-ref` in `slow_queries` logger declaration. + +If you want to log slow queries to a dedicated log file (which is e.g. rotated), that is also possible +by pointing `slow_queries` logger to a respective file appender of a given reference, similar to `DEBUGLOG` where all logs go by default. + +The structure of a table looks like this: + +[source,cql] +---- +cassandra@cqlsh> DESCRIBE system_views.slow_queries ; + +/* +Warning: Table system_views.slow_queries is a virtual table and cannot be recreated with CQL. +Structure, for reference: +VIRTUAL TABLE system_views.slow_queries ( + keyspace_name text, + table_name text, + timestamp timestamp, + query text, + avg_ms bigint, + cross_node boolean, + max_ms bigint, + min_ms bigint, + times_reported int, + PRIMARY KEY (keyspace_name, table_name, timestamp, query) +) WITH CLUSTERING ORDER BY (table_name ASC, timestamp ASC, query ASC) + AND comment = 'Slow queries'; +---- + +By having slow queries in a virtual table, an operator can check if there are slow queries for some table, see if +some queries violate some time threshold etc. The rows in this table are same data as one would get in `debug.log`, they +are just way more convenient to parse and query. + +`system_views.slow_queries` table is limited on number of rows it can hold, by default 10 000, configurable by `cassandra.virtual.slow_queries.max.rows` system property. If this table is full, the oldest entry is removed and the newest is inserted. This virtual table can be truncated by CQL and deletion on partition key (`keyspace_name` column) is allowed. + +A reader noticed that by placing custom appender implementation of `SLOW_QUERIES_APPENDER` appender on a class path and referencing it in `logback.xml`, it is possible to log slow queries wherever we have an appender for it. + === Contents of default `logback.xml` [source,XML] ---- - @@ -126,7 +191,7 @@ The appender to virtual table is commented out by default so logging to virtual 5GB - %-5level [%thread] %date{ISO8601} %F:%L - %msg%n + %-5level [%thread] %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n @@ -143,7 +208,7 @@ The appender to virtual table is commented out by default so logging to virtual 5GB - %-5level [%thread] %date{ISO8601} %F:%L - %msg%n + %-5level [%thread] %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n @@ -163,7 +228,7 @@ The appender to virtual table is commented out by default so logging to virtual INFO - %-5level [%thread] %date{ISO8601} %F:%L - %msg%n + %-5level [%thread] %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n diff --git a/doc/modules/cassandra/pages/managing/operating/audit_logging.adoc b/doc/modules/cassandra/pages/managing/operating/audit_logging.adoc index 63f4ba1a1130..c50c9785dc5a 100644 --- a/doc/modules/cassandra/pages/managing/operating/audit_logging.adoc +++ b/doc/modules/cassandra/pages/managing/operating/audit_logging.adoc @@ -213,7 +213,7 @@ the audit log events to flow through separate log file instead of system.log. 5GB - %-5level [%thread] %date{ISO8601} %F:%L - %msg%n + %-5level [%thread] %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n diff --git a/doc/modules/cassandra/pages/managing/operating/auto_repair.adoc b/doc/modules/cassandra/pages/managing/operating/auto_repair.adoc new file mode 100644 index 000000000000..d5701895d74f --- /dev/null +++ b/doc/modules/cassandra/pages/managing/operating/auto_repair.adoc @@ -0,0 +1,456 @@ += Auto Repair +:navtitle: Auto Repair +:description: Auto Repair concepts - How it works, how to configure it, and more. +:keywords: CEP-37, Repair, Incremental, Preview + +Auto Repair is a fully automated scheduler that provides repair orchestration within Apache Cassandra. This +significantly reduces operational overhead by eliminating the need for operators to deploy external tools to submit and +manage repairs. + +At a high level, a dedicated thread pool is assigned to the repair scheduler. The repair scheduler in Cassandra +maintains a new replicated table, `system_distributed.auto_repair_history`, which stores the repair history for all +nodes, including details such as the last repair time. The scheduler selects the node(s) to begin repairs and +orchestrates the process to ensure that every table and its token ranges are repaired. + +The algorithm can run repairs simultaneously on multiple nodes and splits token ranges into subranges, with necessary +retries to handle transient failures. Automatic repair starts as soon as a Cassandra cluster is launched, similar to +compaction, and if configured appropriately, does not require human intervention. + +The scheduler currently supports Full, Incremental, and Preview repair types with the following features. New repair +types, such as Paxos repair or other future repair mechanisms, can be integrated with minimal development effort! + + +== Features +- Capability to run repairs on multiple nodes simultaneously. +- A default implementation and an interface to override the dataset being repaired per session. +- Extendable token split algorithms with two implementations readily available: +. Splits token ranges by placing a cap on the size of data repaired in one session and a maximum cap at the schedule +level using xref:#repair-token-range-splitter[RepairTokenRangeSplitter] (default). +. Splits tokens evenly based on the specified number of splits using +xref:#fixed-split-token-range-splitter[FixedSplitTokenRangeSplitter]. +- A new xref:#table-configuration[CQL table property] (`auto_repair`) offering: +. The ability to disable specific repair types at the table level, allowing the scheduler to skip one or more tables. +. Configuring repair priorities for certain tables to prioritize them over others. +- Dynamic enablement or disablement of the scheduler for each repair type. +- Configurable settings tailored to each repair job. +- Rich configuration options for each repair type (e.g., Full, Incremental, or Preview repairs). +- Comprehensive observability features that allow operators to configure alarms as needed. + +== Considerations + +Before enabling Auto Repair, please consult the xref:managing/operating/repair.adoc[Repair] guide to establish a base +understanding of repairs. + +=== Full Repair + +Full Repairs operate over all data in the token range being repaired. It is therefore important to run full repair +with a longer schedule and with smaller assignments. + +=== Incremental Repair + +When enabled from the inception of a cluster, incremental repairs operate over unrepaired data and should finish +quickly when run more frequently. + +Once incremental repair has been run, SSTables will be separated between data that have been incrementally repaired +and data that have not. Therefore, it is important to continually run incremental repair once it has been enabled so +newly written data can be compacted together with previously repaired data, allowing overwritten and expired data to +be eventually purged. + +Running incremental repair more frequently keeps the unrepaired set smaller and thus causes repairs to operate over +a smaller set of data, so a shorter `min_repair_interval` such as `1h` is recommended for new clusters. + +==== Enabling Incremental Repair on existing clusters with a large amount of data +[#enabling-ir] +One should be careful when enabling incremental repair on a cluster for the first time. While +xref:#repair-token-range-splitter[RepairTokenRangeSplitter] includes a default configuration to attempt to gracefully +migrate to incremental repair over time, failure to take proper precaution could overwhelm the cluster with +xref:managing/operating/compaction/overview.adoc#types-of-compaction[anticompactions]. + +No matter how one goes about enabling and running incremental repair, it is recommended to run a cycle of full repairs +for the entire cluster as pre-flight step to running incremental repair. This will put the cluster into a more +consistent state which will reduce the amount of streaming between replicas when incremental repair initially runs. + +If you do not have strong data consistency requirements, one may consider using +xref:managing/tools/sstable/sstablerepairedset.adoc[nodetool sstablerepairedset] to mark all SSTables as repaired +before enabling incremental repair scheduling using Auto Repair. This will reduce the burden of initially running +incremental repair because all existing data will be considered as repaired, so subsequent incremental repairs will +only run against new data. + +If you do have strong data consistency requirements, then one must treat all data as initially unrepaired and run +incremental repair against it. Consult +xref:#incremental-repair-defaults[RepairTokenRangeSplitter's Incremental repair defaults]. + +In particular one should be mindful of the xref:managing/operating/compaction/overview.adoc[compaction strategy] +you use for your tables and how it might impact incremental repair before running incremental repair for the first +time: + +- *Large SSTables*: When using xref:managing/operating/compaction/stcs.adoc[SizeTieredCompactionStrategy] or any + compaction strategy which can create large SSTables including many partitions the amount of + xref:managing/operating/compaction/overview.adoc#types-of-compaction[anticompaction] that might be required could be + excessive. Using a small `bytes_per_assignment` might contribute to repeated anticompactions over the same + unrepaired data. +- *Partitions overlapping many SSTables*: If partitions overlap between many SSTables, the amount of SSTables included + in a repair might be large. Therefore it is important to consider that many SSTables may be included in a repair + session and must all be anticompacted. xref:managing/operating/compaction/lcs.adoc[LeveledCompactionStrategy] is less + susceptible to this issue as it prevents overlapping of partitions within levels outside of L0, but if SSTables + start accumulating in L0 between incremental repairs, the cost of anticompaction will increase. + xref:managing/operating/compaction/ucs#sharding[UnifiedCompactionStrategy's sharding] can also be used to avoid + partitions overlapping SSTables. + +The xref:#repair-token-range-splitter[token_range_splitter] configuration for incremental repair includes a default +configuration that attempts to conservatively migrate 100GiB of compressed data every day per node. Depending on +requirements, data set and capability of a cluster's hardware, one may consider tuning these values to be more +aggressive or conservative. + +=== Previewing Repaired Data + +The `preview_repaired` repair type executes repairs over the repaired data set to detect possible data inconsistencies. + +Inconsistencies in the repaired data set should not happen in practice and could indicate a possible bug in incremental +repair. + +Running preview repairs is useful when considering using the +xref:cassandra:managing/operating/compaction/tombstones.adoc#deletion[only_purge_repaired_tombstones] table compaction +option to prevent data from possibly being resurrected when inconsistent replicas are missing tombstones from deletes. + +When enabled, the `BytesPreviewedDesynchronized` and `TokenRangesPreviewedDesynchronized` +xref:cassandra:managing/operating/metrics.adoc#table-metrics[table metrics] can be used to detect inconsistencies in the +repaired data set. + +== Configuring Auto Repair in cassandra.yaml + +Configuration for Auto Repair is managed in the `cassandra.yaml` file by the `auto_repair` property. + +A rich set of configuration exists for configuring Auto Repair with sensible defaults. However, the expectation +is that some tuning might be needed particulary when it comes to tuning how often repair should run +(`min_repair_interval`) and how repair assignments as created (`token_range_splitter`). + +The following is a practical example of an auto_repair configuration that one might use. + +[source, yaml] +---- +auto_repair: + enabled: true + repair_type_overrides: + full: + enabled: true + min_repair_interval: 5d + incremental: + enabled: true + min_repair_interval: 1h + token_range_splitter: + parameters: + bytes_per_assignment: 50GiB + max_bytes_per_schedule: 100GiB + preview_repaired: + enabled: true + min_repair_interval: 1d + global_settings: + repair_by_keyspace: true + parallel_repair_count: 1 +---- + + +=== Top level settings +The following settings are defined at the top level of the configuration file and apply universally across all +repair types. + +[cols=",,",options="header",] +|=== +| Name | Default | Description +| enabled | false | Enable/Disable the auto-repair scheduler. If set to false, the scheduler thread will not be started. +If set to true, the repair scheduler thread will be created. The thread will check for secondary configuration available +for each repair type (full, incremental, and preview_repaired), and based on that, it will schedule repairs. +| repair_check_interval | 5m | Time interval between successive checks to see if ongoing repairs are complete or if it +is time to schedule repairs. +| repair_max_retries | 3 | Maximum number of retries for a repair session. +| history_clear_delete_hosts_buffer_interval | 2h | The scheduler needs to adjust its order when nodes leave the ring. +Deleted hosts are tracked in metadata for a specified duration to ensure they are indeed removed before adjustments +are made to the schedule. +|=== + + +=== Repair level settings +The following settings can be configured globally using `global_settings` or tailored individually for each repair +type by using `repair_type_overrides`. + +[cols=",,",options="header",] +|=== +| Name | Default | Description +| enabled | false | Whether the given repair types should be enabled +| min_repair_interval | 24h | Minimum duration between repairing the same node again. This is useful for tiny clusters, +such as clusters with 5 nodes that finish repairs quickly. This means that if the scheduler completes one round on all +nodes in less than this duration, it will not start a new repair round on a given node until this much time has +passed since the last repair completed. Consider increasing to a larger value to reduce the impact of repairs, +however note that one should attempt to run repairs at a smaller interval than gc_grace_seconds to +avoid xref:cassandra:managing/operating/compaction/tombstones.adoc#zombies[data resurrection]. +| token_range_splitter.class_name | org.apache.cassandra.repair.autorepair.RepairTokenRangeSplitter | Implementation of +IAutoRepairTokenRangeSplitter to use; responsible for splitting token ranges for repair assignments. Out of the box, +Cassandra provides org.apache.cassandra.repair.autorepair.{RepairTokenRangeSplitter,FixedTokenRangeSplitter}. +| repair_by_keyspace | true | If true, attempts to group tables in the same keyspace into one repair; otherwise, +each table is repaired individually. +| number_of_repair_threads | 1 | Number of threads to use for each repair job scheduled by the scheduler. Similar to +the -j option in nodetool repair. +| parallel_repair_count | 3 | Number of nodes running repair in parallel. If `parallel_repair_percentage` is set, the +larger value is used. +| parallel_repair_percentage | 3 | Percentage of nodes in the cluster running repair in parallel. If +`parallel_repair_count is set`, the larger value is used. +| allow_parallel_replica_repair | false | Whether to allow a node to take its turn running repair while one or more of +its replicas are running repair. Defaults to false, as running repairs concurrently on replicas can increase load and +also cause anticompaction conflicts while running incremental repair. +| allow_parallel_replica_repair_across_schedules | true | An addition to allow_parallel_repair that also blocks repairs +when replicas (including this node itself) are repairing in any schedule. +For example, if a replica is executing full repairs, a value of false will prevent starting incremental repairs for this +node. Defaults to true and is only evaluated when allow_parallel_replica_repair is false. +| materialized_view_repair_enabled | false | Repairs materialized views if true. +| initial_scheduler_delay | 5m | Delay before starting repairs after a node restarts to avoid repairs starting +immediately after a restart. +| repair_session_timeout | 3h | Timeout for retrying stuck repair sessions. +| force_repair_new_node | false | Force immediate repair on new nodes after they join the ring. +| sstable_upper_threshold | 50000 | Threshold to skip repairing tables with too many SSTables. +| table_max_repair_time | 6h | Maximum time allowed for repairing one table on a given node. If exceeded, the repair +proceeds to the next table. +| ignore_dcs | [] | Avoid running repairs in specific data centers. By default, repairs run in all data centers. Specify +data centers to exclude in this list. Note that repair sessions will still consider all replicas from excluded data +centers. Useful if you have keyspaces that are not replicated in certain data centers, and you want to not run repair +schedule in certain data centers. +| repair_primary_token_range_only | true | Repair only the primary ranges owned by a node. Equivalent to the -pr option +in nodetool repair. General advice is to keep this true. +| repair_retry_backoff | 30s | Backoff time before retrying a repair session. +| repair_task_min_duration | 5s | Minimum duration for the execution of a single repair task. This prevents the +scheduler from overwhelming the node by scheduling too many repair tasks in a short period of time. +|=== + +=== `RepairTokenRangeSplitter` configuration +[#repair-token-range-splitter] + +`RepairTokenRangeSplitter` is the default implementation of `IAutoRepairTokenRangeSplitter` that attempts to create +token range assignments meeting the following goals: + +- *Create smaller, consistent repair times*: Long repairs, such as those lasting 15 hours, can be problematic. If a +node fails 14 hours into the repair, the entire process must be restarted. The goal is to reduce the impact of +disturbances or failures. However, making the repairs too short can lead to overhead from repair orchestration becoming +the main bottleneck. + +- *Minimize the impact on hosts*: Repairs should not heavily affect the host systems. For incremental repairs, this +might involve anti-compaction work. In full repairs, streaming large amounts of data—especially with wide partitions +can lead to issues with disk usage and higher compaction costs. + +- *Reduce overstreaming*: The Merkle tree, which represents data within each partition and range, has a maximum size. +If a repair covers too many partitions, the tree’s leaves represent larger data ranges. Even a small change in a leaf +can trigger excessive data streaming, making the process inefficient. + +- *Reduce number of repairs*: If there are many small tables, it's beneficial to batch these tables together under a +single parent repair. This prevents the repair overhead from becoming a bottleneck, especially when dealing with +hundreds of tables. Running individual repairs for each table can significantly impact performance and efficiency. + +To achieve these goals, this implementation inspects SSTable metadata to estimate the bytes and number of partitions +within a range and splits it accordingly to bound the size of the token ranges used for repair assignments. + +==== Parameter defaults + +The following `parameters` include the same defaults for all repair types. + +[cols=",,",options="header",] +|=== +| Name | Default | Description +| partitions_per_assignment | 1048576 | Maximum number of partitions to include in a repair +assignment. Used to reduce number of partitions present in merkle tree leaf nodes to avoid overstreaming. +| max_tables_per_assignment | 64 | Maximum number of tables to include in a repair assignment. +This reduces the number of repairs, especially in keyspaces with many tables. The splitter avoids batching tables +together if they exceed other configuration parameters like `bytes_per_assignment` or `partitions_per_assignment`. +|=== + +==== Full & Preview Repaired repair defaults + +The following `parameters` defaults are established for both `full` and `preview_repaired` repair scheduling: + +[cols=",,",options="header",] +|=== +| Name | Default | Description +| bytes_per_assignment | 50GiB | The target and maximum amount of *compressed* bytes that should be included in a +repair assignment. *Note*: For full and preview_repaired, only the portion of an SSTable that covers the ranges +being repaired are accounted for in this calculation. +| max_bytes_per_schedule | 100000GiB | The maximum number of bytes to cover in an individual +schedule. This serves as a mechanism to throttle the work done in each repair cycle. You may reduce this value if the +impact of repairs is causing too much load on the cluster or increase it if writes outpace the amount of data being +repaired. Alternatively, adjust the `min_repair_interval`. This is set to a large value for full repair to attempt to +repair all data per repair schedule. +|=== + +==== Incremental repair defaults + +The following `parameters` defaults are established for `incremental` repair scheduling: + +[cols=",,",options="header",] +|=== +| Name | Default | Description +| bytes_per_assignment | 50GiB | The target and maximum amount of *compressed* bytes that should be +included in a repair assignment. *Note*: For incremental repair, the *entire size* of *unrepaired* SSTables +including ranges being repaired are accounted for in this calculation. This is to account for the anticompaction +work required to split the candidate data to repair from the data that won't be repaired. +| max_bytes_per_schedule | 100GiB | The maximum number of bytes to cover in an individual schedule. +Consider increasing if more data is written than this limit within the `min_repair_interval`. +|=== + +=== `FixedSplitTokenRangeSplitter` configuration +[#fixed-split-token-range-splitter] + +`FixedSplitTokenRangeSplitter` is a more simple implementation of `IAutoRepairTokenRangeSplitter` that creates repair +assignments by splitting a node's token ranges into an even number of splits. + +The following `parameters` apply for `FixedSplitTokenRangeSplitter` configuration: + +[cols=",,",options="header",] +|=== +| Name | Default | Description +| number_of_subranges | 32 | Number of evenly split subranges to create for each node that repair runs for. +If vnodes are configured using `num_tokens`, attempts to evenly subdivide subranges by each range. For example, for +`num_tokens: 16` and `number_of_subranges: 32`, 2 (32/16) repair assignments will be created for each token range. At +least one repair assignment will be created for each token range. +|=== + +=== Other cassandra.yaml Considerations + +==== Enable `reject_repair_compaction_threshold` + +When enabling auto_repair, it is advisable to configure the top level `reject_repair_compaction_threshold` +configuration in cassandra.yaml as a backpressure mechanism to reject new repairs on instances that have many +pending compactions. + +==== Tune `incremental_repair_disk_headroom_reject_ratio` + +By default, incremental repairs will be rejected if less than 20% of disk is available. If one wishes to be +conservative this top level configuration could be increased to a larger value to prevent filling your data directories. + +== Table configuration + +If Auto Repair is enabled in cassandra.yaml, the `auto_repair` property may be optionally configured at the table +level, e.g.: + +[source,cql] +---- +ALTER TABLE cycling.cyclist_races +WITH auto_repair = {'incremental_enabled': 'false', 'priority': '0'}; +---- + +[cols=",,",options="header",] +|=== +| Name | Default | Description +| priority | 0 | Indicates the priority at which this table should be given when issuing repairs. The higher the number +the more priority will be given to repair the table (e.g. 3 will be repaired before 2). When `repair_by_keyspace` is +set to `true` tables sharing the same priority may be grouped in the same repair assignment. +| full_enabled | true | Whether full repair is enabled for this table. If full.enabled is not true in cassandra.yaml +this will not be evaluated. +| incremental_enabled | true | Whether incremental repair is enabled for this table. If incremental.enabled is not +true in cassandra.yaml this will not be evaluated. +| preview_repaired_enabled | true | Whether preview repair is enabled for this table. If preview_repaired.enabled is +not true in cassandra.yaml this will not be evaluated. +|=== + +== Nodetool Configuration +=== nodetool getautorepairconfig + +Retrieves the runtime configuration of Auto Repair for the targeted node. + +[source,none] +---- +$> nodetool getautorepairconfig +repair scheduler configuration: + repair_check_interval: 5m + repair_max_retries: 3 + history_clear_delete_hosts_buffer_interval: 2h +configuration for repair_type: full + enabled: true + min_repair_interval: 24h + repair_by_keyspace: true + number_of_repair_threads: 1 + sstable_upper_threshold: 50000 + table_max_repair_time: 6h + ignore_dcs: [] + repair_primary_token_range_only: true + parallel_repair_count: 3 + parallel_repair_percentage: 3 + materialized_view_repair_enabled: false + initial_scheduler_delay: 5m + repair_session_timeout: 3h + force_repair_new_node: false + repair_retry_backoff: 30s + repair_task_min_duration: 5s + token_range_splitter: org.apache.cassandra.repair.autorepair.RepairTokenRangeSplitter + token_range_splitter.bytes_per_assignment: 50GiB + token_range_splitter.partitions_per_assignment: 1048576 + token_range_splitter.max_tables_per_assignment: 64 + token_range_splitter.max_bytes_per_schedule: 100000GiB +configuration for repair_type: incremental + enabled: true + min_repair_interval: 1h + repair_by_keyspace: true + number_of_repair_threads: 1 + sstable_upper_threshold: 50000 + table_max_repair_time: 6h + ignore_dcs: [] + repair_primary_token_range_only: true + parallel_repair_count: 3 + parallel_repair_percentage: 3 + materialized_view_repair_enabled: false + initial_scheduler_delay: 5m + repair_session_timeout: 3h + force_repair_new_node: false + repair_retry_backoff: 30s + repair_task_min_duration: 5s + token_range_splitter: org.apache.cassandra.repair.autorepair.RepairTokenRangeSplitter + token_range_splitter.bytes_per_assignment: 50GiB + token_range_splitter.partitions_per_assignment: 1048576 + token_range_splitter.max_tables_per_assignment: 64 + token_range_splitter.max_bytes_per_schedule: 100GiB +configuration for repair_type: preview_repaired + enabled: false +---- + +=== nodetool autorepairstatus + +Provides currently running Auto Repair status. + +[source,none] +---- +$> nodetool autorepairstatus -t incremental +Active Repairs +425cea55-09aa-46e0-8911-9f37a4424574 + + +$> nodetool autorepairstatus -t full +Active Repairs +NONE + +---- + +=== nodetool setautorepairconfig + +Dynamic configuration changes can be made by using `setautorepairconfig`. Note that this only applies on the node being +targeted and these changes are not retained when a node is bounced. + +The following disables the `incremental` repair schedule: + +[source,none] +---- +$> nodetool setautorepairconfig -t incremental enabled false +---- + +The following adjusts the `min_repair_interval` option to `5d` specifically for the `full` repair schedule: + +[source,none] +---- +$> nodetool setautorepairconfig -t full min_repair_interval 5d +---- + +The following configures the `bytes_per_assignment` parameter for `incremental` repair's `token_range_splitter` to +`10GiB`: + +[source,none] +---- +$> nodetool setautorepairconfig -t incremental token_range_splitter.bytes_per_assignment 10GiB +---- + +==== More details +https://cwiki.apache.org/confluence/display/CASSANDRA/CEP-37+Apache+Cassandra+Unified+Repair+Solution[CEP-37] diff --git a/doc/modules/cassandra/pages/managing/operating/cdc.adoc b/doc/modules/cassandra/pages/managing/operating/cdc.adoc index b368633c250b..98956caebecc 100644 --- a/doc/modules/cassandra/pages/managing/operating/cdc.adoc +++ b/doc/modules/cassandra/pages/managing/operating/cdc.adoc @@ -87,5 +87,5 @@ tables will be rejected unless some consumption process is in place. == Further Reading -* https://issues.apache.org/jira/browse/CASSANDRA-8844[JIRA ticket] -* https://issues.apache.org/jira/browse/CASSANDRA-12148[JIRA ticket] +* Change Data Capture ( https://issues.apache.org/jira/browse/CASSANDRA-8844[CASSANDRA-8844 JIRA ticket] ) +* Improve determinism of CDC data availability ( https://issues.apache.org/jira/browse/CASSANDRA-12148[CASSANDRA-12148 JIRA ticket] ) diff --git a/doc/modules/cassandra/pages/managing/operating/compaction/tombstones.adoc b/doc/modules/cassandra/pages/managing/operating/compaction/tombstones.adoc index 39592865247b..9e0dcb6f7879 100644 --- a/doc/modules/cassandra/pages/managing/operating/compaction/tombstones.adoc +++ b/doc/modules/cassandra/pages/managing/operating/compaction/tombstones.adoc @@ -35,7 +35,7 @@ This kind of deleted but persistent object is called a https://cassandra.apache. == Grace period To prevent the reappearance of zombies, {cassandra} gives each tombstone a grace period. -The grace period for a tombstone is set with the table property ` WITH gc_grace_seconds`. +The grace period for a tombstone is set with the table property `WITH gc_grace_seconds`. Its default value is 864000 seconds (ten days), after which a tombstone expires and can be deleted during compaction. Prior to the grace period expiring, {cassandra} will retain a tombstone through compaction events. Each table can have its own value for this property. diff --git a/doc/modules/cassandra/pages/managing/operating/compaction/ucs.adoc b/doc/modules/cassandra/pages/managing/operating/compaction/ucs.adoc index 4798f0b6fd6e..a41c979ef9da 100644 --- a/doc/modules/cassandra/pages/managing/operating/compaction/ucs.adoc +++ b/doc/modules/cassandra/pages/managing/operating/compaction/ucs.adoc @@ -18,7 +18,86 @@ Thus, a compaction is triggered when more than a given number of SSTables are pr * *size* can be replaced by *density*, allowing SSTables to be split at arbitrary points when the output of a compaction is written, while still producing a leveled hierarchy. Density is defined as the size of an SSTable divided by the width of the token range it covers. -Let's look at the first concept in more detail. +== Migration from Other Strategies + +The Unified Compaction Strategy (UCS) can be configured to behave like other compaction strategies, making migration straightforward. It also provides advanced options for optimizing specific workload patterns. + +=== Examples + +Below are examples for migrating from commonly used strategies. UCS can maintain similar behavior while providing additional benefits such as parallel compaction and the ability to change parameters without requiring full recompaction. + +[cols="1,3a", options="header"] +|=== +| Source Strategy | Migration Command +| Migrating From LCS | +[source,plaintext] +---- +ALTER TABLE mykeyspace.foo +WITH COMPACTION = { + 'class': 'UnifiedCompactionStrategy', + 'scaling_parameters': 'L10'}; +---- +| Migration from SizeTieredCompactionStrategy | +[source,plaintext] +---- +ALTER TABLE mykeyspace.foo WITH +COMPACTION = { + 'class': 'UnifiedCompactionStrategy', + 'scaling_parameters': 'T4'}; +---- +|=== + +== Use Case Specific Configurations + +The following configurations are optimized for common workload patterns. The parameters can be adjusted based on your specific requirements. + +These provide a good starting point for common workloads, but you may find you want to tune additional parameters based on your workload characteristics. +Additional details to understand this are in the following section. + +[cols="1,3a,3", options="header"] +|=== +| Use Case | Configuration Example | Explanation +| Read Heavy Key Value | +[source,plaintext] +---- +ALTER TABLE mykeyspace.foo +WITH COMPACTION = { + 'class': 'UnifiedCompactionStrategy', + 'scaling_parameters': 'L10', + 'target_sstable_size': + '256MiB', + 'base_shard_count': '8' +}; +---- +| Optimizes for read-intensive workloads with a leveled approach similar to LCS. The smaller target SSTable size and higher shard count improve read performance by minimizing the number of SSTables that must be consulted for a query. +| Write Heavy | +[source,plaintext] +---- +ALTER TABLE mykeyspace.foo +WITH COMPACTION = { + 'class': 'UnifiedCompactionStrategy', + 'scaling_parameters': 'T4', + 'target_sstable_size': '1GiB', + 'base_shard_count': '4' +}; +---- +| Optimizes for write-intensive workloads using a tiered approach similar to STCS. The larger target SSTable size reduces write amplification by requiring fewer compactions, while the lower shard count reduces the overhead of managing too many SSTables. +| Time Series | +[source,plaintext] +---- +ALTER TABLE mykeyspace.foo WITH COMPACTION = { + 'class': 'UnifiedCompactionStrategy', + 'scaling_parameters': 'T8', + 'target_sstable_size': '512MiB', + 'base_shard_count': '8', + 'expired_sstable_check_frequency_seconds': '300' +}; +---- +| Suitable for time-series data with TTLs. The higher tiered scaling parameter (T8) improves write throughput, while the frequent expired SSTable check helps reclaim space from expired data more quickly. The higher shard count allows for greater parallelism in compaction operations. + +Using `scaling_parameters:T8` will result in more SSTables per read. Consider using T4 for time series use cases where lower read latency is desired, and you can afford to perform additional compaction. +|=== + == Read and write amplification diff --git a/doc/modules/cassandra/pages/managing/operating/index.adoc b/doc/modules/cassandra/pages/managing/operating/index.adoc index 39dd508c4593..8068bd3dc4d9 100644 --- a/doc/modules/cassandra/pages/managing/operating/index.adoc +++ b/doc/modules/cassandra/pages/managing/operating/index.adoc @@ -14,7 +14,10 @@ * xref:cassandra:managing/operating/metrics.adoc[Monitoring metrics] * xref:cassandra:managing/operating/repair.adoc[Repair] * xref:cassandra:managing/operating/read_repair.adoc[Read repair] +* xref:cassandra:managing/operating/auto_repair.adoc[Auto Repair] * xref:cassandra:managing/operating/security.adoc[Security] * xref:cassandra:managing/operating/topo_changes.adoc[Topology changes] * xref:cassandra:managing/operating/transientreplication.adoc[Transient replication] -* xref:cassandra:managing/operating/virtualtables.adoc[Virtual tables] \ No newline at end of file +* xref:cassandra:managing/operating/virtualtables.adoc[Virtual tables] +* xref:cassandra:managing/operating/password_validation.adoc[Password validation] +* xref:cassandra:managing/operating/onboarding-to-accord.adoc[] diff --git a/doc/modules/cassandra/pages/managing/operating/metrics.adoc b/doc/modules/cassandra/pages/managing/operating/metrics.adoc index 4f3d66652c24..6fba0f811711 100644 --- a/doc/modules/cassandra/pages/managing/operating/metrics.adoc +++ b/doc/modules/cassandra/pages/managing/operating/metrics.adoc @@ -249,12 +249,19 @@ during validation. |PartitionsValidated |Histogram |Histogram over the number of partitions read during validation. -|BytesAnticompacted |Counter |How many bytes we anticompacted. +|BytesAnticompacted |Meter |How many bytes we anticompacted. -|BytesMutatedAnticompaction |Counter |How many bytes we avoided +|BytesMutatedAnticompaction |Meter |How many bytes we avoided anticompacting because the sstable was fully contained in the repaired range. +|BytesPreviewed |Meter |Estimated number of bytes that were scanned for local replica during preview repair + +|BytesPreviewedDesynchronized |Meter |Number of desynchronized bytes that were detected among all replicas during preview repair + +|TokenRangesPreviewedDesynchronized |Meter |Number of token ranges among all replicas where desynchronization was found +during preview repair. These ranges would need to be streamed during subsequent repair. + |MutatedAnticompactionGauge |Gauge |Ratio of bytes mutated vs total bytes repaired. |=== @@ -1081,6 +1088,67 @@ partitions processed per logged batch partitions processed per unlogged batch |=== +== Automated Repair Metrics + +Metrics specifc to automated repair. + +Reported name format: + +*Metric Name*:: +`org.apache.cassandra.metrics.AutoRepair.` +*JMX MBean*:: +`org.apache.cassandra.metrics:type=AutoRepair name= repairType=` + +[cols=",,",options="header",] +|=== +|Name |Type |Description +|RepairsInProgress |Gauge |Repair is in progress +on the node + +|NodeRepairTimeInSec |Gauge |Time taken to repair +the node in seconds + +|ClusterRepairTimeInSec |Gauge |Time taken to repair +the entire Cassandra cluster in seconds + +|LongestUnrepairedSec |Gauge |Time since the last repair +ran on the node in seconds + +|RepairStartLagSec|Gauge |If a repair has not run within min_repair_interval, how long past this value since +repairs last completed. Useful for determining if repairs are behind schedule. + +|SucceededTokenRangesCount |Gauge |Number of token ranges successfully repaired on the node + +|FailedTokenRangesCount |Gauge |Number of token ranges failed to repair on the node + +|SkippedTokenRangesCount |Gauge |Number of token ranges skipped +on the node + +|SkippedTablesCount |Gauge |Number of tables skipped +on the node + +|TotalMVTablesConsideredForRepair |Gauge |Number of materialized +views considered on the node + +|TotalDisabledRepairTables |Gauge |Number of tables on which +the automated repair has been disabled on the node + +|RepairTurnMyTurn |Counter |Represents the node's turn to repair + +|RepairTurnMyTurnDueToPriority |Counter |Represents the node's turn to repair +due to priority set in the automated repair + +|RepairDelayedByReplica |Counter |Represents occurrences of a node's turn being +delayed because a replica was currently taking its turn. Only relevant if +`allow_parallel_replica_repair` is false. + +|RepairDelayedBySchedule |Counter |Represents occurrences of a node's turn being +delayed because it was already being repaired in another schedule. Only relevant +if `allow_parallel_replica_repair_across_schedules` is false. + +|=== + + == JVM Metrics JVM metrics such as memory and garbage collection statistics can either diff --git a/doc/modules/cassandra/pages/managing/operating/onboarding-to-accord.adoc b/doc/modules/cassandra/pages/managing/operating/onboarding-to-accord.adoc new file mode 100644 index 000000000000..17d451500052 --- /dev/null +++ b/doc/modules/cassandra/pages/managing/operating/onboarding-to-accord.adoc @@ -0,0 +1,354 @@ += Onboarding to Accord + +== Intro + +Accord supports all existing CQL and can be enabled on a per table and +per token range within that table basis. Enabling Accord on existing tables requires a +migration process that can be done on this same per table and per range +basis that safely transitions data from being managed by Cassandra +{plus} Paxos to Cassandra {plus} Accord without downtime. + +A migration is required because Accord can't safely read data written by +non-SERIAL writes. Accord requires deterministic reads in order to have +deterministic transaction recovery and non-SERIAL writes can't be read +deterministically while still being highly available. + +This guide describes how to enable Accord and what differences to expect +when migrating your existing CQL workload to Accord. + +This guide does not cover the new transaction syntax. + +== Configuration + +=== YAML + +You need to set `accord.enabled` to true for Accord to be initialized at +startup. + +`accord.default++_++transactional++_++mode` allows you to set a default +transactional mode for newly created tables which will be used in create +table statements when no `transactional++_++mode` is specified. This +prevents accidentally creating non-Accord tables that will need +migration to Accord. + +`accord.range++_++migration` configures the behavior of altering the +`transactional++_++mode` of a table. When set to `auto` the entire ring +will be marked as migrating when the `transactional++_++mode` of a table +is altered. When set to `explicit` no ranges will be marked as migrating +when the `transactional++_++mode` of a table is altered. + +=== Table parameters + +`transactional++_++mode` can be set when a table is created +`CREATE TABLE foo WITH transactional++_++mode = ‘full'` or it can be set +by altering an existing table +`ALTER TABLE foo WITH transactional++_++mode = ‘full'`. +`transactional++_++mode` designates the target or intended transaction +system for the table and for a newly created table this will be the +transaction system that is used, but for existing tables that are being +altered the table will still need to be migrated to the target system. + +`transactional++_++mode` can be set to `full`, `mixed++_++reads`, and +`off`. `off` means that Paxos will be used and transaction statements +will be rejected. `full` means that all reads and writes will execute on +Accord. `mixed++_++reads` means that all writes will execute on Accord +along with `SERIAL` reads/writes, but non-SERIAL reads/writes will +execute on the existing eventually consistent path. Applying the +mutations for blocking read repair will always be done through Accord in +`full` in and `mixed++_++reads`. + +`transactional++_++migration++_++from` indicates whether a migration is +currently in progress although it does not indicate which ranges are +actively being migrated. This is set automatically when you create a +table or alter `transactional++_++mode` and should not be set manually. +It's possible to manually set `transactional++_++migration++_++from` to +force the completion of migration without actually running the necessary +migration steps. + +`transactional++_++migration++_++from` can be set to `none`, `off`, +`full`, and `mixed++_++reads`. `off`, `full`, and `mixed++_++reads` +correspond to the `transactional++_++mode` being migrated away from and +`none` indicates that no migration is in progress either because the +migration has completed or because the table was created with its +current `transactional++_++mode`. + +=== mixed++_++reads vs full + +When Accord is running with `transactional++_++mode` `full` it will be +able to perform asynchronous commit saving a WAN roundtrip. +`mixed++_++reads` allows non-SERIAL reads to continue to execute using +the original eventually consistent read path. `mixed++_++reads`, unlikes +`full`, always requires Accord to always synchronously commit at the +requested consistency level in order to make acknowledged Accord writes +visible to non-SERIAL reads. + +There is no `transactional++_++mode` that allows non-SERIAL writes +because they break Accord's transaction recovery resulting in +transactions appearing to have different outcomes at different nodes. + +== Accord repair + +Repair can now include an optional Accord repair that `nodetool repair` +will enable by default like Paxos repair. This repair doesn't actually +synchronize any data it just runs a transaction that checks that Accord +has resolved the state of all transactions in the repaired range up to +the point the transaction was created and that the transactions are +applied at `ALL`. + +Accord is normally doing this in the background anyways this just +ensures that it has occurred at `ALL` and hasn't experienced any delays. + +== Migration to Accord + +Migrating an existing table to run on Accord starts by altering the +table: + +.... +ALTER TABLE foo WITH transactional_mode = 'full' +.... + +After the table is altered it is required to run +`nodetool consensus++_++admin begin-migration` on ranges in the table +unless `accord.range++_++migration=auto`. + +When a range is initially marked migrating to Accord all non-SERIAL +writes will execute on Accord while `SERIAL` writes will continue to +execute on Paxos. non-SERIAL writes include regular writes, logged and +unlogged batches, hints, and read repair. Accord will perform +synchronous commit the specified consistency level requiring 2x WAN RTT. + +Tables that are migrating or are partially migrated to Accord (or back to Paxos) can be listed using +`nodetool consensus_admin list` or the sytem table `system_accord_debug.migration_state`. + +Migration to Accord consists of two phases with the first phase starting +when a range is marked migrating, and the second phase starting after a +full or incremental data repair, and then the migration completing after +a second repair which must be a full data repair {plus} Paxos repair. +While marking the range as migrating can be done automatically with +`accord.range++_++migration=auto`, there is not automation for +triggering the repairs. If you regularly run compatible repairs then the +migration will eventually complete, but if you don't run them or want +the migration to complete sooner then you will need to either trigger +them manually or invoke `nodetool consensus++_++admin finish-migration` +to trigger them. + +Any repair that is compatible will drive migration forward whether it +only covers part of the migrating range or whether is started via +`nodetool consensus++_++admin finish-migration` or some other external +process that initiates repair. Force repair with down nodes will not be +eligible to drive any type or phase of migration forward. Force repair +with all nodes up will still work. + +=== First phase + +In the first phase of migration Accord is unable to safely read +non-SERIAL writes so Paxos continues to be used for `SERIAL` operations +and Accord executes all writes and synchronously commits at the +requested consistency level in order to allow Paxos to safely read +Accord writes. Accord's read and write metrics are all counted towards the existing `Read` and `Write` scope +along with the eventually consistent operations, but you should also start to see writes also being counted in the `AccordWrite` scope. + +A data repair either incremental or full replicates all non-SERIAL +writes at `ALL` making it safe for Accord to read non-SERIAL writes that +occurred before the migration started. non-SERIAL writes that occurred +after the migration started were executed through Accord so Accord can +safely read them. + +=== Second phase + +In the second phase all reads and writes execute through Accord +(assuming `transactional++_++mode="full"`). Before an operation can execute on +Accord it is necessary to run a Paxos key repair in order to ensure that +any uncommitted Paxos transactions are committed and this check will +take at least one extra WAN RTT. Additionally Accord has to read at `QUORUM` +(where it would normally only read from a single replica in `transactional++_++mode="full"` and migration completed) because +Paxos writes are only visible at `QUORUM`. + +All reads and CAS operations in the range should start showing up in the +Accord metrics and not the existing metrics. + +Once a key has been repaired, the repaired state of the key is stored in +a small in-memory cache and system table so that it doesn't need to be +repaired again. This information is only stored at replicas of the key +so if the coordinator is not a replica it will not know that it can skip +repairing the key. Use token aware routing to avoid redundant key +repairs. + +A full repair {plus} Paxos repair is necessary to complete the second +phase of migration to Accord. An incremental repair can't currently be +used because incremental repair doesn't include the transactions that +are repaired by Paxos repair because it selects the data to include in +the repair before running the Paxos repair. + +== Migration from Accord + +Migration from Accord to Paxos occurs in a single phase and begins by +altering the table's `transactional++_++mode` to `off` and then +optionally marking ranges as migrating as discussed above. + +Once a range is marked migrating all operations in the migrating range +will stop executing on Accord. Before each operation occurs they will +have to run an Accord key repair similar to the Paxos key repair to +ensure Accord transactions for that key have committed at `QUORUM`. + +An Accord repair needs to be run on the migrating range, triggered +manually or via `nodetool finish-migration`, and once that completes +non-SERIAL operations will run using the usual eventually consistent +path and `SERIAL` operations will execute on Paxos. + +== Migration commands + +All the `nodetool` migration commands are based on new +`StorageServiceMBean` JMX methods. These methods are +`migrateConsensusProtocol`, `finishConsensusMigration`, +`listConsensusMigrations`, `getAccordManagedKeyspaces`, and +`getAccordManagedTables` and can be used by external management tools to +manage consensus migration. The existing methods for starting repairs +can also be used to start the repairs that are needed to complete +migration. + +=== nodetool consensus++_++admin list + +Invoking `nodetool` with +`consensus++_++admin list ++[<++keyspace++>++ ++<++tables++>++...++]++` +will connect to the specified node and retrieve that nodes view of what +tables are currently being migrated from transactional cluster metadata. +Tables that are not being migrated are not listed. + +The results can be printed out in several different formats using the +`format` parameter which supports `json`, `minified-json`, `yaml`, and +`minified-yaml`. + +=== nodetool consensus++_++admin begin-migration + +Invoking `nodetool` with +`consensus++_++admin begin-migration ++[<++keyspace++>++ ++<++tables++>++...++]++` +can be used to mark ranges on a table as migrating. This can only be +done after the migration has been started by altering the tables. +Marking ranges as migrating is a lightweight operation and does not +trigger the repairs that will finish the migration. + +The range to mark migrating needs to be explicitly +provided otherwise the entire ring will be marked migrating for the +specified keyspace and tables. If the entire range is marked migrating it is +only necessary to invoke `begin-migration` on one node. + +This is only needed if +`accord.default++_++transactional++_++mode=explicit` is set in +`cassandra.yaml` otherwise all the ranges will already have been marked +migrating when the alter occurred. + +Ranges that are migrating will require at least an extra WAN roundtrip +for each request that touches a migrating range because both transaction +systems may need to be used to execute the request. + +=== nodetool consensus++_++admin finish-migration + +Invoking `nodetool` with +`consensus++_++admin finish-migration ++[<++keyspace++>++ ++<++tables++>++...` +will run the repairs needed to complete the migration for the specified +ranges. If no range is specified it will default to the primary range of +the node that `nodetool` is connecting to so you can call it once on +every node to complete migration. + +When migrating from Paxos to Accord it will run an incremental data +repair and then a full data repair {plus} Paxos repair. When migrating +from Accord to Paxos it will run an Accord repair. + +== Supported consistency levels + +Migration requires support for read and write consistency levels because +Accord ends up being required to read Paxos writes at `QUORUM` and +Accord needs to execute non-SERIAL writes while Paxos is still being +used for `SERIAL` writes and thus needs to perform synchronous commit at +the requested consistency level. + +Once migration is complete the read and write consistency levels will be +ignored with transactional mode `full` . With transactional mode +`mixed++_++reads` Accord will continue to do synchronous commit and +honor the requested commit/write consistency level. + +Accord will always reject any requests to execute at unsupported +consistency levels to ensure that migration to/from Accord is always +possible. + +Supported read consistency levels are `ONE`, `QUORUM`, `SERIAL`, and +`ALL`. Supported write consistency levels are `ANY`, `ONE`, `QUORUM`, +`SERIAL`, and `ALL`. `LOCAL`, `TWO`, and `THREE` are not supported. +`ANY` is executed as an asynchronous commit similar to Paxos. + +== non-SERIAL consistency + +non-SERIAL operations are not linearizable even when executed on Accord +because Accord will continue to write data using the coordinator +generated timestamp not the transaction's timestamp. + +`USING TIMESTAMP` is allowed and the application of the operations will +occur in a linearizable order, but from the perspective of a reader the +merged result may not appear linearizable. + +Paging runs a separate transaction per page and does not produce a +linearizable result. + +Partition range reads are split into multiple transactions during +execution and will not produce a strict serializable result. +Additionally during migration there are no barriers/repairs executed +before partition range reads. When migrating from Accord to Paxos the +effective commit CL for Accord writes as viewed from partition range +reads will be `ANY`. Adding barriers/repairs before partition range +reads would cause them to time out so they are not done. + +== Batchlog and hints + +Pre-existing batchlog entries and hints will be processed during and +after migration until they are completed. If they need to be executed +through Accord they will be routed through Accord automatically. + +Logged batches that only touch Accord data will not be written to the +batch log because that functionality is redundant with Accord. Batches +that touch Accord and non-Accord data continue to use the batch log. +Before release this is likely to change so that a batch that touches +Accord data will be written entirely via Accord including both the +Accord and non-Accord data. + +Hints are not written for Accord writes although the batch log may +result in new hints because batch log entries are converted to hints +after the first retry. + +== Operations spanning Accord/non-Accord data + +Various operations can access both Accord and non-Accord managed data. +These are transparently split into parts that execute on Accord and +parts that execute outside of Accord and the results are merged. If the +splitting process races with migration then the operations is re-split +and retried without surfacing an error to the client. + +== Partition range read with LIMIT performance + +Partition range reads with a limit use more memory and CPU at the nodes +being read from and at the coordinator. Accord splits the ranges owned +by each node into smaller subranges and each subrange is owned by a +command store. The partition range read will execute at every +intersecting command store on a node and each will return `LIMIT N` +results which are sent back to the coordinator. The coordinator then +merges them and re-applies the limit. + +The additional memory and CPU will be amplified proportional to the +number of command stores which defaults to +`DatabaseDescriptor.getAvailableProcessors()`. + +== Metrics + +Accord's read and write metrics are counted under the existing `Read` and `Write` scope along with eventually consistent +operations. To see Accord specific metrics you can look at the `AccordRead` and `AccordWrite` scope. `CASRead` and `CASWrite` will not track +CAS or `SERIAL` read operations that end up running on Accord and they will instead show up in `AccordRead`/`AccordWrite` and `Read`/`Write`. + +If a single request ends up running on both systems due to misrouting it +will show up as multiple requests. Misrouted requests are counted under the `RetryDifferentSystem` meter and will show +up in `AccordRead` and `AccordWrite` if Accord was the system the request was misrouted to as well as `Read` and `Write`. +If the request was misrouted to non-Accord code then it will show up under `Read` and `Write` metrics or `CASRead` and `CASWrite` metrics. + +Hints can be misrouted and this is tracked in `HintsServiceMetrics` under the `HintsRetryDifferentSystem` meter. + +Partition range reads can also potentially generate additional Accord +transactions depending on how the reads end up having to be split due to intersection with migrating ranges. diff --git a/doc/modules/cassandra/pages/managing/operating/password_validation.adoc b/doc/modules/cassandra/pages/managing/operating/password_validation.adoc new file mode 100644 index 000000000000..f6ad0fa812a3 --- /dev/null +++ b/doc/modules/cassandra/pages/managing/operating/password_validation.adoc @@ -0,0 +1,325 @@ += Password validation and generation +:navtitle: Password validation and generation +:description: Password validation and generation - How it works, how to configure it, and more. +:keywords: CEP-24, Password, Generation, Validation, Security + +Here’s the problem: while users have always had the ability to create whatever password they wanted in Cassandra - +from straightforward to incredibly complex and everything in between–this ultimately created a noticeable security vulnerability. + +While organizations might have internal processes for generating secure passwords that adhere to their own security policies, +Cassandra itself did not have the means to enforce these standards. To make the security vulnerability worse, +if a password initially met internal security guidelines, users could later downgrade their password to +a less secure option simply by using `ALTER ROLE` statements. + +When internal password requirements are enforced for an individual, users face the additional +burden of creating compliant passwords. This inevitably involved lots of trial-and-error in attempting +to create a compliant password that satisfied complex security roles. + +But what if there was a way to have Cassandra automatically create passwords that meet all +bespoke security requirements–but without requiring manual effort from users or system operators? + +That’s why we developed https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=228494146[CEP-24: Password validation/generation]. +We recognized that the complexity of secure password management could be significantly reduced (or eliminated entirely) +with the right approach –and improving both security and user experience at the same time. + +== The Goals of CEP-24 + +A Cassandra Enhancement Proposal (or CEP) is a structured process for proposing, creating, and ultimately implementing +new features for the Cassandra project. All CEPs are thoroughly vetted among the Cassandra community before +they are officially integrated into the project. + +These were the key goals we established for CEP-24: + +* Introduce a way to enforce password strength upon role creation or role alteration. +* Implement a reference implementation of a password validator which adheres to a recommended password strength policy, +to be used for Cassandra users out of the box. +* Emit a warning (and proceed) or just reject `CREATE ROLE` and `ALTER ROLE` statements when the provided +password does not meet a certain security level, based on user configuration of Cassandra. +* To be able to implement a custom password validator with its own policy, whatever it might be, +and provide a modular/pluggable mechanism to do so. +* Provide a way for Cassandra to generate a password which would pass the subsequent validation for use by the user. + +The Cassandra Password Validator and Generator builds upon an established framework in Cassandra called Guardrails, +which was originally implemented under CEP-3 (more details https://cwiki.apache.org/confluence/display/CASSANDRA/CEP-3%3A+Guardrails[here]). + +== Implementation and configuration + +The password validator implements a custom guardrail introduced as part of CEP-24. A custom guardrail can validate and +generate values of arbitrary types when properly implemented. In the CEP-24 context, +the password guardrail provides `CassandraPasswordValidator` by extending `ValueValidator`, +while passwords are generated by `CassandraPasswordGenerator` by extending `ValueGenerator`. +Both components work with passwords as `String` type values. + +Password validation and generation are configured in `cassandra.yaml` file under the `password_validator` section. +Let’s explore the key configuration properties available. + +First, the `class_name` and `generator_class_name` parameters +specify which validator and generator classes will be used to validate and generate passwords respectively. + +Cassandra ships `CassandraPasswordValidator` and `CassandraPasswordGenerator` out of the box. +However, if a particular enterprise decides that they need something very custom, they are free to implement their own validators, +put it on Cassandra’s class path and reference it in the configuration behind `class_name` parameter. Same for the validator. + +CEP-24 provides implementations of the validator and generator that the Cassandra team believes will satisfy +the requirements of most users. These default implementations address common password security needs. +However, the framework is designed with flexibility in mind, allowing organizations to implement custom validation +and generation rules that align with their specific security policies and business requirements. + +---- +password_validator: +# Implementation class of a validator. When not in form of FQCN, the +# package name org.apache.cassandra.db.guardrails.validators is prepended. +# By default, there is no validator. + class_name: CassandraPasswordValidator +# Implementation class of related generator which generates values +# which are valid when tested against this validator. +# When not in form of FQCN, the package name +# org.apache.cassandra.db.guardrails.generators is prepended. +# By default, there is no generator. + generator_class_name: CassandraPasswordGenerator +---- + +Password quality might be looked at as the number of _characteristics_ a password satisfies. +There are two levels for any password to be evaluated – warning level and failure level. +Warning and failure levels nicely fit into how Guardrails act. Every guardrail has warning and failure thresholds. +Based on what value a specific guardrail evaluates, it will either emit a warning to a user that its usage +is discouraged (but ultimately allowed), or it will fail to be set altogether. + +This same principle applies to password evaluation – each password is assessed against both warning and failure thresholds. +These thresholds are determined by counting the characteristics present in the password. + +The system evaluates five key characteristics: + +* the password’s overall length +* the number of uppercase characters +* the number of lowercase characters +* the number of special characters +* and the number of digits. + +A comprehensive password security policy can be enforced by configuring minimum requirements for each of these characteristics. + +---- + # There are four characteristics (excluding password's length): + # upper-case, lower-case, special character and digit. + # If this value is set e.g. to 3, a password has to + # consist of 3 out of 4 characteristics. + # For example, it has to contain at least 2 upper-case characters, + # 2 lower-case, and 2 digits to pass, + # but it does not have to contain any special characters. + # If the number of characteristics found in the password is + # less than or equal to this number, it will emit a warning. + characteristic_warn: 3 + # If the number of characteristics found in the password is + #less than or equal to this number, it will emit a failure. + characteristic_fail: 2 +---- + +Next, there are configuration parameters for each characteristic which count towards warning or failure: +---- + +# If the password is shorter than this value, +# the validator will emit a warning. +length_warn: 12 +# If a password is shorter than this value, +# the validator will emit a failure. +length_fail: 8 +# If a password does not contain at least n +# upper-case characters, the validator will emit a warning. +upper_case_warn: 2 +# If a password does not contain at least +# n upper-case characters, the validator will emit a failure. +upper_case_fail: 1 +# If a password does not contain at least +# n lower-case characters, the validator will emit a warning. +lower_case_warn: 2 +# If a password does not contain at least +# n lower-case characters, the validator will emit a failure. +lower_case_fail: 1 +# If a password does not contain at least +# n digits, the validator will emit a warning. +digit_warn: 2 +# If a password does not contain at least +# n digits, the validator will emit a failure. +digit_fail: 1 +# If a password does not contain at least +# n special characters, the validator will emit a warning. +special_warn: 2 +# If a password does not contain at least +# n special characters, the validator will emit a failure. +special_fail: 1 +---- + +It is also possible to say that illegal sequences of certain length found in a password will be forbidden: + +---- +# If a password contains illegal sequences that are at least this long, it is invalid. +# Illegal sequences might be either alphabetical (form 'abcde'), +# numerical (form '34567'), or US qwerty (form 'asdfg') as well +# as sequences from supported character sets. +# The minimum value for this property is 3, +# by default it is set to 5. +illegal_sequence_length: 5 +---- + +Lastly, it is also possible to configure a dictionary of passwords to check against. +That way, we will be checking against password dictionary attacks. +It is up to the operator of a cluster to configure the password dictionary: + +---- +# Dictionary to check the passwords against. Defaults to no dictionary. +# Whole dictionary is cached into memory. Use with caution with relatively big dictionaries. +# Entries in a dictionary, one per line, have to be sorted per String's compareTo contract. +dictionary: /path/to/dictionary/file +---- + +Now that we have gone over all the configuration parameters, let’s take a look at an example of how password +validation and generation look in practice. + +=== Validation and generation of a password + +Consider a scenario where a Cassandra super-user (such as the default ‘cassandra’ role) attempts +to create a new role named ‘alice’. + +---- +cassandra@cqlsh> CREATE ROLE alice WITH PASSWORD = 'cassandraisadatabase' AND LOGIN = true; +InvalidRequest: Error from server: code=2200 [Invalid query] +message="Password was not set as it violated configured password +strength policy. To fix this error, the following has to be resolved: +Password contains the dictionary word 'cassandraisadatabase'. You may also use +'GENERATED PASSWORD' upon role creation or alteration." +---- + +The password is in the dictionary. When an operator sees this, +they will try to fix it by creating some random password not in dictionary: + +---- +cassandra@cqlsh> CREATE ROLE alice WITH PASSWORD = 'T8aum3?' AND LOGIN = true; +InvalidRequest: Error from server: code=2200 [Invalid query] +message="Password was not set as it violated configured password strength +policy. To fix this error, the following has to be resolved: Password +must be 8 or more characters in length. You may also use +'GENERATED PASSWORD' upon role creation or alteration." +---- + +Password is not in the dictionary, but it is not long enough. In the following example, +the password is finally set, but it is not considered to be secure enough. +It satisfies the minimum requirements but our validator identified that not all characteristics were met. + +---- +cassandra@cqlsh> CREATE ROLE alice WITH PASSWORD = 'mYAtt3mp' AND LOGIN = true; + +Warnings: + +Guardrail password violated: Password was set, however it might not be +strong enough according to the configured password strength policy. +To fix this warning, the following has to be resolved: Password must be 12 or more +characters in length. Passwords must contain 2 or more digit characters. Password +must contain 2 or more special characters. Password matches 2 of 4 character rules, +but 4 are required. You may also use 'GENERATED PASSWORD' upon role creation or alteration. +---- + +When an operator saw this, they noticed the note about the `GENERATED PASSWORD` clause which will +generate a password automatically without an operator needing to invent it on their own. +This is a lot of times, as shown, a cumbersome process better to be left on a machine. + +---- +cassandra@cqlsh> ALTER ROLE alice WITH GENERATED PASSWORD; + +generated_password +------------------ + R7tb33?.mcAX +---- + +The generated password shown above will satisfy all the rules we have configured in `cassandra.yaml` automatically. +Every generated password will satisfy all the rules. This is clearly an advantage over manual password generation. + +When the CQL statement is executed, it will be visible in the CQLSH history (`HISTORY` command or in `cqlsh_history` file) +but the password will not be logged, hence it cannot leak. It will also not appear in any auditing logs. +Previously, Cassandra had to obfuscate such statements. This is not necessary anymore. + +We can create a role with generated password like this: + +---- +cassandra@cqlsh> CREATE ROLE alice WITH GENERATED PASSWORD AND LOGIN = true; +---- + +or by `CREATE USER`: + +---- +cassandra@cqlsh> CREATE USER alice WITH GENERATED PASSWORD; +---- + +When a password is generated for `alice` she can log in: + +---- +$ cqlsh -u alice -p R7tb33?.mcAX +... +alice@cqlsh> +---- + +NOTE: It is recommended to save password to ~/.cassandra/credentials, for example: + +---- +[PlainTextAuthProvider] +username = cassandra +password = R7tb33?.mcAX +---- + +and by setting auth_provider in `~/.cassandra/cqlshrc` + +---- +[auth_provider] +module = cassandra.auth +classname = PlainTextAuthProvider +---- + +It is also possible to configure password validators in such a way that a user does not see why a password failed. +This is driven by configuration property for `password_validator` called `detailed_messages`. When set to `false`, +the violations will be very brief: + +---- +alice@cqlsh> ALTER ROLE alice WITH PASSWORD = 'myattempt'; + +InvalidRequest: Error from server: code=2200 [Invalid query] +message="Password was not set as it violated configured password strength policy. +You may also use 'GENERATED PASSWORD' upon role creation or alteration." +---- + +Several potential enhancements to password generation and validation could be implemented in future releases. +One promising extension would be validating new passwords against previous values. +This would prevent users from reusing passwords until after they’ve created a specified number of different passwords. +A related enhancement could include restricting how frequently users can change their passwords, +preventing rapid cycling through passwords to circumvent history-based restrictions. + +These features, while valuable for comprehensive password security, were considered beyond the scope of the initial +implementation and may be addressed in future updates. + +=== Runtime configuration + +Since this solution is based on guardrails which are configurable via JMX in runtime, same hold for +password validator, also configured via `GuardrailsMBean` as any other guardrails. There are two methods exposed: + +* `Map getPasswordValidatorConfig()` - gets password validator configuration +* `void reconfigurePasswordValidator(Map config)` - reconfigures the password validator by reading +and parsing the configuration from the provided map. Reconfiguration of password validator in runtime is considered +to be very sensitive operation. If an operator evaluates the reconfiguration in runtime is not allowed, they +might set `password_validator_reconfiguration_enabled` to `false` in `cassandra.yaml` to disable it. + +=== Diagnostic events + +If diagnostic event's framework is enabled and consumers are subscribed to them, diagnostic events about +warning and failures to generate a password will be published. + +=== Final thoughts and next steps + +The Cassandra Password Validator and Generator implemented under CEP-24 +represents a significant improvement in Cassandra’s security posture. + +By providing robust, configurable password policies with built-in enforcement mechanisms and +convenient password generation capabilities, organizations can now ensure compliance with their +security standards directly at the database level. This not only strengthens overall system security +but also improves the user experience by eliminating guesswork around password requirements. + +As Cassandra continues to evolve as an enterprise-ready database solution, +these security enhancements demonstrate a commitment to meeting the demanding +security requirements of modern applications while maintaining the flexibility that makes Cassandra so powerful. \ No newline at end of file diff --git a/doc/modules/cassandra/pages/managing/operating/repair.adoc b/doc/modules/cassandra/pages/managing/operating/repair.adoc index 1823a6d4ef95..d7eaba171125 100644 --- a/doc/modules/cassandra/pages/managing/operating/repair.adoc +++ b/doc/modules/cassandra/pages/managing/operating/repair.adoc @@ -29,10 +29,21 @@ for syncing up missed writes, but it doesn't protect against things like disk corruption, data loss by operator error, or bugs in Cassandra. For this reason, full repairs should still be run occasionally. -== Usage and Best Practices +== Automated Repair Scheduling -Since repair can result in a lot of disk and network io, it's not run -automatically by Cassandra. It is run by the operator via nodetool. +Since repair can result in a lot of disk and network io, it has +traditionally not been run automatically by Cassandra. + +In the latest version of Cassandra, a new feature called +xref:managing/operating/auto_repair.adoc[auto repair] was introduced to +allow Cassandra to submit and manage repairs automatically on a schedule. + +The introduction of this feature does not interfere with existing repair +functionality enabled via nodetool. + +== Submitting Repairs Using Nodetool + +Repairs can also be run by the operator via nodetool. Incremental repair is the default and is run with the following command: @@ -63,7 +74,7 @@ nodetool repair [options] ---- -The repair command repairs token ranges only on the node being repaired; it does not repair the whole cluster. +The repair command repairs token ranges only on the node being repaired; it does not repair the whole cluster. By default, repair operates on all token ranges replicated by the node on which repair is run, causing duplicate work when running it on every node. Avoid duplicate work by using the `-pr` flag to repair only the "primary" ranges on a node. Do a full cluster repair by running the `nodetool repair -pr` command on each node in each datacenter in the cluster, until all of the nodes and datacenters are repaired. diff --git a/doc/modules/cassandra/pages/managing/operating/security.adoc b/doc/modules/cassandra/pages/managing/operating/security.adoc index cdc76a625dbb..ee846cb13d57 100644 --- a/doc/modules/cassandra/pages/managing/operating/security.adoc +++ b/doc/modules/cassandra/pages/managing/operating/security.adoc @@ -393,6 +393,12 @@ See also: xref:cassandra:developing/cql/security.adoc#grant-permission[`GRANT PE xref:cassandra:developing/cql/security.adoc#grant-all[`GRANT ALL`] and xref:cassandra:developing/cql/security.adoc#revoke-permission[`REVOKE PERMISSION`]. +== Password validation + +If you are interested into the application of a certain security policy for password strength for +user passwords, you are welcome to read about it more in xref:cassandra:managing/operating/password_validation.adoc[here] +which implements https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=228494146[CEP-24]. + [[auth-caching]] == Caching diff --git a/doc/modules/cassandra/pages/managing/operating/topo_changes.adoc b/doc/modules/cassandra/pages/managing/operating/topo_changes.adoc index 9c1e9519c63d..2bd85519b54c 100644 --- a/doc/modules/cassandra/pages/managing/operating/topo_changes.adoc +++ b/doc/modules/cassandra/pages/managing/operating/topo_changes.adoc @@ -97,7 +97,7 @@ in `nodetool netstats`. The replacing node will now start to bootstrap the data from the rest of the nodes in the cluster. A replacing node will only receive writes during the bootstrapping phase if it has a different ip address to the -node that is being replaced. (See CASSANDRA-8523 and CASSANDRA-12344) +node that is being replaced. ( See https://issues.apache.org/jira/browse/CASSANDRA-8523[CASSANDRA-8523] and https://issues.apache.org/jira/browse/CASSANDRA-12344[CASSANDRA-12344] ) Once the bootstrapping is complete the node will be marked "UP". diff --git a/doc/modules/cassandra/pages/managing/operating/virtualtables.adoc b/doc/modules/cassandra/pages/managing/operating/virtualtables.adoc index 362308372ce4..c87699ed77c8 100644 --- a/doc/modules/cassandra/pages/managing/operating/virtualtables.adoc +++ b/doc/modules/cassandra/pages/managing/operating/virtualtables.adoc @@ -72,6 +72,8 @@ cqlsh> select * from system_metrics.all_groups ; group_name | virtual_table -------------------+--------------------------- + AccordCoordinator | accord_coordinator_group + AccordReplica | accord_replica_group Batch | batch_group BufferPool | buffer_pool_group CIDRAuthorizer | cidr_authorizer_group @@ -98,6 +100,7 @@ cqlsh> select * from system_metrics.all_groups ; Paxos | paxos_group ReadRepair | read_repair_group Repair | repair_group + RouteIndex | route_index_group Storage | storage_group StorageProxy | storage_proxy_group Streaming | streaming_group diff --git a/doc/modules/cassandra/pages/managing/tools/sstable/sstableverify.adoc b/doc/modules/cassandra/pages/managing/tools/sstable/sstableverify.adoc index 061edf4978de..a807078a2c40 100644 --- a/doc/modules/cassandra/pages/managing/tools/sstable/sstableverify.adoc +++ b/doc/modules/cassandra/pages/managing/tools/sstable/sstableverify.adoc @@ -9,7 +9,7 @@ results will occur. Note: the script does not verify that Cassandra is stopped. == WARNING -See CASSANDRA-9947 and CASSANDRA-17017 for discussion around risks with this tool. Specifically: "We mark sstables that fail verification as unrepaired, but that's not going to do what you think. What it means is that the local node will use that sstable in the next repair, but other nodes will not. So all we'll end up doing is streaming whatever data we can read from it, to the other replicas. If we could magically mark whatever sstables correspond on the remote nodes, to the data in the local sstable, that would work, but we can't." +See https://issues.apache.org/jira/browse/CASSANDRA-9947[CASSANDRA-9947] and https://issues.apache.org/jira/browse/CASSANDRA-17017[CASSANDRA-17017] for discussion around risks with this tool. Specifically: "We mark sstables that fail verification as unrepaired, but that's not going to do what you think. What it means is that the local node will use that sstable in the next repair, but other nodes will not. So all we'll end up doing is streaming whatever data we can read from it, to the other replicas. If we could magically mark whatever sstables correspond on the remote nodes, to the data in the local sstable, that would work, but we can't." This tool requires the use of a -f or --force flag to indicate that the user understands the risks and would like to attempt its usage anyway. @@ -23,7 +23,7 @@ sstableverify
|-e, --extended |extended verification |-h, --help |display this help message |-v, --verbose |verbose output -|-f, --force |allow use of tool (see CASSANDRA-17017 for risks) +|-f, --force |allow use of tool (see https://issues.apache.org/jira/browse/CASSANDRA-17017[CASSANDRA-17017] for risks) |=== == Basic Verification diff --git a/doc/modules/cassandra/pages/reference/cql-commands/create-index.adoc b/doc/modules/cassandra/pages/reference/cql-commands/create-index.adoc index 71994d86e194..ae95c33a1d50 100644 --- a/doc/modules/cassandra/pages/reference/cql-commands/create-index.adoc +++ b/doc/modules/cassandra/pages/reference/cql-commands/create-index.adoc @@ -240,7 +240,7 @@ include::cassandra:example$RESULTS/sai/select_all_from_cyclist_career_teams-team You can create an index on xref:cassandra:developing/cql/indexing/2i/_2i-create-on-collection.adoc[map collection keys]. If an index of the map values of the collection exists, drop that index before creating an index on the map collection keys. -Assume a cyclist table contains this map data where `nation is the map key and `Canada` is the map value`: +Assume a cyclist table contains this map data where `nation` is the map key and `Canada` is the map value: [source,no-highlight] ---- @@ -471,4 +471,4 @@ SELECT result:: include::cassandra:example$RESULTS/sai/race_starts-queries.result[] ---- -- -==== \ No newline at end of file +==== diff --git a/doc/modules/cassandra/pages/reference/cql-commands/create-table-examples.adoc b/doc/modules/cassandra/pages/reference/cql-commands/create-table-examples.adoc index 47ab8497344b..d3487202b56e 100644 --- a/doc/modules/cassandra/pages/reference/cql-commands/create-table-examples.adoc +++ b/doc/modules/cassandra/pages/reference/cql-commands/create-table-examples.adoc @@ -65,7 +65,7 @@ CDC logging must be enabled in cassandra.yaml. ==== Before enabling CDC logging, have a plan for moving and consuming the log information. After the disk space limit is reached, writes to CDC-enabled tables are rejected until more space is freed. -See https://docs.datastax.com/en/dse/6.8/dse-admin/datastax_enterprise/config/configCassandra_yaml.html#configCassandra_yaml__cdcSpaceSection[Change-data-capture (CDC) space settings] for information about available CDC settings. +See https://docs.datastax.com/en/dse/6.8/dse-admin/datastax_enterprise/config/configCassandra_yaml.html#cdcSpaceSection[Change-data-capture (CDC) space settings] for information about available CDC settings. ==== == Storing data in descending order diff --git a/doc/modules/cassandra/pages/reference/java17.adoc b/doc/modules/cassandra/pages/reference/java17.adoc index 1ec3aab36e1e..645711a66881 100644 --- a/doc/modules/cassandra/pages/reference/java17.adoc +++ b/doc/modules/cassandra/pages/reference/java17.adoc @@ -8,7 +8,7 @@ the vertical axis and the run version is along the horizontal axis. [width="68%",cols="34%,30%,36%",] |=== | | Java 11 (Run) | Java 17 (Run) -| Java 11 (Build) | Supported | Experimental Support +| Java 11 (Build) | Supported | Supported | Java 17(Build) | Not Supported | Experimental in CI |=== diff --git a/doc/modules/cassandra/pages/reference/static.adoc b/doc/modules/cassandra/pages/reference/static.adoc index afa193cdbb33..d27adc76ea5f 100644 --- a/doc/modules/cassandra/pages/reference/static.adoc +++ b/doc/modules/cassandra/pages/reference/static.adoc @@ -2,7 +2,7 @@ :description: In a table that uses clustering columns, non-clustering columns can be declared static in the table definition. Static column values are shared among the rows in the partition. -In a table that uses https://cassandra.apache.org/_/glossary.html#clustering-column[clustering columns], non-clustering columns can be declared static in the table definition. +In a table that uses https://cassandra.apache.org/\_/glossary.html#clustering-column[clustering columns], non-clustering columns can be declared static in the table definition. https://cassandra.apache.org/_/glossary.html#static-column[Static columns] are only static within a given partition. In the following example, the `flag` column is static: diff --git a/doc/modules/cassandra/pages/troubleshooting/use_tools.adoc b/doc/modules/cassandra/pages/troubleshooting/use_tools.adoc index e458b55919aa..ed72f5433d72 100644 --- a/doc/modules/cassandra/pages/troubleshooting/use_tools.adoc +++ b/doc/modules/cassandra/pages/troubleshooting/use_tools.adoc @@ -18,10 +18,10 @@ stacks. [arabic] . By default Cassandra ships with `-XX:+PerfDisableSharedMem` set to -prevent long pauses (see `CASSANDRA-9242` and `CASSANDRA-9483` for +prevent long pauses (see https://issues.apache.org/jira/browse/CASSANDRA-9242[CASSANDRA-9242] and https://issues.apache.org/jira/browse/CASSANDRA-9483[CASSANDRA-9483] for details). If you want to use JVM tooling you can instead have `/tmp` mounted on an in memory `tmpfs` which also effectively works around -`CASSANDRA-9242`. +https://issues.apache.org/jira/browse/CASSANDRA-9242[CASSANDRA-9242] . . Make sure you run the tools as the same user as Cassandra is running as, e.g. if the database is running as `cassandra` the tool also has to be run as `cassandra`, e.g. via `sudo -u cassandra `. diff --git a/doc/modules/cassandra/pages/vector-search/data-modeling.adoc b/doc/modules/cassandra/pages/vector-search/data-modeling.adoc index 232873ea196c..d9a862377de9 100644 --- a/doc/modules/cassandra/pages/vector-search/data-modeling.adoc +++ b/doc/modules/cassandra/pages/vector-search/data-modeling.adoc @@ -1,4 +1,4 @@ -= Data Modeling += Vector Search : Data Modeling As you develop AI and Machine Learning (ML) applications using Vector Search, here are some data modeling considerations. These factors help effectively leverage vector search to produce accurate and efficient search responses within your application. @@ -162,4 +162,4 @@ While the vector embeddings can replace or augment some functions of a tradition * Vector embeddings are not human-readable. Embeddings are not recommended when seeking to directly retrieve data from a table. -* The model might not be able to capture all relevant information from the data, leading to incorrect or incomplete results. \ No newline at end of file +* The model might not be able to capture all relevant information from the data, leading to incorrect or incomplete results. diff --git a/doc/modules/cassandra/partials/cql-syntax-legend.adoc b/doc/modules/cassandra/partials/cql-syntax-legend.adoc index 0064826f66ce..500b5b71688f 100644 --- a/doc/modules/cassandra/partials/cql-syntax-legend.adoc +++ b/doc/modules/cassandra/partials/cql-syntax-legend.adoc @@ -41,7 +41,7 @@ Use single quotation marks to preserve upper case. Braces (`{ }`) enclose map collections or key value pairs. A colon separates the key and the value. -| `<,>` +| `< , >` | Set, list, map, or tuple. Angle brackets ( `< >` ) enclose data types in a set, list, map, or tuple. Separate the data types with a comma. @@ -60,4 +60,4 @@ This syntax is useful when arguments might be mistaken for command line options. | `@=''` | Search CQL only: Identify the entity and literal value to overwrite the XML element in the schema and solrConfig files. -|=== \ No newline at end of file +|=== diff --git a/doc/modules/cassandra/partials/table-column-definitions.adoc b/doc/modules/cassandra/partials/table-column-definitions.adoc index 9abe065f6418..2a5e3e40df5a 100644 --- a/doc/modules/cassandra/partials/table-column-definitions.adoc +++ b/doc/modules/cassandra/partials/table-column-definitions.adoc @@ -8,7 +8,7 @@ Each column is defined using the following syntax: `+column_name cql_type_defini *Restriction:* * A table must have at least one `PRIMARY KEY`. -* When `PRIMARY KEY` is at the end of a column definition, that column is the only primary key for the table, and is defined as the https://cassandra.apache.org/_/glossary.html#[partition-key][partition key]. +* When `PRIMARY KEY` is at the end of a column definition, that column is the only primary key for the table, and is defined as the https://cassandra.apache.org/_/glossary.html#partition-key[partition key]. * A static column cannot be a primary key. * Primary keys can include frozen collections. diff --git a/doc/modules/cassandra/partials/table-properties.adoc b/doc/modules/cassandra/partials/table-properties.adoc index d15ad30eb9aa..9aa6f16d3da5 100644 --- a/doc/modules/cassandra/partials/table-properties.adoc +++ b/doc/modules/cassandra/partials/table-properties.adoc @@ -90,7 +90,7 @@ Tombstoned records within the grace period are excluded from xref:managing/opera ==== + In a single-node cluster, this property can safely be set to zero. -You can also reduce this value for tables whose data is not explicitly deleted -- for example, tables containing only data with https://cassandra.apache.org/_/glossary.html#gloss_ttl[TTL] set, or tables with `default_time_to_live` set. +You can also reduce this value for tables whose data is not explicitly deleted -- for example, tables containing only data with https://cassandra.apache.org/_/glossary.html#ttl[TTL] set, or tables with `default_time_to_live` set. However, if you lower the `gc_grace_seconds` value, consider its interaction with these operations: + @@ -127,7 +127,7 @@ The max_index_interval is the sparsest possible sampling in relation to memory p *speculative_retry* :: Configures https://www.datastax.com/dev/blog/rapid-read-protection-in-cassandra-2-0-2[rapid read protection]. -Normal read requests are sent to just enough replica nodes to satisfy the https://cassandra.apache.org/_/glossary.html#gloss_consistency_level[consistency level]. +Normal read requests are sent to just enough replica nodes to satisfy the https://cassandra.apache.org/_/glossary.html#consistency-level[consistency level]. In rapid read protection, extra read requests are sent to other replicas, even after the consistency level has been met. The speculative retry property specifies the trigger for these extra read requests. + diff --git a/doc/native_protocol_v3.spec b/doc/native_protocol_v3.spec index 30881c949790..a104993367f4 100644 --- a/doc/native_protocol_v3.spec +++ b/doc/native_protocol_v3.spec @@ -228,7 +228,7 @@ Table of Contents representing the port. [consistency] A consistency level specification. This is a [short] representing a consistency level with the following - correspondance: + correspondence: 0x0000 ANY 0x0001 ONE 0x0002 TWO @@ -267,7 +267,7 @@ Table of Contents The body is a [string map] of options. Possible options are: - "CQL_VERSION": the version of CQL to use. This option is mandatory and - currenty, the only version supported is "3.0.0". Note that this is + currently, the only version supported is "3.0.0". Note that this is different from the protocol version. - "COMPRESSION": the compression algorithm to use for frames (See section 5). This is optional, if not specified no compression will be used. @@ -316,8 +316,8 @@ Table of Contents values are provided. Those value are used for bound variables in the query. Optionally, if the 0x40 flag is present, each value will be preceded by a [string] name, representing the name of - the marker the value must be binded to. This is optional, and - if not present, values will be binded by position. + the marker the value must be bound to. This is optional, and + if not present, values will be bound by position. 0x02: Skip_metadata. If present, the Result Set returned as a response to that query (if any) will have the NO_METADATA flag (see Section 4.2.5.2). @@ -332,8 +332,8 @@ Table of Contents started (See Section 8 for more details). 0x10: With serial consistency. If present, should be present. is the [consistency] level for the - serial phase of conditional updates. That consitency can only be - either SERIAL or LOCAL_SERIAL and if not present, it defaults to + serial phase of conditional updates. Consistency can be + either SERIAL or LOCAL_SERIAL, if not present, it defaults to SERIAL. This option will be ignored for anything else that a conditional update/insert. 0x20: With default timestamp. If present, should be present. @@ -400,8 +400,8 @@ Table of Contents flags are, given there mask: 0x10: With serial consistency. If present, should be present. is the [consistency] level for the - serial phase of conditional updates. That consitency can only be - either SERIAL or LOCAL_SERIAL and if not present, it defaults to + serial phase of conditional updates. Consistency can be + either SERIAL or LOCAL_SERIAL, if not present, it defaults to SERIAL. This option will be ignored for anything else that a conditional update/insert. 0x20: With default timestamp. If present, should be present. @@ -435,8 +435,8 @@ Table of Contents - is the [consistency] level for the operation. - is only present if the 0x10 flag is set. In that case, is the [consistency] level for the serial phase of - conditional updates. That consitency can only be either SERIAL or - LOCAL_SERIAL and if not present will defaults to SERIAL. This option will + conditional updates. Consistency can be either SERIAL or + LOCAL_SERIAL, if not present, it defaults to SERIAL. This option will be ignored for anything else that a conditional update/insert. The server will respond with a RESULT message. @@ -461,7 +461,7 @@ Table of Contents This section describes the content of the frame body for the different responses. Please note that to make room for future evolution, clients should - support extra informations (that they should simply discard) to the one + support extra information (that they should simply discard) to the one described in this document at the end of the frame body. 4.2.1. ERROR @@ -488,7 +488,7 @@ Table of Contents The authentication is SASL based and thus consists on a number of server challenges (AUTH_CHALLENGE, Section 4.2.7) followed by client responses - (AUTH_RESPONSE, Section 4.1.2). The Initial exchange is however boostrapped + (AUTH_RESPONSE, Section 4.1.2). The Initial exchange is however bootstrapped by an initial client response. The details of that exchange (including how much challenge-response pair are required) are specific to the authenticator in use. The exchange ends when the server sends an AUTH_SUCCESS message or @@ -541,7 +541,7 @@ Table of Contents [][?...] where: - is an [int]. The bits of provides information on the - formatting of the remaining informations. A flag is set if the bit + formatting of the remaining information. A flag is set if the bit corresponding to its `mask` is set. Supported flags are, given there mask: 0x0001 Global_tables_spec: if set, only one table spec (keyspace @@ -555,7 +555,7 @@ Table of Contents this query (See Section 8 for more details). 0x0004 No_metadata: if set, the is only composed of these , the and optionally the - (depending on the Has_more_pages flage) but + (depending on the Has_more_pages flag) but no other information (so no nor ). This will only ever be the case if this was requested during the query (see QUERY and RESULT messages). @@ -567,8 +567,8 @@ Table of Contents (unique) keyspace name and table name the columns return are of. - specifies the columns returned in the query. There is such column specifications that are composed of: - ()? - The initial and are two [string] are only present + ()? + The initial and are two [string] are only present if the Global_tables_spec flag is not set. The is a [string] and is an [option] that correspond to the description (what this description is depends a bit on the context: in results to @@ -608,7 +608,7 @@ Table of Contents - is a [string] representing the keyspace name this UDT is part of. - is a [string] representing the UDT name. - - is a [short] reprensenting the number of fields of + - is a [short] representing the number of fields of the UDT, and thus the number of pair following - is a [string] representing the name of the @@ -657,7 +657,7 @@ Table of Contents Note that prepared query ID return is global to the node on which the query has been prepared. It can be used on any connection to that node and this - until the node is restarted (after which the query must be reprepared). + until the node is restarted (after which the query must be re-prepared). 4.2.5.5. Schema_change @@ -759,7 +759,7 @@ Table of Contents bytes). - snappy (https://code.google.com/p/snappy/). This compression might not be available as it depends on a native lib (server-side) that might not be - avaivable on some installation. + available on some installation. 6. Data Type Serialization Formats @@ -981,7 +981,7 @@ Table of Contents is an [int] representing the number of replica whose acknowledgement is required to achieve . is a [string] that describe the type of the write - that timeouted. The value of that string can be one + that timed out. The value of that string can be one of: - "SIMPLE": the write was a non-batched non-counter write. @@ -993,10 +993,10 @@ Table of Contents batch. Not batch log write has been attempted. - "COUNTER": the write was a counter write (batched or not). - - "BATCH_LOG": the timeout occured during the + - "BATCH_LOG": the timeout occurred during the write to the batch log when a (logged) batch write was requested. - - "CAS": the timeout occured during the Compare And Set write/update. + - "CAS": the timeout occurred during the Compare And Set write/update. 0x1200 Read_timeout: Timeout exception during a read request. The rest of the ERROR message body will be diff --git a/doc/native_protocol_v4.spec b/doc/native_protocol_v4.spec index 6def73721d98..cd55137a8f57 100644 --- a/doc/native_protocol_v4.spec +++ b/doc/native_protocol_v4.spec @@ -245,7 +245,7 @@ Table of Contents representing the port. [consistency] A consistency level specification. This is a [short] representing a consistency level with the following - correspondance: + correspondence: 0x0000 ANY 0x0001 ONE 0x0002 TWO @@ -366,8 +366,8 @@ Table of Contents started (See Section 8 for more details). 0x10: With serial consistency. If set, should be present. is the [consistency] level for the - serial phase of conditional updates. That consitency can only be - either SERIAL or LOCAL_SERIAL and if not present, it defaults to + serial phase of conditional updates. Consistency can be + SERIAL or LOCAL_SERIAL, if not present, it defaults to SERIAL. This option will be ignored for anything else other than a conditional update/insert. 0x20: With default timestamp. If set, should be present. @@ -432,8 +432,8 @@ Table of Contents flags are, given their mask: 0x10: With serial consistency. If set, should be present. is the [consistency] level for the - serial phase of conditional updates. That consistency can only be - either SERIAL or LOCAL_SERIAL and if not present, it defaults to + serial phase of conditional updates. Consistency can be + either SERIAL or LOCAL_SERIAL, and if not present, it defaults to SERIAL. This option will be ignored for anything else other than a conditional update/insert. 0x20: With default timestamp. If set, should be present. @@ -467,8 +467,8 @@ Table of Contents - is the [consistency] level for the operation. - is only present if the 0x10 flag is set. In that case, is the [consistency] level for the serial phase of - conditional updates. That consitency can only be either SERIAL or - LOCAL_SERIAL and if not present will defaults to SERIAL. This option will + conditional updates. Consistency can be SERIAL or + LOCAL_SERIAL, if not present, it defaults to SERIAL. This option will be ignored for anything else other than a conditional update/insert. The server will respond with a RESULT message. @@ -493,7 +493,7 @@ Table of Contents This section describes the content of the frame body for the different responses. Please note that to make room for future evolution, clients should - support extra informations (that they should simply discard) to the one + support extra information (that they should simply discard) to the one described in this document at the end of the frame body. 4.2.1. ERROR @@ -521,7 +521,7 @@ Table of Contents The authentication is SASL based and thus consists of a number of server challenges (AUTH_CHALLENGE, Section 4.2.7) followed by client responses - (AUTH_RESPONSE, Section 4.1.2). The initial exchange is however boostrapped + (AUTH_RESPONSE, Section 4.1.2). The initial exchange is however bootstrapped by an initial client response. The details of that exchange (including how many challenge-response pairs are required) are specific to the authenticator in use. The exchange ends when the server sends an AUTH_SUCCESS message or @@ -600,8 +600,8 @@ Table of Contents (unique) keyspace name and table name the columns belong to. - specifies the columns returned in the query. There are such column specifications that are composed of: - ()? - The initial and are two [string] and are only present + ()? + The initial and are two [string] and are only present if the Global_tables_spec flag is not set. The is a [string] and is an [option] that corresponds to the description (what this description is depends a bit on the context: in results to @@ -713,8 +713,8 @@ Table of Contents - specifies the bind markers in the prepared statement. There are such column specifications, each with the following format: - ()? - The initial and are two [string] that are only + ()? + The initial and are two [string] that are only present if the Global_tables_spec flag is not set. The field is a [string] that holds the name of the bind marker (if named), or the name of the column, field, or expression that the bind marker @@ -737,7 +737,7 @@ Table of Contents Note that the prepared query ID returned is global to the node on which the query has been prepared. It can be used on any connection to that node - until the node is restarted (after which the query must be reprepared). + until the node is restarted (after which the query must be re-prepared). 4.2.5.5. Schema_change @@ -754,7 +754,7 @@ Table of Contents 4.2.6. EVENT An event pushed by the server. A client will only receive events for the - types it has REGISTERed to. The body of an EVENT message will start with a + types it has REGISTER-ed to. The body of an EVENT message will start with a [string] representing the event type. The rest of the message depends on the event type. The valid event types are: - "TOPOLOGY_CHANGE": events related to change in the cluster topology. @@ -842,7 +842,7 @@ Table of Contents bytes). - snappy (https://code.google.com/p/snappy/). This compression might not be available as it depends on a native lib (server-side) that might not be - avaivable on some installations. + available on some installations. 6. Data Type Serialization Formats @@ -1099,11 +1099,11 @@ Table of Contents - "BATCH_LOG": the timeout occurred during the write to the batch log when a (logged) batch write was requested. - - "CAS": the timeout occured during the Compare And Set write/update. - - "VIEW": the timeout occured when a write involves - VIEW update and failure to acqiure local view(MV) + - "CAS": the timeout occurred during the Compare And Set write/update. + - "VIEW": the timeout occurred when a write involves + VIEW update and failure to acquire local view(MV) lock for key within timeout - - "CDC": the timeout occured when cdc_total_space is + - "CDC": the timeout occurred when cdc_total_space is exceeded when doing a write to data tracked by cdc. 0x1200 Read_timeout: Timeout exception during a read request. The rest of the ERROR message body will be @@ -1124,7 +1124,7 @@ Table of Contents responded. Otherwise, the value is != 0. 0x1300 Read_failure: A non-timeout exception during a read request. The rest of the ERROR message body will be - + where: is the [consistency] level of the query having triggered the exception. @@ -1132,7 +1132,7 @@ Table of Contents answered the request. is an [int] representing the number of replicas whose acknowledgement is required to achieve . - is an [int] representing the number of nodes that + is an [int] representing the number of nodes that experience a failure while executing the request. is a single byte. If its value is 0, it means the replica that was asked for data had not @@ -1146,7 +1146,7 @@ Table of Contents [string list] one string for each argument type (as CQL type) of the failed function 0x1500 Write_failure: A non-timeout exception during a write request. The rest of the ERROR message body will be - + where: is the [consistency] level of the query having triggered the exception. @@ -1154,7 +1154,7 @@ Table of Contents answered the request. is an [int] representing the number of replicas whose acknowledgement is required to achieve . - is an [int] representing the number of nodes that + is an [int] representing the number of nodes that experience a failure while executing the request. is a [string] that describes the type of the write that failed. The value of that string can be one @@ -1169,14 +1169,14 @@ Table of Contents batch. No batch log write has been attempted. - "COUNTER": the write was a counter write (batched or not). - - "BATCH_LOG": the failure occured during the + - "BATCH_LOG": the failure occurred during the write to the batch log when a (logged) batch write was requested. - - "CAS": the failure occured during the Compare And Set write/update. - - "VIEW": the failure occured when a write involves - VIEW update and failure to acqiure local view(MV) + - "CAS": the failure occurred during the Compare And Set write/update. + - "VIEW": the failure occurred when a write involves + VIEW update and failure to acquire local view(MV) lock for key within timeout - - "CDC": the failure occured when cdc_total_space is + - "CDC": the failure occurred when cdc_total_space is exceeded when doing a write to data tracked by cdc. 0x2000 Syntax_error: The submitted query has a syntax error. diff --git a/doc/native_protocol_v5.spec b/doc/native_protocol_v5.spec index e080801978c5..88d6a948a709 100644 --- a/doc/native_protocol_v5.spec +++ b/doc/native_protocol_v5.spec @@ -404,7 +404,7 @@ Table of Contents The purpose is to send small negative values as small unsigned values, so that we save bytes on the wire. To encode a value n use "(n >> 31) ^ (n << 1)" for 32 bit values, and "(n >> 63) ^ (n << 1)" for 64 bit values where "^" is the xor operation, "<<" is the left shift operation and ">>" is - the arithemtic right shift operation (highest-order bit is replicated). + the arithmetic right shift operation (highest-order bit is replicated). Decode with "(n >> 1) ^ -(n & 1)". [option] A pair of where is a [short] representing @@ -422,7 +422,7 @@ Table of Contents [byte] representing the IP address. [consistency] A consistency level specification. This is a [short] representing a consistency level with the following - correspondance: + correspondence: 0x0000 ANY 0x0001 ONE 0x0002 TWO @@ -478,7 +478,7 @@ Table of Contents This is optional; if not specified no compression will be used. - "DRIVER_NAME": allows clients to supply a free-form label representing the driver implementation. This is displayed in the output of `nodetool clientstats` - - "DRIVER_VERSION": allows clients to supply a free-form label represting the driver + - "DRIVER_VERSION": allows clients to supply a free-form label representing the driver version. This is displayed in the output of `nodetool clientstats` - "THROW_ON_OVERLOAD": flag to specify server behaviour where the incoming message rate is too high. An [string] value of "1" instructs the server to respond with @@ -548,8 +548,8 @@ Table of Contents started (See Section 7 for more details). 0x0010: With serial consistency. If set, should be present. is the [consistency] level for the - serial phase of conditional updates. That consitency can only be - either SERIAL or LOCAL_SERIAL and if not present, it defaults to + serial phase of conditional updates. Consistency can be + either SERIAL or LOCAL_SERIAL, if not present, it defaults to SERIAL. This option will be ignored for anything else other than a conditional update/insert. 0x0020: With default timestamp. If set, must be present. @@ -567,7 +567,7 @@ Table of Contents and using this flag, while supported, is almost surely inefficient. 0x0080: With keyspace. If set, must be present. is a [string] indicating the keyspace that the query should be executed in. - It supercedes the keyspace that the connection is bound to, if any. + It supersedes the keyspace that the connection is bound to, if any. 0x0100: With now in seconds. If set, must be present. is an [int] representing the current time (now) for the query. Affects TTL cell liveness in read queries and local deletion @@ -593,7 +593,7 @@ Table of Contents flags are, given their mask: 0x01: With keyspace. If set, must be present. is a [string] indicating the keyspace that the query should be executed in. - It supercedes the keyspace that the connection is bound to, if any. + It supersedes the keyspace that the connection is bound to, if any. The server will respond with a RESULT message with a `prepared` kind (0x0004, see Section 4.2.5). @@ -606,10 +606,10 @@ Table of Contents where - is the prepared query ID. It's the [short bytes] returned as a response to a PREPARE message. - - is the ID of the resultset metadata that was sent + - is the ID of the result set metadata that was sent along with response to PREPARE message. If a RESULT/Rows message reports - changed resultset metadata with the Metadata_changed flag, the reported new - resultset metadata must be used in subsequent executions. + changed result set metadata with the Metadata_changed flag, the reported new + result set metadata must be used in subsequent executions. - has the exact same definition as in QUERY (see Section 4.1.4). @@ -634,8 +634,8 @@ Table of Contents flags are, given their mask: 0x0010: With serial consistency. If set, should be present. is the [consistency] level for the - serial phase of conditional updates. That consistency can only be - either SERIAL or LOCAL_SERIAL and if not present, it defaults to + serial phase of conditional updates. Consistency can be + either SERIAL or LOCAL_SERIAL, if not present, it defaults to SERIAL. This option will be ignored for anything else other than a conditional update/insert. 0x0020: With default timestamp. If set, should be present. @@ -652,7 +652,7 @@ Table of Contents more details]. 0x0080: With keyspace. If set, must be present. is a [string] indicating the keyspace that the query should be executed in. - It supercedes the keyspace that the connection is bound to, if any. + It supersedes the keyspace that the connection is bound to, if any. 0x0100: With now in seconds. If set, must be present. is an [int] representing the current time (now) for the query. Affects TTL cell liveness in read queries and local deletion @@ -677,8 +677,8 @@ Table of Contents - is the [consistency] level for the operation. - is only present if the 0x10 flag is set. In that case, is the [consistency] level for the serial phase of - conditional updates. That consitency can only be either SERIAL or - LOCAL_SERIAL and if not present will defaults to SERIAL. This option will + conditional updates. Consistency can be either SERIAL or + LOCAL_SERIAL, if not present, it defaults to SERIAL. This option will be ignored for anything else other than a conditional update/insert. The server will respond with a RESULT message. @@ -703,7 +703,7 @@ Table of Contents This section describes the content of the frame body for the different responses. Please note that to make room for future evolution, clients should - support extra informations (that they should simply discard) to the one + support extra information (that they should simply discard) to the one described in this document at the end of the frame body. 4.2.1. ERROR @@ -731,7 +731,7 @@ Table of Contents The authentication is SASL based and thus consists of a number of server challenges (AUTH_CHALLENGE, Section 4.2.7) followed by client responses - (AUTH_RESPONSE, Section 4.1.2). The initial exchange is however boostrapped + (AUTH_RESPONSE, Section 4.1.2). The initial exchange is however bootstrapped by an initial client response. The details of that exchange (including how many challenge-response pairs are required) are specific to the authenticator in use. The exchange ends when the server sends an AUTH_SUCCESS message or @@ -809,12 +809,12 @@ Table of Contents during the query (see QUERY and RESULT messages). 0x0008 Metadata_changed: if set, the No_metadata flag has to be unset and has to be supplied. This flag is to be - used to avoid a roundtrip in case of metadata changes for queries + used to avoid a round trip in case of metadata changes for queries that requested metadata to be skipped. - is an [int] representing the number of columns selected by the query that produced this result. It defines the number of elements in and the number of elements for each row in . - - is [short bytes] representing the new, changed resultset + - is [short bytes] representing the new, changed result set metadata. The new metadata ID must also be used in subsequent executions of the corresponding prepared statement, if any. - is present if the Global_tables_spec is set in @@ -822,8 +822,8 @@ Table of Contents (unique) keyspace name and table name the columns belong to. - specifies the columns returned in the query. There are such column specifications that are composed of: - ()? - The initial and are two [string] and are only present + ()? + The initial and are two [string] and are only present if the Global_tables_spec flag is not set. The is a [string] and is an [option] that corresponds to the description (what this description is depends a bit on the context: in results to @@ -901,7 +901,7 @@ Table of Contents where: - is [short bytes] representing the prepared query ID. - - is [short bytes] representing the resultset metadata ID. + - is [short bytes] representing the result set metadata ID. - is composed of: [...][?...] where: @@ -937,8 +937,8 @@ Table of Contents - specifies the bind markers in the prepared statement. There are such column specifications, each with the following format: - ()? - The initial and are two [string] that are only + ()? + The initial and are two [string] that are only present if the Global_tables_spec flag is not set. The field is a [string] that holds the name of the bind marker (if named), or the name of the column, field, or expression that the bind marker @@ -961,7 +961,7 @@ Table of Contents Note that the prepared query ID returned is global to the node on which the query has been prepared. It can be used on any connection to that node - until the node is restarted (after which the query must be reprepared). + until the node is restarted (after which the query must be re-prepared). 4.2.5.5. Schema_change @@ -978,7 +978,7 @@ Table of Contents 4.2.6. EVENT An event pushed by the server. A client will only receive events for the - types it has REGISTERed to. The body of an EVENT message will start with a + types it has REGISTER-ed to. The body of an EVENT message will start with a [string] representing the event type. The rest of the message depends on the event type. The valid event types are: - "TOPOLOGY_CHANGE": events related to change in the cluster topology. @@ -1209,7 +1209,7 @@ Table of Contents 5.25 vector For a vector of n dimensions of a fixed-length type, a sequence of those n elements. - For a vector with variable-length elements, the size of the elements will preced + For a vector with variable-length elements, the size of the elements will precede each element. Each element is the [bytes] representing the serialized value. The number of dimensions is not encoded, since it's part of the type definition. @@ -1318,13 +1318,13 @@ Table of Contents - "BATCH_LOG": the timeout occurred during the write to the batch log when a (logged) batch write was requested. - - "CAS": the timeout occured during the Compare And Set write/update. - - "VIEW": the timeout occured when a write involves - VIEW update and failure to acqiure local view(MV) + - "CAS": the timeout occurred during the Compare And Set write/update. + - "VIEW": the timeout occurred when a write involves + VIEW update and failure to acquire local view(MV) lock for key within timeout - - "CDC": the timeout occured when cdc_total_space is + - "CDC": the timeout occurred when cdc_total_space is exceeded when doing a write to data tracked by cdc. - is a [short] that describes the number of contentions occured during the CAS operation. + is a [short] that describes the number of contentions occurred during the CAS operation. The field only presents when the is "CAS". 0x1200 Read_timeout: Timeout exception during a read request. The rest of the ERROR message body will be @@ -1345,7 +1345,7 @@ Table of Contents responded. Otherwise, the value is != 0. 0x1300 Read_failure: A non-timeout exception during a read request. The rest of the ERROR message body will be - + where: is the [consistency] level of the query having triggered the exception. @@ -1353,12 +1353,12 @@ Table of Contents answered the request. is an [int] representing the number of replicas whose acknowledgement is required to achieve . - is a map of endpoint to failure reason codes. This maps + is a map of endpoint to failure reason codes. This maps the endpoints of the replica nodes that failed when executing the request to a code representing the reason for the failure. The map is encoded starting with an [int] n - followed by n pairs of where - is an [inetaddr] and is a [short]. + followed by n pairs of where + is an [inetaddr] and is a [short]. is a single byte. If its value is 0, it means the replica that was asked for data had not responded. Otherwise, the value is != 0. @@ -1371,7 +1371,7 @@ Table of Contents [string list] one string for each argument type (as CQL type) of the failed function 0x1500 Write_failure: A non-timeout exception during a write request. The rest of the ERROR message body will be - + where: is the [consistency] level of the query having triggered the exception. @@ -1379,12 +1379,12 @@ Table of Contents answered the request. is an [int] representing the number of replicas whose acknowledgement is required to achieve . - is a map of endpoint to failure reason codes. This maps + is a map of endpoint to failure reason codes. This maps the endpoints of the replica nodes that failed when executing the request to a code representing the reason for the failure. The map is encoded starting with an [int] n - followed by n pairs of where - is an [inetaddr] and is a [short]. + followed by n pairs of where + is an [inetaddr] and is a [short]. is a [string] that describes the type of the write that failed. The value of that string can be one of: @@ -1398,17 +1398,17 @@ Table of Contents batch. No batch log write has been attempted. - "COUNTER": the write was a counter write (batched or not). - - "BATCH_LOG": the failure occured during the + - "BATCH_LOG": the failure occurred during the write to the batch log when a (logged) batch write was requested. - - "CAS": the failure occured during the Compare And Set write/update. - - "VIEW": the failure occured when a write involves - VIEW update and failure to acqiure local view(MV) + - "CAS": the failure occurred during the Compare And Set write/update. + - "VIEW": the failure occurred when a write involves + VIEW update and failure to acquire local view(MV) lock for key within timeout - - "CDC": the failure occured when cdc_total_space is + - "CDC": the failure occurred when cdc_total_space is exceeded when doing a write to data tracked by cdc. 0x1600 CDC_WRITE_FAILURE: // todo - 0x1700 CAS_WRITE_UNKNOWN: An exception occured due to contended Compare And Set write/update. + 0x1700 CAS_WRITE_UNKNOWN: An exception occurred due to contended Compare And Set write/update. The CAS operation was only partially completed and the operation may or may not get completed by the contending CAS write or SERIAL/LOCAL_SERIAL read. The rest of the ERROR message body will be @@ -1444,8 +1444,8 @@ Table of Contents * Added result set metadata id to Prepared responses (Section 4.2.5.4) * Beta protocol flag for v5 native protocol is added (Section 2.2) - * in Read_failure and Write_failure error message bodies (Section 9) - has been replaced with . The maps node IP addresses to + * in Read_failure and Write_failure error message bodies (Section 9) + has been replaced with . The maps node IP addresses to a failure reason code which indicates why the request failed on that node. * Enlarged flag's bitmaps for QUERY, EXECUTE and BATCH messages from [byte] to [int] (Sections 4.1.4, 4.1.6 and 4.1.7). diff --git a/doc/scripts/process-native-protocol-specs-in-docker.sh b/doc/scripts/process-native-protocol-specs-in-docker.sh index 05565c02b93d..332310ab661e 100755 --- a/doc/scripts/process-native-protocol-specs-in-docker.sh +++ b/doc/scripts/process-native-protocol-specs-in-docker.sh @@ -20,73 +20,27 @@ # Variables GO_VERSION="1.23.1" - -GO_OS=linux - -if [ $(uname) = "Darwin" ]; then - GO_OS=darwin -fi - -GO_PLATFORM=amd64 - -if [ $(uname -m) = "aarch64" ]; then - GO_PLATFORM=arm64 -fi - -GO_TAR="go${GO_VERSION}.${GO_OS}-${GO_PLATFORM}.tar.gz" TMPDIR="${TMPDIR:-/tmp}" check_go_version() { if command -v go &>/dev/null; then local installed_version=$(go version | awk '{print $3}' | sed 's/go//') - if [ "$(printf '%s\n' "$GO_VERSION" "$installed_version" | sort -V | head -n1)" = "$GO_VERSION" ]; then - echo "Detected Go $installed_version (>= $GO_VERSION), skipping installation." + echo "Detected Go $installed_version (>= $GO_VERSION)" return 0 else - if [ -z $installed_version ]; then - echo "No Go installation detected, proceeding with installation." - else - echo "Detected Go $installed_version (< $GO_VERSION), proceeding with installation." - fi - return 1 + echo "Detected unsupported Go $installed_version (< $GO_VERSION), please update to supported version." fi else - echo "Go env not found in your system, proceeding with installation." - return 1 + echo "No Go installation detected, please install Go (>= $GO_VERSION)" fi + return 1 } if ! check_go_version; then - - if ls $TMPDIR/go$GO_VERSION > /dev/null 2>&1; then - echo "Reusing cached installation in $TMPDIR/go$GO_VERSION" - export PATH="$PATH:$TMPDIR/go$GO_VERSION/go/bin" - export GOPATH="$TMPDIR/go$GO_VERSION/go/bin" - export GOROOT="$TMPDIR/go$GO_VERSION/go" - else - if ! ls $TMPDIR/$GO_TAR > /dev/null 2>&1; then - echo "Downloading Go $GO_VERSION..." - - curl -L --fail --silent --retry 2 --retry-delay 5 --max-time 30 https://golang.org/dl/$GO_TAR -o $TMPDIR/$GO_TAR - - if [ $? != "0" ]; then - echo "Network error. Specify '-Dant.gen-doc.skip=true' to skip if offline." - exit 1 - fi - fi - - echo "Installing Go $GO_VERSION..." - mkdir -p $TMPDIR/go$GO_VERSION - tar -C "$TMPDIR/go$GO_VERSION" -xzf "$TMPDIR/$GO_TAR" - - # Set Go environment variables - export PATH="$PATH:$TMPDIR/go$GO_VERSION/go/bin" - export GOPATH="$TMPDIR/go$GO_VERSION/go/bin" - export GOROOT="$TMPDIR/go$GO_VERSION/go" - fi -else - echo "Using system-installed Go." + echo " Please install/upgrade Golang for 'ant gen-doc', or specify '-Dant.gen-doc.skip=true' to skip this step." + echo " For download and installation instructions see https://go.dev/doc/install" + exit 1 fi # Step 1: Building the parser @@ -107,7 +61,7 @@ git sparse-checkout set --no-cone /cqlprotodoc git checkout cd "${TMPDIR}/cassandra-website/cqlprotodoc" rm -rf "${TMPDIR}/cqlprotodoc" -$TMPDIR/go$GO_VERSION/go/bin/go build -o "$TMPDIR"/cqlprotodoc +go build -o "$TMPDIR"/cqlprotodoc # Step 2: Process the spec files using the parser echo "Processing the .spec files..." @@ -116,6 +70,11 @@ output_dir="modules/cassandra/attachments" mkdir -p "${output_dir}" "$TMPDIR"/cqlprotodoc . "${output_dir}" +if ! ls ${output_dir}/native_protocol_v*.html > /dev/null 2>&1; then + echo "failed: No native_protocol_v*.html files generated in ${output_dir}" + exit 1 +fi + # Step 4: Generate summary file summary_file="modules/cassandra/pages/reference/native-protocol.adoc" diff --git a/ide/idea-iml-file.xml b/ide/idea-iml-file.xml index 13e66fa61308..4daf6af09613 100644 --- a/ide/idea-iml-file.xml +++ b/ide/idea-iml-file.xml @@ -30,6 +30,8 @@ + + @@ -49,6 +51,16 @@ + + + + + + + + + + @@ -56,6 +68,8 @@ + + @@ -63,12 +77,17 @@ + + + + + @@ -76,6 +95,9 @@ + + + diff --git a/ide/idea/vcs.xml b/ide/idea/vcs.xml index 81872fd3f150..a5367a526e4d 100644 --- a/ide/idea/vcs.xml +++ b/ide/idea/vcs.xml @@ -2,6 +2,7 @@ + - \ No newline at end of file + diff --git a/ide/idea/workspace.xml b/ide/idea/workspace.xml index c5c0e28b963b..5eb1a70b78b3 100644 --- a/ide/idea/workspace.xml +++ b/ide/idea/workspace.xml @@ -183,24 +183,39 @@
WHERE k=1 AND c=2) + * ex. LET y = (SELECT * FROM
WHERE k=1 LIMIT 1) + */ +letStatement returns [SelectStatement.RawStatement expr] + @init { + Term.Raw limit = null; + } + : K_LET txnVar=IDENT '=' + '(' { stmtBegins(); } K_SELECT assignments=letSelectors K_FROM cf=columnFamilyName K_WHERE wclause=whereClause ( K_LIMIT rows=intValue { limit = rows; } )? ')' + { + SelectStatement.Parameters params = new SelectStatement.Parameters(Collections.emptyList(), Collections.emptyList(), false, false, false, $txnVar.text); + WhereClause where = wclause == null ? WhereClause.empty() : wclause.build(); + + $expr = new SelectStatement.RawStatement(cf, params, assignments, where, limit, null, stmtSrc()); + } + ; + +letSelectors returns [List expr] + : t1=letSelector { $expr = new ArrayList(); $expr.add(t1); } (',' tN=letSelector { $expr.add(tN); })* + | '\*' { $expr = Collections.emptyList();} + ; + +letSelector returns [RawSelector s] + @init{ ColumnIdentifier alias = null; } + : us=unaliasedSelector { $s = new RawSelector(us, alias); } + ; selectClause returns [boolean isDistinct, List selectors] @init{ $isDistinct = false; } @@ -475,6 +552,9 @@ groupByClause[List groups] * */ insertStatement returns [ModificationStatement.Parsed expr] + @init { + stmtBegins(); + } : K_INSERT K_INTO cf=columnFamilyName ( st1=normalInsertStatement[cf] { $expr = st1; } | K_JSON st2=jsonInsertStatement[cf] { $expr = st2; }) @@ -489,14 +569,19 @@ normalInsertStatement [QualifiedName qn] returns [UpdateStatement.ParsedInsert e } : '(' c1=cident { columnNames.add(c1); } ( ',' cn=cident { columnNames.add(cn); } )* ')' K_VALUES - '(' v1=term { values.add(v1); } ( ',' vn=term { values.add(vn); } )* ')' + '(' insertValue[values] ( ',' insertValue[values] )* ')' ( K_IF K_NOT K_EXISTS { ifNotExists = true; } )? ( usingClause[attrs] )? { - $expr = new UpdateStatement.ParsedInsert(qn, attrs, columnNames, values, ifNotExists); + $expr = new UpdateStatement.ParsedInsert(qn, attrs, columnNames, values, ifNotExists, stmtSrc(), isParsingTxn); } ; +insertValue[List values] + : t=term { values.add(t); } + | {isParsingTxn}? dr=rowDataReference { values.add(new ReferenceValue.Substitution.Raw(dr)); } + ; + jsonInsertStatement [QualifiedName qn] returns [UpdateStatement.ParsedInsertJson expr] @init { Attributes.Raw attrs = new Attributes.Raw(); @@ -508,7 +593,7 @@ jsonInsertStatement [QualifiedName qn] returns [UpdateStatement.ParsedInsertJson ( K_IF K_NOT K_EXISTS { ifNotExists = true; } )? ( usingClause[attrs] )? { - $expr = new UpdateStatement.ParsedInsertJson(qn, attrs, val, defaultUnset, ifNotExists); + $expr = new UpdateStatement.ParsedInsertJson(qn, attrs, val, defaultUnset, ifNotExists, stmtSrc(), isParsingTxn); } ; @@ -537,8 +622,9 @@ usingClauseObjective[Attributes.Raw attrs] updateStatement returns [UpdateStatement.ParsedUpdate expr] @init { Attributes.Raw attrs = new Attributes.Raw(); - List> operations = new ArrayList<>(); + UpdateStatement.OperationCollector operations = new UpdateStatement.OperationCollector(); boolean ifExists = false; + stmtBegins(); } : K_UPDATE cf=columnFamilyName ( usingClause[attrs] )? @@ -551,7 +637,9 @@ updateStatement returns [UpdateStatement.ParsedUpdate expr] operations, wclause.build(), conditions == null ? Collections.emptyList() : conditions, - ifExists); + ifExists, + isParsingTxn, + stmtSrc()); } ; @@ -572,6 +660,7 @@ deleteStatement returns [DeleteStatement.Parsed expr] Attributes.Raw attrs = new Attributes.Raw(); List columnDeletions = Collections.emptyList(); boolean ifExists = false; + stmtBegins(); } : K_DELETE ( dels=deleteSelection { columnDeletions = dels; } )? K_FROM cf=columnFamilyName @@ -584,7 +673,9 @@ deleteStatement returns [DeleteStatement.Parsed expr] columnDeletions, wclause.build(), conditions == null ? Collections.emptyList() : conditions, - ifExists); + ifExists, + stmtSrc(), + isParsingTxn); } ; @@ -650,6 +741,102 @@ batchStatementObjective returns [ModificationStatement.Parsed statement] | d=deleteStatement { $statement = d; } ; +/** + * ex. conditional update returning pre-update values + * + * BEGIN TRANSACTION + * LET row1 = (SELECT * FROM
WHERE k=1 AND c=2); + * LET row2 = (SELECT * FROM
WHERE k=2 AND c=2); + * SELECT row1.v, row2.v; + * IF row1.v = 3 AND row2.v = 4 THEN + * UPDATE
SET v = row1.v + 1 WHERE k = 1 AND c = 2; + * END IF + * COMMIT TRANSACTION + * + * ex. read-only transaction + * + * BEGIN TRANSACTION + * SELECT * FROM
WHERE k=1 AND c=2; + * COMMIT TRANSACTION + * + * ex. write-only transaction + * + * BEGIN TRANSACTION + * INSERT INTO
(k, c, v) VALUES (0, 0, 1); + * COMMIT TRANSACTION + */ +batchTxnStatement returns [TransactionStatement.Parsed expr] + @init { + isParsingTxn = true; + List assignments = new ArrayList<>(); + SelectStatement.RawStatement select = null; + List returning = null; + List updates = new ArrayList<>(); + } + : K_BEGIN K_TRANSACTION + ( let=letStatement ';' { assignments.add(let); })* + ( ( (selectStatement) => s=selectStatement ';' { select = s; }) | ( K_SELECT drs=rowDataReferences ';' { returning = drs; }) )? + ( K_IF conditions=txnConditions K_THEN { isTxnConditional = true; } )? + ( upd=batchStatementObjective ';' { updates.add(upd); } )* + ( {!isTxnConditional}? (K_COMMIT K_TRANSACTION) | {isTxnConditional}? (K_END K_IF K_COMMIT K_TRANSACTION)) + { + $expr = new TransactionStatement.Parsed(assignments, select, returning, updates, conditions, references); + } + ; + finally { isParsingTxn = false; } + +rowDataReferences returns [List refs] + : r1=rowDataReference { refs = new ArrayList(); refs.add(r1); } (',' rN=rowDataReference { refs.add(rN); })* + ; + +rowDataReference returns [RowDataReference.Raw rawRef] + @init { Selectable.RawIdentifier tuple = null; Selectable.Raw selectable = null; } + @after { $rawRef = newRowDataReference(tuple, selectable); } + : t=sident ('.' s=referenceSelection)? { tuple = t; selectable = s; } + ; + +referenceSelection returns [Selectable.Raw s] + : g=referenceSelectionWithoutField m=selectorModifier[g] {$s = m;} + ; + +referenceSelectionWithoutField returns [Selectable.Raw s] + @init { Selectable.Raw tmp = null; } + @after { $s = tmp; } + : sn=sident { tmp=sn; } + | (selectionTypeHint)=> h=selectionTypeHint { tmp=h; } + | t=selectionTupleOrNestedSelector { tmp=t; } + | l=selectionList { tmp=l; } + | m=selectionMapOrSet { tmp=m; } + // UDTs are equivalent to maps from the syntax point of view, so the final decision will be done in Selectable.WithMapOrUdt + ; + +txnConditions returns [List conditions] + @init { conditions = new ArrayList(); } + : txnColumnCondition[conditions] ( K_AND txnColumnCondition[conditions] )* + ; + +txnConditionKind returns [ConditionStatement.Kind op] + : '=' { $op = ConditionStatement.Kind.EQ; } + | '<' { $op = ConditionStatement.Kind.LT; } + | '<=' { $op = ConditionStatement.Kind.LTE; } + | '>' { $op = ConditionStatement.Kind.GT; } + | '>=' { $op = ConditionStatement.Kind.GTE; } + | '!=' { $op = ConditionStatement.Kind.NEQ; } + ; + +txnColumnCondition[List conditions] + : lhs=rowDataReference + ( + K_IS + ( + K_NOT K_NULL { conditions.add(new ConditionStatement.Raw(lhs, ConditionStatement.Kind.IS_NOT_NULL, null)); } + | K_NULL { conditions.add(new ConditionStatement.Raw(lhs, ConditionStatement.Kind.IS_NULL, null)); } + ) + | (txnConditionKind term)=> op=txnConditionKind t=term { conditions.add(new ConditionStatement.Raw(lhs, op, t)); } + ) + | lhs=term op=txnConditionKind rhs=rowDataReference { conditions.add(new ConditionStatement.Raw(lhs, op, rhs)); } + ; + createAggregateStatement returns [CreateAggregateStatement.Raw stmt] @init { boolean orReplace = false; @@ -780,8 +967,8 @@ tableDefinition[CreateTableStatement.Raw stmt] ; tableColumns[CreateTableStatement.Raw stmt] - @init { boolean isStatic = false; } - : k=ident v=comparatorType (K_STATIC { isStatic = true; })? (mask=columnMask)? (constraints=columnConstraints)? { $stmt.addColumn(k, v, isStatic, mask, constraints); } + @init { boolean isStatic = false; boolean isNotNull = false; } + : k=ident v=comparatorType (K_STATIC { isStatic = true; })? (K_NOT K_NULL { isNotNull = true; })? (mask=columnMask)? (constraints=columnConstraints)? { $stmt.addColumn(k, v, isStatic, isNotNull, mask, constraints); } (K_PRIMARY K_KEY { $stmt.setPartitionKeyColumn(k); })? | K_PRIMARY K_KEY '(' tablePartitionKey[stmt] (',' c=ident { $stmt.markClusteringColumn(c); } )* ')' ; @@ -795,9 +982,30 @@ columnConstraints returns [ColumnConstraints.Raw constraints] ; columnConstraint returns [ColumnConstraint columnConstraint] - : funcName=ident '(' k=ident ')' op=relationType t=value { $columnConstraint = new FunctionColumnConstraint.Raw(funcName, k, op, t.getText()).prepare(); } - | funcName=ident '(' k=ident ')' { $columnConstraint = new UnaryFunctionColumnConstraint.Raw(funcName, k).prepare(); } - | k=ident op=relationType t=value { $columnConstraint = new ScalarColumnConstraint.Raw(k, op, t.getText()).prepare(); } + @init { List arguments = new ArrayList<>(); } + : K_NOT K_NULL + { + $columnConstraint = new UnaryFunctionColumnConstraint.Raw("NOT_NULL").prepare(); + } + | funcName=ident columnConstraintsArguments[arguments] (op=relationType t=value)? + { + if (op != null && t != null) + { + $columnConstraint = new FunctionColumnConstraint.Raw(funcName, arguments, op, t.getText()).prepare(); + } + else + { + $columnConstraint = new UnaryFunctionColumnConstraint.Raw(funcName, arguments).prepare(); + } + } + | k=ident op=relationType t=value + { + $columnConstraint = new ScalarColumnConstraint.Raw(k, op, t.getText()).prepare(); + } + | funcName=ident + { + $columnConstraint = new UnaryFunctionColumnConstraint.Raw(funcName).prepare(); + } ; columnMask returns [ColumnMask.Raw mask] @@ -810,6 +1018,12 @@ columnMaskArguments[List arguments] : '(' ')' | '(' c=term { arguments.add(c); } (',' cn=term { arguments.add(cn); })* ')' ; +columnConstraintsArguments[List arguments] + : '(' ')' + | '(' c=term { try { arguments.add(c.toString()); } catch (Throwable t) { throw new SyntaxException("Constraint function parameters need to be strings."); }; } (',' cn=term { try { arguments.add(cn.toString()); } catch (Throwable t) { throw new SyntaxException("Constraint function parameters need to be strings."); }; })* ')' + | '(' ci=ident { throw new SyntaxException("Constraint function parameters need to be strings."); } (',' cni=ident)* ')' + ; + tablePartitionKey[CreateTableStatement.Raw stmt] @init {List l = new ArrayList();} @after{ $stmt.setPartitionKeyColumns(l); } @@ -988,14 +1202,14 @@ alterTableStatement returns [AlterTableStatement.Raw stmt] | K_ALTER ( K_IF K_EXISTS { $stmt.ifColumnExists(true); } )? id=cident ( mask=columnMask { $stmt.mask(id, mask); } + | constraints=columnConstraints { $stmt.constraint(id, constraints); } | K_DROP K_MASKED { $stmt.mask(id, null); } - | K_DROP K_CHECK { $stmt.constraint(id, null); } - | (constraints=columnConstraints) { $stmt.constraint(id, constraints); }) + | K_DROP K_CHECK { $stmt.constraint(id, null); }) | K_ADD ( K_IF K_NOT K_EXISTS { $stmt.ifColumnNotExists(true); } )? - ( id=ident v=comparatorType b=isStaticColumn (m=columnMask)? { $stmt.add(id, v, b, m); } - | ('(' id1=ident v1=comparatorType b1=isStaticColumn (m1=columnMask)? { $stmt.add(id1, v1, b1, m1); } - ( ',' idn=ident vn=comparatorType bn=isStaticColumn (mn=columnMask)? { $stmt.add(idn, vn, bn, mn); mn=null; } )* ')') ) + ( id=ident v=comparatorType b=isStaticColumn (m=columnMask)? (c=columnConstraints)? { $stmt.add(id, v, b, m, c); } + | ('(' id1=ident v1=comparatorType b1=isStaticColumn (m1=columnMask)? (c=columnConstraints)? { $stmt.add(id1, v1, b1, m1, c); } + ( ',' idn=ident vn=comparatorType bn=isStaticColumn (mn=columnMask)? (c=columnConstraints)? { $stmt.add(idn, vn, bn, mn, c); mn=null; c=null;} )* ')') ) | K_DROP ( K_IF K_EXISTS { $stmt.ifColumnExists(true); } )? ( id=ident { $stmt.drop(id); } @@ -1727,18 +1941,18 @@ simpleTerm returns [Term.Raw term] | K_CAST '(' t=simpleTerm K_AS n=native_type ')' { $term = FunctionCall.Raw.newCast(t, n); } ; -columnOperation[List> operations] +columnOperation[UpdateStatement.OperationCollector operations] : key=cident columnOperationDifferentiator[operations, key] ; -columnOperationDifferentiator[List> operations, ColumnIdentifier key] +columnOperationDifferentiator[UpdateStatement.OperationCollector operations, ColumnIdentifier key] : '=' normalColumnOperation[operations, key] | shorthandColumnOperation[operations, key] | '[' k=term ']' collectionColumnOperation[operations, key, k] | '.' field=fident udtColumnOperation[operations, key, field] ; -normalColumnOperation[List> operations, ColumnIdentifier key] +normalColumnOperation[UpdateStatement.OperationCollector operations, ColumnIdentifier key] : t=term ('+' c=cident )? { if (c == null) @@ -1766,27 +1980,56 @@ normalColumnOperation[List> operatio addRecognitionError("Only expressions of the form X = X " + ($i.text.charAt(0) == '-' ? '-' : '+') + " are supported."); addRawUpdate(operations, key, new Operation.Addition(Constants.Literal.integer($i.text))); } + | {isParsingTxn}? r=rowDataReference + { + addRawReferenceOperation(operations, key, new ReferenceOperation.Raw(new Operation.SetValue(r), key, new ReferenceValue.Substitution.Raw(r))); + } ; -shorthandColumnOperation[List> operations, ColumnIdentifier key] - : sig=('+=' | '-=') t=term - { - addRawUpdate(operations, key, $sig.text.equals("+=") ? new Operation.Addition(t) : new Operation.Substraction(t)); - } +shorthandColumnOperation[UpdateStatement.OperationCollector operations, ColumnIdentifier key] + : sig=('+=' | '-=') + ( + t=term + { + addRawUpdate(operations, key, $sig.text.equals("+=") ? new Operation.Addition(t) : new Operation.Substraction(t)); + } + | {isParsingTxn}? dr=rowDataReference + { + ReferenceValue.Raw right = new ReferenceValue.Substitution.Raw(dr); + Operation.RawUpdate operation = $sig.text.equals("+=") ? new Operation.Addition(dr) : new Operation.Substraction(dr); + addRawReferenceOperation(operations, key, new ReferenceOperation.Raw(operation, key, right)); + } + ) ; -collectionColumnOperation[List> operations, ColumnIdentifier key, Term.Raw k] - : '=' t=term - { - addRawUpdate(operations, key, new Operation.SetElement(k, t)); - } +collectionColumnOperation[UpdateStatement.OperationCollector operations, ColumnIdentifier key, Term.Raw k] + : '=' + ( + t=term + { + addRawUpdate(operations, key, new Operation.SetElement(k, t)); + } + | {isParsingTxn}? dr=rowDataReference + { + ReferenceValue.Raw right = new ReferenceValue.Substitution.Raw(dr); + addRawReferenceOperation(operations, key, new ReferenceOperation.Raw(new Operation.SetElement(k, dr), key, right)); + } + ) ; -udtColumnOperation[List> operations, ColumnIdentifier key, FieldIdentifier field] - : '=' t=term - { - addRawUpdate(operations, key, new Operation.SetField(field, t)); - } +udtColumnOperation[UpdateStatement.OperationCollector operations, ColumnIdentifier key, FieldIdentifier field] + : '=' + ( + t=term + { + addRawUpdate(operations, key, new Operation.SetField(field, t)); + } + | {isParsingTxn}? dr=rowDataReference + { + ReferenceValue.Raw right = new ReferenceValue.Substitution.Raw(dr); + addRawReferenceOperation(operations, key, new ReferenceOperation.Raw(new Operation.SetField(field, dr), key, right)); + } + ) ; columnCondition returns [ColumnCondition.Raw condition] @@ -2094,5 +2337,10 @@ basic_unreserved_keyword returns [String str] | K_BETWEEN | K_CHECK | K_INDEXES + | K_COMMIT + | K_END + | K_LET + | K_THEN + | K_TRANSACTION ) { $str = $k.text; } ; diff --git a/src/java/org/apache/cassandra/audit/AuditLogEntryCategory.java b/src/java/org/apache/cassandra/audit/AuditLogEntryCategory.java index b848440607a5..9f9af16e8abb 100644 --- a/src/java/org/apache/cassandra/audit/AuditLogEntryCategory.java +++ b/src/java/org/apache/cassandra/audit/AuditLogEntryCategory.java @@ -23,5 +23,5 @@ */ public enum AuditLogEntryCategory { - QUERY, DML, DDL, DCL, OTHER, AUTH, ERROR, PREPARE, JMX + QUERY, DML, DDL, DCL, OTHER, AUTH, ERROR, PREPARE, JMX, TRANSACTION } diff --git a/src/java/org/apache/cassandra/audit/AuditLogEntryType.java b/src/java/org/apache/cassandra/audit/AuditLogEntryType.java index 2bbff08429e1..4ee3348055fb 100644 --- a/src/java/org/apache/cassandra/audit/AuditLogEntryType.java +++ b/src/java/org/apache/cassandra/audit/AuditLogEntryType.java @@ -63,6 +63,7 @@ public enum AuditLogEntryType DROP_IDENTITY(AuditLogEntryCategory.DCL), USE_KEYSPACE(AuditLogEntryCategory.OTHER), DESCRIBE(AuditLogEntryCategory.OTHER), + TRANSACTION(AuditLogEntryCategory.TRANSACTION), /* * Common Audit Log Entry Types diff --git a/src/java/org/apache/cassandra/audit/AuditLogFilter.java b/src/java/org/apache/cassandra/audit/AuditLogFilter.java index b775ac7785cf..ec53212bce97 100644 --- a/src/java/org/apache/cassandra/audit/AuditLogFilter.java +++ b/src/java/org/apache/cassandra/audit/AuditLogFilter.java @@ -28,7 +28,7 @@ final class AuditLogFilter { private static final Logger logger = LoggerFactory.getLogger(AuditLogFilter.class); - private static ImmutableSet EMPTY_FILTERS = ImmutableSet.of(); + private static final ImmutableSet EMPTY_FILTERS = ImmutableSet.of(); final ImmutableSet excludedKeyspaces; final ImmutableSet includedKeyspaces; diff --git a/src/java/org/apache/cassandra/auth/CassandraRoleManager.java b/src/java/org/apache/cassandra/auth/CassandraRoleManager.java index 1e1a9ec310e9..45e9b0dd4d53 100644 --- a/src/java/org/apache/cassandra/auth/CassandraRoleManager.java +++ b/src/java/org/apache/cassandra/auth/CassandraRoleManager.java @@ -75,7 +75,6 @@ import org.apache.cassandra.utils.NoSpamLogger; import org.mindrot.jbcrypt.BCrypt; -import static org.apache.cassandra.config.CassandraRelevantProperties.AUTH_BCRYPT_GENSALT_LOG2_ROUNDS; import static org.apache.cassandra.service.QueryState.forInternalCalls; /** @@ -147,17 +146,6 @@ public class CassandraRoleManager implements IRoleManager, CassandraRoleManagerM } }; - private static final int GENSALT_LOG2_ROUNDS = getGensaltLogRounds(); - - static int getGensaltLogRounds() - { - int rounds = AUTH_BCRYPT_GENSALT_LOG2_ROUNDS.getInt(10); - if (rounds < 4 || rounds > 30) - throw new ConfigurationException(String.format("Bad value for system property %s." + - "Please use a value between 4 and 30 inclusively", AUTH_BCRYPT_GENSALT_LOG2_ROUNDS.getKey())); - return rounds; - } - private SelectStatement loadRoleStatement; private SelectStatement loadIdentityStatement; @@ -689,9 +677,11 @@ private String optionsToAssignments(Map options) .collect(Collectors.joining(",")); } + + private static String hashpw(String password) { - return BCrypt.hashpw(password, BCrypt.gensalt(GENSALT_LOG2_ROUNDS)); + return BCrypt.hashpw(password, PasswordSaltSupplier.get()); } private static String escape(String name) diff --git a/src/java/org/apache/cassandra/auth/DataResource.java b/src/java/org/apache/cassandra/auth/DataResource.java index 4923a0b2d611..6776fc144921 100644 --- a/src/java/org/apache/cassandra/auth/DataResource.java +++ b/src/java/org/apache/cassandra/auth/DataResource.java @@ -24,6 +24,7 @@ import org.apache.commons.lang3.StringUtils; import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.SchemaConstants; /** * The primary type of resource in Cassandra. @@ -249,7 +250,8 @@ public boolean exists() return true; case KEYSPACE: case ALL_TABLES: - return Schema.instance.getKeyspaces().contains(keyspace); + return SchemaConstants.isVirtualSystemKeyspace(keyspace) || + Schema.instance.getKeyspaces().contains(keyspace); case TABLE: return Schema.instance.getTableMetadata(keyspace, table) != null; } diff --git a/src/java/org/apache/cassandra/auth/MutualTlsAuthenticator.java b/src/java/org/apache/cassandra/auth/MutualTlsAuthenticator.java index 0337291a9dc2..7c5a6654f72f 100644 --- a/src/java/org/apache/cassandra/auth/MutualTlsAuthenticator.java +++ b/src/java/org/apache/cassandra/auth/MutualTlsAuthenticator.java @@ -44,7 +44,7 @@ import org.apache.cassandra.utils.NoSpamLogger; import static org.apache.cassandra.auth.IAuthenticator.AuthenticationMode.MTLS; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; /** * Performs mTLS authentication for client connections by extracting identities from client certificate diff --git a/src/java/org/apache/cassandra/auth/MutualTlsInternodeAuthenticator.java b/src/java/org/apache/cassandra/auth/MutualTlsInternodeAuthenticator.java index c1fcbd6eabf0..91500cbeb2f2 100644 --- a/src/java/org/apache/cassandra/auth/MutualTlsInternodeAuthenticator.java +++ b/src/java/org/apache/cassandra/auth/MutualTlsInternodeAuthenticator.java @@ -49,7 +49,7 @@ import org.apache.cassandra.metrics.MutualTlsMetrics; import org.apache.cassandra.utils.NoSpamLogger; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; /** * Performs mTLS authentication for internode connections by extracting identities from the certificates of incoming diff --git a/src/java/org/apache/cassandra/auth/MutualTlsWithPasswordFallbackAuthenticator.java b/src/java/org/apache/cassandra/auth/MutualTlsWithPasswordFallbackAuthenticator.java index 219353d76a1f..f994399f2161 100644 --- a/src/java/org/apache/cassandra/auth/MutualTlsWithPasswordFallbackAuthenticator.java +++ b/src/java/org/apache/cassandra/auth/MutualTlsWithPasswordFallbackAuthenticator.java @@ -81,7 +81,7 @@ public SaslNegotiator newSaslNegotiator(InetAddress clientAddress, Certificate[] public void validateConfiguration() throws ConfigurationException { Config config = DatabaseDescriptor.getRawConfig(); - if (config.client_encryption_options.getClientAuth() == EncryptionOptions.ClientAuth.NOT_REQUIRED) + if (config.client_encryption_options.getClientAuth() == EncryptionOptions.ClientEncryptionOptions.ClientAuth.NOT_REQUIRED) { String msg = "MutualTlsWithPasswordFallbackAuthenticator requires client_encryption_options.require_client_auth to be optional/true"; throw new ConfigurationException(msg); diff --git a/src/java/org/apache/cassandra/auth/PasswordSaltSupplier.java b/src/java/org/apache/cassandra/auth/PasswordSaltSupplier.java new file mode 100644 index 000000000000..9c9bd1d0f813 --- /dev/null +++ b/src/java/org/apache/cassandra/auth/PasswordSaltSupplier.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.auth; + +import java.util.function.Supplier; + +import com.google.common.annotations.VisibleForTesting; + +import org.apache.cassandra.exceptions.ConfigurationException; +import org.mindrot.jbcrypt.BCrypt; + +import static org.apache.cassandra.config.CassandraRelevantProperties.AUTH_BCRYPT_GENSALT_LOG2_ROUNDS; + +public class PasswordSaltSupplier +{ + // 2 ** GENSALT_LOG2_ROUNDS rounds of hashing will be performed. + private static final int GENSALT_LOG2_ROUNDS = getGensaltLogRounds(); + + @VisibleForTesting + static int getGensaltLogRounds() + { + int rounds = AUTH_BCRYPT_GENSALT_LOG2_ROUNDS.getInt(); + if (rounds < 4 || rounds > 30) + throw new ConfigurationException(String.format("Bad value for system property -D%s." + + "Please use a value between 4 and 30 inclusively", + AUTH_BCRYPT_GENSALT_LOG2_ROUNDS.getKey())); + return rounds; + } + private static Supplier DEFAULT_SALT_SUPPLIER = () -> BCrypt.gensalt(GENSALT_LOG2_ROUNDS); + private static Supplier saltSupplier = DEFAULT_SALT_SUPPLIER; + + public static void unsafeSet(Supplier newSaltSupplier) + { + assert newSaltSupplier != null; + saltSupplier = newSaltSupplier; + } + public static void unsafeReset() + { + saltSupplier = DEFAULT_SALT_SUPPLIER; + } + + public static String get() + { + return saltSupplier.get(); + } +} diff --git a/src/java/org/apache/cassandra/batchlog/BatchlogManager.java b/src/java/org/apache/cassandra/batchlog/BatchlogManager.java index b5b33d5e0b34..4c19d1c0105c 100644 --- a/src/java/org/apache/cassandra/batchlog/BatchlogManager.java +++ b/src/java/org/apache/cassandra/batchlog/BatchlogManager.java @@ -27,25 +27,24 @@ import java.util.Set; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Supplier; +import javax.annotation.Nullable; import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.util.concurrent.RateLimiter; -import org.apache.cassandra.concurrent.ScheduledExecutorPlus; -import org.apache.cassandra.schema.KeyspaceMetadata; -import org.apache.cassandra.tcm.ClusterMetadata; -import org.apache.cassandra.transport.Dispatcher; -import org.apache.cassandra.utils.TimeUUID; -import org.apache.cassandra.utils.concurrent.Future; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.concurrent.ScheduledExecutorPlus; +import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.UntypedResultSet; +import org.apache.cassandra.cql3.UntypedResultSet.Row; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.db.Keyspace; @@ -55,6 +54,7 @@ import org.apache.cassandra.db.marshal.BytesType; import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.dht.Token; +import org.apache.cassandra.exceptions.RetryOnDifferentSystemException; import org.apache.cassandra.exceptions.WriteFailureException; import org.apache.cassandra.exceptions.WriteTimeoutException; import org.apache.cassandra.gms.FailureDetector; @@ -70,26 +70,39 @@ import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessageFlag; import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.service.WriteResponseHandler; +import org.apache.cassandra.service.accord.IAccordService.IAccordResult; +import org.apache.cassandra.service.accord.txn.TxnResult; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationMutationHelper; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationMutationHelper.SplitMutations; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.transport.Dispatcher; +import org.apache.cassandra.utils.Clock; import org.apache.cassandra.utils.ExecutorUtils; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.MBeanWrapper; +import org.apache.cassandra.utils.Throwables; +import org.apache.cassandra.utils.TimeUUID; +import org.apache.cassandra.utils.concurrent.Future; import static java.util.concurrent.TimeUnit.MILLISECONDS; import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; import static org.apache.cassandra.config.CassandraRelevantProperties.BATCHLOG_REPLAY_TIMEOUT_IN_MS; import static org.apache.cassandra.cql3.QueryProcessor.executeInternal; import static org.apache.cassandra.cql3.QueryProcessor.executeInternalWithPaging; +import static org.apache.cassandra.hints.HintsService.RETRY_ON_DIFFERENT_SYSTEM_UUID; import static org.apache.cassandra.net.Verb.MUTATION_REQ; +import static org.apache.cassandra.service.accord.txn.TxnResult.Kind.retry_new_protocol; +import static org.apache.cassandra.service.consensus.migration.ConsensusMigrationMutationHelper.mutateWithAccordAsync; import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis; public class BatchlogManager implements BatchlogManagerMBean { public static final String MBEAN_NAME = "org.apache.cassandra.db:type=BatchlogManager"; - private static final long REPLAY_INTERVAL = 10 * 1000; // milliseconds static final int DEFAULT_PAGE_SIZE = 128; private static final Logger logger = LoggerFactory.getLogger(BatchlogManager.class); @@ -104,6 +117,8 @@ public class BatchlogManager implements BatchlogManagerMBean private final RateLimiter rateLimiter = RateLimiter.create(Double.MAX_VALUE); + private final AtomicBoolean isBatchlogReplayPaused = new AtomicBoolean(false); + public BatchlogManager() { batchlogTasks = executorFactory().scheduled(false, "BatchlogTasks"); @@ -115,7 +130,7 @@ public void start() batchlogTasks.scheduleWithFixedDelay(this::replayFailedBatches, StorageService.RING_DELAY_MILLIS, - REPLAY_INTERVAL, + CassandraRelevantProperties.BATCHLOG_REPLAY_INTERVAL_MS.getLong(), MILLISECONDS); } @@ -184,7 +199,9 @@ public long getTotalBatchesReplayed() public void forceBatchlogReplay() throws Exception { + logger.debug("Forcing batchlog replay"); startBatchlogReplay().get(); + logger.debug("Finished forcing batchlog replay"); } public Future startBatchlogReplay() @@ -193,14 +210,25 @@ public Future startBatchlogReplay() return batchlogTasks.submit(this::replayFailedBatches); } - void performInitialReplay() throws InterruptedException, ExecutionException + public void pauseReplay() + { + logger.debug("Paused batchlog replay"); + isBatchlogReplayPaused.set(true); + } + + public void resumeReplay() { - // Invokes initial replay. Used for testing only. - batchlogTasks.submit(this::replayFailedBatches).get(); + logger.debug("Resumed batchlog replay"); + isBatchlogReplayPaused.set(false); } private void replayFailedBatches() { + if (isBatchlogReplayPaused.get()) + { + logger.debug("Batch log replay is paused, skipping replay"); + return; + } logger.trace("Started replayFailedBatches"); // rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml). @@ -223,6 +251,7 @@ private void replayFailedBatches() SchemaConstants.SYSTEM_KEYSPACE_NAME, SystemKeyspace.BATCHES); UntypedResultSet batches = executeInternalWithPaging(query, pageSize, lastReplayedUuid, limitUuid); + processBatchlogEntries(batches, pageSize, rateLimiter); lastReplayedUuid = limitUuid; logger.trace("Finished replayFailedBatches"); @@ -276,16 +305,7 @@ private void processBatchlogEntries(UntypedResultSet batches, int pageSize, Rate int version = row.getInt("version"); try { - ReplayingBatch batch = new ReplayingBatch(id, version, row.getList("mutations", BytesType.instance)); - if (batch.replay(rateLimiter, hintedNodes) > 0) - { - unfinishedBatches.add(batch); - } - else - { - remove(id); // no write mutations were sent (either expired or all CFs involved truncated). - ++totalBatchesReplayed; - } + dispatchBatch(rateLimiter, row, id, version, hintedNodes, unfinishedBatches); } catch (IOException e) { @@ -307,6 +327,8 @@ private void processBatchlogEntries(UntypedResultSet batches, int pageSize, Rate // finalize the incomplete last page of batches if (positionInPage > 0) finishAndClearBatches(unfinishedBatches, hintedNodes, replayedBatches); + else + logger.trace("Had no batches to replay"); if (caughtException != null) logger.warn(String.format("Encountered %d unexpected exceptions while sending out batches", skipped), caughtException); @@ -318,6 +340,35 @@ private void processBatchlogEntries(UntypedResultSet batches, int pageSize, Rate replayedBatches.forEach(BatchlogManager::remove); } + private void dispatchBatch(RateLimiter rateLimiter, Row row, TimeUUID id, int version, Set hintedNodes, ArrayList unfinishedBatches) throws IOException + { + while (true) + { + ClusterMetadata cm = ClusterMetadata.current(); + try + { + ReplayingBatch batch = new ReplayingBatch(id, version, row.getList("mutations", BytesType.instance), cm); + if (batch.replay(rateLimiter, hintedNodes)) + { + unfinishedBatches.add(batch); + } + else + { + remove(id); // no write mutations were sent (either expired or all CFs involved truncated). + ++totalBatchesReplayed; + } + } + catch (RetryOnDifferentSystemException e) + { + // Self apply can throw retry on different system + // Barring bugs we should already have the latest cluster metadata needed to correctly + // split the batch and retry since that is what was used to generate the exception + continue; + } + break; + } + } + private void finishAndClearBatches(ArrayList batches, Set hintedNodes, Set replayedBatches) { // schedule hints for timed out deliveries @@ -340,61 +391,111 @@ private static class ReplayingBatch { private final TimeUUID id; private final long writtenAt; - private final List mutations; + private final int unsplitGcGs; + private final List normalMutations; + private final List accordMutations; private final int replayedBytes; + private final ClusterMetadata cm; - private List> replayHandlers; + private List> replayHandlers = ImmutableList.of(); + private IAccordResult accordResult; + @Nullable + private Dispatcher.RequestTime accordTxnStart; - ReplayingBatch(TimeUUID id, int version, List serializedMutations) throws IOException + ReplayingBatch(TimeUUID id, int version, List serializedMutations, ClusterMetadata cm) throws IOException { this.id = id; this.writtenAt = id.unix(MILLISECONDS); - this.mutations = new ArrayList<>(serializedMutations.size()); - this.replayedBytes = addMutations(version, serializedMutations); + List unsplitMutations = new ArrayList<>(serializedMutations.size()); + this.replayedBytes = addMutations(unsplitMutations, writtenAt, version, serializedMutations); + unsplitGcGs = gcgs(unsplitMutations); + SplitMutations splitMutations = ConsensusMigrationMutationHelper.splitMutationsIntoAccordAndNormal(cm, unsplitMutations); + logger.trace("Replaying batch with Accord {} and normal {}", splitMutations.accordMutations(), splitMutations.normalMutations()); + normalMutations = splitMutations.normalMutations(); + accordMutations = splitMutations.accordMutations(); + if (accordMutations != null) + accordTxnStart = new Dispatcher.RequestTime(Clock.Global.nanoTime()); + this.cm = cm; } - public int replay(RateLimiter rateLimiter, Set hintedNodes) throws IOException + public boolean replay(RateLimiter rateLimiter, Set hintedNodes) throws IOException { logger.trace("Replaying batch {}", id); - if (mutations.isEmpty()) - return 0; + if ((normalMutations == null || normalMutations.isEmpty()) && (accordMutations == null || accordMutations.isEmpty())) + return false; - int gcgs = gcgs(mutations); - if (MILLISECONDS.toSeconds(writtenAt) + gcgs <= FBUtilities.nowInSeconds()) - return 0; + if (MILLISECONDS.toSeconds(writtenAt) + unsplitGcGs <= FBUtilities.nowInSeconds()) + return false; - replayHandlers = sendReplays(mutations, writtenAt, hintedNodes); + if (accordMutations != null) + { + accordTxnStart = accordTxnStart.withStartedAt(Clock.Global.nanoTime()); + accordResult = accordMutations != null ? mutateWithAccordAsync(cm, accordMutations, null, accordTxnStart) : null; + } + + if (normalMutations != null) + replayHandlers = sendReplays(normalMutations, writtenAt, hintedNodes); rateLimiter.acquire(replayedBytes); // acquire afterwards, to not mess up ttl calculation. - return replayHandlers.size(); + return replayHandlers.size() > 0 || accordMutations != null; } public void finish(Set hintedNodes) { - for (int i = 0; i < replayHandlers.size(); i++) + Throwable failure = null; + // Check if the Accord mutations succeeded asynchronously + try { - ReplayWriteResponseHandler handler = replayHandlers.get(i); - try + if (accordResult != null) { - handler.get(); + TxnResult.Kind kind = accordResult.awaitAndGet().kind(); + if (kind == retry_new_protocol) + throw new RetryOnDifferentSystemException(); } - catch (WriteTimeoutException|WriteFailureException e) + } + catch (WriteTimeoutException | WriteFailureException | RetryOnDifferentSystemException e ) + { + logger.trace("Failed replaying a batched mutation on Accord, will write a hint"); + logger.trace("Failure was : {}", e.getMessage()); + writeHintsForUndeliveredAccordTxns(hintedNodes); + } + catch (Exception e) + { + failure = Throwables.merge(failure, e); + } + + try + { + for (int i = 0; i < replayHandlers.size(); i++) { - if (logger.isTraceEnabled()) + ReplayWriteResponseHandler handler = replayHandlers.get(i); + try + { + handler.get(); + } + catch (WriteTimeoutException|WriteFailureException|RetryOnDifferentSystemException e) { logger.trace("Failed replaying a batched mutation to a node, will write a hint"); logger.trace("Failure was : {}", e.getMessage()); + // writing hints for the rest to hints, starting from i + writeHintsForUndeliveredEndpoints(i, hintedNodes); + break; } - // writing hints for the rest to hints, starting from i - writeHintsForUndeliveredEndpoints(i, hintedNodes); - return; } } + catch (Exception e) + { + logger.debug("Unexpected batchlog replay exception", e); + failure = Throwables.merge(failure, e); + } + + if (failure != null) + throw Throwables.unchecked(failure); } - private int addMutations(int version, List serializedMutations) throws IOException + private static int addMutations(List unsplitMutations, long writtenAt, int version, List serializedMutations) throws IOException { int ret = 0; for (ByteBuffer serializedMutation : serializedMutations) @@ -402,7 +503,7 @@ private int addMutations(int version, List serializedMutations) thro ret += serializedMutation.remaining(); try (DataInputBuffer in = new DataInputBuffer(serializedMutation, true)) { - addMutation(Mutation.serializer.deserialize(in, version)); + addMutation(unsplitMutations, writtenAt, Mutation.serializer.deserialize(in, version)); } } @@ -412,19 +513,41 @@ private int addMutations(int version, List serializedMutations) thro // Remove CFs that have been truncated since. writtenAt and SystemTable#getTruncatedAt() both return millis. // We don't abort the replay entirely b/c this can be considered a success (truncated is same as delivered then // truncated. - private void addMutation(Mutation mutation) + private static void addMutation(List unsplitMutations, long writtenAt, Mutation mutation) { for (TableId tableId : mutation.getTableIds()) if (writtenAt <= SystemKeyspace.getTruncatedAt(tableId)) mutation = mutation.without(tableId); - if (!mutation.isEmpty()) - mutations.add(mutation); + if (mutation != null) + unsplitMutations.add(mutation); + } + + // Write the hint assuming that when it is replayed it will probably be replayed + // as an Accord transaction so no reason to record per endpoint hints for all the endpoints + // Hints will still have to split and re-route on replay + private void writeHintsForUndeliveredAccordTxns(Set hintedNodes) + { + if (accordMutations == null) + return; + + int gcgs = gcgs(accordMutations); + + // expired + if (MILLISECONDS.toSeconds(writtenAt) + gcgs <= FBUtilities.nowInSeconds()) + return; + + for (Mutation m : accordMutations) + HintsService.instance.write(ImmutableList.of(RETRY_ON_DIFFERENT_SYSTEM_UUID), Hint.create(m, writtenAt)); + hintedNodes.add(RETRY_ON_DIFFERENT_SYSTEM_UUID); } private void writeHintsForUndeliveredEndpoints(int startFrom, Set hintedNodes) { - int gcgs = gcgs(mutations); + if (normalMutations == null) + return; + + int gcgs = gcgs(normalMutations); // expired if (MILLISECONDS.toSeconds(writtenAt) + gcgs <= FBUtilities.nowInSeconds()) @@ -434,7 +557,7 @@ private void writeHintsForUndeliveredEndpoints(int startFrom, Set hintedNo for (int i = startFrom; i < replayHandlers.size(); i++) { ReplayWriteResponseHandler handler = replayHandlers.get(i); - Mutation undeliveredMutation = mutations.get(i); + Mutation undeliveredMutation = normalMutations.get(i); if (handler != null) { diff --git a/src/java/org/apache/cassandra/concurrent/ExecutionFailure.java b/src/java/org/apache/cassandra/concurrent/ExecutionFailure.java index 27ab885e234e..dc1262b3c1b2 100644 --- a/src/java/org/apache/cassandra/concurrent/ExecutionFailure.java +++ b/src/java/org/apache/cassandra/concurrent/ExecutionFailure.java @@ -19,6 +19,7 @@ package org.apache.cassandra.concurrent; import java.util.concurrent.Callable; +import java.util.concurrent.CancellationException; import java.util.concurrent.Future; import org.apache.cassandra.concurrent.DebuggableTask.RunnableDebuggableTask; @@ -26,6 +27,7 @@ import org.slf4j.LoggerFactory; import org.apache.cassandra.db.compaction.CompactionInterruptedException; +import org.apache.cassandra.exceptions.RequestTimeoutException; import org.apache.cassandra.utils.Closeable; import org.apache.cassandra.utils.JVMStabilityInspector; import org.apache.cassandra.utils.WithResources; @@ -49,6 +51,9 @@ public static void handle(Throwable t) { try { + if (t instanceof RequestTimeoutException || t instanceof CancellationException) + return; + if (t instanceof CompactionInterruptedException) { // TODO: should we check to see there aren't nested CompactionInterruptedException? diff --git a/src/java/org/apache/cassandra/concurrent/ExecutorFactory.java b/src/java/org/apache/cassandra/concurrent/ExecutorFactory.java index ec3b9c370be1..0b72961e13cc 100644 --- a/src/java/org/apache/cassandra/concurrent/ExecutorFactory.java +++ b/src/java/org/apache/cassandra/concurrent/ExecutorFactory.java @@ -18,15 +18,15 @@ package org.apache.cassandra.concurrent; -import org.apache.cassandra.concurrent.InfiniteLoopExecutor.Daemon; import org.apache.cassandra.concurrent.InfiniteLoopExecutor.Interrupts; import org.apache.cassandra.concurrent.InfiniteLoopExecutor.SimulatorSafe; import org.apache.cassandra.utils.JVMStabilityInspector; import org.apache.cassandra.utils.Shared; -import static java.lang.Thread.*; +import static java.lang.Thread.NORM_PRIORITY; +import static java.lang.Thread.UncaughtExceptionHandler; import static org.apache.cassandra.concurrent.ExecutorFactory.SimulatorSemantics.NORMAL; -import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.Daemon.DAEMON; +import static org.apache.cassandra.concurrent.ExecutorFactory.SystemThreadTag.DAEMON; import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.Interrupts.UNSYNCHRONIZED; import static org.apache.cassandra.concurrent.NamedThreadFactory.createThread; import static org.apache.cassandra.concurrent.NamedThreadFactory.setupThread; @@ -78,6 +78,17 @@ public enum SimulatorSemantics NORMAL, DISCARD } + /// Simulator Tag specifies the nature of the created thread: + /// - JOB threads are short-lived and simulation treats them as sub tasks of the task that creates them, + /// so that the strictly ordered property of the simulator ensures the thread terminates before the next + /// task of its parent is scheduled. + /// - DAEMON threads are treated as background tasks, and are neither linked to their parent task or the Work phase that creates them. + /// - INFINITE_LOOP threads detach from their parent task as they are expected to run forever, but unlike DAEMON threads must have + /// no active work for a given Work phase to complete. + public enum SimulatorThreadTag { JOB, DAEMON, INFINITE_LOOP } + + public enum SystemThreadTag { DAEMON, NON_DAEMON } + /** * @return a factory that configures executors that propagate {@link ExecutorLocals} to the executing thread */ @@ -124,10 +135,11 @@ public enum SimulatorSemantics * Create and start a new thread to execute {@code runnable} * @param name the name of the thread * @param runnable the task to execute - * @param daemon flag to indicate whether the thread should be a daemon or not + * @param systemTag flag to indicate whether the loop thread should be a daemon thread or not + * @param simulatorTag flag to indicate the nature of the specific thread to help simulate it * @return the new thread */ - Thread startThread(String name, Runnable runnable, Daemon daemon); + Thread startThread(String name, Runnable runnable, SystemThreadTag systemTag, SimulatorThreadTag simulatorTag); /** * Create and start a new thread to execute {@code runnable}; this thread will be a daemon thread. @@ -137,7 +149,7 @@ public enum SimulatorSemantics */ default Thread startThread(String name, Runnable runnable) { - return startThread(name, runnable, DAEMON); + return startThread(name, runnable, DAEMON, SimulatorThreadTag.JOB); } /** @@ -148,14 +160,14 @@ default Thread startThread(String name, Runnable runnable) * @param name the name of the thread used to invoke the task repeatedly * @param task the task to execute repeatedly * @param simulatorSafe flag indicating if the loop thread can be intercepted / rescheduled during cluster simulation - * @param daemon flag to indicate whether the loop thread should be a daemon thread or not + * @param systemTag flag to indicate whether the loop thread should be a daemon thread or not * @param interrupts flag to indicate whether to synchronize interrupts of the task execution thread * using the task's monitor this can be used to prevent interruption while performing * IO operations which forbid interrupted threads. * See: {@link org.apache.cassandra.db.commitlog.AbstractCommitLogSegmentManager#start} * @return the new thread */ - Interruptible infiniteLoop(String name, Interruptible.Task task, SimulatorSafe simulatorSafe, Daemon daemon, Interrupts interrupts); + Interruptible infiniteLoop(String name, Interruptible.Task task, SimulatorSafe simulatorSafe, SystemThreadTag systemTag, Interrupts interrupts); /** * Create and start a new InfiniteLoopExecutor to repeatedly invoke {@code runnable}. @@ -291,9 +303,9 @@ public ScheduledExecutorPlus scheduled(boolean executeOnShutdown, String name, i } @Override - public Thread startThread(String name, Runnable runnable, Daemon daemon) + public Thread startThread(String name, Runnable runnable, SystemThreadTag systemTag, SimulatorThreadTag simulatorTag) { - Thread thread = setupThread(createThread(threadGroup, runnable, name, daemon == DAEMON), + Thread thread = setupThread(createThread(threadGroup, runnable, name, systemTag == DAEMON), Thread.NORM_PRIORITY, contextClassLoader, uncaughtExceptionHandler); @@ -302,9 +314,9 @@ public Thread startThread(String name, Runnable runnable, Daemon daemon) } @Override - public Interruptible infiniteLoop(String name, Interruptible.Task task, SimulatorSafe simulatorSafe, Daemon daemon, Interrupts interrupts) + public Interruptible infiniteLoop(String name, Interruptible.Task task, SimulatorSafe simulatorSafe, SystemThreadTag systemTag, Interrupts interrupts) { - return new InfiniteLoopExecutor(this, name, task, daemon, interrupts); + return new InfiniteLoopExecutor(this, name, task, systemTag, interrupts); } @Override diff --git a/src/java/org/apache/cassandra/concurrent/InfiniteLoopExecutor.java b/src/java/org/apache/cassandra/concurrent/InfiniteLoopExecutor.java index ac10a70c3066..9d8701cb88e4 100644 --- a/src/java/org/apache/cassandra/concurrent/InfiniteLoopExecutor.java +++ b/src/java/org/apache/cassandra/concurrent/InfiniteLoopExecutor.java @@ -28,6 +28,8 @@ import java.util.function.BiFunction; import java.util.function.Consumer; +import org.apache.cassandra.concurrent.ExecutorFactory.SimulatorThreadTag; +import org.apache.cassandra.concurrent.ExecutorFactory.SystemThreadTag; import org.apache.cassandra.utils.Shared; import org.apache.cassandra.utils.concurrent.Condition; import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; @@ -52,9 +54,6 @@ public enum InternalState { SHUTTING_DOWN_NOW, TERMINATED } @Shared(scope = Shared.Scope.SIMULATION) public enum SimulatorSafe { SAFE, UNSAFE } - @Shared(scope = Shared.Scope.SIMULATION) - public enum Daemon { DAEMON, NON_DAEMON } - @Shared(scope = Shared.Scope.SIMULATION) public enum Interrupts { SYNCHRONIZED, UNSYNCHRONIZED } @@ -65,20 +64,20 @@ public enum Interrupts { SYNCHRONIZED, UNSYNCHRONIZED } private final Consumer interruptHandler; private final Condition isTerminated = newOneTimeCondition(); - public InfiniteLoopExecutor(String name, Task task, Daemon daemon) + public InfiniteLoopExecutor(String name, Task task, SystemThreadTag systemTag) { - this(ExecutorFactory.Global.executorFactory(), name, task, daemon, UNSYNCHRONIZED); + this(ExecutorFactory.Global.executorFactory(), name, task, systemTag, UNSYNCHRONIZED); } - public InfiniteLoopExecutor(ExecutorFactory factory, String name, Task task, Daemon daemon) + public InfiniteLoopExecutor(ExecutorFactory factory, String name, Task task, SystemThreadTag systemTag) { - this(factory, name, task, daemon, UNSYNCHRONIZED); + this(factory, name, task, systemTag, UNSYNCHRONIZED); } - public InfiniteLoopExecutor(ExecutorFactory factory, String name, Task task, Daemon daemon, Interrupts interrupts) + public InfiniteLoopExecutor(ExecutorFactory factory, String name, Task task, SystemThreadTag systemTag, Interrupts interrupts) { this.task = task; - this.thread = factory.startThread(name, this::loop, daemon); + this.thread = factory.startThread(name, this::loop, systemTag, SimulatorThreadTag.INFINITE_LOOP); this.interruptHandler = interrupts == SYNCHRONIZED ? interruptHandler(task) : Thread::interrupt; @@ -103,7 +102,6 @@ private static Consumer interruptHandler(final Object monitor) }; } - private void loop() { boolean interrupted = false; @@ -189,6 +187,11 @@ public boolean awaitTermination(long time, TimeUnit unit) throws InterruptedExce return isTerminated(); } + public long threadId() + { + return thread.getId(); + } + @VisibleForTesting public boolean isAlive() { diff --git a/src/java/org/apache/cassandra/net/ManyToOneConcurrentLinkedQueue.java b/src/java/org/apache/cassandra/concurrent/ManyToOneConcurrentLinkedQueue.java similarity index 97% rename from src/java/org/apache/cassandra/net/ManyToOneConcurrentLinkedQueue.java rename to src/java/org/apache/cassandra/concurrent/ManyToOneConcurrentLinkedQueue.java index 4c73bdc9cd2e..8615e99c22f8 100644 --- a/src/java/org/apache/cassandra/net/ManyToOneConcurrentLinkedQueue.java +++ b/src/java/org/apache/cassandra/concurrent/ManyToOneConcurrentLinkedQueue.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.cassandra.net; +package org.apache.cassandra.concurrent; import java.util.Collection; import java.util.Iterator; @@ -37,12 +37,12 @@ * In addition to that, provides a {@link #relaxedPeekLastAndOffer(Object)} method that we use to avoid a CAS when * putting message handlers onto the wait queue. */ -class ManyToOneConcurrentLinkedQueue extends ManyToOneConcurrentLinkedQueueHead implements Queue +public class ManyToOneConcurrentLinkedQueue extends ManyToOneConcurrentLinkedQueueHead implements Queue { @SuppressWarnings("unused") // pad two cache lines after the head to prevent false sharing protected long p31, p32, p33, p34, p35, p36, p37, p38, p39, p40, p41, p42, p43, p44, p45; - ManyToOneConcurrentLinkedQueue() + public ManyToOneConcurrentLinkedQueue() { head = tail = new Node<>(null); } @@ -63,7 +63,7 @@ public boolean isEmpty() * - {@code false} result indicates that the queue MIGHT BE non-empty - the value of {@code head} might * not yet have been made externally visible by the consumer thread. */ - boolean relaxedIsEmpty() + public boolean relaxedIsEmpty() { return null == head.next; } @@ -156,7 +156,7 @@ public boolean remove(Object o) * Yields no performance benefit over invoking {@link #poll()} manually - there just isn't * anything to meaningfully amortise on the consumer side of this queue. */ - void drain(Consumer consumer) + public void drain(Consumer consumer) { E item; while ((item = poll()) != null) @@ -181,7 +181,7 @@ public boolean offer(E e) * * @return previously last tail item in the queue, potentially stale */ - E relaxedPeekLastAndOffer(E e) + public E relaxedPeekLastAndOffer(E e) { return internalOffer(e); } diff --git a/src/java/org/apache/cassandra/concurrent/Shutdownable.java b/src/java/org/apache/cassandra/concurrent/Shutdownable.java index 185875b791d2..a72253fc87e9 100644 --- a/src/java/org/apache/cassandra/concurrent/Shutdownable.java +++ b/src/java/org/apache/cassandra/concurrent/Shutdownable.java @@ -19,7 +19,9 @@ package org.apache.cassandra.concurrent; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import org.apache.cassandra.utils.ExecutorUtils; import org.apache.cassandra.utils.Shared; import static org.apache.cassandra.utils.Shared.Scope.SIMULATION; @@ -29,6 +31,11 @@ public interface Shutdownable { boolean isTerminated(); + default boolean isShutdown() + { + return isTerminated(); + } + /** * Shutdown once any remaining work has completed (however this is defined for the implementation). */ @@ -42,5 +49,10 @@ public interface Shutdownable /** * Await termination of this object, i.e. the cessation of all current and future work. */ - public boolean awaitTermination(long timeout, TimeUnit units) throws InterruptedException; + boolean awaitTermination(long timeout, TimeUnit units) throws InterruptedException; + + default void shutdownAndWait(long timeout, TimeUnit unit) throws InterruptedException, TimeoutException + { + ExecutorUtils.shutdownAndWait(timeout, unit, this); + } } diff --git a/src/java/org/apache/cassandra/concurrent/SingleThreadExecutorPlus.java b/src/java/org/apache/cassandra/concurrent/SingleThreadExecutorPlus.java index eb2827774a59..553855ad7bc5 100644 --- a/src/java/org/apache/cassandra/concurrent/SingleThreadExecutorPlus.java +++ b/src/java/org/apache/cassandra/concurrent/SingleThreadExecutorPlus.java @@ -25,6 +25,7 @@ public class SingleThreadExecutorPlus extends ThreadPoolExecutorPlus implements { public static class AtLeastOnce extends AtomicBoolean implements AtLeastOnceTrigger, Runnable { + private static final long serialVersionUID = 0; // for simulator support protected final SequentialExecutorPlus executor; protected final Runnable run; diff --git a/src/java/org/apache/cassandra/concurrent/Stage.java b/src/java/org/apache/cassandra/concurrent/Stage.java index 23f80b5a575f..557c1ca0a48b 100644 --- a/src/java/org/apache/cassandra/concurrent/Stage.java +++ b/src/java/org/apache/cassandra/concurrent/Stage.java @@ -43,23 +43,23 @@ public enum Stage { - READ (false, "ReadStage", "request", DatabaseDescriptor::getConcurrentReaders, DatabaseDescriptor::setConcurrentReaders, Stage::multiThreadedLowSignalStage), - MUTATION (true, "MutationStage", "request", DatabaseDescriptor::getConcurrentWriters, DatabaseDescriptor::setConcurrentWriters, Stage::multiThreadedLowSignalStage), - COUNTER_MUTATION (true, "CounterMutationStage", "request", DatabaseDescriptor::getConcurrentCounterWriters, DatabaseDescriptor::setConcurrentCounterWriters, Stage::multiThreadedLowSignalStage), - VIEW_MUTATION (true, "ViewMutationStage", "request", DatabaseDescriptor::getConcurrentViewWriters, DatabaseDescriptor::setConcurrentViewWriters, Stage::multiThreadedLowSignalStage), - GOSSIP (true, "GossipStage", "internal", () -> 1, null, Stage::singleThreadedStage), - REQUEST_RESPONSE (false, "RequestResponseStage", "request", FBUtilities::getAvailableProcessors, null, Stage::multiThreadedLowSignalStage), - ANTI_ENTROPY (false, "AntiEntropyStage", "internal", () -> 1, null, Stage::singleThreadedStage), - MIGRATION (false, "MigrationStage", "internal", () -> 1, null, Stage::migrationStage), - MISC (false, "MiscStage", "internal", () -> 1, null, Stage::singleThreadedStage), - TRACING (false, "TracingStage", "internal", () -> 1, null, Stage::tracingStage), - INTERNAL_RESPONSE (false, "InternalResponseStage", "internal", FBUtilities::getAvailableProcessors, null, Stage::multiThreadedStage), - IMMEDIATE (false, "ImmediateStage", "internal", () -> 0, null, Stage::immediateExecutor), - PAXOS_REPAIR (false, "PaxosRepairStage", "internal", FBUtilities::getAvailableProcessors, null, Stage::multiThreadedStage), - INTERNAL_METADATA (false, "InternalMetadataStage", "internal", FBUtilities::getAvailableProcessors, null, Stage::multiThreadedStage), - FETCH_LOG (false, "MetadataFetchLogStage", "internal", () -> 1, null, Stage::singleThreadedStage) + READ (false, "ReadStage", "request", DatabaseDescriptor::getConcurrentReaders, DatabaseDescriptor::setConcurrentReaders, Stage::multiThreadedLowSignalStage), + MUTATION (true, "MutationStage", "request", DatabaseDescriptor::getConcurrentWriters, DatabaseDescriptor::setConcurrentWriters, Stage::multiThreadedLowSignalStage), + COUNTER_MUTATION (true, "CounterMutationStage", "request", DatabaseDescriptor::getConcurrentCounterWriters, DatabaseDescriptor::setConcurrentCounterWriters, Stage::multiThreadedLowSignalStage), + VIEW_MUTATION (true, "ViewMutationStage", "request", DatabaseDescriptor::getConcurrentViewWriters, DatabaseDescriptor::setConcurrentViewWriters, Stage::multiThreadedLowSignalStage), + ACCORD_MIGRATION (false, "AccordMigrationStage", "request", DatabaseDescriptor::getAccordConcurrentOps, DatabaseDescriptor::setConcurrentAccordOps, Stage::multiThreadedLowSignalStage), + GOSSIP (true, "GossipStage", "internal", () -> 1, null, Stage::singleThreadedStage), + REQUEST_RESPONSE (false, "RequestResponseStage", "request", FBUtilities::getAvailableProcessors, null, Stage::multiThreadedLowSignalStage), + ANTI_ENTROPY (false, "AntiEntropyStage", "internal", () -> 1, null, Stage::singleThreadedStage), + MIGRATION (false, "MigrationStage", "internal", () -> 1, null, Stage::migrationStage), + MISC (false, "MiscStage", "internal", () -> 1, null, Stage::singleThreadedStage), + TRACING (false, "TracingStage", "internal", () -> 1, null, Stage::tracingStage), + INTERNAL_RESPONSE (false, "InternalResponseStage", "internal", FBUtilities::getAvailableProcessors, null, Stage::multiThreadedStage), + IMMEDIATE (false, "ImmediateStage", "internal", () -> 0, null, Stage::immediateExecutor), + PAXOS_REPAIR (false, "PaxosRepairStage", "internal", FBUtilities::getAvailableProcessors, null, Stage::multiThreadedStage), + INTERNAL_METADATA (false, "InternalMetadataStage", "internal", FBUtilities::getAvailableProcessors, null, Stage::multiThreadedStage), + FETCH_METADATA (false, "MetadataFetchLogStage", "internal", () -> 1, null, Stage::singleThreadedStage), ; - public final String jmxName; private final Supplier executorSupplier; private volatile ExecutorPlus executor; diff --git a/src/java/org/apache/cassandra/concurrent/SyncFutureTask.java b/src/java/org/apache/cassandra/concurrent/SyncFutureTask.java index 422da99fb806..8176913de7e1 100644 --- a/src/java/org/apache/cassandra/concurrent/SyncFutureTask.java +++ b/src/java/org/apache/cassandra/concurrent/SyncFutureTask.java @@ -71,7 +71,11 @@ public void run() catch (Throwable t) { tryFailure(t); - ExecutionFailure.handle(t); + // A lot of exceptions are expected and will be handled by Cassandra + // by consuming the result of the future task so only treat Error + // as uncaught + if (t instanceof Error) + ExecutionFailure.handle(t); } } diff --git a/src/java/org/apache/cassandra/config/AccordSpec.java b/src/java/org/apache/cassandra/config/AccordSpec.java new file mode 100644 index 000000000000..a85437624fd6 --- /dev/null +++ b/src/java/org/apache/cassandra/config/AccordSpec.java @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.config; + +import java.util.concurrent.TimeUnit; + +import accord.utils.Invariants; +import com.fasterxml.jackson.annotation.JsonIgnore; +import org.apache.cassandra.journal.Params; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.service.consensus.TransactionalMode; + +import static org.apache.cassandra.config.AccordSpec.QueueShardModel.THREAD_POOL_PER_SHARD; +import static org.apache.cassandra.config.AccordSpec.QueueSubmissionModel.SYNC; + +public class AccordSpec +{ + public volatile boolean enabled = false; + + public volatile String journal_directory; + + public volatile boolean enable_journal_compaction = true; + + /** + * Enables the virtual Accord debug-only keyspace with tables + * that expose internal state to aid the developers working + * on Accord implementation. + *

+ * These tables can and will change and/or go away at any point, + * including in a minor release, are not to be considered part of the API, + * and are NOT to be relied on for anything. + *

+ * Only enable this keyspace if you are working on Accord and + * need to debug an issue with Accord implementation, or if an Accord + * developer asked you to. + */ + public boolean enable_virtual_debug_only_keyspace = false; + + public enum QueueShardModel + { + /** + * Same number of threads as queue shards, but the shard lock is held only while managing the queue, + * so that submitting threads may queue load/save work. + * + * The global READ and WRITE stages are used for IO. + */ + THREAD_PER_SHARD, + + /** + * Same number of threads as shards, and the shard lock is held for the duration of serving requests. + * The global READ and WRITE stages are used for IO. + */ + THREAD_PER_SHARD_SYNC_QUEUE, + + /** + * More threads than shards. Threads update transaction state as well as performing IO, minimising context switching. + * Fewer shards is generally better, until queue-contention is encountered. + */ + THREAD_POOL_PER_SHARD, + + /** + * More threads than shards. Threads update transaction state only, relying on READ and WRITE stages for IO. + * Fewer shards is generally better, until queue-contention is encountered. + */ + THREAD_POOL_PER_SHARD_EXCLUDES_IO, + } + + public enum QueueSubmissionModel + { + /** + * The queue workers and all submissions require ownership of the lock. + */ + SYNC, + + /** + * The queue workers and some submissions require ownership of the lock. + * That is, if the lock is available on submission we take it; if it is not we try to guarantee that + * another thread will witness the work submission promptly, but if we cannot we wait for the lock + * to ensure work is scheduled. + */ + SEMI_SYNC, + + /** + * The queue workers only require ownership of the lock, submissions happens fully asynchronously. + */ + ASYNC, + + /** + * The queue is backed by submission to a single-threaded plain executor. + * This implementation does not honur the sharding model option. + * + * Note: this isn't intended to be used by real clusters. + */ + EXEC_ST + } + + public QueueShardModel queue_shard_model = THREAD_POOL_PER_SHARD; + public QueueSubmissionModel queue_submission_model = SYNC; + + /** + * The number of queue (and cache) shards. + */ + public volatile OptionaldPositiveInt queue_shard_count = OptionaldPositiveInt.UNDEFINED; + + /** + * The target number of command stores to create per topology shard. + * This determines the amount of execution parallelism possible for a given table/shard on the host. + * More shards means more parallelism, but more state. + * + * TODO (expected): make this a table property + * TODO (expected): adjust this by proportion of ring + */ + public volatile OptionaldPositiveInt command_store_shard_count = OptionaldPositiveInt.UNDEFINED; + + public volatile OptionaldPositiveInt max_queued_loads = OptionaldPositiveInt.UNDEFINED; + public volatile OptionaldPositiveInt max_queued_range_loads = OptionaldPositiveInt.UNDEFINED; + + public DataStorageSpec.LongMebibytesBound cache_size = null; + public DataStorageSpec.LongMebibytesBound working_set_size = null; + public boolean shrink_cache_entries_before_eviction = true; + + public DurationSpec.IntMillisecondsBound range_syncpoint_timeout = new DurationSpec.IntMillisecondsBound("3m"); + public DurationSpec.IntMillisecondsBound repair_timeout = new DurationSpec.IntMillisecondsBound("10m"); + public String recover_txn = "5s*attempts <= 60s"; + public StringRetryStrategy recover_syncpoint = new StringRetryStrategy("60s <= 30s*attempts...60s*attempts <= 600s"); + public String fetch_txn = "1s*attempts"; + public String fetch_syncpoint = "5s*attempts"; + public String expire_txn = "5s*attempts"; + public String expire_syncpoint = "60s*attempts<=300s"; + public String expire_epoch_wait = "10s"; + // we don't want to wait ages for durability as it blocks other durability progress; even this might be too long, as we can always retry + public String expire_durability = "10s*attempts <= 30s"; + public String slow_syncpoint_preaccept = "10s"; + public String slow_txn_preaccept = "30ms <= p50*2 <= 100ms"; + public String slow_read = "30ms <= p50*2 <= 100ms"; + public StringRetryStrategy retry_syncpoint = new StringRetryStrategy("10s*attempts <= 600s"); + public StringRetryStrategy retry_durability = new StringRetryStrategy("10s*attempts <= 600s"); + public StringRetryStrategy retry_bootstrap = new StringRetryStrategy("10s*attempts <= 600s"); + public StringRetryStrategy retry_fetch_min_epoch = new StringRetryStrategy("200ms...1s*attempts <= 1s,retries=3"); + public StringRetryStrategy retry_fetch_topology = new StringRetryStrategy("200ms...1s*attempts <= 1s,retries=100"); + + public volatile DurationSpec.IntSecondsBound fast_path_update_delay = null; + + public volatile DurationSpec.IntSecondsBound gc_delay = new DurationSpec.IntSecondsBound("5m"); + public volatile int shard_durability_target_splits = 16; + public volatile DurationSpec.IntSecondsBound durability_txnid_lag = new DurationSpec.IntSecondsBound(5); + public volatile DurationSpec.IntSecondsBound shard_durability_cycle = new DurationSpec.IntSecondsBound(5, TimeUnit.MINUTES); + public volatile DurationSpec.IntSecondsBound global_durability_cycle = new DurationSpec.IntSecondsBound(5, TimeUnit.MINUTES); + + public enum TransactionalRangeMigration + { + auto, explicit + } + + /** + * Defines the behavior of range migration opt-in when changing transactional settings on a table. In auto, + * all ranges are marked as migrating and no additional user action is needed aside from running repairs. In + * explicit, no ranges are marked as migrating, and the user needs to explicitly mark ranges as migrating to + * the target transactional mode via nodetool. + */ + public volatile TransactionalRangeMigration range_migration = TransactionalRangeMigration.auto; + + /** + * default transactional mode for tables created by this node when no transactional mode has been specified in the DDL + */ + public TransactionalMode default_transactional_mode = TransactionalMode.off; + public boolean ephemeralReadEnabled = true; + public boolean state_cache_listener_jfr_enabled = true; + public final JournalSpec journal = new JournalSpec(); + + public static class JournalSpec implements Params + { + public int segmentSize = 32 << 20; + public FailurePolicy failurePolicy = FailurePolicy.STOP; + public FlushMode flushMode = FlushMode.PERIODIC; + public volatile DurationSpec flushPeriod; // pulls default from 'commitlog_sync_period' + public DurationSpec periodicFlushLagBlock = new DurationSpec.IntMillisecondsBound("1500ms"); + public DurationSpec.IntMillisecondsBound compactionPeriod = new DurationSpec.IntMillisecondsBound("60000ms"); + private volatile long flushCombinedBlockPeriod = Long.MIN_VALUE; + public Version version = Version.DOWNGRADE_SAFE_VERSION; + + public void setFlushPeriod(DurationSpec newFlushPeriod) + { + flushPeriod = newFlushPeriod; + flushCombinedBlockPeriod = Long.MIN_VALUE; + } + + public void setPeriodicFlushLagBlock(DurationSpec newPeriodicFlushLagBlock) + { + periodicFlushLagBlock = newPeriodicFlushLagBlock; + flushCombinedBlockPeriod = Long.MIN_VALUE; + } + + @Override + public int segmentSize() + { + return segmentSize; + } + + @Override + public FailurePolicy failurePolicy() + { + return failurePolicy; + } + + @Override + public FlushMode flushMode() + { + return flushMode; + } + + @Override + public boolean enableCompaction() + { + return DatabaseDescriptor.getAccord().enable_journal_compaction; + } + + @Override + public long compactionPeriod(TimeUnit unit) + { + return compactionPeriod.to(unit); + } + + @JsonIgnore + @Override + public long flushPeriod(TimeUnit units) + { + return flushPeriod.to(units); + } + + @JsonIgnore + @Override + public long periodicBlockPeriod(TimeUnit units) + { + long nanos = flushCombinedBlockPeriod; + if (nanos >= 0) + return units.convert(nanos, TimeUnit.NANOSECONDS); + + long flushPeriodNanos = flushPeriod(TimeUnit.NANOSECONDS); + Invariants.require(flushPeriodNanos > 0); + nanos = periodicFlushLagBlock.to(TimeUnit.NANOSECONDS) + flushPeriodNanos; + // it is possible for this to race and cache the wrong value after an update + flushCombinedBlockPeriod = nanos; + return nanos; + } + + /** + * This is required by the journal, but we don't have multiple versions, so block it from showing up, so we don't need to worry about maintaining it + */ + @JsonIgnore + @Override + public int userVersion() + { + return version.version; + } + } +} diff --git a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java index e83dfaf53aba..e2957ce95f4f 100644 --- a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java +++ b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java @@ -25,9 +25,10 @@ import com.google.common.primitives.Ints; +import accord.utils.Invariants; import org.apache.cassandra.db.virtual.LogMessagesTable; +import org.apache.cassandra.db.virtual.SlowQueriesTable; import org.apache.cassandra.exceptions.ConfigurationException; -import org.apache.cassandra.service.FileSystemOwnershipCheck; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.StorageCompatibilityMode; @@ -38,6 +39,12 @@ /** A class that extracts system properties for the cassandra node it runs within. */ public enum CassandraRelevantProperties { + ACCORD_AGENT_CLASS("cassandra.test.accord.agent"), + ACCORD_ALLOW_TEST_MODES("cassandra.test.accord.allow_test_modes", "false"), + ACCORD_KEY_PARANOIA_COSTFACTOR(Invariants.KEY_PARANOIA_COSTFACTOR), + ACCORD_KEY_PARANOIA_CPU(Invariants.KEY_PARANOIA_CPU), + ACCORD_KEY_PARANOIA_MEMORY(Invariants.KEY_PARANOIA_MEMORY), + ACCORD_REPAIR_RANGE_STEP_UPDATE_INTERVAL("cassandra.accord.repair.range_step_update_interval", "100"), ACQUIRE_RETRY_SECONDS("cassandra.acquire_retry_seconds", "60"), ACQUIRE_SLEEP_MS("cassandra.acquire_sleep_ms", "1000"), ALLOCATE_TOKENS_FOR_KEYSPACE("cassandra.allocate_tokens_for_keyspace"), @@ -53,13 +60,14 @@ public enum CassandraRelevantProperties ALLOW_UNSAFE_TRANSIENT_CHANGES("cassandra.allow_unsafe_transient_changes"), APPROXIMATE_TIME_PRECISION_MS("cassandra.approximate_time_precision_ms", "2"), /** 2 ** GENSALT_LOG2_ROUNDS rounds of hashing will be performed. */ - AUTH_BCRYPT_GENSALT_LOG2_ROUNDS("cassandra.auth_bcrypt_gensalt_log2_rounds"), + AUTH_BCRYPT_GENSALT_LOG2_ROUNDS("cassandra.auth_bcrypt_gensalt_log2_rounds", "4"), /** We expect default values on cache retries and interval to be sufficient for everyone but have this escape hatch just in case. */ AUTH_CACHE_WARMING_MAX_RETRIES("cassandra.auth_cache.warming.max_retries"), AUTH_CACHE_WARMING_RETRY_INTERVAL_MS("cassandra.auth_cache.warming.retry_interval_ms"), AUTOCOMPACTION_ON_STARTUP_ENABLED("cassandra.autocompaction_on_startup_enabled", "true"), AUTO_BOOTSTRAP("cassandra.auto_bootstrap"), AUTO_REPAIR_FREQUENCY_SECONDS("cassandra.auto_repair_frequency_seconds", convertToString(TimeUnit.MINUTES.toSeconds(5))), + BATCHLOG_REPLAY_INTERVAL_MS("cassandra.batchlog.replay_interval_ms", "10000"), BATCHLOG_REPLAY_TIMEOUT_IN_MS("cassandra.batchlog.replay_timeout_in_ms"), BATCH_COMMIT_LOG_SYNC_INTERVAL("cassandra.batch_commitlog_sync_interval_millis", "1000"), /** @@ -212,6 +220,8 @@ public enum CassandraRelevantProperties */ DRAIN_EXECUTOR_TIMEOUT_MS("cassandra.drain_executor_timeout_ms", convertToString(TimeUnit.MINUTES.toMillis(5))), DROP_OVERSIZED_READ_REPAIR_MUTATIONS("cassandra.drop_oversized_readrepair_mutations"), + DTEST_ACCORD_ENABLED("jvm_dtest.accord.enabled", "true"), + DTEST_ACCORD_JOURNAL_SANITY_CHECK_ENABLED("jvm_dtest.accord.journal_sanity_check_enabled", "false"), DTEST_API_LOG_TOPOLOGY("cassandra.dtest.api.log.topology"), /** This property indicates if the code is running under the in-jvm dtest framework */ DTEST_IS_IN_JVM_DTEST("org.apache.cassandra.dtest.is_in_jvm_dtest"), @@ -236,11 +246,11 @@ public enum CassandraRelevantProperties /** @deprecated should be removed in favor of flags in relevant startup check (FileSystemOwnershipCheck) */ /** @deprecated See CASSANDRA-17797 */ @Deprecated(since = "4.1") - FILE_SYSTEM_CHECK_OWNERSHIP_FILENAME("cassandra.fs_ownership_filename", FileSystemOwnershipCheck.DEFAULT_FS_OWNERSHIP_FILENAME), + FILE_SYSTEM_CHECK_OWNERSHIP_FILENAME("cassandra.fs_ownership_filename", ".cassandra_fs_ownership"), /** @deprecated should be removed in favor of flags in relevant startup check (FileSystemOwnershipCheck) */ /** @deprecated See CASSANDRA-17797 */ @Deprecated(since = "4.1") - FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN(FileSystemOwnershipCheck.FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN), + FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN("CassandraOwnershipToken"), FORCE_DEFAULT_INDEXING_PAGE_SIZE("cassandra.force_default_indexing_page_size"), /** Used when running in Client mode and the system and schema keyspaces need to be initialized outside of their normal initialization path **/ FORCE_LOAD_LOCAL_KEYSPACES("cassandra.schema.force_load_local_keyspaces"), @@ -268,6 +278,8 @@ public enum CassandraRelevantProperties */ GOSSIP_SETTLE_POLL_SUCCESSES_REQUIRED("cassandra.gossip_settle_poll_success_required", "3"), + HINT_DISPATCH_INTERVAL_MS("cassandra.hint_dispatch_interval_ms", "10000"), + IGNORED_SCHEMA_CHECK_ENDPOINTS("cassandra.skip_schema_check_for_endpoints"), IGNORED_SCHEMA_CHECK_VERSIONS("cassandra.skip_schema_check_for_versions"), IGNORE_CORRUPTED_SCHEMA_TABLES("cassandra.ignore_corrupted_schema_tables"), @@ -332,6 +344,16 @@ public enum CassandraRelevantProperties /** Java Virtual Machine implementation name */ JAVA_VM_NAME("java.vm.name"), JOIN_RING("cassandra.join_ring", "true"), + + /** + * {@link StorageCompatibilityMode} mode sets how the node will behave, sstable or messaging versions to use etc according to a yaml setting. + * But many tests don't load the config hence we need to force it otherwise they would run always under the default. Config is null for junits + * that don't load the config. Get from env var that CI/build.xml sets. + * + * This is a dev/CI only property. Do not use otherwise. + */ + JUNIT_STORAGE_COMPATIBILITY_MODE("cassandra.junit_storage_compatibility_mode", StorageCompatibilityMode.CASSANDRA_4.toString()), + /** startup checks properties */ LIBJEMALLOC("cassandra.libjemalloc"), /** Line separator ("\n" on UNIX). */ @@ -342,6 +364,8 @@ public enum CassandraRelevantProperties LOG4J2_DISABLE_JMX_LEGACY("log4j2.disable.jmx"), LOG4J_SHUTDOWN_HOOK_ENABLED("log4j.shutdownHookEnabled"), LOGBACK_CONFIGURATION_FILE("logback.configurationFile"), + /** Maximum number of rows in system_views.slow_queries */ + LOGS_SLOW_QUERIES_VIRTUAL_TABLE_MAX_ROWS("cassandra.virtual.slow_queries.max.rows", convertToString(SlowQueriesTable.LOGS_VIRTUAL_TABLE_DEFAULT_ROWS)), /** Maximum number of rows in system_views.logs table */ LOGS_VIRTUAL_TABLE_MAX_ROWS("cassandra.virtual.logs.max.rows", convertToString(LogMessagesTable.LOGS_VIRTUAL_TABLE_DEFAULT_ROWS)), /** @@ -491,9 +515,18 @@ public enum CassandraRelevantProperties SERIALIZATION_EMPTY_TYPE_NONEMPTY_BEHAVIOR("cassandra.serialization.emptytype.nonempty_behavior"), SET_SEP_THREAD_NAME("cassandra.set_sep_thread_name", "true"), SHUTDOWN_ANNOUNCE_DELAY_IN_MS("cassandra.shutdown_announce_in_ms", "2000"), + SIMULATOR_SEED("cassandra.simulator.seed"), + SIMULATOR_STARTED("cassandra.simulator.started"), SIZE_RECORDER_INTERVAL("cassandra.size_recorder_interval", "300"), SKIP_AUTH_SETUP("cassandra.skip_auth_setup", "false"), SKIP_GC_INSPECTOR("cassandra.skip_gc_inspector", "false"), + + /** + * Do not try to calculate optimal streaming candidates. This can take a lot of time in some configs specially + * with vnodes. + */ + SKIP_OPTIMAL_STREAMING_CANDIDATES_CALCULATION("cassandra.skip_optimal_streaming_candidates_calculation", "false"), + SKIP_PAXOS_REPAIR_ON_TOPOLOGY_CHANGE("cassandra.skip_paxos_repair_on_topology_change"), /** If necessary for operational purposes, permit certain keyspaces to be ignored for paxos topology repairs. */ SKIP_PAXOS_REPAIR_ON_TOPOLOGY_CHANGE_KEYSPACES("cassandra.skip_paxos_repair_on_topology_change_keyspaces"), @@ -553,7 +586,7 @@ public enum CassandraRelevantProperties TCM_UNSAFE_BOOT_WITH_CLUSTERMETADATA("cassandra.unsafe_boot_with_clustermetadata", null), TCM_USE_ATOMIC_LONG_PROCESSOR("cassandra.test.use_atomic_long_processor", "false"), TCM_USE_NO_OP_REPLICATOR("cassandra.test.use_no_op_replicator", "false"), - + TEST_ACCORD_STORE_THREAD_CHECKS_ENABLED("cassandra.test.accord.store.thread_checks_enabled", "true"), TEST_BBFAILHELPER_ENABLED("test.bbfailhelper.enabled"), TEST_BLOB_SHARED_SEED("cassandra.test.blob.shared.seed", "42"), TEST_BYTEMAN_TRANSFORMATIONS_DEBUG("cassandra.test.byteman.transformations.debug"), @@ -566,6 +599,7 @@ public enum CassandraRelevantProperties TEST_COMPRESSION("cassandra.test.compression"), TEST_COMPRESSION_ALGO("cassandra.test.compression.algo", "lz4"), TEST_DEBUG_REF_COUNT("cassandra.debugrefcount"), + TEST_DEBUG_REF_EVENTS("cassandra.debug.refevents"), TEST_DRIVER_CONNECTION_TIMEOUT_MS("cassandra.test.driver.connection_timeout_ms", "5000"), TEST_DRIVER_READ_TIMEOUT_MS("cassandra.test.driver.read_timeout_ms", "12000"), TEST_ENCRYPTION("cassandra.test.encryption", "false"), @@ -577,6 +611,8 @@ public enum CassandraRelevantProperties * can be also done manually for that particular case: {@code flush(SchemaConstants.SCHEMA_KEYSPACE_NAME);}. */ TEST_FLUSH_LOCAL_SCHEMA_CHANGES("cassandra.test.flush_local_schema_changes", "true"), TEST_HARRY_SWITCH_AFTER("cassandra.test.harry.progression.switch-after", "1"), + TEST_HISTORY_VALIDATOR_LOGGING_ENABLED("cassandra.test.history_validator.logging.enabled", "false"), + TEST_IGNORE_SIGAR("cassandra.test.ignore_sigar"), TEST_INTERVAL_TREE_EXPENSIVE_CHECKS("cassandra.test.interval_tree_expensive_checks"), TEST_INVALID_LEGACY_SSTABLE_ROOT("invalid-legacy-sstable-root"), TEST_JVM_DTEST_DISABLE_SSL("cassandra.test.disable_ssl"), @@ -585,9 +621,11 @@ public enum CassandraRelevantProperties TEST_ORG_CAFFINITAS_OHC_SEGMENTCOUNT("org.caffinitas.ohc.segmentCount"), TEST_PRESERVE_THREAD_CREATION_STACKTRACE("cassandra.test.preserve_thread_creation_stacktrace", "false"), TEST_RANDOM_SEED("cassandra.test.random.seed"), + TEST_RANGE_EXPENSIVE_CHECKS("cassandra.test.range_expensive_checks"), TEST_READ_ITERATION_DELAY_MS("cassandra.test.read_iteration_delay_ms", "0"), TEST_REUSE_PREPARED("cassandra.test.reuse_prepared", "true"), TEST_ROW_CACHE_SIZE("cassandra.test.row_cache_size"), + TEST_SEED("cassandra.test.seed"), TEST_SERIALIZATION_WRITES("cassandra.test-serialization-writes"), TEST_SIMULATOR_DEBUG("cassandra.test.simulator.debug"), TEST_SIMULATOR_DETERMINISM_CHECK("cassandra.test.simulator.determinismcheck", "none"), diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java index 2ec1d78e301f..a22b40fa27ee 100644 --- a/src/java/org/apache/cassandra/config/Config.java +++ b/src/java/org/apache/cassandra/config/Config.java @@ -32,6 +32,7 @@ import com.google.common.base.Joiner; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Sets; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -42,10 +43,12 @@ import org.apache.cassandra.index.internal.CassandraIndex; import org.apache.cassandra.io.compress.BufferType; import org.apache.cassandra.io.sstable.format.big.BigFormat; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig; import org.apache.cassandra.service.StartupChecks.StartupCheckType; import org.apache.cassandra.utils.StorageCompatibilityMode; import static org.apache.cassandra.config.CassandraRelevantProperties.AUTOCOMPACTION_ON_STARTUP_ENABLED; +import static org.apache.cassandra.config.CassandraRelevantProperties.CASSANDRA_AVAILABLE_PROCESSORS; import static org.apache.cassandra.config.CassandraRelevantProperties.FILE_CACHE_ENABLED; import static org.apache.cassandra.config.CassandraRelevantProperties.SKIP_PAXOS_REPAIR_ON_TOPOLOGY_CHANGE; import static org.apache.cassandra.config.CassandraRelevantProperties.SKIP_PAXOS_REPAIR_ON_TOPOLOGY_CHANGE_KEYSPACES; @@ -150,12 +153,16 @@ public static Set splitCommaDelimited(String src) @Replaces(oldName = "write_request_timeout_in_ms", converter = Converters.MILLIS_DURATION_LONG, deprecated = true) public volatile DurationSpec.LongMillisecondsBound write_request_timeout = new DurationSpec.LongMillisecondsBound("2000ms"); + public volatile DurationSpec.LongMillisecondsBound short_rpc_timeout = new DurationSpec.LongMillisecondsBound("1000ms"); + @Replaces(oldName = "counter_write_request_timeout_in_ms", converter = Converters.MILLIS_DURATION_LONG, deprecated = true) public volatile DurationSpec.LongMillisecondsBound counter_write_request_timeout = new DurationSpec.LongMillisecondsBound("5000ms"); @Replaces(oldName = "cas_contention_timeout_in_ms", converter = Converters.MILLIS_DURATION_LONG, deprecated = true) public volatile DurationSpec.LongMillisecondsBound cas_contention_timeout = new DurationSpec.LongMillisecondsBound("1800ms"); + public volatile DurationSpec.LongMillisecondsBound accord_preaccept_timeout = new DurationSpec.LongMillisecondsBound("1s"); + @Replaces(oldName = "truncate_request_timeout_in_ms", converter = Converters.MILLIS_DURATION_LONG, deprecated = true) public volatile DurationSpec.LongMillisecondsBound truncate_request_timeout = new DurationSpec.LongMillisecondsBound("60000ms"); @@ -176,8 +183,14 @@ public static Set splitCommaDelimited(String src) public volatile DurationSpec.LongMillisecondsBound cms_await_timeout = new DurationSpec.LongMillisecondsBound("120000ms"); public volatile int cms_default_max_retries = 10; - public volatile DurationSpec.IntMillisecondsBound cms_default_retry_backoff = new DurationSpec.IntMillisecondsBound("50ms"); + @Deprecated(since="5.1") + public volatile DurationSpec.IntMillisecondsBound cms_default_retry_backoff = null; + @Deprecated(since="5.1") + public volatile DurationSpec.IntMillisecondsBound cms_default_max_retry_backoff = null; + public String cms_retry_delay = "50ms*attempts <= 500ms ... 100ms*attempts <= 1s,retries=10"; + public volatile int epoch_aware_debounce_inflight_tracker_max_size = 100; + /** * How often we should snapshot the cluster metadata. */ @@ -188,9 +201,10 @@ public static Set splitCommaDelimited(String src) public int concurrent_reads = 32; public int concurrent_writes = 32; + public int concurrent_accord_operations = 32; public int concurrent_counter_writes = 32; public int concurrent_materialized_view_writes = 32; - public int available_processors = -1; + public OptionaldPositiveInt available_processors = new OptionaldPositiveInt(CASSANDRA_AVAILABLE_PROCESSORS.getInt(OptionaldPositiveInt.UNDEFINED_VALUE)); public int memtable_flush_writers = 0; @Replaces(oldName = "memtable_heap_space_in_mb", converter = Converters.MEBIBYTES_DATA_STORAGE_INT, deprecated = true) @@ -356,6 +370,10 @@ public MemtableOptions() // The number of executors to use for building secondary indexes public volatile int concurrent_index_builders = 2; + // at least 20% of disk must be unused to run incremental repair + // if you want to disable this feature (the recommendation is not to, but if you want to disable it for whatever reason) then set the ratio to 0.0 + public volatile double incremental_repair_disk_headroom_reject_ratio = 0.2; + /** * @deprecated retry support removed on CASSANDRA-10992 */ @@ -394,6 +412,7 @@ public static class SSTableConfig @Replaces(oldName = "commitlog_total_space_in_mb", converter = Converters.MEBIBYTES_DATA_STORAGE_INT, deprecated = true) public DataStorageSpec.IntMebibytesBound commitlog_total_space; public CommitLogSync commitlog_sync; + @Replaces(oldName = "commitlog_sync_group_window_in_ms", converter = Converters.MILLIS_DURATION_DOUBLE, deprecated = true) public DurationSpec.IntMillisecondsBound commitlog_sync_group_window = new DurationSpec.IntMillisecondsBound("0ms"); @Replaces(oldName = "commitlog_sync_period_in_ms", converter = Converters.MILLIS_DURATION_INT, deprecated = true) @@ -441,7 +460,7 @@ public static class SSTableConfig public String failure_detector = "FailureDetector"; public EncryptionOptions.ServerEncryptionOptions server_encryption_options = new EncryptionOptions.ServerEncryptionOptions(); - public EncryptionOptions client_encryption_options = new EncryptionOptions(); + public EncryptionOptions.ClientEncryptionOptions client_encryption_options = new EncryptionOptions.ClientEncryptionOptions(); public JMXServerOptions jmx_server_options; @@ -496,6 +515,8 @@ public static class SSTableConfig public DataStorageSpec.LongMebibytesBound paxos_cache_size = null; + public DataStorageSpec.LongMebibytesBound consensus_migration_cache_size = null; + @Replaces(oldName = "cache_load_timeout_seconds", converter = Converters.NEGATIVE_SECONDS_DURATION, deprecated = true) public DurationSpec.IntSecondsBound cache_load_timeout = new DurationSpec.IntSecondsBound("30s"); @@ -609,6 +630,10 @@ public static class SSTableConfig @Replaces(oldName = "enable_materialized_views", converter = Converters.IDENTITY, deprecated = true) public boolean materialized_views_enabled = false; + // When true, materialized views data in SSTable go through commit logs during internodes streaming, e.g. repair + // When false, it behaves the same as normal streaming. + public volatile boolean materialized_views_on_repair_enabled = true; + @Replaces(oldName = "enable_transient_replication", converter = Converters.IDENTITY, deprecated = true) public boolean transient_replication_enabled = false; @@ -976,6 +1001,7 @@ public static void setClientMode(boolean clientMode) public volatile boolean password_validator_reconfiguration_enabled = true; public volatile CustomGuardrailConfig password_validator = new CustomGuardrailConfig(); + public volatile AutoRepairConfig auto_repair = new AutoRepairConfig(); /** * The variants of paxos implementation and semantics supported by Cassandra. @@ -1159,6 +1185,7 @@ public enum PaxosOnLinearizabilityViolation public volatile boolean client_request_size_metrics_enabled = true; + public volatile int max_top_size_partition_count = 10; public volatile int max_top_tombstone_partition_count = 10; public volatile DataStorageSpec.LongBytesBound min_tracked_partition_size = new DataStorageSpec.LongBytesBound("1MiB"); @@ -1172,6 +1199,8 @@ public enum PaxosOnLinearizabilityViolation */ public ParameterizedClass default_compaction = null; + public final AccordSpec accord = new AccordSpec(); + public static Supplier getOverrideLoadConfig() { return overrideLoadConfig; @@ -1400,10 +1429,10 @@ public static void log(Config config) String value; try { - // Field.get() can throw NPE if the value of the field is null - value = field.get(config).toString(); + Object obj = field.get(config); + value = obj != null ? obj.toString() : "null"; } - catch (NullPointerException | IllegalAccessException npe) + catch (IllegalAccessException npe) { value = "null"; } @@ -1467,4 +1496,6 @@ public enum CQLStartTime // 3.x Cassandra Driver has its "read" timeout set to 12 seconds, default matches this. public DurationSpec.LongMillisecondsBound native_transport_timeout = new DurationSpec.LongMillisecondsBound("12s"); public boolean enforce_native_deadline_for_hints = false; + + public boolean paxos_repair_race_wait = true; } diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java index f6fd1b52ff44..625fe2be100f 100644 --- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java +++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java @@ -81,6 +81,7 @@ import org.apache.cassandra.config.Config.DiskAccessMode; import org.apache.cassandra.config.Config.PaxosOnLinearizabilityViolation; import org.apache.cassandra.config.Config.PaxosStatePurging; +import org.apache.cassandra.config.DurationSpec.IntMillisecondsBound; import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.db.commitlog.AbstractCommitLogSegmentManager; import org.apache.cassandra.db.commitlog.CommitLog; @@ -105,20 +106,23 @@ import org.apache.cassandra.locator.EndpointSnitchInfo; import org.apache.cassandra.locator.IEndpointSnitch; import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.locator.Locator; -import org.apache.cassandra.locator.LocationInfo; import org.apache.cassandra.locator.InitialLocationProvider; +import org.apache.cassandra.locator.LocationInfo; +import org.apache.cassandra.locator.Locator; import org.apache.cassandra.locator.NodeAddressConfig; +import org.apache.cassandra.locator.NodeProximity; import org.apache.cassandra.locator.ReconnectableSnitchHelper; import org.apache.cassandra.locator.SeedProvider; -import org.apache.cassandra.locator.NodeProximity; import org.apache.cassandra.locator.SnitchAdapter; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig; import org.apache.cassandra.security.AbstractCryptoProvider; import org.apache.cassandra.security.EncryptionContext; import org.apache.cassandra.security.JREProvider; import org.apache.cassandra.security.SSLFactory; import org.apache.cassandra.service.CacheService.CacheType; import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.api.AccordWaitStrategies; +import org.apache.cassandra.service.consensus.TransactionalMode; import org.apache.cassandra.service.paxos.Paxos; import org.apache.cassandra.tcm.RegistrationStatus; import org.apache.cassandra.utils.FBUtilities; @@ -129,8 +133,8 @@ import static org.apache.cassandra.config.CassandraRelevantProperties.ALLOCATE_TOKENS_FOR_KEYSPACE; import static org.apache.cassandra.config.CassandraRelevantProperties.ALLOW_UNLIMITED_CONCURRENT_VALIDATIONS; import static org.apache.cassandra.config.CassandraRelevantProperties.AUTO_BOOTSTRAP; -import static org.apache.cassandra.config.CassandraRelevantProperties.CONFIG_LOADER; import static org.apache.cassandra.config.CassandraRelevantProperties.CHRONICLE_ANALYTICS_DISABLE; +import static org.apache.cassandra.config.CassandraRelevantProperties.CONFIG_LOADER; import static org.apache.cassandra.config.CassandraRelevantProperties.DISABLE_STCS_IN_L0; import static org.apache.cassandra.config.CassandraRelevantProperties.INITIAL_TOKEN; import static org.apache.cassandra.config.CassandraRelevantProperties.IO_NETTY_TRANSPORT_ESTIMATE_SIZE_ON_SUBMIT; @@ -154,7 +158,7 @@ import static org.apache.cassandra.config.DataRateSpec.DataRateUnit.BYTES_PER_SECOND; import static org.apache.cassandra.config.DataRateSpec.DataRateUnit.MEBIBYTES_PER_SECOND; import static org.apache.cassandra.config.DataStorageSpec.DataStorageUnit.MEBIBYTES; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; import static org.apache.cassandra.db.ConsistencyLevel.ALL; import static org.apache.cassandra.db.ConsistencyLevel.EACH_QUORUM; import static org.apache.cassandra.db.ConsistencyLevel.LOCAL_QUORUM; @@ -163,6 +167,7 @@ import static org.apache.cassandra.db.ConsistencyLevel.QUORUM; import static org.apache.cassandra.io.util.FileUtils.ONE_GIB; import static org.apache.cassandra.io.util.FileUtils.ONE_MIB; +import static org.apache.cassandra.journal.Params.FlushMode.PERIODIC; import static org.apache.cassandra.utils.Clock.Global.logInitializationOutcome; public class DatabaseDescriptor @@ -224,6 +229,9 @@ public class DatabaseDescriptor private static long keyCacheSizeInMiB; private static long paxosCacheSizeInMiB; + private static long accordCacheSizeInMiB; + private static long accordWorkingSetSizeInMiB; + private static long consensusMigrationCacheSizeInMiB; private static long counterCacheSizeInMiB; private static long indexSummaryCapacityInMiB; @@ -384,6 +392,14 @@ public static void clientInitialization(boolean failIfDaemonOrTool) clientInitialization(failIfDaemonOrTool, Config::new); } + // For simulator tests + public static void clientWithDaemonConfig() + { + clientInitialization(true, DatabaseDescriptor::loadConfig); + applyAll(); + AuthConfig.applyAuth(); + } + /** * Initializes this class as a client, which means that just an empty configuration will * be used. @@ -600,6 +616,16 @@ else if (conf.commitlog_sync_period.toMilliseconds() != 0) logger.debug("Syncing log with a period of {}", conf.commitlog_sync_period.toString()); } + if (conf.accord.journal.flushPeriod == null) + { + conf.accord.journal.flushPeriod = conf.commitlog_sync_period; + if (conf.accord.journal.flushMode == PERIODIC && conf.commitlog_sync_period.toMilliseconds() == 0) + { + logger.warn("Accord journal is configured in periodic mode, while Cassandra commit log is configured in {} mode", conf.commitlog_sync); + conf.accord.journal.flushPeriod = conf.accord.journal.periodicFlushLagBlock; + } + } + /* evaluate the DiskAccessMode Config directive, which also affects indexAccessMode selection */ if (conf.disk_access_mode == DiskAccessMode.auto || conf.disk_access_mode == DiskAccessMode.mmap_index_only) { @@ -641,6 +667,9 @@ else if (conf.disk_access_mode == DiskAccessMode.direct) if (conf.concurrent_counter_writes < 2) throw new ConfigurationException("concurrent_counter_writes must be at least 2, but was " + conf.concurrent_counter_writes, false); + if (conf.concurrent_accord_operations < 1) + throw new ConfigurationException("concurrent_accord_operations must be at least 1, but was " + conf.concurrent_accord_operations, false); + if (conf.networking_cache_size == null) conf.networking_cache_size = new DataStorageSpec.IntMebibytesBound(Math.min(128, (int) (Runtime.getRuntime().maxMemory() / (16 * 1048576)))); @@ -712,6 +741,11 @@ else if (conf.repair_session_space.toMebibytes() > (int) (Runtime.getRuntime().m if (commitLogWriteDiskAccessMode != conf.commitlog_disk_access_mode) logger.info("commitlog_disk_access_mode resolved to: {}", commitLogWriteDiskAccessMode); + if (conf.accord.journal_directory == null) + { + conf.accord.journal_directory = storagedirFor("accord_journal"); + } + if (conf.hints_directory == null) { conf.hints_directory = storagedirFor("hints"); @@ -787,6 +821,8 @@ else if (conf.repair_session_space.toMebibytes() > (int) (Runtime.getRuntime().m throw new ConfigurationException("local_system_data_file_directory must not be the same as any data_file_directories", false); if (datadir.equals(conf.commitlog_directory)) throw new ConfigurationException("commitlog_directory must not be the same as any data_file_directories", false); + if (datadir.equals(conf.accord.journal_directory)) + throw new ConfigurationException("accord.journal_directory must not be the same as any data_file_directories", false); if (datadir.equals(conf.hints_directory)) throw new ConfigurationException("hints_directory must not be the same as any data_file_directories", false); if (datadir.equals(conf.saved_caches_directory)) @@ -802,6 +838,8 @@ else if (conf.repair_session_space.toMebibytes() > (int) (Runtime.getRuntime().m { if (conf.local_system_data_file_directory.equals(conf.commitlog_directory)) throw new ConfigurationException("local_system_data_file_directory must not be the same as the commitlog_directory", false); + if (conf.local_system_data_file_directory.equals(conf.accord.journal_directory)) + throw new ConfigurationException("local_system_data_file_directory must not be the same as the accord.journal_directory", false); if (conf.local_system_data_file_directory.equals(conf.saved_caches_directory)) throw new ConfigurationException("local_system_data_file_directory must not be the same as the saved_caches_directory", false); if (conf.local_system_data_file_directory.equals(conf.hints_directory)) @@ -814,10 +852,18 @@ else if (conf.repair_session_space.toMebibytes() > (int) (Runtime.getRuntime().m FBUtilities.prettyPrintMemory(freeBytes)); } - if (conf.commitlog_directory.equals(conf.saved_caches_directory)) - throw new ConfigurationException("saved_caches_directory must not be the same as the commitlog_directory", false); + if (conf.commitlog_directory.equals(conf.accord.journal_directory)) + throw new ConfigurationException("accord.journal_directory must not be the same as the commitlog_directory", false); if (conf.commitlog_directory.equals(conf.hints_directory)) throw new ConfigurationException("hints_directory must not be the same as the commitlog_directory", false); + if (conf.commitlog_directory.equals(conf.saved_caches_directory)) + throw new ConfigurationException("saved_caches_directory must not be the same as the commitlog_directory", false); + + if (conf.accord.journal_directory.equals(conf.hints_directory)) + throw new ConfigurationException("hints_directory must not be the same as the accord.journal_directory", false); + if (conf.accord.journal_directory.equals(conf.saved_caches_directory)) + throw new ConfigurationException("saved_caches_directory must not be the same as the accord.journal_directory", false); + if (conf.hints_directory.equals(conf.saved_caches_directory)) throw new ConfigurationException("saved_caches_directory must not be the same as the hints_directory", false); @@ -931,6 +977,52 @@ else if (conf.repair_session_space.toMebibytes() > (int) (Runtime.getRuntime().m + conf.paxos_cache_size + "', supported values are >= 0.", false); } + try + { + // if accordCacheSizeInMiB option was set to "auto" then size of the cache should be "max(10% of Heap (in MB), 1MB) + accordCacheSizeInMiB = (conf.accord.cache_size == null) + ? Math.max(1, (int) ((Runtime.getRuntime().totalMemory() * 0.10) / 1024 / 1024)) + : conf.accord.cache_size.toMebibytes(); + + if (accordCacheSizeInMiB < 0) + throw new NumberFormatException(); // to escape duplicating error message + } + catch (NumberFormatException e) + { + throw new ConfigurationException("accord.cache_size option was set incorrectly to '" + + conf.accord.cache_size + "', supported values are >= 0.", false); + } + + try + { + // if accordWorkingSetSizeInMiB option was set to "auto" then size of the working set should be "max(5% of Heap (in MB), 1MB) + // if negative, there is no limit + accordWorkingSetSizeInMiB = (conf.accord.working_set_size == null) + ? Math.max(1, (int) ((Runtime.getRuntime().totalMemory() * 0.05) / 1024 / 1024)) + : conf.accord.working_set_size.toMebibytes(); + } + catch (NumberFormatException e) + { + throw new ConfigurationException("accord.working_set_size option was set incorrectly to '" + + conf.accord.working_set_size + "', supported values are >= 0.", false); + } + + try + { + // if consensusMigrationCacheSizeInMiB option was set to "auto" then size of the cache should be "min(1% of Heap (in MB), 50MB) + consensusMigrationCacheSizeInMiB = (conf.consensus_migration_cache_size == null) + ? Math.min(Math.max(1, (int) (Runtime.getRuntime().totalMemory() * 0.01 / 1024 / 1024)), 50) + : conf.consensus_migration_cache_size.toMebibytes(); + + if (consensusMigrationCacheSizeInMiB < 0) + throw new NumberFormatException(); // to escape duplicating error message + } + catch (NumberFormatException e) + { + throw new ConfigurationException("consensus_migration_cache_size option was set incorrectly to '" + + conf.consensus_migration_cache_size + "', supported values are >= 0.", false); + } + // we need this assignment for the Settings virtual table - CASSANDRA-17735 conf.counter_cache_size = new DataStorageSpec.LongMebibytesBound(counterCacheSizeInMiB); @@ -998,10 +1090,10 @@ else if (conf.commitlog_segment_size.toKibibytes() < 2 * conf.max_mutation_size. } else if (JMXServerOptions.isEnabledBySystemProperties()) { - throw new ConfigurationException("Configure either jmx_server_options in cassandra.yaml and comment out " + - "configure_jmx function call in cassandra-env.sh or keep cassandra-env.sh " + - "to call configure_jmx function but you have to keep jmx_server_options " + - "in cassandra.yaml commented out."); + throw new ConfigurationException("Configure either jmx_server_options in cassandra.yaml and comment out " + + "configure_jmx function call in cassandra-env.sh or keep cassandra-env.sh " + + "to call configure_jmx function but you have to keep jmx_server_options " + + "in cassandra.yaml commented out."); } conf.jmx_server_options.jmx_encryption_options.applyConfig(); @@ -1474,6 +1566,12 @@ static void checkForLowestAcceptedTimeouts(Config conf) logInfo("truncate_request_timeout", conf.truncate_request_timeout, LOWEST_ACCEPTED_TIMEOUT); conf.truncate_request_timeout = LOWEST_ACCEPTED_TIMEOUT; } + + if (conf.accord_preaccept_timeout.toMilliseconds() < LOWEST_ACCEPTED_TIMEOUT.toMilliseconds()) + { + logInfo("accord_preaccept_timeout", conf.accord_preaccept_timeout, LOWEST_ACCEPTED_TIMEOUT); + conf.accord_preaccept_timeout = LOWEST_ACCEPTED_TIMEOUT; + } } private static void logInfo(String property, DurationSpec.LongMillisecondsBound actualValue, DurationSpec.LongMillisecondsBound lowestAcceptedValue) @@ -2104,6 +2202,10 @@ public static void createAllDirectories() throw new ConfigurationException("commitlog_directory must be specified", false); FileUtils.createDirectory(conf.commitlog_directory); + if (conf.accord.journal_directory == null) + throw new ConfigurationException("accord.journal_directory must be specified", false); + FileUtils.createDirectory(conf.accord.journal_directory); + if (conf.hints_directory == null) throw new ConfigurationException("hints_directory must be specified", false); FileUtils.createDirectory(conf.hints_directory); @@ -2408,6 +2510,11 @@ public static long getWriteRpcTimeout(TimeUnit unit) return conf.write_request_timeout.to(unit); } + public static long getShortRpcTimeout(TimeUnit unit) + { + return conf.short_rpc_timeout.to(unit); + } + public static void setWriteRpcTimeout(long timeOutInMillis) { conf.write_request_timeout = new DurationSpec.LongMillisecondsBound(timeOutInMillis); @@ -2646,6 +2753,20 @@ public static void setConcurrentViewWriters(int concurrent_materialized_view_wri conf.concurrent_materialized_view_writes = concurrent_materialized_view_writes; } + public static int getAccordConcurrentOps() + { + return conf.concurrent_accord_operations; + } + + public static void setConcurrentAccordOps(int concurrent_operations) + { + if (concurrent_operations < 0) + { + throw new IllegalArgumentException("Concurrent accord operations must be non-negative"); + } + conf.concurrent_accord_operations = concurrent_operations; + } + public static int getFlushWriters() { return conf.memtable_flush_writers; @@ -2653,7 +2774,13 @@ public static int getFlushWriters() public static int getAvailableProcessors() { - return conf == null ? -1 : conf.available_processors; + OptionaldPositiveInt ap = conf == null ? OptionaldPositiveInt.UNDEFINED : conf.available_processors; + return ap.or(Runtime.getRuntime()::availableProcessors); + } + + public static void setAvailableProcessors(int value) + { + conf.available_processors = new OptionaldPositiveInt(value); } public static int getConcurrentCompactors() @@ -2991,6 +3118,16 @@ public static void setCommitLogCompression(ParameterizedClass compressor) conf.commitlog_compression = compressor; } + public static String getAccordJournalDirectory() + { + return conf.accord.journal_directory; + } + + public static void setAccordJournalDirectory(String path) + { + conf.accord.journal_directory = path; + } + public static Config.FlushCompression getFlushCompression() { return conf.flush_compression; @@ -3546,6 +3683,11 @@ public static boolean paxoTopologyRepairStrictEachQuorum() return conf.paxos_topology_repair_strict_each_quorum; } + public static TransactionalMode defaultTransactionalMode() + { + return conf.accord.default_transactional_mode; + } + public static void setNativeTransportMaxRequestDataInFlightPerIpInBytes(long maxRequestDataInFlightInBytes) { if (maxRequestDataInFlightInBytes == -1) @@ -3833,7 +3975,7 @@ public static void setInternodeMessagingEncyptionOptions(EncryptionOptions.Serve conf.server_encryption_options = encryptionOptions; } - public static EncryptionOptions getNativeProtocolEncryptionOptions() + public static EncryptionOptions.ClientEncryptionOptions getNativeProtocolEncryptionOptions() { return conf.client_encryption_options; } @@ -3844,7 +3986,7 @@ public static JMXServerOptions getJmxServerOptions() } @VisibleForTesting - public static void updateNativeProtocolEncryptionOptions(Function update) + public static void updateNativeProtocolEncryptionOptions(Function update) { conf.client_encryption_options = update.apply(conf.client_encryption_options); } @@ -3895,6 +4037,11 @@ public static int getHintsFlushPeriodInMS() return conf.hints_flush_period.toMilliseconds(); } + public static void setHintsFlushPeriodInMS(int milliseconds) + { + conf.hints_flush_period = new IntMillisecondsBound(milliseconds); + } + public static long getMaxHintsFileSize() { return conf.max_hints_file_size.toBytesInLong(); @@ -4133,6 +4280,21 @@ public static long getPaxosCacheSizeInMiB() return paxosCacheSizeInMiB; } + public static long getAccordCacheSizeInMiB() + { + return accordCacheSizeInMiB; + } + + public static long getAccordWorkingSetSizeInMiB() + { + return accordWorkingSetSizeInMiB; + } + + public static long getConsensusMigrationCacheSizeInMiB() + { + return consensusMigrationCacheSizeInMiB; + } + public static long getCounterCacheSizeInMiB() { return counterCacheSizeInMiB; @@ -4405,6 +4567,16 @@ public static void setMaterializedViewsEnabled(boolean enableMaterializedViews) conf.materialized_views_enabled = enableMaterializedViews; } + public static boolean isMaterializedViewsOnRepairEnabled() + { + return conf.materialized_views_on_repair_enabled; + } + + public static void setMaterializedViewsOnRepairEnabled(boolean val) + { + conf.materialized_views_on_repair_enabled = val; + } + public static boolean getSASIIndexesEnabled() { return conf.sasi_indexes_enabled; @@ -5145,6 +5317,145 @@ public static void setUseStatementsEnabled(boolean enabled) } } + + public static AccordSpec getAccord() + { + return conf.accord; + } + + public static AccordSpec.TransactionalRangeMigration getTransactionalRangeMigration() + { + return conf.accord.range_migration; + } + + public static void setTransactionalRangeMigration(AccordSpec.TransactionalRangeMigration val) + { + conf.accord.range_migration = Preconditions.checkNotNull(val); + } + + public static long getAccordRangeSyncPointTimeoutNanos() + { + return conf.accord.range_syncpoint_timeout.to(TimeUnit.NANOSECONDS); + } + + public static long getAccordRepairTimeoutNanos() + { + return conf.accord.repair_timeout.to(TimeUnit.NANOSECONDS); + } + + public static boolean getAccordTransactionsEnabled() + { + return conf == null ? false : conf.accord.enabled; + } + + public static void setAccordTransactionsEnabled(boolean b) + { + conf.accord.enabled = b; + } + + public static AccordSpec.QueueShardModel getAccordQueueShardModel() + { + return conf.accord.queue_shard_model; + } + + public static AccordSpec.QueueSubmissionModel getAccordQueueSubmissionModel() + { + return conf.accord.queue_submission_model; + } + + public static int getAccordQueueShardCount() + { + switch (getAccordQueueShardModel()) + { + default: throw new AssertionError("Unhandled queue_shard_model: " + conf.accord.queue_shard_model); + case THREAD_PER_SHARD: + case THREAD_PER_SHARD_SYNC_QUEUE: + return conf.accord.queue_shard_count.or(DatabaseDescriptor::getAvailableProcessors); + case THREAD_POOL_PER_SHARD: + case THREAD_POOL_PER_SHARD_EXCLUDES_IO: + int defaultMax = getAccordQueueSubmissionModel() == AccordSpec.QueueSubmissionModel.SYNC ? 8 : 4; + return conf.accord.queue_shard_count.or(Math.min(defaultMax, DatabaseDescriptor.getAvailableProcessors())); + } + } + + public static int getAccordCommandStoreShardCount() + { + return conf.accord.command_store_shard_count.or(DatabaseDescriptor::getAvailableProcessors); + } + + public static int getAccordMaxQueuedLoadCount() + { + return conf.accord.max_queued_loads.or(getAccordConcurrentOps()); + } + + public static int getAccordMaxQueuedRangeLoadCount() + { + return conf.accord.max_queued_range_loads.or(Math.max(4, getAccordConcurrentOps() / 4)); + } + + public static boolean getAccordCacheShrinkingOn() + { + return conf.accord.shrink_cache_entries_before_eviction; + } + + public static String getAccordRecoverTxnDelay() + { + return conf.accord.recover_txn; + } + + public static void setAccordRecoverTxnDelay(String recoverTxnDelay) + { + AccordWaitStrategies.setRecoverTxn(recoverTxnDelay); + conf.accord.recover_txn = recoverTxnDelay; + } + + public static String getAccordExpireTxnDelay() + { + return conf.accord.expire_txn; + } + + public static void setAccordExpireTxnDelay(String expireTxnDelay) + { + AccordWaitStrategies.setExpireTxn(expireTxnDelay); + conf.accord.expire_txn = expireTxnDelay; + } + + public static long getAccordFastPathUpdateDelayMillis() + { + DurationSpec.IntSecondsBound bound = conf.accord.fast_path_update_delay; + return bound == null ? -1 : bound.to(TimeUnit.MILLISECONDS); + } + + public static long getAccordGCDelay(TimeUnit unit) + { + return conf.accord.gc_delay.to(unit); + } + + public static int getAccordShardDurabilityTargetSplits() + { + return conf.accord.shard_durability_target_splits; + } + + public static long getAccordScheduleDurabilityTxnIdLag(TimeUnit unit) + { + return conf.accord.durability_txnid_lag.to(unit); + } + + public static long getAccordGlobalDurabilityCycle(TimeUnit unit) + { + return conf.accord.global_durability_cycle.to(unit); + } + + public static long getAccordShardDurabilityCycle(TimeUnit unit) + { + return conf.accord.shard_durability_cycle.to(unit); + } + + public static boolean getAccordStateCacheListenerJFREnabled() + { + return conf.accord.state_cache_listener_jfr_enabled; + } + public static boolean getForceNewPreparedStatementBehaviour() { return conf.force_new_prepared_statement_behaviour; @@ -5400,6 +5711,15 @@ public static void resetSSTableFormats(Iterable factories return Objects.requireNonNull(selectedSSTableFormat, "Forgot to initialize DatabaseDescriptor?"); } + @VisibleForTesting + public static void setSelectedSSTableFormat(String name) + { + SSTableFormat format = getSSTableFormats().get(name); + if (format == null) + throw new IllegalArgumentException("Unknown sstable format: " + name); + setSelectedSSTableFormat(format); + } + @VisibleForTesting public static void setSelectedSSTableFormat(SSTableFormat format) { @@ -5467,11 +5787,21 @@ public static void setCmsDefaultRetryMaxTries(int value) conf.cms_default_max_retries = value; } - public static DurationSpec getDefaultRetryBackoff() + public static DurationSpec.IntMillisecondsBound getDefaultRetryBackoff() { return conf.cms_default_retry_backoff; } + public static DurationSpec.IntMillisecondsBound getDefaultMaxRetryBackoff() + { + return conf.cms_default_max_retry_backoff; + } + + public static String getCMSRetryDelay() + { + return conf.cms_retry_delay; + } + public static DurationSpec getCmsAwaitTimeout() { return conf.cms_await_timeout; @@ -5574,4 +5904,45 @@ public static void setPurgeableTobmstonesMetricGranularity(Config.TombstonesMetr { conf.tombstone_read_purgeable_metric_granularity = granularity; } + + public static boolean getPaxosRepairRaceWait() + { + return conf.paxos_repair_race_wait; + } + + @VisibleForTesting + public static void setPaxosRepairRaceWait(boolean paxosRepairRaceWait) + { + conf.paxos_repair_race_wait = paxosRepairRaceWait; + } + + public static boolean getAccordEphemeralReadEnabledEnabled() + { + return conf.accord.ephemeralReadEnabled; + } + + public static AutoRepairConfig getAutoRepairConfig() + { + return conf.auto_repair; + } + + public static double getIncrementalRepairDiskHeadroomRejectRatio() + { + return conf.incremental_repair_disk_headroom_reject_ratio; + } + + public static void setIncrementalRepairDiskHeadroomRejectRatio(double value) + { + if (value < 0.0 || value > 1.0) + { + throw new IllegalArgumentException("Value must be >= 0 and <= 1 for incremental_repair_disk_headroom_reject_ratio"); + } + conf.incremental_repair_disk_headroom_reject_ratio = value; + } + + @VisibleForTesting + public static void setPartitioner(String name) + { + partitioner = FBUtilities.newPartitioner(name); + } } diff --git a/src/java/org/apache/cassandra/config/DurationSpec.java b/src/java/org/apache/cassandra/config/DurationSpec.java index bf0fc21334ad..01a7d70b5f46 100644 --- a/src/java/org/apache/cassandra/config/DurationSpec.java +++ b/src/java/org/apache/cassandra/config/DurationSpec.java @@ -17,6 +17,7 @@ */ package org.apache.cassandra.config; +import java.time.Duration; import java.util.Arrays; import java.util.Objects; import java.util.concurrent.TimeUnit; @@ -137,6 +138,11 @@ public TimeUnit unit() return unit; } + public Duration toDuration() + { + return Duration.of(quantity(), unit().toChronoUnit()); + } + /** * @param symbol the time unit symbol * @return the time unit associated to the specified symbol diff --git a/src/java/org/apache/cassandra/config/EncryptionOptions.java b/src/java/org/apache/cassandra/config/EncryptionOptions.java index 07f78b9a5eea..5f94c14bcbb7 100644 --- a/src/java/org/apache/cassandra/config/EncryptionOptions.java +++ b/src/java/org/apache/cassandra/config/EncryptionOptions.java @@ -17,7 +17,6 @@ */ package org.apache.cassandra.config; -import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -46,10 +45,8 @@ * Examples of such options are: supported cipher-suites, ssl protocol with version, accepted protocols, end-point * verification, require client-auth/cert etc. */ -public class EncryptionOptions +public abstract class EncryptionOptions> { - Logger logger = LoggerFactory.getLogger(EncryptionOptions.class); - public enum TlsEncryptionPolicy { UNENCRYPTED("unencrypted"), @@ -69,39 +66,50 @@ public String description() } } - public enum ClientAuth + public enum ConfigKey { - REQUIRED("true"), - NOT_REQUIRED("false"), - OPTIONAL("optional"); - private final String value; - private static final Map VALUES = new HashMap<>(); - static + KEYSTORE("keystore"), + KEYSTORE_PASSWORD("keystore_password"), + KEYSTORE_PASSWORD_FILE("keystore_password_file"), + OUTBOUND_KEYSTORE("outbound_keystore"), + OUTBOUND_KEYSTORE_PASSWORD("outbound_keystore_password"), + OUTBOUND_KEYSTORE_PASSWORD_FILE("outbound_keystore_password_file"), + TRUSTSTORE("truststore"), + TRUSTSTORE_PASSWORD("truststore_password"), + TRUSTSTORE_PASSWORD_FILE("truststore_password_file"), + CIPHER_SUITES("cipher_suites"), + PROTOCOL("protocol"), + ACCEPTED_PROTOCOLS("accepted_protocols"), + ALGORITHM("algorithm"), + STORE_TYPE("store_type"), + REQUIRE_CLIENT_AUTH("require_client_auth"), + REQUIRE_ENDPOINT_VERIFICATION("require_endpoint_verification"), + ENABLED("enabled"), + OPTIONAL("optional"), + MAX_CERTIFICATE_VALIDITY_PERIOD("max_certificate_validity_period"), + CERTIFICATE_VALIDITY_WARN_THRESHOLD("certificate_validity_warn_threshold"); + + final String keyName; + + ConfigKey(String keyName) { - for (ClientAuth clientAuth : ClientAuth.values()) - { - VALUES.put(clientAuth.value, clientAuth); - VALUES.put(toLowerCaseLocalized(clientAuth.name()), clientAuth); - } + this.keyName = keyName; } - ClientAuth(String value) + public String toString() { - this.value = value; + return keyName; } - public static ClientAuth from(String value) + static Set asSet() { - if (VALUES.containsKey(toLowerCaseLocalized(value))) + Set valueSet = new HashSet<>(); + ConfigKey[] values = values(); + for (ConfigKey key : values) { - return VALUES.get(toLowerCaseLocalized(value)); + valueSet.add(toLowerCaseLocalized(key.toString())); } - throw new ConfigurationException(value + " is not a valid ClientAuth option"); - } - - public String value() - { - return value; + return valueSet; } } @@ -153,53 +161,6 @@ public String value() */ public transient ISslContextFactory sslContextFactoryInstance; - public enum ConfigKey - { - KEYSTORE("keystore"), - KEYSTORE_PASSWORD("keystore_password"), - KEYSTORE_PASSWORD_FILE("keystore_password_file"), - OUTBOUND_KEYSTORE("outbound_keystore"), - OUTBOUND_KEYSTORE_PASSWORD("outbound_keystore_password"), - OUTBOUND_KEYSTORE_PASSWORD_FILE("outbound_keystore_password_file"), - TRUSTSTORE("truststore"), - TRUSTSTORE_PASSWORD("truststore_password"), - TRUSTSTORE_PASSWORD_FILE("truststore_password_file"), - CIPHER_SUITES("cipher_suites"), - PROTOCOL("protocol"), - ACCEPTED_PROTOCOLS("accepted_protocols"), - ALGORITHM("algorithm"), - STORE_TYPE("store_type"), - REQUIRE_CLIENT_AUTH("require_client_auth"), - REQUIRE_ENDPOINT_VERIFICATION("require_endpoint_verification"), - ENABLED("enabled"), - OPTIONAL("optional"), - MAX_CERTIFICATE_VALIDITY_PERIOD("max_certificate_validity_period"), - CERTIFICATE_VALIDITY_WARN_THRESHOLD("certificate_validity_warn_threshold"); - - final String keyName; - - ConfigKey(String keyName) - { - this.keyName = keyName; - } - - public String toString() - { - return keyName; - } - - static Set asSet() - { - Set valueSet = new HashSet<>(); - ConfigKey[] values = values(); - for (ConfigKey key : values) - { - valueSet.add(toLowerCaseLocalized(key.toString())); - } - return valueSet; - } - } - public EncryptionOptions() { ssl_context_factory = new ParameterizedClass("org.apache.cassandra.security.DefaultSslContextFactory", @@ -252,7 +213,7 @@ public EncryptionOptions(ParameterizedClass ssl_context_factory, this.certificate_validity_warn_threshold = certificate_validity_warn_threshold; } - public EncryptionOptions(EncryptionOptions options) + public EncryptionOptions(EncryptionOptions options) { ssl_context_factory = options.ssl_context_factory; keystore = options.keystore; @@ -280,7 +241,7 @@ public EncryptionOptions(EncryptionOptions options) * * It also initializes the ISslContextFactory's instance */ - public EncryptionOptions applyConfig() + public T applyConfig() { ensureConfigNotApplied(); @@ -304,7 +265,7 @@ else if (sslContextFactoryInstance.hasKeystore()) // Otherwise if there's no keystore, not possible to establish an optional secure connection isOptional = false; } - return this; + return (T) this; } /** @@ -312,22 +273,22 @@ else if (sslContextFactoryInstance.hasKeystore()) * as the constructor for its implementation. * * @throws IllegalArgumentException in case any pre-defined key, as per {@link ConfigKey}, for the encryption - * options is duplicated in the parameterized keys. + * options is duplicated in the parameterized keys. */ - private void prepareSslContextFactoryParameterizedKeys(Map sslContextFactoryParameters) + private void prepareSslContextFactoryParameterizedKeys(Map sslContextFactoryParameters) { if (ssl_context_factory.parameters != null) { Set configKeys = ConfigKey.asSet(); for (Map.Entry entry : ssl_context_factory.parameters.entrySet()) { - if(configKeys.contains(toLowerCaseLocalized(entry.getKey()))) + if (configKeys.contains(toLowerCaseLocalized(entry.getKey()))) { - throw new IllegalArgumentException("SslContextFactory "+ssl_context_factory.class_name+" should " + - "configure '"+entry.getKey()+"' as encryption_options instead of" + + throw new IllegalArgumentException("SslContextFactory " + ssl_context_factory.class_name + " should " + + "configure '" + entry.getKey() + "' as encryption_options instead of" + " parameterized keys"); } - sslContextFactoryParameters.put(entry.getKey(),entry.getValue()); + sslContextFactoryParameters.put(entry.getKey(), entry.getValue()); } } } @@ -375,7 +336,8 @@ private void initializeSslContextFactory() protected static void putSslContextFactoryParameter(Map existingParameters, ConfigKey configKey, Object value) { - if (value != null) { + if (value != null) + { existingParameters.put(configKey.toString(), value); } } @@ -480,9 +442,9 @@ public List getAcceptedProtocols() return sslContextFactoryInstance == null ? null : sslContextFactoryInstance.getAcceptedProtocols(); } - public ClientAuth getClientAuth() + public ClientEncryptionOptions.ClientAuth getClientAuth() { - return this.require_client_auth == null ? ClientAuth.NOT_REQUIRED : ClientAuth.from(this.require_client_auth); + return this.require_client_auth == null ? ClientEncryptionOptions.ClientAuth.NOT_REQUIRED : ClientEncryptionOptions.ClientAuth.from(this.require_client_auth); } public String[] acceptedProtocolsArray() @@ -517,162 +479,6 @@ else if (getEnabled()) } } - public EncryptionOptions withSslContextFactory(ParameterizedClass sslContextFactoryClass) - { - return new EncryptionOptions(sslContextFactoryClass, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, - store_type, require_client_auth, require_endpoint_verification, enabled, - optional, max_certificate_validity_period, max_certificate_validity_period).applyConfig(); - } - - public EncryptionOptions withKeyStore(String keystore) - { - return new EncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, - store_type, require_client_auth, require_endpoint_verification, enabled, - optional, max_certificate_validity_period, max_certificate_validity_period).applyConfig(); - } - - public EncryptionOptions withKeyStorePassword(String keystore_password) - { - return new EncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, - store_type, require_client_auth, require_endpoint_verification, enabled, - optional, max_certificate_validity_period, max_certificate_validity_period).applyConfig(); - } - - public EncryptionOptions withKeyStorePasswordFile(String keystore_password_file) - { - return new EncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, - store_type, require_client_auth, require_endpoint_verification, enabled, - optional, max_certificate_validity_period, max_certificate_validity_period).applyConfig(); - } - - public EncryptionOptions withTrustStore(String truststore) - { - return new EncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, - store_type, require_client_auth, require_endpoint_verification, enabled, - optional, max_certificate_validity_period, max_certificate_validity_period).applyConfig(); - } - - public EncryptionOptions withTrustStorePassword(String truststore_password) - { - return new EncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, - store_type, require_client_auth, require_endpoint_verification, enabled, - optional, max_certificate_validity_period, max_certificate_validity_period).applyConfig(); - } - - public EncryptionOptions withTrustStorePasswordFile(String truststore_password_file) - { - return new EncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, - store_type, require_client_auth, require_endpoint_verification, enabled, - optional, max_certificate_validity_period, max_certificate_validity_period).applyConfig(); - } - - public EncryptionOptions withCipherSuites(List cipher_suites) - { - return new EncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, - store_type, require_client_auth, require_endpoint_verification, enabled, - optional, max_certificate_validity_period, max_certificate_validity_period).applyConfig(); - } - - public EncryptionOptions withCipherSuites(String... cipher_suites) - { - return new EncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, ImmutableList.copyOf(cipher_suites), protocol, - accepted_protocols, algorithm, store_type, require_client_auth, - require_endpoint_verification, enabled, optional, max_certificate_validity_period, - max_certificate_validity_period).applyConfig(); - } - - public EncryptionOptions withProtocol(String protocol) - { - return new EncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, - store_type, require_client_auth, require_endpoint_verification, enabled, - optional, max_certificate_validity_period, max_certificate_validity_period).applyConfig(); - } - - - public EncryptionOptions withAcceptedProtocols(List accepted_protocols) - { - return new EncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols == null ? null : - ImmutableList.copyOf(accepted_protocols), - algorithm, store_type, require_client_auth, require_endpoint_verification, - enabled, optional, max_certificate_validity_period, max_certificate_validity_period).applyConfig(); - } - - - public EncryptionOptions withAlgorithm(String algorithm) - { - return new EncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, - store_type, require_client_auth, require_endpoint_verification, enabled, - optional, max_certificate_validity_period, max_certificate_validity_period).applyConfig(); - } - - public EncryptionOptions withStoreType(String store_type) - { - return new EncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, - store_type, require_client_auth, require_endpoint_verification, enabled, - optional, max_certificate_validity_period, max_certificate_validity_period).applyConfig(); - } - - public EncryptionOptions withRequireClientAuth(ClientAuth require_client_auth) - { - return new EncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, - store_type, require_client_auth.value, require_endpoint_verification, enabled, - optional, max_certificate_validity_period, max_certificate_validity_period).applyConfig(); - } - - public EncryptionOptions withRequireEndpointVerification(boolean require_endpoint_verification) - { - return new EncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, - store_type, require_client_auth, require_endpoint_verification, enabled, - optional, max_certificate_validity_period, max_certificate_validity_period).applyConfig(); - } - - public EncryptionOptions withEnabled(boolean enabled) - { - return new EncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, - store_type, require_client_auth, require_endpoint_verification, enabled, - optional, max_certificate_validity_period, max_certificate_validity_period).applyConfig(); - } - - public EncryptionOptions withOptional(Boolean optional) - { - return new EncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, - store_type, require_client_auth, require_endpoint_verification, enabled, - optional, max_certificate_validity_period, max_certificate_validity_period).applyConfig(); - } - - public EncryptionOptions withMaxCertificateValidityPeriod(DurationSpec.IntMinutesBound maxCertificateValidityPeriod) - { - return new EncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, - store_type, require_client_auth, require_endpoint_verification, enabled, - optional, maxCertificateValidityPeriod, certificate_validity_warn_threshold).applyConfig(); - } - - public EncryptionOptions withCertificateValidityWarnThreshold(DurationSpec.IntMinutesBound certificateValidityWarnThreshold) - { - return new EncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, - truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, - store_type, require_client_auth, require_endpoint_verification, enabled, - optional, max_certificate_validity_period, certificateValidityWarnThreshold).applyConfig(); - } - /** * The method is being mainly used to cache SslContexts therefore, we only consider * fields that would make a difference when the TrustStore or KeyStore files are updated @@ -685,7 +491,7 @@ public boolean equals(Object o) if (o == null || getClass() != o.getClass()) return false; - EncryptionOptions opt = (EncryptionOptions)o; + EncryptionOptions opt = (EncryptionOptions) o; return enabled == opt.enabled && optional == opt.optional && require_client_auth.equals(opt.require_client_auth) && @@ -731,8 +537,258 @@ public int hashCode() return result; } - public static class ServerEncryptionOptions extends EncryptionOptions + public static abstract class Builder> { + ParameterizedClass ssl_context_factory; + String keystore; + String keystore_password; + String keystore_password_file; + String truststore; + String truststore_password; + String truststore_password_file; + List cipher_suites; + String protocol; + List accepted_protocols; + String algorithm; + String store_type; + String require_client_auth; + boolean require_endpoint_verification; + DurationSpec.IntMinutesBound max_certificate_validity_period; + DurationSpec.IntMinutesBound certificate_validity_warn_threshold; + Boolean enabled; + Boolean optional; + Boolean isEnabled; + Boolean isOptional; + + public Builder(EncryptionOptions options) + { + ssl_context_factory = options.ssl_context_factory; + keystore = options.keystore; + keystore_password = options.keystore_password; + keystore_password_file = options.keystore_password_file; + truststore = options.truststore; + truststore_password = options.truststore_password; + truststore_password_file = options.truststore_password_file; + cipher_suites = options.cipher_suites; + protocol = options.protocol; + accepted_protocols = options.accepted_protocols; + algorithm = options.algorithm; + store_type = options.store_type; + require_client_auth = options.require_client_auth; + require_endpoint_verification = options.require_endpoint_verification; + enabled = options.enabled; + optional = options.optional; + max_certificate_validity_period = options.max_certificate_validity_period; + certificate_validity_warn_threshold = options.certificate_validity_warn_threshold; + } + + public Builder withSslContextFactory(ParameterizedClass sslContextFactoryClass) + { + this.ssl_context_factory = sslContextFactoryClass; + return this; + } + + public Builder withKeyStore(String keystore) + { + this.keystore = keystore; + return this; + } + + public Builder withKeyStorePassword(String keystore_password) + { + this.keystore_password = keystore_password; + return this; + } + + public Builder withKeyStorePasswordFile(String keystore_password_file) + { + this.keystore_password_file = keystore_password_file; + return this; + } + + public Builder withTrustStore(String truststore) + { + this.truststore = truststore; + return this; + } + + public Builder withTrustStorePassword(String truststore_password) + { + this.truststore_password = truststore_password; + return this; + } + + public Builder withTrustStorePasswordFile(String truststore_password_file) + { + this.truststore_password_file = truststore_password_file; + return this; + } + + public Builder withCipherSuites(List cipher_suites) + { + this.cipher_suites = cipher_suites; + return this; + } + + public Builder withCipherSuites(String... cipher_suites) + { + this.cipher_suites = ImmutableList.copyOf(cipher_suites); + return this; + } + + public Builder withProtocol(String protocol) + { + this.protocol = protocol; + return this; + } + + public Builder withAcceptedProtocols(List accepted_protocols) + { + this.accepted_protocols = accepted_protocols == null ? null : + ImmutableList.copyOf(accepted_protocols); + return this; + } + + public Builder withAlgorithm(String algorithm) + { + this.algorithm = algorithm; + return this; + } + + public Builder withStoreType(String store_type) + { + this.store_type = store_type; + return this; + } + + public Builder withRequireClientAuth(ClientEncryptionOptions.ClientAuth require_client_auth) + { + this.require_client_auth = require_client_auth.value; + return this; + } + + public Builder withRequireEndpointVerification(boolean require_endpoint_verification) + { + this.require_endpoint_verification = require_endpoint_verification; + return this; + } + + public Builder withEnabled(boolean enabled) + { + this.enabled = enabled; + return this; + } + + public Builder withOptional(Boolean optional) + { + this.optional = optional; + return this; + } + + public Builder withMaxCertificateValidityPeriod(DurationSpec.IntMinutesBound maxCertificateValidityPeriod) + { + this.max_certificate_validity_period = maxCertificateValidityPeriod; + return this; + } + + public Builder withCertificateValidityWarnThreshold(DurationSpec.IntMinutesBound certificateValidityWarnThreshold) + { + this.certificate_validity_warn_threshold = certificateValidityWarnThreshold; + return this; + } + + public abstract T build(); + } + + public static class ClientEncryptionOptions extends EncryptionOptions + { + public ClientEncryptionOptions() + { + } + + public ClientEncryptionOptions(ParameterizedClass ssl_context_factory, + String keystore, String keystore_password, String keystore_password_file, + String truststore, String truststore_password, String truststore_password_file, + List cipher_suites, String protocol, List accepted_protocols, + String algorithm, String store_type, String require_client_auth, + boolean require_endpoint_verification, Boolean enabled, Boolean optional, + DurationSpec.IntMinutesBound max_certificate_validity_period, + DurationSpec.IntMinutesBound certificate_validity_warn_threshold) + { + super(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, truststore_password, + truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, store_type, require_client_auth, + require_endpoint_verification, enabled, optional, max_certificate_validity_period, certificate_validity_warn_threshold); + } + + public ClientEncryptionOptions(ClientEncryptionOptions options) + { + super(options); + } + + public enum ClientAuth + { + REQUIRED("true"), + NOT_REQUIRED("false"), + OPTIONAL("optional"); + private final String value; + private static final Map VALUES = new HashMap<>(); + + static + { + for (ClientAuth clientAuth : ClientAuth.values()) + { + VALUES.put(clientAuth.value, clientAuth); + VALUES.put(toLowerCaseLocalized(clientAuth.name()), clientAuth); + } + } + + ClientAuth(String value) + { + this.value = value; + } + + public static ClientAuth from(String value) + { + if (VALUES.containsKey(toLowerCaseLocalized(value))) + { + return VALUES.get(toLowerCaseLocalized(value)); + } + throw new ConfigurationException(value + " is not a valid ClientAuth option"); + } + + public String value() + { + return value; + } + } + + public static class Builder extends EncryptionOptions.Builder + { + public Builder() + { + this(new ClientEncryptionOptions()); + } + + public Builder(ClientEncryptionOptions options) + { + super(options); + } + + @Override + public ClientEncryptionOptions build() + { + return new ClientEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, truststore, + truststore_password, truststore_password_file, cipher_suites, protocol, accepted_protocols, algorithm, + store_type, require_client_auth, require_endpoint_verification, enabled, + optional, max_certificate_validity_period, certificate_validity_warn_threshold).applyConfig(); + } + } + } + + public static class ServerEncryptionOptions extends EncryptionOptions + { + private static final Logger logger = LoggerFactory.getLogger(ServerEncryptionOptions.class); + public enum InternodeEncryption { all, none, dc, rack @@ -788,6 +844,73 @@ public ServerEncryptionOptions(ServerEncryptionOptions options) this.outbound_keystore_password_file = options.outbound_keystore_password_file; } + public static class Builder extends EncryptionOptions.Builder + { + private InternodeEncryption internode_encryption; + private boolean legacy_ssl_storage_port_enabled; + private String outbound_keystore; + private String outbound_keystore_password; + private String outbound_keystore_password_file; + + public Builder() + { + this(new ServerEncryptionOptions()); + } + + public Builder(ServerEncryptionOptions options) + { + super(options); + this.internode_encryption = options.internode_encryption; + this.legacy_ssl_storage_port_enabled = options.legacy_ssl_storage_port_enabled; + this.outbound_keystore = options.outbound_keystore; + this.outbound_keystore_password = options.outbound_keystore_password; + this.outbound_keystore_password_file = options.outbound_keystore_password_file; + } + + public Builder withInternodeEncryption(InternodeEncryption internode_encryption) + { + this.internode_encryption = internode_encryption; + return this; + } + + public Builder withLegacySslStoragePort(boolean enable_legacy_ssl_storage_port) + { + this.legacy_ssl_storage_port_enabled = enable_legacy_ssl_storage_port; + return this; + } + + public Builder withOutboundKeystore(String outboundKeystore) + { + this.outbound_keystore = outboundKeystore; + return this; + } + + public Builder withOutboundKeystorePassword(String outboundKeystorePassword) + { + this.outbound_keystore_password = outboundKeystorePassword; + return this; + } + + public Builder withOutboundKeystorePasswordFile(String outboundKeystorePasswordFile) + { + this.outbound_keystore_password_file = outboundKeystorePasswordFile; + return this; + } + + @Override + public ServerEncryptionOptions build() + { + return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, + outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, + truststore, truststore_password, truststore_password_file, + cipher_suites, protocol, accepted_protocols, + algorithm, store_type, require_client_auth, + require_endpoint_verification, optional, internode_encryption, + legacy_ssl_storage_port_enabled, max_certificate_validity_period, + certificate_validity_warn_threshold).applyConfig(); + } + } + @Override protected void fillSslContextParams(Map sslContextFactoryParameters) { @@ -798,7 +921,7 @@ protected void fillSslContextParams(Map sslContextFactoryParamet } @Override - public EncryptionOptions applyConfig() + public ServerEncryptionOptions applyConfig() { return applyConfigInternal(); } @@ -814,13 +937,13 @@ private ServerEncryptionOptions applyConfigInternal() logger.warn("Setting server_encryption_options.enabled has no effect, use internode_encryption"); } - if (getClientAuth() != ClientAuth.NOT_REQUIRED && (internode_encryption == InternodeEncryption.rack || internode_encryption == InternodeEncryption.dc)) + if (getClientAuth() != ClientEncryptionOptions.ClientAuth.NOT_REQUIRED && (internode_encryption == InternodeEncryption.rack || internode_encryption == InternodeEncryption.dc)) { logger.warn("Setting require_client_auth is incompatible with 'rack' and 'dc' internode_encryption values." - + " It is possible for an internode connection to pretend to be in the same rack/dc by spoofing" - + " its broadcast address in the handshake and bypass authentication. To ensure that mutual TLS" - + " authentication is not bypassed, please set internode_encryption to 'all'. Continuing with" - + " insecure configuration."); + + " It is possible for an internode connection to pretend to be in the same rack/dc by spoofing" + + " its broadcast address in the handshake and bypass authentication. To ensure that mutual TLS" + + " authentication is not bypassed, please set internode_encryption to 'all'. Continuing with" + + " insecure configuration."); } // regardless of the optional flag, if the internode encryption is set to rack or dc @@ -910,284 +1033,5 @@ public int hashCode() result += 31 * (outbound_keystore_password_file == null ? 0 : outbound_keystore_password_file.hashCode()); return result; } - - @Override - public ServerEncryptionOptions withSslContextFactory(ParameterizedClass sslContextFactoryClass) - { - return new ServerEncryptionOptions(sslContextFactoryClass, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - @Override - public ServerEncryptionOptions withKeyStore(String keystore) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - @Override - public ServerEncryptionOptions withKeyStorePassword(String keystore_password) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - @Override - public ServerEncryptionOptions withKeyStorePasswordFile(String keystore_password_file) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - @Override - public ServerEncryptionOptions withTrustStore(String truststore) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - @Override - public ServerEncryptionOptions withTrustStorePassword(String truststore_password) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - @Override - public ServerEncryptionOptions withTrustStorePasswordFile(String truststore_password_file) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - @Override - public ServerEncryptionOptions withCipherSuites(List cipher_suites) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - @Override - public ServerEncryptionOptions withCipherSuites(String... cipher_suites) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - Arrays.asList(cipher_suites), protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - @Override - public ServerEncryptionOptions withProtocol(String protocol) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - @Override - public ServerEncryptionOptions withAcceptedProtocols(List accepted_protocols) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - @Override - public ServerEncryptionOptions withAlgorithm(String algorithm) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - @Override - public ServerEncryptionOptions withStoreType(String store_type) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - @Override - public ServerEncryptionOptions withRequireClientAuth(ClientAuth require_client_auth) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth.value, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - @Override - public ServerEncryptionOptions withRequireEndpointVerification(boolean require_endpoint_verification) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - public ServerEncryptionOptions withOptional(boolean optional) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - public ServerEncryptionOptions withInternodeEncryption(InternodeEncryption internode_encryption) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - public ServerEncryptionOptions withLegacySslStoragePort(boolean enable_legacy_ssl_storage_port) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - enable_legacy_ssl_storage_port, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - public ServerEncryptionOptions withOutboundKeystore(String outboundKeystore) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outboundKeystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - public ServerEncryptionOptions withOutboundKeystorePassword(String outboundKeystorePassword) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outboundKeystorePassword, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - - public ServerEncryptionOptions withOutboundKeystorePasswordFile(String outboundKeystorePasswordFile) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outboundKeystorePasswordFile, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, max_certificate_validity_period, - max_certificate_validity_period).applyConfigInternal(); - } - @Override - public ServerEncryptionOptions withMaxCertificateValidityPeriod(DurationSpec.IntMinutesBound maxCertificateValidityPeriod) - { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, keystore_password_file, - outbound_keystore, outbound_keystore_password, outbound_keystore_password_file, - truststore, truststore_password, truststore_password_file, - cipher_suites, protocol, accepted_protocols, - algorithm, store_type, require_client_auth, - require_endpoint_verification, optional, internode_encryption, - legacy_ssl_storage_port_enabled, maxCertificateValidityPeriod, - certificate_validity_warn_threshold).applyConfigInternal(); - } } } diff --git a/src/java/org/apache/cassandra/config/JMXServerOptions.java b/src/java/org/apache/cassandra/config/JMXServerOptions.java index 705ab02bb898..ab80307e5460 100644 --- a/src/java/org/apache/cassandra/config/JMXServerOptions.java +++ b/src/java/org/apache/cassandra/config/JMXServerOptions.java @@ -52,7 +52,7 @@ public class JMXServerOptions public final Boolean authenticate; // ssl options - public final EncryptionOptions jmx_encryption_options; + public final EncryptionOptions.ClientEncryptionOptions jmx_encryption_options; // options for using Cassandra's own authentication mechanisms public final String login_config_name; @@ -71,11 +71,11 @@ public class JMXServerOptions public JMXServerOptions() { this(true, false, 7199, 0, false, - new EncryptionOptions(), null, null, null, + new EncryptionOptions.ClientEncryptionOptions(), null, null, null, null, null); } - public static JMXServerOptions create(boolean enabled, boolean local, int jmxPort, EncryptionOptions options) + public static JMXServerOptions create(boolean enabled, boolean local, int jmxPort, EncryptionOptions.ClientEncryptionOptions options) { return new JMXServerOptions(enabled, !local, jmxPort, 0, false, options, null, null, null, @@ -95,7 +95,7 @@ public JMXServerOptions(Boolean enabled, int jmxPort, int rmiPort, Boolean authenticate, - EncryptionOptions jmx_encryption_options, + EncryptionOptions.ClientEncryptionOptions jmx_encryption_options, String loginConfigName, String loginConfigFile, String passwordFile, @@ -198,7 +198,8 @@ public static JMXServerOptions createParsingSystemProperties() // in the `cassandra.yaml`. Since the JMX SSL Config can also leverage it as per CASSANDRA-18508, password file // support is not added to the JMX SSL configuration via the system properties. Hence, `null` is used as // the password file arguments for the keystore and the truststore while constructing the encryption options here. - EncryptionOptions encryptionOptions = new EncryptionOptions(new ParameterizedClass("org.apache.cassandra.security.DefaultSslContextFactory", new HashMap<>()), + + EncryptionOptions.ClientEncryptionOptions encryptionOptions = new EncryptionOptions.ClientEncryptionOptions(new ParameterizedClass("org.apache.cassandra.security.DefaultSslContextFactory", new HashMap<>()), keystore, keystorePassword, null, diff --git a/src/java/org/apache/cassandra/config/OptionaldPositiveInt.java b/src/java/org/apache/cassandra/config/OptionaldPositiveInt.java new file mode 100644 index 000000000000..ea33b7af98f6 --- /dev/null +++ b/src/java/org/apache/cassandra/config/OptionaldPositiveInt.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.config; + +import java.util.Objects; +import java.util.function.IntSupplier; + +public class OptionaldPositiveInt +{ + public static final int UNDEFINED_VALUE = -1; + public static final OptionaldPositiveInt UNDEFINED = new OptionaldPositiveInt(UNDEFINED_VALUE); + + private final int value; + + public OptionaldPositiveInt(int value) + { + if (!(value == -1 || value >= 1)) + throw new IllegalArgumentException(String.format("Only -1 (undefined) and positive values are allowed; given %d", value)); + this.value = value; + } + + public boolean isDefined() + { + return value != UNDEFINED_VALUE; + } + + public int or(int defaultValue) + { + return value == UNDEFINED_VALUE ? defaultValue : value; + } + + public int or(IntSupplier defaultValue) + { + return value == UNDEFINED_VALUE ? defaultValue.getAsInt() : value; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + OptionaldPositiveInt that = (OptionaldPositiveInt) o; + return value == that.value; + } + + @Override + public int hashCode() + { + return Objects.hash(value); + } + + @Override + public String toString() + { + return !isDefined() ? "null" : Integer.toString(value); + } +} diff --git a/src/java/org/apache/cassandra/config/RetrySpec.java b/src/java/org/apache/cassandra/config/RetrySpec.java index 4f113af962b5..ff9b58f827ae 100644 --- a/src/java/org/apache/cassandra/config/RetrySpec.java +++ b/src/java/org/apache/cassandra/config/RetrySpec.java @@ -23,6 +23,10 @@ import javax.annotation.Nullable; import org.apache.cassandra.config.DurationSpec.LongMillisecondsBound; +import org.apache.cassandra.repair.SharedContext; +import org.apache.cassandra.service.RetryStrategy; +import org.apache.cassandra.service.TimeoutStrategy.LatencySourceFactory; +import org.apache.cassandra.service.WaitStrategy; public class RetrySpec { @@ -153,6 +157,13 @@ public LongMillisecondsBound getMaxSleepTime() return !isEnabled() ? null : maxSleepTime; } + public static WaitStrategy toStrategy(SharedContext ctx, RetrySpec spec) + { + if (!spec.isEnabled()) + return WaitStrategy.None.INSTANCE; + return RetryStrategy.parse(spec.baseSleepTime.toMilliseconds() + "ms * 2^attempts <= " + spec.maxSleepTime.toMilliseconds() + "ms,retries=" + (spec.maxAttempts.value - 1), LatencySourceFactory.none()); + } + @Override public String toString() { diff --git a/src/java/org/apache/cassandra/config/StringRetryStrategy.java b/src/java/org/apache/cassandra/config/StringRetryStrategy.java new file mode 100644 index 000000000000..6003c6591ac7 --- /dev/null +++ b/src/java/org/apache/cassandra/config/StringRetryStrategy.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.config; + +import org.apache.cassandra.service.RetryStrategy; + +import static org.apache.cassandra.service.TimeoutStrategy.LatencySourceFactory.none; + +public class StringRetryStrategy +{ + private final String spec; + private final RetryStrategy retry; + + public StringRetryStrategy(String spec) + { + this.spec = spec; + this.retry = RetryStrategy.parse(spec, none()); + } + + public RetryStrategy retry() + { + return retry; + } + + @Override + public String toString() + { + return spec; + } +} diff --git a/src/java/org/apache/cassandra/config/YamlConfigurationLoader.java b/src/java/org/apache/cassandra/config/YamlConfigurationLoader.java index 9bf4e415592c..f37a42e8fa54 100644 --- a/src/java/org/apache/cassandra/config/YamlConfigurationLoader.java +++ b/src/java/org/apache/cassandra/config/YamlConfigurationLoader.java @@ -135,16 +135,7 @@ public Config loadConfig(URL url) throws ConfigurationException throw new AssertionError(e); } - SafeConstructor constructor = new CustomConstructor(Config.class, Yaml.class.getClassLoader()); - Map, Map> replacements = getNameReplacements(Config.class); - verifyReplacements(replacements, configBytes); - PropertiesChecker propertiesChecker = new PropertiesChecker(replacements); - constructor.setPropertyUtils(propertiesChecker); - Yaml yaml = new Yaml(constructor); - Config result = loadConfig(yaml, configBytes); - propertiesChecker.check(); - maybeAddSystemProperties(result); - return result; + return loadConfig(configBytes); } catch (YAMLException e) { @@ -152,6 +143,21 @@ public Config loadConfig(URL url) throws ConfigurationException } } + @VisibleForTesting + static Config loadConfig(byte[] configBytes) + { + SafeConstructor constructor = new CustomConstructor(Config.class, Yaml.class.getClassLoader()); + Map, Map> replacements = getNameReplacements(Config.class); + verifyReplacements(replacements, configBytes); + PropertiesChecker propertiesChecker = new PropertiesChecker(replacements); + constructor.setPropertyUtils(propertiesChecker); + Yaml yaml = new Yaml(constructor); + Config result = loadConfig(yaml, configBytes); + propertiesChecker.check(); + maybeAddSystemProperties(result); + return result; + } + private static void maybeAddSystemProperties(Object obj) { if (CassandraRelevantProperties.CONFIG_ALLOW_SYSTEM_PROPERTIES.getBoolean()) diff --git a/src/java/org/apache/cassandra/cql3/CQLStatement.java b/src/java/org/apache/cassandra/cql3/CQLStatement.java index 349e79b30ff4..db9896361ecf 100644 --- a/src/java/org/apache/cassandra/cql3/CQLStatement.java +++ b/src/java/org/apache/cassandra/cql3/CQLStatement.java @@ -133,4 +133,14 @@ interface SingleKeyspaceCqlStatement extends CQLStatement { String keyspace(); } + + interface CompositeCQLStatement extends CQLStatement + { + Iterable getStatements(); + } + + interface ReturningCQLStatement extends CQLStatement + { + ResultSet.ResultMetadata getResultMetadata(); + } } diff --git a/src/java/org/apache/cassandra/cql3/ColumnsExpression.java b/src/java/org/apache/cassandra/cql3/ColumnsExpression.java index 78b0b8e9cd5b..b078fe5196ae 100644 --- a/src/java/org/apache/cassandra/cql3/ColumnsExpression.java +++ b/src/java/org/apache/cassandra/cql3/ColumnsExpression.java @@ -252,18 +252,21 @@ String toCQLString(List identifiers, ElementExpression.Raw raw */ private final List columns; + private final TableMetadata table; + /** * The element if this is an ELEMENT expression, {@code null} otherwise. * Like UDT field or collection element. */ private final ElementExpression element; //Only relevant for ELEMENT kind - ColumnsExpression(Kind kind, AbstractType type, List columns, ElementExpression element) + ColumnsExpression(Kind kind, AbstractType type, List columns, TableMetadata table, ElementExpression element) { assert kind != Kind.ELEMENT || element != null: "Element expression must have an element"; this.kind = kind; this.type = type; this.columns = columns; + this.table = table; this.element = element; // This could be null for kinds that don't use it } @@ -281,9 +284,9 @@ public AbstractType type() * @param column the column * @return an expression for a single column. */ - public static ColumnsExpression singleColumn(ColumnMetadata column) + public static ColumnsExpression singleColumn(ColumnMetadata column, TableMetadata table) { - return new ColumnsExpression(Kind.SINGLE_COLUMN, column.type, ImmutableList.of(column), null); + return new ColumnsExpression(Kind.SINGLE_COLUMN, column.type, ImmutableList.of(column), table, null); } /** @@ -292,10 +295,10 @@ public static ColumnsExpression singleColumn(ColumnMetadata column) * @return an expression for multi-columns. */ @VisibleForTesting - public static ColumnsExpression multiColumns(List columns) + public static ColumnsExpression multiColumns(List columns, TableMetadata table) { AbstractType type = new TupleType(ColumnMetadata.types(columns)); - return new ColumnsExpression(Kind.MULTI_COLUMN, type, ImmutableList.copyOf(columns),null); + return new ColumnsExpression(Kind.MULTI_COLUMN, type, ImmutableList.copyOf(columns), table,null); } /** @@ -307,6 +310,11 @@ public ColumnMetadata firstColumn() return columns().get(0); } + public TableMetadata table() + { + return table; + } + /** * Returns the last column metadata. * @return the last column metadata. @@ -565,7 +573,7 @@ public ColumnsExpression prepare(TableMetadata table) AbstractType type = kind.type(table, columns, elementExpression); - return new ColumnsExpression(kind, type, columns, elementExpression); + return new ColumnsExpression(kind, type, columns, table, elementExpression); } /** diff --git a/src/java/org/apache/cassandra/cql3/Operation.java b/src/java/org/apache/cassandra/cql3/Operation.java index 7c5e02eb63e3..646c07e57472 100644 --- a/src/java/org/apache/cassandra/cql3/Operation.java +++ b/src/java/org/apache/cassandra/cql3/Operation.java @@ -27,7 +27,15 @@ import org.apache.cassandra.cql3.terms.Term; import org.apache.cassandra.cql3.terms.UserTypes; import org.apache.cassandra.db.DecoratedKey; -import org.apache.cassandra.db.marshal.*; +import org.apache.cassandra.db.marshal.CollectionType; +import org.apache.cassandra.db.marshal.CounterColumnType; +import org.apache.cassandra.db.marshal.ListType; +import org.apache.cassandra.db.marshal.MapType; +import org.apache.cassandra.db.marshal.NumberType; +import org.apache.cassandra.db.marshal.SetType; +import org.apache.cassandra.db.marshal.StringType; +import org.apache.cassandra.db.marshal.TupleType; +import org.apache.cassandra.db.marshal.UserType; import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.TableMetadata; @@ -62,6 +70,11 @@ protected Operation(ColumnMetadata column, Term t) this.t = t; } + public Term term() + { + return t; + } + public void addFunctionsTo(List functions) { if (t != null) @@ -69,14 +82,23 @@ public void addFunctionsTo(List functions) } /** - * @return whether the operation requires a read of the previous value to be executed - * (only lists setterByIdx, discard and discardByIdx requires that). + * @return whether the operation requires a read of the existing value to be executed */ public boolean requiresRead() { return false; } + + /** + * @return whether the operation requires its timestamp to be known to be executed safely + */ + public boolean requiresTimestamp() + { + return false; + } + + /** * Collects the column specification for the bind variables of this operation. * @@ -178,7 +200,7 @@ public Operation prepare(TableMetadata metadata, ColumnMetadata receiver, boolea if (receiver.type.isCollection()) { - switch (((CollectionType) receiver.type).kind) + switch (((CollectionType) receiver.type).kind) { case LIST: return new Lists.Setter(receiver, v); @@ -228,7 +250,7 @@ public Operation prepare(TableMetadata metadata, ColumnMetadata receiver, boolea else if (!(receiver.type.isMultiCell())) throw new InvalidRequestException(String.format("Invalid operation (%s) for frozen collection column %s", toString(receiver), receiver.name)); - switch (((CollectionType)receiver.type).kind) + switch (((CollectionType)receiver.type).kind) { case LIST: Term idx = selector.prepare(metadata.keyspace, Lists.indexSpecOf(receiver)); @@ -328,7 +350,7 @@ public Operation prepare(TableMetadata metadata, ColumnMetadata receiver, boolea else if (!(receiver.type.isMultiCell())) throw new InvalidRequestException(String.format("Invalid operation (%s) for frozen collection column %s", toString(receiver), receiver.name)); - switch (((CollectionType)receiver.type).kind) + switch (((CollectionType)receiver.type).kind) { case LIST: return new Lists.Appender(receiver, value.prepare(metadata.keyspace, receiver)); @@ -371,7 +393,7 @@ public Substraction(Term.Raw value) } public Operation prepare(TableMetadata metadata, ColumnMetadata receiver, boolean canReadExistingState) throws InvalidRequestException - { + { if (!(receiver.type instanceof CollectionType)) { if (canReadExistingState) @@ -389,7 +411,7 @@ public Operation prepare(TableMetadata metadata, ColumnMetadata receiver, boolea else if (!(receiver.type.isMultiCell())) throw new InvalidRequestException(String.format("Invalid operation (%s) for frozen collection column %s", toString(receiver), receiver.name)); - switch (((CollectionType)receiver.type).kind) + switch (((CollectionType)receiver.type).kind) { case LIST: return new Lists.Discarder(receiver, value.prepare(metadata.keyspace, receiver)); @@ -400,7 +422,7 @@ else if (!(receiver.type.isMultiCell())) ColumnSpecification vr = new ColumnSpecification(receiver.ksName, receiver.cfName, receiver.name, - SetType.getInstance(((MapType)receiver.type).getKeysType(), false)); + SetType.getInstance(((MapType) receiver.type).getKeysType(), true)); Term term; try { @@ -502,7 +524,7 @@ public Operation prepare(String keyspace, ColumnMetadata receiver, TableMetadata else if (!(receiver.type.isMultiCell())) throw new InvalidRequestException(String.format("Invalid deletion operation for frozen collection column %s", receiver.name)); - switch (((CollectionType)receiver.type).kind) + switch (((CollectionType)receiver.type).kind) { case LIST: Term idx = element.prepare(keyspace, Lists.indexSpecOf(receiver)); diff --git a/src/java/org/apache/cassandra/cql3/Operations.java b/src/java/org/apache/cassandra/cql3/Operations.java index a9451d7fc544..f0dbd91f22e1 100644 --- a/src/java/org/apache/cassandra/cql3/Operations.java +++ b/src/java/org/apache/cassandra/cql3/Operations.java @@ -21,10 +21,14 @@ import java.util.Iterator; import java.util.List; +import com.google.common.base.Preconditions; +import com.google.common.collect.Iterators; + import org.apache.cassandra.cql3.functions.Function; import org.apache.cassandra.cql3.statements.StatementType; - -import com.google.common.collect.Iterators; +import org.apache.cassandra.cql3.transactions.ReferenceOperation; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableMetadata; /** * A set of Operations. @@ -36,6 +40,10 @@ public final class Operations implements Iterable * The type of statement. */ private final StatementType type; + /** + * If this operation is for a Transaction; this causes Operations to "migrate" when they require-read + */ + private final boolean isForTxn; /** * The operations on regular columns. @@ -47,9 +55,29 @@ public final class Operations implements Iterable */ private final List staticOperations = new ArrayList<>(); - public Operations(StatementType type) + private final List regularSubstitutions = new ArrayList<>(); + private final List staticSubstitutions = new ArrayList<>(); + + public Operations(StatementType type, boolean isForTxn) { this.type = type; + this.isForTxn = isForTxn; + } + + private Operations(Operations other, TableMetadata tableMetadata) + { + Preconditions.checkState(!other.isForTxn, "Unable to migrate from txn to txn"); + Preconditions.checkState(other.regularSubstitutions.isEmpty() && other.staticSubstitutions.isEmpty(), "Transaction substitutions are defined for a non-transaction operations! regular=%s, static=%s", other.regularSubstitutions, other.staticSubstitutions); + + type = other.type; + isForTxn = true; + for (Operation opt : other) + add(opt, tableMetadata); + } + + public Operations forTxn(TableMetadata tableMetadata) + { + return new Operations(this, tableMetadata); } /** @@ -59,7 +87,7 @@ public Operations(StatementType type) */ public boolean appliesToStaticColumns() { - return !staticOperations.isEmpty(); + return !staticIsEmpty(); } /** @@ -69,10 +97,10 @@ public boolean appliesToStaticColumns() */ public boolean appliesToRegularColumns() { - // If we have regular operations, this applies to regular columns. + // If we have regular operations, this applies to regular columns. // Otherwise, if the statement is a DELETE and staticOperations is also empty, this means we have no operations, // which for a DELETE means a full row deletion. Which means the operation applies to all columns and regular ones in particular. - return !regularOperations.isEmpty() || (type.isDelete() && staticOperations.isEmpty()); + return !regularIsEmpty() || (type.isDelete() && staticIsEmpty()); } /** @@ -95,16 +123,32 @@ public List staticOperations() /** * Adds the specified Operation to this set of operations. - * @param operation the operation to add + * + * @param operation the operation to add + * @param tableMetadata */ - public void add(Operation operation) + public void add(Operation operation, TableMetadata tableMetadata) { + if (isForTxn && (operation.requiresRead() || operation.requiresTimestamp())) + { + add(operation.column, ReferenceOperation.create(operation, tableMetadata)); + return; + } if (operation.column.isStatic()) staticOperations.add(operation); else regularOperations.add(operation); } + public void add(ColumnMetadata column, ReferenceOperation operation) + { + Preconditions.checkState(isForTxn, "Unable to add a transaction reference to a non-transaction operation"); + if (column.isStatic()) + staticSubstitutions.add(operation); + else + regularSubstitutions.add(operation); + } + /** * Checks if one of the operations requires a read. * @@ -126,7 +170,7 @@ public boolean requiresRead() */ public boolean isEmpty() { - return staticOperations.isEmpty() && regularOperations.isEmpty(); + return staticIsEmpty() && regularIsEmpty(); } /** @@ -142,5 +186,41 @@ public void addFunctionsTo(List functions) { regularOperations.forEach(p -> p.addFunctionsTo(functions)); staticOperations.forEach(p -> p.addFunctionsTo(functions)); + //TODO substitutions as well? + } + + public List allSubstitutions() + { + if (staticSubstitutions.isEmpty()) + return regularSubstitutions; + + if (regularSubstitutions.isEmpty()) + return staticSubstitutions; + + // Only create a new list if we actually have something to combine + List list = new ArrayList<>(staticSubstitutions.size() + regularSubstitutions.size()); + list.addAll(staticSubstitutions); + list.addAll(regularSubstitutions); + return list; + } + + public List regularSubstitutions() + { + return regularSubstitutions; + } + + public List staticSubstitutions() + { + return staticSubstitutions; + } + + private boolean regularIsEmpty() + { + return regularOperations.isEmpty() && regularSubstitutions.isEmpty(); + } + + private boolean staticIsEmpty() + { + return staticOperations.isEmpty() && staticSubstitutions.isEmpty(); } } diff --git a/src/java/org/apache/cassandra/cql3/Operator.java b/src/java/org/apache/cassandra/cql3/Operator.java index 64658b43226f..93a81fa2f4bb 100644 --- a/src/java/org/apache/cassandra/cql3/Operator.java +++ b/src/java/org/apache/cassandra/cql3/Operator.java @@ -26,6 +26,7 @@ import java.util.Objects; import java.util.function.Function; import java.util.stream.Collectors; +import java.util.stream.Stream; import com.google.common.collect.RangeSet; @@ -39,13 +40,19 @@ import org.apache.cassandra.db.marshal.SetType; import org.apache.cassandra.db.rows.CellPath; import org.apache.cassandra.db.rows.ComplexColumnData; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.serializers.ListSerializer; import org.apache.cassandra.utils.ByteBufferUtil; +import static com.google.common.base.Preconditions.checkArgument; + import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse; import static org.apache.cassandra.cql3.statements.RequestValidations.checkTrue; import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest; +import static org.apache.cassandra.db.TypeSizes.sizeofUnsignedVInt; public enum Operator { @@ -89,12 +96,12 @@ public boolean requiresFilteringOrIndexingFor(ColumnMetadata.Kind columnKind) } @Override - public void restrict(RangeSet rangeSet, List args) + public void restrict(RangeSet rangeSet, List args, IPartitioner partitioner) { assert args.size() == 1 : this + " accept only one single value"; ClusteringElements arg = args.get(0); - rangeSet.removeAll(ClusteringElements.lessThan(arg)); - rangeSet.removeAll(ClusteringElements.greaterThan(arg)); + rangeSet.removeAll(ClusteringElements.lessThan(arg, partitioner)); + rangeSet.removeAll(ClusteringElements.greaterThan(arg, partitioner)); } @Override @@ -138,10 +145,10 @@ public boolean requiresFilteringOrIndexingFor(ColumnMetadata.Kind columnKind) } @Override - public void restrict(RangeSet rangeSet, List args) + public void restrict(RangeSet rangeSet, List args, IPartitioner partitioner) { assert args.size() == 1 : this + " accept only one single value"; - rangeSet.removeAll(ClusteringElements.atLeast(args.get(0))); + rangeSet.removeAll(ClusteringElements.atLeast(args.get(0), partitioner)); } @Override @@ -192,10 +199,10 @@ public boolean requiresFilteringOrIndexingFor(ColumnMetadata.Kind columnKind) } @Override - public void restrict(RangeSet rangeSet, List args) + public void restrict(RangeSet rangeSet, List args, IPartitioner partitioner) { assert args.size() == 1 : this + " accept only one single value"; - rangeSet.removeAll(ClusteringElements.greaterThan(args.get(0))); + rangeSet.removeAll(ClusteringElements.greaterThan(args.get(0), partitioner)); } @Override @@ -246,10 +253,10 @@ public boolean requiresFilteringOrIndexingFor(ColumnMetadata.Kind columnKind) } @Override - public void restrict(RangeSet rangeSet, List args) + public void restrict(RangeSet rangeSet, List args, IPartitioner partitioner) { assert args.size() == 1 : this + " accept only one single value"; - rangeSet.removeAll(ClusteringElements.lessThan(args.get(0))); + rangeSet.removeAll(ClusteringElements.lessThan(args.get(0), partitioner)); } @Override @@ -299,10 +306,10 @@ public boolean requiresFilteringOrIndexingFor(ColumnMetadata.Kind columnKind) } @Override - public void restrict(RangeSet rangeSet, List args) + public void restrict(RangeSet rangeSet, List args, IPartitioner partitioner) { assert args.size() == 1 : this + " accept only one single value"; - rangeSet.removeAll(ClusteringElements.atMost(args.get(0))); + rangeSet.removeAll(ClusteringElements.atMost(args.get(0), partitioner)); } @Override @@ -493,7 +500,7 @@ public boolean isSatisfiedBy(MultiElementType type, ComplexColumnData leftOpe } @Override - public void restrict(RangeSet rangeSet, List args) + public void restrict(RangeSet rangeSet, List args, IPartitioner partitioner) { assert args.size() == 1; rangeSet.remove(ClusteringElements.notEqualTo(args.get(0))); @@ -670,7 +677,7 @@ public boolean requiresFilteringOrIndexingFor(ColumnMetadata.Kind columnKind) } @Override - public void restrict(RangeSet rangeSet, List args) + public void restrict(RangeSet rangeSet, List args, IPartitioner partitioner) { for (ClusteringElements clustering : args) rangeSet.remove(ClusteringElements.notEqualTo(clustering)); @@ -799,12 +806,16 @@ public boolean requiresFilteringOrIndexingFor(ColumnMetadata.Kind columnKind) } @Override - public void restrict(RangeSet rangeSet, List args) + public void restrict(RangeSet rangeSet, List args, IPartitioner partitioner) { assert args.size() == 2 : this + " accepts exactly two values"; - args.sort(ClusteringElements.CQL_COMPARATOR); - rangeSet.removeAll(ClusteringElements.lessThan(args.get(0))); - rangeSet.removeAll(ClusteringElements.greaterThan(args.get(1))); + // avoid sorting when working with token restrictions, otherwise we can't know the difference between these queries: + // select * from x.y where token(id) between 0 and MIN_TOKEN + // select * from x.y where token(id) between MIN_TOKEN and 0 + if (!args.get(0).token) + args.sort(ClusteringElements.CQL_COMPARATOR); + rangeSet.removeAll(ClusteringElements.lessThan(args.get(0), partitioner)); + rangeSet.removeAll(ClusteringElements.greaterThan(args.get(1), partitioner)); } @Override @@ -828,6 +839,25 @@ public enum Kind BINARY, TERNARY, MULTI_VALUE; }; + private static final Operator[] idToOperatorMapping; + + static + { + Operator[] operators = values(); + int maxId = Stream.of(operators) + .map(Operator::getValue) + .max(Integer::compareTo) + .get(); + + idToOperatorMapping = new Operator[maxId + 1]; + for (Operator operator : operators) + { + if (null != idToOperatorMapping[operator.b]) + throw new IllegalStateException("Duplicate Operator id " + operator.b); + idToOperatorMapping[operator.b] = operator; + } + } + /** * The binary representation of this Enum value. */ @@ -853,6 +883,17 @@ public void writeTo(DataOutput output) throws IOException output.writeInt(getValue()); } + /** + * Write the serialized version of this Operator to the specified output. + * + * @param output the output to write to + * @throws IOException if an I/O problem occurs while writing to the specified output + */ + public void writeToUnsignedVInt(DataOutputPlus output) throws IOException + { + output.writeUnsignedVInt32(b); + } + public int getValue() { return b; @@ -885,12 +926,27 @@ public boolean isTernary() */ public static Operator readFrom(DataInput input) throws IOException { - int b = input.readInt(); - for (Operator operator : values()) - if (operator.b == b) - return operator; + return fromBinary(input.readInt()); + } + + /** + * Deserializes a Operator instance from the specified input. + * + * @param input the input to read from + * @return the Operator instance deserialized + * @throws IOException if a problem occurs while deserializing the Type instance. + */ + public static Operator readFromUnsignedVInt(DataInputPlus input) throws IOException + { + return fromBinary(input.readUnsignedVInt32()); + } - throw new IOException(String.format("Cannot resolve Relation.Type from binary representation: %s", b)); + private static Operator fromBinary(int b) throws IOException + { + checkArgument(b > -1, "b must be > -1 to be a valid Operator id"); + if (b > idToOperatorMapping.length) + throw new IOException(String.format("Cannot resolve Operator from binary representation: %s", b)); + return idToOperatorMapping[b]; } @@ -1023,7 +1079,7 @@ public boolean appliesToMapKeys() * @param rangeSet the range set to restrict * @param args the operator arguments */ - public void restrict(RangeSet rangeSet, List args) + public void restrict(RangeSet rangeSet, List args, IPartitioner partitioner) { throw new UnsupportedOperationException(this + " is not a range operator"); } @@ -1149,4 +1205,9 @@ private String buildCQLString(String leftOperand, T rightOperand, Function { - MD5Digest md5Digest = (MD5Digest) key; - if (cause.wasEvicted()) - { - metrics.preparedStatementsEvicted.inc(); - lastMinuteEvictionsCount.incrementAndGet(); - SystemKeyspace.removePreparedStatement(md5Digest); - } - }).build(); + .removalListener((key, prepared, cause) -> evictPreparedStatement(key, cause)) + .build(); ScheduledExecutors.scheduledTasks.scheduleAtFixedRate(() -> { long count = lastMinuteEvictionsCount.getAndSet(0); @@ -157,6 +158,16 @@ public class QueryProcessor implements QueryHandler DatabaseDescriptor.getPreparedStatementsCacheSizeMiB()); } + private static void evictPreparedStatement(MD5Digest key, RemovalCause cause) + { + if (cause.wasEvicted()) + { + metrics.preparedStatementsEvicted.inc(); + lastMinuteEvictionsCount.incrementAndGet(); + SystemKeyspace.removePreparedStatement(key); + } + } + private static long capacityToBytes(long cacheSizeMB) { return cacheSizeMB * 1024 * 1024; @@ -181,6 +192,12 @@ private enum InternalStateInstance } public void preloadPreparedStatements() + { + preloadPreparedStatements(PRELOAD_PREPARED_STATEMENTS_FETCH_SIZE); + } + + @VisibleForTesting + public int preloadPreparedStatements(int pageSize) { long startTime = nanoTime(); int count = SystemKeyspace.loadPreparedStatements((id, query, keyspace) -> { @@ -196,18 +213,19 @@ public void preloadPreparedStatements() // Preload `null` statement for non-fully qualified statements, since it can't be parsed if loaded from cache and will be dropped if (!prepared.fullyQualified) preparedStatements.get(computeId(query, null), (ignored_) -> prepared); - return true; + return prepared; } catch (RequestValidationException e) { JVMStabilityInspector.inspectThrowable(e); logger.warn("Prepared statement recreation error, removing statement: {} {} {}, error details: {}", id, query, keyspace, e.getMessage()); SystemKeyspace.removePreparedStatement(id); - return false; + return null; } - }); + }, pageSize); long endTime = nanoTime(); logger.info("Preloaded {} prepared statements in {} ms", count, TimeUnit.NANOSECONDS.toMillis(endTime - startTime)); + return count; } @@ -297,6 +315,8 @@ private ResultMessage processNodeLocalStatement(CQLStatement statement, QuerySta return processNodeLocalWrite(statement, queryState, options); else if (statement instanceof SelectStatement) return processNodeLocalSelect((SelectStatement) statement, queryState, options); + else if (statement instanceof TransactionStatement) + return statement.executeLocally(queryState, options); else throw new InvalidRequestException("NODE_LOCAL consistency level can only be used with BATCH, UPDATE, INSERT, DELETE, and SELECT statements"); } @@ -444,6 +464,11 @@ public static Prepared prepareInternal(String query) throws RequestValidationExc } public static Prepared parseAndPrepare(String query, ClientState clientState, boolean isInternal) throws RequestValidationException + { + return parseAndPrepare(query, clientState, isInternal, true); + } + + public static Prepared parseAndPrepare(String query, ClientState clientState, boolean isInternal, boolean measure) throws RequestValidationException { CQLStatement.Raw raw = parseStatement(query); @@ -472,7 +497,10 @@ public static Prepared parseAndPrepare(String query, ClientState clientState, bo res = new Prepared(statement, "", fullyQualified, keyspace); else res = new Prepared(statement, query, fullyQualified, keyspace); - res.pstmntSize = measurePstmnt(res); + + // Some prepared statements will not be cached and therefore do not require a pre-computed size. + if (measure) + res.pstmntSize = measurePstmnt(res); return res; } @@ -579,12 +607,33 @@ public static UntypedResultSet execute(String query, ConsistencyLevel cl, QueryS public static UntypedResultSet executeInternalWithPaging(String query, int pageSize, Object... values) { Prepared prepared = prepareInternal(query); - if (!(prepared.statement instanceof SelectStatement)) + + return executeInternalWithPaging(prepared.statement, pageSize, values); + } + + /** + * Executes with a non-prepared statement using paging. Generally {@link #executeInternalWithPaging(String, int, Object...)} + * should be used instead of this, but this may be used in niche cases like + * {@link SystemKeyspace#loadPreparedStatement(MD5Digest, SystemKeyspace.TriFunction)} where prepared statements are + * being loaded into {@link #preparedStatements} so it doesn't make sense to prepare a statement in this context. + */ + public static UntypedResultSet executeOnceInternalWithPaging(String query, int pageSize, Object... values) + { + QueryState queryState = internalQueryState(); + CQLStatement statement = parseStatement(query, queryState.getClientState()); + statement.validate(queryState.getClientState()); + + return executeInternalWithPaging(statement, pageSize, values); + } + + private static UntypedResultSet executeInternalWithPaging(CQLStatement statement, int pageSize, Object... values) + { + if (!(statement instanceof SelectStatement)) throw new IllegalArgumentException("Only SELECTs can be paged"); - SelectStatement select = (SelectStatement)prepared.statement; + SelectStatement select = (SelectStatement) statement; long nowInSec = FBUtilities.nowInSeconds(); - QueryPager pager = select.getQuery(makeInternalOptionsWithNowInSec(prepared.statement, nowInSec, values), nowInSec).getPager(null, ProtocolVersion.CURRENT); + QueryPager pager = select.getQuery(makeInternalOptionsWithNowInSec(select, nowInSec, values), nowInSec).getPager(null, ProtocolVersion.CURRENT); return UntypedResultSet.create(select, pager, pageSize); } @@ -826,7 +875,7 @@ public static ResultMessage.Prepared storePreparedStatement(String queryString, Prepared previous = preparedStatements.get(statementId, (ignored_) -> prepared); if (previous == prepared) - SystemKeyspace.writePreparedStatement(keyspace, statementId, queryString); + SystemKeyspace.writePreparedStatement(keyspace, statementId, queryString, prepared.timestamp); ResultSet.PreparedMetadata preparedMetadata = ResultSet.PreparedMetadata.fromPrepared(prepared.statement); ResultSet.ResultMetadata resultMetadata = ResultSet.ResultMetadata.fromPrepared(prepared.statement); @@ -1047,10 +1096,9 @@ else if (statement instanceof SelectStatement) statementKsName = selectStatement.keyspace(); statementCfName = selectStatement.table(); } - else if (statement instanceof BatchStatement) + else if (statement instanceof CQLStatement.CompositeCQLStatement) { - BatchStatement batchStatement = ((BatchStatement) statement); - for (ModificationStatement stmt : batchStatement.getStatements()) + for (CQLStatement stmt : ((CQLStatement.CompositeCQLStatement) statement).getStatements()) { if (shouldInvalidate(ksName, cfName, stmt)) return true; diff --git a/src/java/org/apache/cassandra/cql3/Relation.java b/src/java/org/apache/cassandra/cql3/Relation.java index 9fbd3adf68e9..31d6f771ce6d 100644 --- a/src/java/org/apache/cassandra/cql3/Relation.java +++ b/src/java/org/apache/cassandra/cql3/Relation.java @@ -35,7 +35,6 @@ import org.apache.cassandra.exceptions.InvalidRequestException; import static org.apache.cassandra.cql3.statements.RequestValidations.*; -import static org.apache.cassandra.cql3.statements.RequestValidations.checkTrue; /** * The parsed version of a {@code SimpleRestriction} as outputed by the CQL parser. diff --git a/src/java/org/apache/cassandra/cql3/ResultSet.java b/src/java/org/apache/cassandra/cql3/ResultSet.java index de393f73105c..ddb17634c13a 100644 --- a/src/java/org/apache/cassandra/cql3/ResultSet.java +++ b/src/java/org/apache/cassandra/cql3/ResultSet.java @@ -31,7 +31,6 @@ import com.google.common.annotations.VisibleForTesting; import io.netty.buffer.ByteBuf; -import org.apache.cassandra.cql3.statements.SelectStatement; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.service.pager.PagingState; import org.apache.cassandra.transport.CBCodec; @@ -314,8 +313,8 @@ public MD5Digest getResultMetadataId() public static ResultMetadata fromPrepared(CQLStatement statement) { - if (statement instanceof SelectStatement) - return ((SelectStatement)statement).getResultMetadata(); + if (statement instanceof CQLStatement.ReturningCQLStatement) + return ((CQLStatement.ReturningCQLStatement) statement).getResultMetadata(); return ResultSet.ResultMetadata.EMPTY; } diff --git a/src/java/org/apache/cassandra/cql3/StatementSource.java b/src/java/org/apache/cassandra/cql3/StatementSource.java new file mode 100644 index 000000000000..2f07ec4f53d9 --- /dev/null +++ b/src/java/org/apache/cassandra/cql3/StatementSource.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.cql3; + +import java.util.Objects; + +import org.antlr.runtime.Token; + +import static java.lang.Math.max; +import static java.lang.Math.min; + +public class StatementSource +{ + public static final StatementSource INTERNAL = new StatementSource(0, 0); + + public final int line; + public final int charPositionInLine; + + public StatementSource(int line, int charPositionInLine) + { + this.line = line; + this.charPositionInLine = charPositionInLine; + } + + @Override + public String toString() + { + if (this == INTERNAL) + { + return "<<>>"; + } + else + { + if (!isEmpty()) + return String.format("at [%d:%d]", line + 1, charPositionInLine + 1); + else + return ""; + } + } + + public boolean isEmpty() + { + return line > Character.MAX_VALUE || line == Character.MAX_VALUE && charPositionInLine > Character.MAX_VALUE; + } + + // note - this can also reproduce the original statement raw text by getting TokenStream and calling toString(startToken, endToken) + public static StatementSource create(Token startToken) + { + Objects.requireNonNull(startToken); + + if (startToken.getType() == Token.EOF) + return new StatementSource(Character.MAX_VALUE + 1, 0); + + int startLine = min(max(startToken.getLine(), 1) - 1, Character.MAX_VALUE); + int startChar = min(max(startToken.getCharPositionInLine(), 0), Character.MAX_VALUE); + + return new StatementSource(startLine, startChar); + } + +} diff --git a/src/java/org/apache/cassandra/cql3/UntypedResultSet.java b/src/java/org/apache/cassandra/cql3/UntypedResultSet.java index f82ff3eb835e..f70c1211e969 100644 --- a/src/java/org/apache/cassandra/cql3/UntypedResultSet.java +++ b/src/java/org/apache/cassandra/cql3/UntypedResultSet.java @@ -20,7 +20,6 @@ import java.net.InetAddress; import java.nio.ByteBuffer; -import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.Iterator; @@ -28,23 +27,33 @@ import java.util.Map; import java.util.Set; import java.util.UUID; +import javax.annotation.Nonnull; import java.util.stream.Stream; import java.util.stream.StreamSupport; import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; import org.apache.cassandra.cql3.functions.types.LocalDate; import org.apache.cassandra.cql3.statements.SelectStatement; -import org.apache.cassandra.db.Clustering; import org.apache.cassandra.db.ConsistencyLevel; -import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.ReadExecutionController; -import org.apache.cassandra.db.marshal.*; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.BooleanType; +import org.apache.cassandra.db.marshal.ByteType; +import org.apache.cassandra.db.marshal.DoubleType; +import org.apache.cassandra.db.marshal.InetAddressType; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.ListType; +import org.apache.cassandra.db.marshal.LongType; +import org.apache.cassandra.db.marshal.MapType; +import org.apache.cassandra.db.marshal.SetType; +import org.apache.cassandra.db.marshal.ShortType; +import org.apache.cassandra.db.marshal.TimestampType; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.db.marshal.UUIDType; +import org.apache.cassandra.db.marshal.VectorType; import org.apache.cassandra.db.partitions.PartitionIterator; -import org.apache.cassandra.db.rows.Cell; -import org.apache.cassandra.db.rows.ComplexColumnData; -import org.apache.cassandra.schema.ColumnMetadata; -import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.service.pager.QueryPager; import org.apache.cassandra.transport.Dispatcher; @@ -60,11 +69,6 @@ public static UntypedResultSet create(ResultSet rs) return new FromResultSet(rs); } - public static UntypedResultSet create(List> results) - { - return new FromResultList(results); - } - public static UntypedResultSet create(SelectStatement select, QueryPager pager, int pageSize) { return new FromPager(select, pager, pageSize); @@ -142,48 +146,6 @@ public List metadata() } } - private static class FromResultList extends UntypedResultSet - { - private final List> cqlRows; - - private FromResultList(List> cqlRows) - { - this.cqlRows = cqlRows; - } - - public int size() - { - return cqlRows.size(); - } - - public Row one() - { - if (cqlRows.size() != 1) - throw new IllegalStateException("One row required, " + cqlRows.size() + " found"); - return new Row(cqlRows.get(0)); - } - - public Iterator iterator() - { - return new AbstractIterator() - { - final Iterator> iter = cqlRows.iterator(); - - protected Row computeNext() - { - if (!iter.hasNext()) - return endOfData(); - return new Row(iter.next()); - } - }; - } - - public List metadata() - { - throw new UnsupportedOperationException(); - } - } - private static class FromPager extends UntypedResultSet { private final SelectStatement select; @@ -307,52 +269,18 @@ public List metadata() public static class Row { + @Nonnull private final Map data = new HashMap<>(); - private final List columns = new ArrayList<>(); + @Nonnull + private final List columns; - public Row(Map data) + public Row(@Nonnull List names, @Nonnull List columns) { - this.data.putAll(data); - } - - public Row(List names, List columns) - { - this.columns.addAll(names); + this.columns = ImmutableList.copyOf(names); for (int i = 0; i < names.size(); i++) data.put(names.get(i).name.toString(), columns.get(i)); } - public static Row fromInternalRow(TableMetadata metadata, DecoratedKey key, org.apache.cassandra.db.rows.Row row) - { - Map data = new HashMap<>(); - - ByteBuffer[] keyComponents = SelectStatement.getComponents(metadata, key); - for (ColumnMetadata def : metadata.partitionKeyColumns()) - data.put(def.name.toString(), keyComponents[def.position()]); - - Clustering clustering = row.clustering(); - for (ColumnMetadata def : metadata.clusteringColumns()) - data.put(def.name.toString(), clustering.bufferAt(def.position())); - - for (ColumnMetadata def : metadata.regularAndStaticColumns()) - { - if (def.isSimple()) - { - Cell cell = row.getCell(def); - if (cell != null) - data.put(def.name.toString(), cell.buffer()); - } - else - { - ComplexColumnData complexData = row.getComplexColumnData(def); - if (complexData != null) - data.put(def.name.toString(), ((CollectionType) def.type).serializeForNativeProtocol(complexData.iterator())); - } - } - - return new Row(data); - } - public boolean has(String column) { // Note that containsKey won't work because we may have null values @@ -454,6 +382,14 @@ public long getLong(String column) return LongType.instance.compose(data.get(column)); } + // this function will return the default value if the row doesn't have that column or the column data is null + // This function is used to avoid the nullpointerexception + public long getLong(String column, long ifNull) + { + ByteBuffer bytes = data.get(column); + return bytes == null ? ifNull : LongType.instance.compose(bytes); + } + public Set getSet(String column, AbstractType type) { ByteBuffer raw = data.get(column); @@ -509,7 +445,47 @@ public List getColumns() @Override public String toString() { - return data.toString(); + StringBuilder sb = new StringBuilder(); + toString(sb); + return sb.toString(); + } + + public void toString(StringBuilder sb) + { + for (int i = 0; i < columns.size(); i++) + { + ColumnSpecification cspec = columns.get(i); + ByteBuffer v = data.get(cspec.name.toString()); + if (i != 0) + sb.append(" | "); + if (v == null) + { + sb.append("null"); + } + else + { + sb.append(cspec.type.getString(v)); + } + } + } + } + + /** + * When UntypedResultSet is from a pager calling toString will consume the pager. + * toString shouldn't mutate the object and this of course breaks things waiting to consume + * the results so if you want to get a pretty printed string you need to call this method explicitly. + */ + @SuppressWarnings("unused") + public String toStringUnsafe() + { + StringBuilder sb = new StringBuilder(); + sb.append(metadata()).append('\n'); + for (Row row : this) + { + row.toString(sb); + sb.append('\n'); } + sb.append("---"); + return sb.toString(); } } diff --git a/src/java/org/apache/cassandra/cql3/UpdateParameters.java b/src/java/org/apache/cassandra/cql3/UpdateParameters.java index d13d0b49a76f..6ef6c7bb6ca0 100644 --- a/src/java/org/apache/cassandra/cql3/UpdateParameters.java +++ b/src/java/org/apache/cassandra/cql3/UpdateParameters.java @@ -20,14 +20,26 @@ import java.nio.ByteBuffer; import java.util.Map; -import org.apache.cassandra.db.guardrails.Guardrails; -import org.apache.cassandra.schema.ColumnMetadata; -import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.db.*; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ClusteringComparator; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionPurger; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.LivenessInfo; +import org.apache.cassandra.db.RangeTombstone; +import org.apache.cassandra.db.Slice; import org.apache.cassandra.db.context.CounterContext; +import org.apache.cassandra.db.guardrails.Guardrails; import org.apache.cassandra.db.partitions.Partition; -import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.db.rows.BTreeRow; +import org.apache.cassandra.db.rows.BufferCell; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellPath; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.Rows; import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.utils.TimeUUID; @@ -37,17 +49,16 @@ public class UpdateParameters { public final TableMetadata metadata; - public final RegularAndStaticColumns updatedColumns; public final ClientState clientState; public final QueryOptions options; private final long nowInSec; - private final long timestamp; + protected final long timestamp; private final int ttl; private final DeletionTime deletionTime; - // For lists operation that require a read-before-write. Will be null otherwise. + // Holds data for operations that require a read-before-write. Will be null otherwise. private final Map prefetchedRows; private Row.Builder staticBuilder; @@ -57,17 +68,14 @@ public class UpdateParameters private Row.Builder builder; public UpdateParameters(TableMetadata metadata, - RegularAndStaticColumns updatedColumns, ClientState clientState, QueryOptions options, long timestamp, long nowInSec, int ttl, - Map prefetchedRows) - throws InvalidRequestException + Map prefetchedRows) throws InvalidRequestException { this.metadata = metadata; - this.updatedColumns = updatedColumns; this.clientState = clientState; this.options = options; @@ -123,10 +131,20 @@ public Clustering currentClustering() public void addPrimaryKeyLivenessInfo() { - builder.addPrimaryKeyLivenessInfo(LivenessInfo.create(timestamp, ttl, nowInSec)); + addPrimaryKeyLivenessInfo(LivenessInfo.create(timestamp, ttl, nowInSec)); + } + + private void addPrimaryKeyLivenessInfo(LivenessInfo info) + { + builder.addPrimaryKeyLivenessInfo(info); } public void addRowDeletion() + { + addRowDeletion(Row.Deletion.regular(deletionTime)); + } + + private void addRowDeletion(Row.Deletion deletion) { // For compact tables, at the exclusion of the static row (of static compact tables), each row ever has a single column, // the "compact" one. As such, deleting the row or deleting that single cell is equivalent. We favor the later @@ -134,7 +152,7 @@ public void addRowDeletion() if (metadata.isCompactTable() && builder.clustering() != Clustering.STATIC_CLUSTERING) addTombstone(((TableMetadata.CompactTableMetadata) metadata).compactValueColumn); else - builder.addRowDeletion(Row.Deletion.regular(deletionTime)); + builder.addRowDeletion(deletion); } public void addTombstone(ColumnMetadata column) throws InvalidRequestException @@ -179,6 +197,14 @@ public Cell addCell(ColumnMetadata column, CellPath path, ByteBuffer value) t return cell; } + public void addRow(Row row) + { + newRow(row.clustering()); + addRowDeletion(row.deletion()); + addPrimaryKeyLivenessInfo(row.primaryKeyLivenessInfo()); + row.cells().forEach(builder::addCell); + } + private void validateColumnSize(ColumnMetadata column, ByteBuffer value) { CQL3Type cql3Type = column.type.asCQL3Type(); diff --git a/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java b/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java index 1d0afcddc2e0..1f9ba51862a4 100644 --- a/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java +++ b/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java @@ -17,6 +17,7 @@ */ package org.apache.cassandra.cql3.conditions; +import java.io.IOException; import java.nio.ByteBuffer; import java.util.*; @@ -42,11 +43,18 @@ import org.apache.cassandra.db.rows.ColumnData; import org.apache.cassandra.db.rows.ComplexColumnData; import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.io.ParameterisedUnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; import org.apache.cassandra.utils.ByteBufferUtil; import static org.apache.cassandra.cql3.statements.RequestValidations.*; +import static org.apache.cassandra.db.TypeSizes.sizeofUnsignedVInt; +import static org.apache.cassandra.service.accord.AccordSerializers.columnMetadataSerializer; +import static org.apache.cassandra.utils.ByteBufferUtil.nullableByteBufferSerializer; /** * A CQL3 condition on the value of a column or collection element. For example, "UPDATE .. IF a = 0". @@ -114,21 +122,23 @@ public ColumnCondition.Bound bind(QueryOptions options) private Bound bindSingleColumn(QueryOptions options) { ColumnMetadata column = columnsExpression.firstColumn(); + TableMetadata table = columnsExpression.table(); if (column.type.isMultiCell()) - return new MultiCellBound(column, operator, toValue(column.type, bindAndGetTerms(options))); + return new MultiCellBound(column, table, operator, toValue(column.type, bindAndGetTerms(options))); - return new SimpleBound(column, operator, toValue(column.type, bindAndGetTerms(options))); + return new SimpleBound(column, table, operator, toValue(column.type, bindAndGetTerms(options))); } private ColumnCondition.Bound bindElement(QueryOptions options) { ColumnMetadata column = columnsExpression.firstColumn(); + TableMetadata table = columnsExpression.table(); ByteBuffer keyOrIndex = columnsExpression.element(options); if (column.type.isCollection()) { checkNotNull(keyOrIndex, "Invalid null value for %s element access", column.type instanceof MapType ? "map" : "list"); } - return new ElementOrFieldAccessBound(column, keyOrIndex, operator, toValue(columnsExpression.type(), bindAndGetTerms(options))); + return new ElementOrFieldAccessBound(column, table, keyOrIndex, operator, toValue(columnsExpression.type(), bindAndGetTerms(options))); } private ByteBuffer toValue(AbstractType type, List values) @@ -171,15 +181,52 @@ public String toCQLString() return operator.buildCQLString(columnsExpression, values); } + public interface BoundSerializer + { + default void serialize(T bound, DataOutputPlus out) throws IOException {} + Bound deserialize(DataInputPlus in, ColumnMetadata column, TableMetadata table, Operator operator, ByteBuffer value) throws IOException; + default long serializedSize(T condition) { return 0; } + } + + public enum BoundKind + { + Simple(0, SimpleBound.serializer), + ElementOrFieldAccess(1, ElementOrFieldAccessBound.serializer), + MultiCell(2, MultiCellBound.serializer); + + private final int id; + @SuppressWarnings("rawtypes") + public final BoundSerializer serializer; + + BoundKind(int id, BoundSerializer serializer) + { + this.id = id; + this.serializer = serializer; + } + + public static BoundKind valueOf(int id) + { + switch (id) + { + case 0: return BoundKind.Simple; + case 1: return BoundKind.ElementOrFieldAccess; + case 2: return BoundKind.MultiCell; + default: throw new IllegalArgumentException("Unknown id: " + id); + } + } + } + public static abstract class Bound { - protected final ColumnMetadata column; - protected final Operator operator; - protected final ByteBuffer value; + public final ColumnMetadata column; + public final TableMetadata table; + public final Operator operator; + public final ByteBuffer value; - protected Bound(ColumnMetadata column, Operator operator, ByteBuffer value) + protected Bound(ColumnMetadata column, TableMetadata table, Operator operator, ByteBuffer value) { this.column = column; + this.table = table; this.operator = operator; this.value = value; } @@ -188,16 +235,57 @@ protected Bound(ColumnMetadata column, Operator operator, ByteBuffer value) * Validates whether this condition applies to {@code current}. */ public abstract boolean appliesTo(Row row); + + public abstract BoundKind kind(); + + public static final ParameterisedUnversionedSerializer serializer = new ParameterisedUnversionedSerializer<>() { + @Override + public void serialize(Bound bound, TableMetadatas tables, DataOutputPlus out) throws IOException + { + tables.serialize(bound.table, out); + columnMetadataSerializer.serialize(bound.column, bound.table, out); + bound.operator.writeToUnsignedVInt(out); + nullableByteBufferSerializer.serialize(bound.value, out); + ColumnCondition.BoundKind kind = bound.kind(); + out.writeUnsignedVInt32(kind.ordinal()); + kind.serializer.serialize(bound, out); + } + + @Override + public Bound deserialize(TableMetadatas tables, DataInputPlus in) throws IOException + { + TableMetadata table = tables.deserialize(in); + ColumnMetadata column = columnMetadataSerializer.deserialize(table, in); + Operator operator = Operator.readFromUnsignedVInt(in); + ByteBuffer value = nullableByteBufferSerializer.deserialize(in); + ColumnCondition.BoundKind boundKind = ColumnCondition.BoundKind.valueOf(in.readUnsignedVInt32()); + return boundKind.serializer.deserialize(in, column, table, operator, value); + } + + @Override + public long serializedSize(Bound bound, TableMetadatas tables) + { + ColumnCondition.BoundKind kind = bound.kind(); + return tables.serializedSize(bound.table) + + columnMetadataSerializer.serializedSize(bound.column, bound.table) + + bound.operator.sizeAsUnsignedVInt() + + nullableByteBufferSerializer.serializedSize(bound.value) + + sizeofUnsignedVInt(kind.ordinal()) + + kind.serializer.serializedSize(bound); + } + }; } /** * A condition on a single non-collection column. */ - private static final class SimpleBound extends Bound + public static class SimpleBound extends Bound { - private SimpleBound(ColumnMetadata column, Operator operator, ByteBuffer value) + private static final BoundSerializer serializer = (in, column, table, operator, value) -> new SimpleBound(column, table, operator, value); + + public SimpleBound(ColumnMetadata column, TableMetadata table, Operator operator, ByteBuffer value) { - super(column, operator, value); + super(column, table, operator, value); } @Override @@ -206,7 +294,7 @@ public boolean appliesTo(Row row) return operator.isSatisfiedBy(column.type, rowValue(row), value); } - private ByteBuffer rowValue(Row row) + protected ByteBuffer rowValue(Row row) { // If we're asking for a given cell, and we didn't get any row from our read, it's // the same as not having said cell. @@ -216,13 +304,70 @@ private ByteBuffer rowValue(Row row) Cell c = row.getCell(column); return c == null ? null : c.buffer(); } + + @Override + public BoundKind kind() + { + return BoundKind.Simple; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + SimpleBound bound = (SimpleBound) o; + return column.equals(bound.column) && operator == bound.operator && Objects.equals(value, bound.value); + } + + @Override + public int hashCode() + { + return Objects.hash(column, operator, value); + } + } + + public static class SimpleClusteringBound extends SimpleBound + { + public SimpleClusteringBound(ColumnMetadata column, TableMetadata table, Operator operator, ByteBuffer value) + { + super(column, table, operator, value); + assert column.isClusteringColumn() : String.format("Column must be a clustering column, but given %s", column); + } + + @Override + protected ByteBuffer rowValue(Row row) + { + return row == null ? null : row.clustering().bufferAt(column.position()); + } } /** * A condition on a collection element or a UDT field. */ - private static final class ElementOrFieldAccessBound extends Bound + public static final class ElementOrFieldAccessBound extends Bound { + private static final BoundSerializer serializer = new BoundSerializer<>() + { + @Override + public void serialize(ElementOrFieldAccessBound bound, DataOutputPlus out) throws IOException + { + nullableByteBufferSerializer.serialize(bound.keyOrIndex, out); + } + + @Override + public Bound deserialize(DataInputPlus in, ColumnMetadata column, TableMetadata table, Operator operator, ByteBuffer value) throws IOException + { + ByteBuffer keyOrIndex = nullableByteBufferSerializer.deserialize(in); + return new ElementOrFieldAccessBound(column, table, keyOrIndex, operator, value); + } + + @Override + public long serializedSize(ElementOrFieldAccessBound condition) + { + return nullableByteBufferSerializer.serializedSize(condition.keyOrIndex); + } + }; /** * The collection element or UDT field type. */ @@ -234,16 +379,23 @@ private static final class ElementOrFieldAccessBound extends Bound private final ByteBuffer keyOrIndex; - private ElementOrFieldAccessBound(ColumnMetadata column, - ByteBuffer keyOrIndex, - Operator operator, - ByteBuffer value) + public ElementOrFieldAccessBound(ColumnMetadata column, + TableMetadata table, + ByteBuffer keyOrIndex, + Operator operator, + ByteBuffer value) { - super(column, operator, value); + super(column, table, operator, value); this.elementType = ((MultiElementType) column.type).elementType(keyOrIndex); this.keyOrIndex = keyOrIndex; } + @Override + public BoundKind kind() + { + return BoundKind.ElementOrFieldAccess; + } + @Override public boolean appliesTo(Row row) { @@ -260,17 +412,40 @@ private ColumnData columnData(Row row) { return row == null ? null : row.getColumnData(column); } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + ElementOrFieldAccessBound bound = (ElementOrFieldAccessBound) o; + return column.equals(bound.column) && operator == bound.operator && Objects.equals(value, bound.value) && Objects.equals(keyOrIndex, bound.keyOrIndex); + } + + @Override + public int hashCode() + { + return Objects.hash(column, operator, value); + } } /** * A condition on a multicell column. */ - private static final class MultiCellBound extends Bound + public static final class MultiCellBound extends Bound { - public MultiCellBound(ColumnMetadata column, Operator operator, ByteBuffer value) + private static final BoundSerializer serializer = (in, column, table, operator, value) -> new MultiCellBound(column, table, operator, value); + + public MultiCellBound(ColumnMetadata column, TableMetadata table, Operator operator, ByteBuffer value) + { + super(column, table, operator, value); + assert column.type.isMultiCell() : String.format("Unexpected type: %s", column.type); + } + + @Override + public BoundKind kind() { - super(column, operator, value); - assert column.type.isMultiCell(); + return BoundKind.MultiCell; } public boolean appliesTo(Row row) @@ -278,6 +453,21 @@ public boolean appliesTo(Row row) ComplexColumnData columnData = row == null ? null : row.getComplexColumnData(column); return operator.isSatisfiedBy((MultiElementType) column.type, columnData, value); } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + MultiCellBound bound = (MultiCellBound) o; + return column.equals(bound.column) && operator == bound.operator && Objects.equals(value, bound.value); + } + + @Override + public int hashCode() + { + return Objects.hash(column, operator, value); + } } public static class Raw diff --git a/src/java/org/apache/cassandra/cql3/constraints/AbstractFunctionConstraint.java b/src/java/org/apache/cassandra/cql3/constraints/AbstractFunctionConstraint.java index 6204b96a212c..2f6805bdef10 100644 --- a/src/java/org/apache/cassandra/cql3/constraints/AbstractFunctionConstraint.java +++ b/src/java/org/apache/cassandra/cql3/constraints/AbstractFunctionConstraint.java @@ -20,7 +20,6 @@ import java.util.List; -import org.apache.cassandra.cql3.ColumnIdentifier; import org.apache.cassandra.cql3.CqlBuilder; import org.apache.cassandra.cql3.Operator; import org.apache.cassandra.utils.LocalizeString; @@ -30,9 +29,8 @@ public abstract class AbstractFunctionConstraint extends ColumnConstraint protected final Operator relationType; protected final String term; - public AbstractFunctionConstraint(ColumnIdentifier columnName, Operator relationType, String term) + public AbstractFunctionConstraint(Operator relationType, String term) { - super(columnName); this.relationType = relationType; this.term = term; } diff --git a/src/java/org/apache/cassandra/cql3/constraints/AbstractFunctionSatisfiabilityChecker.java b/src/java/org/apache/cassandra/cql3/constraints/AbstractFunctionSatisfiabilityChecker.java index 91d448da012a..cebc5c36d3dd 100644 --- a/src/java/org/apache/cassandra/cql3/constraints/AbstractFunctionSatisfiabilityChecker.java +++ b/src/java/org/apache/cassandra/cql3/constraints/AbstractFunctionSatisfiabilityChecker.java @@ -26,6 +26,8 @@ import org.apache.cassandra.cql3.Operator; import org.apache.cassandra.cql3.functions.types.ParseUtils; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.Int32Type; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.utils.Pair; @@ -70,6 +72,8 @@ public void check(String functionName, List> constraints, Co */ abstract Pair, List> categorizeConstraints(List> constraints, String functionName); + abstract AbstractType returnType(ColumnMetadata columnMetadata); + private void checkSupportedOperators(List allConstraints, String functionName) { for (CONSTRAINT_TYPE constraint : allConstraints) @@ -147,11 +151,12 @@ else if (firstRelation == NEQ && secondRelation == NEQ) } else { - ByteBuffer firstTermBuffer = columnMetadata.type.fromString(ParseUtils.unquote(firstTerm)); - ByteBuffer secondTermBuffer = columnMetadata.type.fromString(ParseUtils.unquote(secondTerm)); + AbstractType returnType = returnType(columnMetadata); + ByteBuffer firstTermBuffer = returnType.fromString(ParseUtils.unquote(firstTerm)); + ByteBuffer secondTermBuffer = returnType.fromString(ParseUtils.unquote(secondTerm)); - boolean firstSatisfaction = firstRelation.isSatisfiedBy(columnMetadata.type, secondTermBuffer, firstTermBuffer); - boolean secondSatisfaction = secondRelation.isSatisfiedBy(columnMetadata.type, firstTermBuffer, secondTermBuffer); + boolean firstSatisfaction = firstRelation.isSatisfiedBy(returnType, secondTermBuffer, firstTermBuffer); + boolean secondSatisfaction = secondRelation.isSatisfiedBy(returnType, firstTermBuffer, secondTermBuffer); if (!firstSatisfaction || !secondSatisfaction) throw new InvalidConstraintDefinitionException(format("Constraints of %s are not satisfiable: %s %s %s, %s %s %s", @@ -186,6 +191,14 @@ public Pair, List> categori return Pair.create(scalars, notEqualScalars); } + + @Override + AbstractType returnType(ColumnMetadata metadata) + { + // function constraints will always have terms of int32 type + // unlike scalar constraints where it will be a type of column + return metadata.type; + } }; public static final AbstractFunctionSatisfiabilityChecker FUNCTION_SATISFIABILITY_CHECKER = new AbstractFunctionSatisfiabilityChecker<>() @@ -215,5 +228,11 @@ public Pair, List> cate return Pair.create(funnctionColumnConstraints, notEqualConstraints); } + + @Override + AbstractType returnType(ColumnMetadata columnMetadata) + { + return Int32Type.instance; + } }; } diff --git a/src/java/org/apache/cassandra/cql3/constraints/ColumnConstraint.java b/src/java/org/apache/cassandra/cql3/constraints/ColumnConstraint.java index ddcca653ea30..bddea571aa4c 100644 --- a/src/java/org/apache/cassandra/cql3/constraints/ColumnConstraint.java +++ b/src/java/org/apache/cassandra/cql3/constraints/ColumnConstraint.java @@ -30,6 +30,7 @@ import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.utils.ByteBufferUtil; import static java.lang.String.format; @@ -39,9 +40,9 @@ */ public abstract class ColumnConstraint { - protected final ColumnIdentifier columnName; + protected ColumnIdentifier columnName; - public ColumnConstraint(ColumnIdentifier columnName) + public void setColumnName(ColumnIdentifier columnName) { this.columnName = columnName; } @@ -116,8 +117,10 @@ public String fullName() */ public void evaluate(AbstractType valueType, ByteBuffer columnValue) throws ConstraintViolationException { - if (columnValue.capacity() == 0) + if (columnValue == ByteBufferUtil.EMPTY_BYTE_BUFFER) throw new ConstraintViolationException("Column value does not satisfy value constraint for column '" + columnName + "' as it is null."); + else if (valueType.isEmptyValueMeaningless() && columnValue.capacity() == 0) + throw new ConstraintViolationException("Column value does not satisfy value constraint for column '" + columnName + "' as it is empty."); internalEvaluate(valueType, columnValue); } diff --git a/src/java/org/apache/cassandra/cql3/constraints/ColumnConstraints.java b/src/java/org/apache/cassandra/cql3/constraints/ColumnConstraints.java index 0acace098f46..900fb7047be9 100644 --- a/src/java/org/apache/cassandra/cql3/constraints/ColumnConstraints.java +++ b/src/java/org/apache/cassandra/cql3/constraints/ColumnConstraints.java @@ -27,6 +27,9 @@ import java.util.Set; import java.util.TreeSet; +import com.google.common.annotations.VisibleForTesting; + +import org.apache.cassandra.cql3.ColumnIdentifier; import org.apache.cassandra.cql3.CqlBuilder; import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.db.marshal.AbstractType; @@ -48,10 +51,17 @@ public class ColumnConstraints extends ColumnConstraint public ColumnConstraints(List> constraints) { - super(null); this.constraints = constraints; } + @Override + public void setColumnName(ColumnIdentifier columnName) + { + this.columnName = columnName; + for (ColumnConstraint constraint : constraints) + constraint.setColumnName(columnName); + } + @Override public String name() { @@ -108,7 +118,7 @@ public int getSize() // Checks if there is at least one constraint that will perform checks public boolean hasRelevantConstraints() { - for (ColumnConstraint c : constraints) + for (ColumnConstraint c : constraints) { if (c != ColumnConstraints.NO_OP) return true; @@ -116,13 +126,27 @@ public boolean hasRelevantConstraints() return false; } + public boolean containsNotNullConstraint() + { + for (ColumnConstraint c : constraints) + { + if (c.toString().equals(NotNullConstraint.CQL_FUNCTION_NAME)) + return true; + } + + return false; + } + @Override public void validate(ColumnMetadata columnMetadata) throws InvalidConstraintDefinitionException { if (!columnMetadata.type.isConstrainable()) + { throw new InvalidConstraintDefinitionException("Constraint cannot be defined on the column " + columnMetadata.name + " of type " + columnMetadata.type.asCQL3Type() - + " for the table " + columnMetadata.ksName + "." + columnMetadata.cfName); + + " for the table " + columnMetadata.ksName + '.' + columnMetadata.cfName + '.' + + (columnMetadata.type.isCollection() ? " When using collections, constraints can be used only of frozen collections." : "")); + } // this will look at constraints as a whole, // checking if combinations of a particular constraint make sense (duplicities, satisfiability etc.). @@ -207,17 +231,33 @@ public Raw() this.constraints = Collections.emptyList(); } - public ColumnConstraints prepare() + public ColumnConstraints prepare(ColumnIdentifier column) { if (constraints.isEmpty()) return NO_OP; - return new ColumnConstraints(constraints); + + for (ColumnConstraint constraint : constraints) + { + // We only check scalar constraints column name, as the rest of the constraints + // imply the name from the column they are defined at + if (constraint.getConstraintType() == ConstraintType.SCALAR) + { + if (!column.equals(constraint.columnName)) + { + throw new InvalidConstraintDefinitionException(format("Constraint %s was not specified on a column it operates on: %s but on: %s", + constraint, column.toCQLString(), constraint.columnName)); + } + } + } + + ColumnConstraints columnConstraints = new ColumnConstraints(constraints); + columnConstraints.setColumnName(column); + return columnConstraints; } } public static class Serializer implements MetadataSerializer { - @Override public void serialize(ColumnConstraints columnConstraint, DataOutputPlus out, Version version) throws IOException { @@ -236,13 +276,11 @@ public ColumnConstraints deserialize(DataInputPlus in, Version version) throws I List> columnConstraints = new ArrayList<>(); int numberOfConstraints = in.readInt(); for (int i = 0; i < numberOfConstraints; i++) - { - int serializerPosition = in.readShort(); - ColumnConstraint constraint = (ColumnConstraint) ConstraintType - .getSerializer(serializerPosition) - .deserialize(in, version); - columnConstraints.add(constraint); - } + columnConstraints.add(deserializeConstraint(in, in.readShort(), version)); + + // we are not setting column name here on purpose + // that is deffered in ColumnMetadata's constructor, + // we do not have the access to a column name here anyway return new ColumnConstraints(columnConstraints); } @@ -257,6 +295,14 @@ public long serializedSize(ColumnConstraints columnConstraint, Version version) } return constraintsSize; } + + @VisibleForTesting + public ColumnConstraint deserializeConstraint(DataInputPlus in, int serializerPosition, Version version) throws IOException + { + return (ColumnConstraint) ConstraintType + .getSerializer(serializerPosition) + .deserialize(in, version); + } } @Override diff --git a/src/java/org/apache/cassandra/cql3/constraints/ConstraintFunction.java b/src/java/org/apache/cassandra/cql3/constraints/ConstraintFunction.java index 9952ab32d94f..e7837c74f76a 100644 --- a/src/java/org/apache/cassandra/cql3/constraints/ConstraintFunction.java +++ b/src/java/org/apache/cassandra/cql3/constraints/ConstraintFunction.java @@ -19,13 +19,17 @@ package org.apache.cassandra.cql3.constraints; import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.List; import org.apache.cassandra.cql3.ColumnIdentifier; import org.apache.cassandra.cql3.Operator; +import org.apache.cassandra.cql3.functions.types.ParseUtils; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.utils.ByteBufferUtil; +import static java.lang.String.format; import static org.apache.cassandra.cql3.Operator.EQ; import static org.apache.cassandra.cql3.Operator.GT; import static org.apache.cassandra.cql3.Operator.GTE; @@ -40,13 +44,22 @@ public abstract class ConstraintFunction { public static final List DEFAULT_FUNCTION_OPERATORS = List.of(EQ, NEQ, GTE, GT, LTE, LT); - protected final ColumnIdentifier columnName; + protected ColumnIdentifier columnName; protected final String name; + protected final List args; + // args as propagated from cql + protected final List rawArgs; - public ConstraintFunction(ColumnIdentifier columnName, String name) + public ConstraintFunction(String name, List args) { - this.columnName = columnName; this.name = name; + this.rawArgs = args; + this.args = unquote(args); + } + + public List arguments() + { + return args; } /** @@ -55,8 +68,10 @@ public ConstraintFunction(ColumnIdentifier columnName, String name) */ public void evaluate(AbstractType valueType, Operator relationType, String term, ByteBuffer columnValue) throws ConstraintViolationException { - if (columnValue.capacity() == 0) + if (columnValue == ByteBufferUtil.EMPTY_BYTE_BUFFER) throw new ConstraintViolationException("Column value does not satisfy value constraint for column '" + columnName + "' as it is null."); + else if (valueType.isEmptyValueMeaningless() && columnValue.capacity() == 0) + throw new ConstraintViolationException("Column value does not satisfy value constraint for column '" + columnName + "' as it is empty."); internalEvaluate(valueType, relationType, term, columnValue); } @@ -81,6 +96,7 @@ public void evaluate(AbstractType valueType, ByteBuffer columnValue) throws C */ public void validate(ColumnMetadata columnMetadata, String term) throws InvalidConstraintDefinitionException { + maybeThrowOnNonEmptyArguments(name); } /** @@ -88,10 +104,7 @@ public void validate(ColumnMetadata columnMetadata, String term) throws InvalidC * * @return list of operators this function is allowed to have. */ - public List getSupportedOperators() - { - return List.of(); - } + public abstract List getSupportedOperators(); /** * Tells what types of columns are supported by this constraint. @@ -100,4 +113,44 @@ public List getSupportedOperators() * @return supported types for given constraint */ public abstract List> getSupportedTypes(); + + /** + * Tells whether implementation supports specifying arguments on its function. + *
+ * In this case, this function will return "true" + *

+     *     val int check length() < 1024
+     * 
+ * + * In this case, this function will return "false" + *
+     *     val int check someconstraint('abc', 'def')
+     * 
+ * @return true if this constraint does not accept any parameters, false otherwise. + */ + public boolean isParameterless() { return true; } + + @Override + public String toString() + { + return name; + } + + protected void maybeThrowOnNonEmptyArguments(String constraintName) + { + if (!isParameterless()) + return; + + if (args != null && !args.isEmpty()) + throw new InvalidConstraintDefinitionException(format("Constraint %s does not accept any arguments.", constraintName)); + } + + private List unquote(List quotedArgs) + { + List unquotedArgs = new ArrayList<>(); + for (String quotedArg : quotedArgs) + unquotedArgs.add(ParseUtils.unquote(quotedArg)); + + return unquotedArgs; + } } diff --git a/src/java/org/apache/cassandra/cql3/constraints/FunctionColumnConstraint.java b/src/java/org/apache/cassandra/cql3/constraints/FunctionColumnConstraint.java index a94b4bd0bdcb..a25553bd7bbc 100644 --- a/src/java/org/apache/cassandra/cql3/constraints/FunctionColumnConstraint.java +++ b/src/java/org/apache/cassandra/cql3/constraints/FunctionColumnConstraint.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.List; import java.util.function.Function; @@ -45,21 +46,21 @@ public class FunctionColumnConstraint extends AbstractFunctionConstraint arguments, Operator relationType, String term) { this.relationType = relationType; - this.columnName = columnName; this.term = term; - function = createConstraintFunction(functionName.toCQLString(), columnName); + if (arguments == null) + arguments = new ArrayList<>(); + function = createConstraintFunction(functionName.toCQLString(), arguments); } public FunctionColumnConstraint prepare() { - return new FunctionColumnConstraint(function, columnName, relationType, term); + return new FunctionColumnConstraint(function, relationType, term); } } @@ -81,23 +82,31 @@ public enum Functions OCTET_LENGTH(OctetLengthConstraint::new), REGEXP(RegexpConstraint::new); - private final Function functionCreator; + private final Function, ConstraintFunction> functionCreator; - Functions(Function functionCreator) + Functions(Function, ConstraintFunction> functionCreator) { this.functionCreator = functionCreator; } } - private static ConstraintFunction createConstraintFunction(String functionName, ColumnIdentifier columnName) + private static ConstraintFunction createConstraintFunction(String functionName, List args) { - return getEnum(Functions.class, functionName).functionCreator.apply(columnName); + return getEnum(Functions.class, functionName).functionCreator.apply(args); } - private FunctionColumnConstraint(ConstraintFunction function, ColumnIdentifier columnName, Operator relationType, String term) + private FunctionColumnConstraint(ConstraintFunction function, Operator relationType, String term) { - super(columnName, relationType, term); + super(relationType, term); this.function = function; + this.columnName = function.columnName; + } + + @Override + public void setColumnName(ColumnIdentifier columnName) + { + this.columnName = columnName; + this.function.columnName = columnName; } public ConstraintFunction function() @@ -156,7 +165,6 @@ protected void internalEvaluate(AbstractType valueType, ByteBuffer columnValu @Override public void validate(ColumnMetadata columnMetadata) { - validateArgs(columnMetadata); validateTypes(columnMetadata); function.validate(columnMetadata, term); } @@ -167,18 +175,11 @@ public ConstraintType getConstraintType() return ConstraintType.FUNCTION; } - void validateArgs(ColumnMetadata columnMetadata) - { - if (!columnMetadata.name.equals(columnName)) - throw new InvalidConstraintDefinitionException(String.format("Parameter of %s constraint should be the column name (%s)", - name(), - columnMetadata.name)); - } - @Override public String toString() { - return function.name + "(" + columnName + ") " + relationType + " " + term; + String arguments = String.join(",", function.rawArgs); + return function.name + '(' + arguments + ") " + relationType + ' ' + term; } public static class Serializer implements MetadataSerializer @@ -187,7 +188,12 @@ public static class Serializer implements MetadataSerializer args = new ArrayList<>(); + int argsSize = in.readInt(); + for (int i = 0; i < argsSize; i++) + args.add(in.readUTF()); + ConstraintFunction function; - String columnNameString = in.readUTF(); - ColumnIdentifier columnName = new ColumnIdentifier(columnNameString, true); try { - function = createConstraintFunction(functionName, columnName); + function = createConstraintFunction(functionName, args); } catch (Exception e) { @@ -209,14 +219,19 @@ public FunctionColumnConstraint deserialize(DataInputPlus in, Version version) t } Operator relationType = Operator.readFrom(in); final String term = in.readUTF(); - return new FunctionColumnConstraint(function, columnName, relationType, term); + return new FunctionColumnConstraint(function, relationType, term); } @Override public long serializedSize(FunctionColumnConstraint columnConstraint, Version version) { + int argsSizes = 0; + for (String arg : columnConstraint.function.args) + argsSizes += TypeSizes.sizeof(arg); + return TypeSizes.sizeof(columnConstraint.function.getClass().getName()) - + TypeSizes.sizeof(columnConstraint.columnName.toCQLString()) + + TypeSizes.sizeof(columnConstraint.function.args.size()) + + argsSizes + TypeSizes.sizeof(columnConstraint.term) + Operator.serializedSize(); } diff --git a/src/java/org/apache/cassandra/cql3/constraints/JsonConstraint.java b/src/java/org/apache/cassandra/cql3/constraints/JsonConstraint.java index 99aeb6734e63..15a19d7954fc 100644 --- a/src/java/org/apache/cassandra/cql3/constraints/JsonConstraint.java +++ b/src/java/org/apache/cassandra/cql3/constraints/JsonConstraint.java @@ -21,7 +21,6 @@ import java.nio.ByteBuffer; import java.util.List; -import org.apache.cassandra.cql3.ColumnIdentifier; import org.apache.cassandra.cql3.Operator; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.AsciiType; @@ -31,20 +30,15 @@ import static java.lang.String.format; -public class JsonConstraint extends ConstraintFunction +public class JsonConstraint extends UnaryConstraintFunction { private static final List> SUPPORTED_TYPES = List.of(UTF8Type.instance, AsciiType.instance); public static final String FUNCTION_NAME = "JSON"; - public JsonConstraint(ColumnIdentifier columnName) + public JsonConstraint(List args) { - this(columnName, FUNCTION_NAME); - } - - public JsonConstraint(ColumnIdentifier columnName, String name) - { - super(columnName, name); + super(FUNCTION_NAME, args); } @Override @@ -57,7 +51,7 @@ public void internalEvaluate(AbstractType valueType, Operator relationType, S catch (MarshalException ex) { throw new ConstraintViolationException(format("Value for column '%s' violated %s constraint as it is not a valid JSON.", - columnName.toCQLString(), + columnName, name)); } } diff --git a/src/java/org/apache/cassandra/cql3/constraints/LengthConstraint.java b/src/java/org/apache/cassandra/cql3/constraints/LengthConstraint.java index 49954c28fb93..59d78afcaa39 100644 --- a/src/java/org/apache/cassandra/cql3/constraints/LengthConstraint.java +++ b/src/java/org/apache/cassandra/cql3/constraints/LengthConstraint.java @@ -21,7 +21,6 @@ import java.nio.ByteBuffer; import java.util.List; -import org.apache.cassandra.cql3.ColumnIdentifier; import org.apache.cassandra.cql3.Operator; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.AsciiType; @@ -35,9 +34,9 @@ public class LengthConstraint extends ConstraintFunction private static final String NAME = "LENGTH"; private static final List> SUPPORTED_TYPES = List.of(BytesType.instance, UTF8Type.instance, AsciiType.instance); - public LengthConstraint(ColumnIdentifier columnName) + public LengthConstraint(List args) { - super(columnName, NAME); + super(NAME, args); } @Override diff --git a/src/java/org/apache/cassandra/cql3/constraints/NotNullConstraint.java b/src/java/org/apache/cassandra/cql3/constraints/NotNullConstraint.java index fb9f7de95b2d..d0d050db4544 100644 --- a/src/java/org/apache/cassandra/cql3/constraints/NotNullConstraint.java +++ b/src/java/org/apache/cassandra/cql3/constraints/NotNullConstraint.java @@ -19,27 +19,30 @@ package org.apache.cassandra.cql3.constraints; import java.nio.ByteBuffer; +import java.util.Collections; import java.util.List; -import org.apache.cassandra.cql3.ColumnIdentifier; import org.apache.cassandra.cql3.Operator; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.schema.ColumnMetadata; import static java.lang.String.format; -public class NotNullConstraint extends ConstraintFunction +public class NotNullConstraint extends UnaryConstraintFunction { - public static final String FUNCTION_NAME = "NOT_NULL"; + public static final String FUNCTION_NAME = "NOT_NULL"; // as enum item + public static final String CQL_FUNCTION_NAME = "NOT NULL"; - public NotNullConstraint(ColumnIdentifier columnName) + private static final List emptyArguments = Collections.emptyList(); + + public NotNullConstraint() { - this(columnName, FUNCTION_NAME); + super(FUNCTION_NAME, emptyArguments); } - public NotNullConstraint(ColumnIdentifier columnName, String name) + public NotNullConstraint(List args) { - super(columnName, name); + super(FUNCTION_NAME, args); } @Override @@ -51,6 +54,7 @@ public void internalEvaluate(AbstractType valueType, Operator relationType, S @Override public void validate(ColumnMetadata columnMetadata, String term) throws InvalidConstraintDefinitionException { + super.validate(columnMetadata, term); if (columnMetadata.isPrimaryKeyColumn()) throw new InvalidConstraintDefinitionException(format("%s constraint can not be specified on a %s key column '%s'", name, @@ -64,6 +68,12 @@ public List> getSupportedTypes() return null; } + @Override + public String toString() + { + return CQL_FUNCTION_NAME; + } + @Override public boolean equals(Object o) { diff --git a/src/java/org/apache/cassandra/cql3/constraints/OctetLengthConstraint.java b/src/java/org/apache/cassandra/cql3/constraints/OctetLengthConstraint.java index 8147d37d62d2..b55b489465c9 100644 --- a/src/java/org/apache/cassandra/cql3/constraints/OctetLengthConstraint.java +++ b/src/java/org/apache/cassandra/cql3/constraints/OctetLengthConstraint.java @@ -21,7 +21,6 @@ import java.nio.ByteBuffer; import java.util.List; -import org.apache.cassandra.cql3.ColumnIdentifier; import org.apache.cassandra.cql3.Operator; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.AsciiType; @@ -34,9 +33,9 @@ public class OctetLengthConstraint extends ConstraintFunction { private static final List> SUPPORTED_TYPES = List.of(BytesType.instance, UTF8Type.instance, AsciiType.instance); - public OctetLengthConstraint(ColumnIdentifier columnName) + public OctetLengthConstraint(List args) { - super(columnName, "OCTET_LENGTH"); + super("OCTET_LENGTH", args); } @Override diff --git a/src/java/org/apache/cassandra/cql3/constraints/RegexpConstraint.java b/src/java/org/apache/cassandra/cql3/constraints/RegexpConstraint.java index a2e439585f05..062a5a1c6a97 100644 --- a/src/java/org/apache/cassandra/cql3/constraints/RegexpConstraint.java +++ b/src/java/org/apache/cassandra/cql3/constraints/RegexpConstraint.java @@ -23,7 +23,6 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.cassandra.cql3.ColumnIdentifier; import org.apache.cassandra.cql3.Operator; import org.apache.cassandra.cql3.functions.types.ParseUtils; import org.apache.cassandra.db.marshal.AbstractType; @@ -43,9 +42,9 @@ public class RegexpConstraint extends ConstraintFunction private Pattern pattern; - public RegexpConstraint(ColumnIdentifier columnName) + public RegexpConstraint(List args) { - super(columnName, FUNCTION_NAME); + super(FUNCTION_NAME, args); } @Override @@ -84,6 +83,7 @@ public List getSupportedOperators() @Override public void validate(ColumnMetadata columnMetadata, String regexp) throws InvalidConstraintDefinitionException { + super.validate(columnMetadata, regexp); try { // compilation of a regexp every single time upon evaluation is not performance friendly diff --git a/src/java/org/apache/cassandra/cql3/constraints/ScalarColumnConstraint.java b/src/java/org/apache/cassandra/cql3/constraints/ScalarColumnConstraint.java index 80671a6bf39f..f230c6b2d61e 100644 --- a/src/java/org/apache/cassandra/cql3/constraints/ScalarColumnConstraint.java +++ b/src/java/org/apache/cassandra/cql3/constraints/ScalarColumnConstraint.java @@ -96,10 +96,12 @@ public void checkSatisfiability(List> constraints, ColumnMet } private ByteBuffer value; + private AbstractType returnType; - private ScalarColumnConstraint(ColumnIdentifier param, Operator relationType, String term) + private ScalarColumnConstraint(ColumnIdentifier columnName, Operator relationType, String term) { - super(param, relationType, term); + super(relationType, term); + setColumnName(columnName); } @Override @@ -125,11 +127,13 @@ protected void internalEvaluate(AbstractType valueType, ByteBuffer columnValu @Override public void validate(ColumnMetadata columnMetadata) throws InvalidConstraintDefinitionException { + returnType = columnMetadata.type; + validateTypes(columnMetadata); try { - value = columnMetadata.type.fromString(ParseUtils.unquote(term)); + value = returnType.fromString(ParseUtils.unquote(term)); } catch (Throwable t) { @@ -180,9 +184,9 @@ public void serialize(ScalarColumnConstraint columnConstraint, DataOutputPlus ou @Override public ScalarColumnConstraint deserialize(DataInputPlus in, Version version) throws IOException { - ColumnIdentifier param = new ColumnIdentifier(in.readUTF(), true); + ColumnIdentifier columnName = new ColumnIdentifier(in.readUTF(), true); Operator relationType = Operator.readFrom(in); - return new ScalarColumnConstraint(param, relationType, in.readUTF()); + return new ScalarColumnConstraint(columnName, relationType, in.readUTF()); } @Override diff --git a/src/java/org/apache/cassandra/cql3/constraints/UnaryConstraintFunction.java b/src/java/org/apache/cassandra/cql3/constraints/UnaryConstraintFunction.java new file mode 100644 index 000000000000..0e4b0ddd2de9 --- /dev/null +++ b/src/java/org/apache/cassandra/cql3/constraints/UnaryConstraintFunction.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.cql3.constraints; + +import java.util.List; + +import org.apache.cassandra.cql3.Operator; + +public abstract class UnaryConstraintFunction extends ConstraintFunction +{ + public UnaryConstraintFunction(String name, List args) + { + super(name, args); + } + + public List getSupportedOperators() + { + return List.of(); + } +} diff --git a/src/java/org/apache/cassandra/cql3/constraints/UnaryFunctionColumnConstraint.java b/src/java/org/apache/cassandra/cql3/constraints/UnaryFunctionColumnConstraint.java index 80fd443e0e4d..c8edd1189ea4 100644 --- a/src/java/org/apache/cassandra/cql3/constraints/UnaryFunctionColumnConstraint.java +++ b/src/java/org/apache/cassandra/cql3/constraints/UnaryFunctionColumnConstraint.java @@ -20,9 +20,12 @@ import java.io.IOException; import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.List; import java.util.function.Function; +import com.google.common.annotations.VisibleForTesting; + import org.apache.cassandra.cql3.ColumnIdentifier; import org.apache.cassandra.cql3.Operator; import org.apache.cassandra.cql3.constraints.SatisfiabilityChecker.UnaryFunctionSatisfiabilityChecker; @@ -45,17 +48,25 @@ public class UnaryFunctionColumnConstraint extends AbstractFunctionConstraint arguments) + { + function = createConstraintFunction(functionName.toString(), arguments); + } + + public Raw(ColumnIdentifier functionName) { - this.columnName = columnName; - function = createConstraintFunction(functionName.toCQLString(), columnName); + function = createConstraintFunction(functionName.toString(), List.of()); } public UnaryFunctionColumnConstraint prepare() { - return new UnaryFunctionColumnConstraint(function, columnName); + return new UnaryFunctionColumnConstraint(function); } } @@ -64,23 +75,31 @@ public enum Functions implements UnaryFunctionSatisfiabilityChecker NOT_NULL(NotNullConstraint::new), JSON(JsonConstraint::new); - private final Function functionCreator; + private final Function, ConstraintFunction> functionCreator; - Functions(Function functionCreator) + Functions(Function, ConstraintFunction> functionCreator) { this.functionCreator = functionCreator; } } - private static ConstraintFunction createConstraintFunction(String functionName, ColumnIdentifier columnName) + private static ConstraintFunction createConstraintFunction(String functionName, List arguments) { - return getEnum(Functions.class, functionName).functionCreator.apply(columnName); + return getEnum(Functions.class, functionName).functionCreator.apply(arguments); } - private UnaryFunctionColumnConstraint(ConstraintFunction function, ColumnIdentifier columnName) + public UnaryFunctionColumnConstraint(ConstraintFunction function) { - super(columnName, null, null); + super(null, null); this.function = function; + this.columnName = function.columnName; + } + + @Override + public void setColumnName(ColumnIdentifier columnName) + { + this.columnName = columnName; + this.function.columnName = columnName; } @Override @@ -89,6 +108,11 @@ public String name() return function.name; } + public ConstraintFunction function() + { + return function; + } + @Override public MetadataSerializer serializer() { @@ -122,7 +146,6 @@ public void internalEvaluate(AbstractType valueType, ByteBuffer columnValue) @Override public void validate(ColumnMetadata columnMetadata) throws InvalidConstraintDefinitionException { - validateArgs(columnMetadata); validateTypes(columnMetadata); function.validate(columnMetadata, term); } @@ -133,18 +156,18 @@ public ConstraintType getConstraintType() return UNARY_FUNCTION; } - void validateArgs(ColumnMetadata columnMetadata) - { - if (!columnMetadata.name.equals(columnName)) - throw new InvalidConstraintDefinitionException(String.format("Parameter of %s constraint should be the column name (%s)", - name(), - columnMetadata.name)); - } - @Override public String toString() { - return function.name + "(" + columnName + ")"; + if (function.isParameterless()) + { + return function.toString(); + } + else + { + String arguments = String.join(",", function.rawArgs); + return function.toString() + '(' + arguments + ')'; + } } public static class Serializer implements MetadataSerializer @@ -153,26 +176,40 @@ public static class Serializer implements MetadataSerializer args = new ArrayList<>(); + int argsSize = in.readInt(); + for (int i = 0; i < argsSize; i++) + args.add(in.readUTF()); + ConstraintFunction function; - String columnNameString = in.readUTF(); - ColumnIdentifier columnName = new ColumnIdentifier(columnNameString, true); try { - function = createConstraintFunction(functionName, columnName); + function = getConstraintFunction(functionName, args); } catch (Exception e) { throw new IOException(e); } - return new UnaryFunctionColumnConstraint(function, columnName); + return new UnaryFunctionColumnConstraint(function); + } + + @VisibleForTesting + public ConstraintFunction getConstraintFunction(String functionName, List args) + { + return createConstraintFunction(functionName, args); } @Override diff --git a/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java index 10b3864aec37..dd43bb002387 100644 --- a/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java +++ b/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java @@ -24,6 +24,7 @@ import com.google.common.collect.RangeSet; import org.apache.cassandra.db.guardrails.Guardrails; +import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.cql3.QueryOptions; @@ -51,18 +52,22 @@ final class ClusteringColumnRestrictions extends RestrictionSetWrapper */ private final boolean allowFiltering; + private final IPartitioner partitioner; + public ClusteringColumnRestrictions(TableMetadata table, boolean allowFiltering) { - this(table.comparator, RestrictionSet.empty(), allowFiltering); + this(table.comparator, RestrictionSet.empty(), allowFiltering, table.partitioner); } private ClusteringColumnRestrictions(ClusteringComparator comparator, RestrictionSet restrictionSet, - boolean allowFiltering) + boolean allowFiltering, + IPartitioner partitioner) { super(restrictionSet); this.comparator = comparator; this.allowFiltering = allowFiltering; + this.partitioner = partitioner; } public ClusteringColumnRestrictions mergeWith(Restriction restriction, @Nullable IndexRegistry indexRegistry) throws InvalidRequestException @@ -89,7 +94,7 @@ public ClusteringColumnRestrictions mergeWith(Restriction restriction, @Nullable newRestrictionStart.name); } - return new ClusteringColumnRestrictions(this.comparator, newRestrictionSet, allowFiltering); + return new ClusteringColumnRestrictions(this.comparator, newRestrictionSet, allowFiltering, partitioner); } public NavigableSet> valuesAsClustering(QueryOptions options, ClientState state) throws InvalidRequestException @@ -123,7 +128,7 @@ public Slices slices(QueryOptions options) throws InvalidRequestException if (r.isSlice()) { RangeSet rangeSet = ClusteringElements.all(); - r.restrict(rangeSet, options); + r.restrict(rangeSet, options, partitioner); return builder.extend(rangeSet).buildSlices(); } diff --git a/src/java/org/apache/cassandra/cql3/restrictions/ClusteringElements.java b/src/java/org/apache/cassandra/cql3/restrictions/ClusteringElements.java index f8f04ebb5737..104c73d3ef19 100644 --- a/src/java/org/apache/cassandra/cql3/restrictions/ClusteringElements.java +++ b/src/java/org/apache/cassandra/cql3/restrictions/ClusteringElements.java @@ -34,6 +34,7 @@ import org.apache.cassandra.db.BufferClusteringBound; import org.apache.cassandra.db.ClusteringBound; import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.schema.ColumnMetadata; /** @@ -94,7 +95,7 @@ public int compare(ClusteringElements a, ClusteringElements b) /** * The empty {@code ClusteringElements} instance used to avoid creating unecessary empty instances. */ - private static final ClusteringElements EMPTY = new ClusteringElements(ImmutableList.of(), ImmutableList.of()); + private static final ClusteringElements EMPTY = new ClusteringElements(ImmutableList.of(), ImmutableList.of(), false); /** * A range representing all {@code ClusteringElements}. @@ -112,7 +113,12 @@ public int compare(ClusteringElements a, ClusteringElements b) */ private final ImmutableList values; - private ClusteringElements(ImmutableList columns, ImmutableList values) + /** + * We need to special case token restrictions to properly handle MIN_TOKEN + */ + public final boolean token; + + private ClusteringElements(ImmutableList columns, ImmutableList values, boolean token) { if (columns.size() != values.size()) throw new IllegalArgumentException("columns and values should have the same size"); @@ -121,6 +127,7 @@ private ClusteringElements(ImmutableList columns, this.columns = columns; this.values = values; + this.token = token; } private static void checkColumnsOrder(ImmutableList columns) @@ -163,9 +170,9 @@ public static ClusteringElements of() * @param value the element value * @return a {@code ClusteringElements} with a single element. */ - public static ClusteringElements of(ColumnSpecification column, ByteBuffer value) + public static ClusteringElements of(ColumnSpecification column, ByteBuffer value, boolean onToken) { - return new ClusteringElements(ImmutableList.of(column), ImmutableList.of(value)); + return new ClusteringElements(ImmutableList.of(column), ImmutableList.of(value), onToken); } /** @@ -176,7 +183,7 @@ public static ClusteringElements of(ColumnSpecification column, ByteBuffer value */ public static ClusteringElements of(List columns, List values) { - return new ClusteringElements(ImmutableList.copyOf(columns), ImmutableList.copyOf(values)); + return new ClusteringElements(ImmutableList.copyOf(columns), ImmutableList.copyOf(values), false); } /** @@ -200,9 +207,9 @@ public ClusteringElements extend(ClusteringElements suffix) ImmutableList newColumns = concat(columns, suffix.columns); ImmutableList newValues = concat(values, suffix.values); - return suffix instanceof Top ? new Top(newColumns, newValues) - : suffix instanceof Bottom ? new Bottom(newColumns, newValues) - : new ClusteringElements(newColumns, newValues); + return suffix instanceof Top ? new Top(newColumns, newValues, token) + : suffix instanceof Bottom ? new Bottom(newColumns, newValues, token) + : new ClusteringElements(newColumns, newValues, token); } private void checkSuffix(ClusteringElements suffix) @@ -245,36 +252,36 @@ public static RangeSet all() * Returns a {@code RangeSet} that contains all values less than or equal to endpoint. * @return a {@code RangeSet} that contains all values less than or equal to endpoint. */ - public static RangeSet atMost(ClusteringElements endpoint) + public static RangeSet atMost(ClusteringElements endpoint, IPartitioner partitioner) { - return buildRangeSet(endpoint, true, BoundType.CLOSED); + return buildRangeSet(endpoint, true, BoundType.CLOSED, partitioner); } /** * Returns a {@code RangeSet} that contains all values less than endpoint. * @return a {@code RangeSet} that contains all values less than endpoint. */ - public static RangeSet lessThan(ClusteringElements endpoint) + public static RangeSet lessThan(ClusteringElements endpoint, IPartitioner partitioner) { - return buildRangeSet(endpoint, true, BoundType.OPEN); + return buildRangeSet(endpoint, true, BoundType.OPEN, partitioner); } /** * Returns a {@code RangeSet} that contains all values greater or equal to endpoint. * @return a {@code RangeSet} that contains all values greater or equal to endpoint. */ - public static RangeSet atLeast(ClusteringElements endpoint) + public static RangeSet atLeast(ClusteringElements endpoint, IPartitioner partitioner) { - return buildRangeSet(endpoint, false, BoundType.CLOSED); + return buildRangeSet(endpoint, false, BoundType.CLOSED, partitioner); } /** * Returns a {@code RangeSet} that contains all values greater than endpoint. * @return a {@code RangeSet} that contains all values greater than endpoint. */ - public static RangeSet greaterThan(ClusteringElements endpoint) + public static RangeSet greaterThan(ClusteringElements endpoint, IPartitioner partitioner) { - return buildRangeSet(endpoint, false, BoundType.OPEN); + return buildRangeSet(endpoint, false, BoundType.OPEN, partitioner); } public static Range notEqualTo(ClusteringElements endpoint) @@ -282,7 +289,7 @@ public static Range notEqualTo(ClusteringElements endpoint) return Range.closed(endpoint.bottom(), endpoint.top()); } - private static RangeSet buildRangeSet(ClusteringElements endpoint, boolean upperBound, BoundType boundType) + private static RangeSet buildRangeSet(ClusteringElements endpoint, boolean upperBound, BoundType boundType, IPartitioner partitioner) { TreeRangeSet rangeSet = TreeRangeSet.create(); boolean reversed = endpoint.columnType(0).isReversed(); @@ -312,12 +319,16 @@ private static RangeSet buildRangeSet(ClusteringElements end oppositeEndpoint = upperBound ? e.bottom() : e.top(); } } + boolean minToken = false; + if (endpoint.token && !upperBound) + minToken = partitioner.getTokenFactory().fromByteArray(endpoint.get(0)).isMinimum(); // We need to add the last range or the only one if there was no change of direction. Range range = upperBound ? Range.range(oppositeEndpoint, BoundType.CLOSED, boundType == BoundType.OPEN ? endpoint.bottom() : endpoint.top(), boundType) - : Range.range(boundType == BoundType.OPEN ? endpoint.top() : endpoint.bottom(), + : Range.range(minToken ? oppositeEndpoint + : boundType == BoundType.OPEN ? endpoint.top() : endpoint.bottom(), boundType, oppositeEndpoint, BoundType.CLOSED); @@ -331,7 +342,7 @@ private static RangeSet buildRangeSet(ClusteringElements end */ public ClusteringElements top() { - return new Top(columns, values); + return new Top(columns, values, token); } /** @@ -340,7 +351,7 @@ public ClusteringElements top() */ public ClusteringElements bottom() { - return new Bottom(columns, values); + return new Bottom(columns, values, token); } @Override @@ -472,9 +483,9 @@ private static E last(List elements) */ private static class Bottom extends ClusteringElements { - private Bottom(ImmutableList columns, ImmutableList values) + private Bottom(ImmutableList columns, ImmutableList values, boolean token) { - super(columns, values); + super(columns, values, token); } @Override @@ -491,9 +502,9 @@ public ClusteringBound toBound(boolean isStart, boolean isInclusive) */ private static class Top extends ClusteringElements { - private Top(ImmutableList columns, ImmutableList values) + private Top(ImmutableList columns, ImmutableList values, boolean token) { - super(columns, values); + super(columns, values, token); } @Override diff --git a/src/java/org/apache/cassandra/cql3/restrictions/MergedRestriction.java b/src/java/org/apache/cassandra/cql3/restrictions/MergedRestriction.java index 7976b78b7c01..9296e00aede3 100644 --- a/src/java/org/apache/cassandra/cql3/restrictions/MergedRestriction.java +++ b/src/java/org/apache/cassandra/cql3/restrictions/MergedRestriction.java @@ -29,6 +29,7 @@ import org.apache.cassandra.cql3.QueryOptions; import org.apache.cassandra.cql3.functions.Function; import org.apache.cassandra.db.filter.RowFilter; +import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.index.Index; import org.apache.cassandra.index.IndexRegistry; import org.apache.cassandra.schema.ColumnMetadata; @@ -325,11 +326,11 @@ public List values(QueryOptions options) } @Override - public void restrict(RangeSet rangeSet, QueryOptions options) + public void restrict(RangeSet rangeSet, QueryOptions options, IPartitioner partitioner) { for (int i = 0, m = restrictions.size(); i < m; i++) { - restrictions.get(i).restrict(rangeSet, options); + restrictions.get(i).restrict(rangeSet, options, partitioner); } } diff --git a/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeyRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeyRestrictions.java index e4df4c7a6976..a33c32ae9aab 100644 --- a/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeyRestrictions.java +++ b/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeyRestrictions.java @@ -296,7 +296,7 @@ private RangeSet toRangeSet(IPartitioner partitioner, List toRangeSet(IPartitioner partitioner, SingleRestriction slice, QueryOptions options) { RangeSet rangeSet = ClusteringElements.all(); - slice.restrict(rangeSet, options); + slice.restrict(rangeSet, options, partitioner); ImmutableRangeSet.Builder builder = ImmutableRangeSet.builder(); diff --git a/src/java/org/apache/cassandra/cql3/restrictions/SimpleRestriction.java b/src/java/org/apache/cassandra/cql3/restrictions/SimpleRestriction.java index 8592fbbb7b17..b5bb2f43fa8d 100644 --- a/src/java/org/apache/cassandra/cql3/restrictions/SimpleRestriction.java +++ b/src/java/org/apache/cassandra/cql3/restrictions/SimpleRestriction.java @@ -34,6 +34,7 @@ import org.apache.cassandra.cql3.terms.Terms; import org.apache.cassandra.db.filter.RowFilter; import org.apache.cassandra.db.marshal.ListType; +import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.index.Index; import org.apache.cassandra.index.IndexRegistry; import org.apache.cassandra.schema.ColumnMetadata; @@ -226,10 +227,10 @@ public List values(QueryOptions options) } @Override - public void restrict(RangeSet rangeSet, QueryOptions options) + public void restrict(RangeSet rangeSet, QueryOptions options, IPartitioner partitioner) { assert operator.isSlice() || operator == Operator.EQ; - operator.restrict(rangeSet, bindAndGetClusteringElements(options)); + operator.restrict(rangeSet, bindAndGetClusteringElements(options), partitioner); } private List bindAndGetClusteringElements(QueryOptions options) @@ -254,7 +255,7 @@ private List bindAndGetSingleTermClusteringElements(QueryOpt List elements = new ArrayList<>(values.size()); for (int i = 0; i < values.size(); i++) - elements.add(ClusteringElements.of(columnsExpression.columnSpecification(), values.get(i))); + elements.add(ClusteringElements.of(columnsExpression.columnSpecification(), values.get(i), isOnToken())); return elements; } diff --git a/src/java/org/apache/cassandra/cql3/restrictions/SingleRestriction.java b/src/java/org/apache/cassandra/cql3/restrictions/SingleRestriction.java index e317e8742da5..7720fb1bb660 100644 --- a/src/java/org/apache/cassandra/cql3/restrictions/SingleRestriction.java +++ b/src/java/org/apache/cassandra/cql3/restrictions/SingleRestriction.java @@ -22,6 +22,7 @@ import com.google.common.collect.RangeSet; import org.apache.cassandra.cql3.QueryOptions; +import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.index.Index; /** @@ -103,7 +104,8 @@ default SingleRestriction mergeWith(SingleRestriction other) * * @param rangeSet the range set to add to * @param options the query options + * @param partitioner the partitioner, used to identify MIN_TOKEN when using token restrictions * @throws UnsupportedOperationException if the operator is not an operator selecting ranges of data. */ - void restrict(RangeSet rangeSet, QueryOptions options); + void restrict(RangeSet rangeSet, QueryOptions options, IPartitioner partitioner); } diff --git a/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java index e619993c6af6..f827942b5ce7 100644 --- a/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java +++ b/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java @@ -868,13 +868,23 @@ private void validateSecondaryIndexSelections() * * @return true if all the primary key columns are restricted by an equality relation. */ - public boolean hasAllPKColumnsRestrictedByEqualities() + public boolean hasAllPrimaryKeyColumnsRestrictedByEqualities() + { + return hasAllPartitionKeyColumnsRestrictedByEqualities() + && !hasUnrestrictedClusteringColumns() + && (clusteringColumnsRestrictions.hasOnlyEqualityRestrictions()); + } + + /** + * Checks that all the partition key columns are restricted by an equality relation ('=' or 'IN'). + * + * @return true if all the partition key columns are restricted by an equality relation. + */ + public boolean hasAllPartitionKeyColumnsRestrictedByEqualities() { return !isPartitionKeyRestrictionsOnToken() - && !partitionKeyRestrictions.hasUnrestrictedPartitionKeyComponents() - && (partitionKeyRestrictions.hasOnlyEqualityRestrictions()) - && !hasUnrestrictedClusteringColumns() - && (clusteringColumnsRestrictions.hasOnlyEqualityRestrictions()); + && !partitionKeyRestrictions.hasUnrestrictedPartitionKeyComponents() + && (partitionKeyRestrictions.hasOnlyEqualityRestrictions()); } /** diff --git a/src/java/org/apache/cassandra/cql3/selection/Selectable.java b/src/java/org/apache/cassandra/cql3/selection/Selectable.java index 56ee558ebb32..301a6306b433 100644 --- a/src/java/org/apache/cassandra/cql3/selection/Selectable.java +++ b/src/java/org/apache/cassandra/cql3/selection/Selectable.java @@ -18,12 +18,27 @@ */ package org.apache.cassandra.cql3.selection; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; import java.util.function.Predicate; import java.util.stream.Collectors; -import org.apache.cassandra.cql3.*; -import org.apache.cassandra.cql3.functions.*; +import org.apache.cassandra.cql3.AssignmentTestable; +import org.apache.cassandra.cql3.CQL3Type; +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.cql3.ColumnSpecification; +import org.apache.cassandra.cql3.FieldIdentifier; +import org.apache.cassandra.cql3.VariableSpecifications; +import org.apache.cassandra.cql3.functions.AggregateFcts; +import org.apache.cassandra.cql3.functions.CastFcts; +import org.apache.cassandra.cql3.functions.Function; +import org.apache.cassandra.cql3.functions.FunctionName; +import org.apache.cassandra.cql3.functions.FunctionResolver; +import org.apache.cassandra.cql3.functions.OperationFcts; import org.apache.cassandra.cql3.selection.Selector.Factory; import org.apache.cassandra.cql3.terms.Constants; import org.apache.cassandra.cql3.terms.Lists; @@ -33,7 +48,18 @@ import org.apache.cassandra.cql3.terms.Tuples; import org.apache.cassandra.cql3.terms.UserTypes; import org.apache.cassandra.cql3.terms.Vectors; -import org.apache.cassandra.db.marshal.*; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.CollectionType; +import org.apache.cassandra.db.marshal.DurationType; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.ListType; +import org.apache.cassandra.db.marshal.LongType; +import org.apache.cassandra.db.marshal.MapType; +import org.apache.cassandra.db.marshal.ReversedType; +import org.apache.cassandra.db.marshal.SetType; +import org.apache.cassandra.db.marshal.TupleType; +import org.apache.cassandra.db.marshal.UserType; +import org.apache.cassandra.db.marshal.VectorType; import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.TableMetadata; @@ -577,8 +603,8 @@ public boolean selectColumns(Predicate predicate) public static class Raw implements Selectable.Raw { - private final Selectable.Raw selected; - private final FieldIdentifier field; + public final Selectable.Raw selected; + public final FieldIdentifier field; public Raw(Selectable.Raw selected, FieldIdentifier field) { @@ -1402,6 +1428,11 @@ public FieldIdentifier toFieldIdentifier() : FieldIdentifier.forUnquoted(text); } + public String getText() + { + return text; + } + @Override public String toString() { @@ -1471,8 +1502,8 @@ public boolean selectColumns(Predicate predicate) public static class Raw implements Selectable.Raw { - private final Selectable.Raw selected; - private final Term.Raw element; + public final Selectable.Raw selected; + public final Term.Raw element; public Raw(Selectable.Raw selected, Term.Raw element) { diff --git a/src/java/org/apache/cassandra/cql3/selection/Selection.java b/src/java/org/apache/cassandra/cql3/selection/Selection.java index 743da6934e05..34d0bf9207ad 100644 --- a/src/java/org/apache/cassandra/cql3/selection/Selection.java +++ b/src/java/org/apache/cassandra/cql3/selection/Selection.java @@ -18,7 +18,12 @@ package org.apache.cassandra.cql3.selection; import java.nio.ByteBuffer; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; import com.google.common.base.MoreObjects; import com.google.common.base.Predicate; @@ -134,6 +139,11 @@ public ResultSet.ResultMetadata getResultMetadata() return resultMetadata; } + public static Selection.Selectors noopSelector() + { + return new SimpleSelectors(); + } + public static Selection wildcard(TableMetadata table, boolean isJson, boolean returnStaticContentOnPartitionWithNoRows) { List all = new ArrayList<>(table.columns().size()); @@ -346,55 +356,72 @@ private static List rowToJson(List row, return Arrays.asList(jsonRow); } - public static interface Selectors + public interface Selectors { /** * Returns the {@code ColumnFilter} corresponding to those selectors * * @return the {@code ColumnFilter} corresponding to those selectors */ - public ColumnFilter getColumnFilter(); + default ColumnFilter getColumnFilter() { return ColumnFilter.NONE; } /** * Checks if this Selectors perform some processing * @return {@code true} if this Selectors perform some processing, {@code false} otherwise. */ - public boolean hasProcessing(); + default boolean hasProcessing() { return false; } /** * Checks if one of the selectors perform some aggregations. * @return {@code true} if one of the selectors perform some aggregations, {@code false} otherwise. */ - public boolean isAggregate(); - - /** - * Returns the number of fetched columns - * @return the number of fetched columns - */ - public int numberOfFetchedColumns(); + default boolean isAggregate() { return false; } /** * Checks if one of the selectors collect TTLs. * @return {@code true} if one of the selectors collect TTLs, {@code false} otherwise. */ - public boolean collectTTLs(); + default boolean collectTTLs() { return false; } /** * Checks if one of the selectors collects write timestamps. * @return {@code true} if one of the selectors collects write timestamps, {@code false} otherwise. */ - public boolean collectWritetimes(); + default boolean collectWritetimes() { return false; } /** * Adds the current row of the specified ResultSetBuilder. * * @param input the input row */ - public void addInputRow(InputRow input); + void addInputRow(InputRow input); - public List getOutputRow(); + List getOutputRow(); - public void reset(); + void reset(); + } + + public static class SimpleSelectors implements Selectors + { + protected List current; + + @Override + public void addInputRow(InputRow input) + { + current = input.getValues(); + } + + @Override + public List getOutputRow() + { + return current; + } + + @Override + public void reset() + { + current = null; + } } // Special cased selection for when only columns are selected. @@ -466,15 +493,9 @@ public boolean isAggregate() public Selectors newSelectors(QueryOptions options) { - return new Selectors() + return new SimpleSelectors() { - private List current; - - public void reset() - { - current = null; - } - + @Override public List getOutputRow() { if (isJson) @@ -482,39 +503,6 @@ public List getOutputRow() return current; } - public void addInputRow(InputRow input) - { - current = input.getValues(); - } - - public boolean isAggregate() - { - return false; - } - - public boolean hasProcessing() - { - return false; - } - - @Override - public int numberOfFetchedColumns() - { - return getColumns().size(); - } - - @Override - public boolean collectTTLs() - { - return false; - } - - @Override - public boolean collectWritetimes() - { - return false; - } - @Override public ColumnFilter getColumnFilter() { @@ -615,12 +603,6 @@ public void addInputRow(InputRow input) selector.addInput(input); } - @Override - public int numberOfFetchedColumns() - { - return getColumns().size(); - } - @Override public boolean collectTTLs() { diff --git a/src/java/org/apache/cassandra/cql3/selection/Selector.java b/src/java/org/apache/cassandra/cql3/selection/Selector.java index fce2ef063407..fa22ea0bb5e2 100644 --- a/src/java/org/apache/cassandra/cql3/selection/Selector.java +++ b/src/java/org/apache/cassandra/cql3/selection/Selector.java @@ -414,7 +414,7 @@ private void add(ComplexColumnData ccd, long nowInSec) UserType udt = (UserType) type; int size = udt.size(); - values[index] = udt.serializeForNativeProtocol(ccd.iterator(), protocolVersion); + values[index] = udt.serializeForNativeProtocol(ccd.iterator()); short fieldPosition = 0; for (Cell cell : ccd) diff --git a/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java b/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java index 6c5b12199252..68b2885e2951 100644 --- a/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java @@ -18,7 +18,16 @@ package org.apache.cassandra.cql3.statements; import java.nio.ByteBuffer; -import java.util.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; import java.util.concurrent.TimeUnit; import com.google.common.annotations.VisibleForTesting; @@ -32,19 +41,38 @@ import org.apache.cassandra.audit.AuditLogContext; import org.apache.cassandra.audit.AuditLogEntryType; -import org.apache.cassandra.db.guardrails.Guardrails; -import org.apache.cassandra.schema.TableId; -import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.cql3.*; -import org.apache.cassandra.db.*; +import org.apache.cassandra.cql3.Attributes; +import org.apache.cassandra.cql3.BatchQueryOptions; +import org.apache.cassandra.cql3.CQLStatement; +import org.apache.cassandra.cql3.ColumnSpecification; +import org.apache.cassandra.cql3.QueryOptions; +import org.apache.cassandra.cql3.ResultSet; +import org.apache.cassandra.cql3.VariableSpecifications; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.IMutation; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; +import org.apache.cassandra.db.RegularAndStaticColumns; +import org.apache.cassandra.db.Slice; +import org.apache.cassandra.db.Slices; +import org.apache.cassandra.db.guardrails.Guardrails; import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.db.rows.RowIterator; -import org.apache.cassandra.exceptions.*; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.exceptions.RequestExecutionException; +import org.apache.cassandra.exceptions.RequestValidationException; +import org.apache.cassandra.exceptions.UnauthorizedException; import org.apache.cassandra.metrics.BatchMetrics; import org.apache.cassandra.metrics.ClientRequestSizeMetrics; -import org.apache.cassandra.service.*; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.ClientWarn; +import org.apache.cassandra.service.QueryState; +import org.apache.cassandra.service.StorageProxy; import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.transport.Dispatcher; import org.apache.cassandra.transport.messages.ResultMessage; @@ -53,13 +81,12 @@ import org.apache.cassandra.utils.Pair; import static java.util.function.Predicate.isEqual; - import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse; /** * A BATCH statement parsed from a CQL query. */ -public class BatchStatement implements CQLStatement +public class BatchStatement implements CQLStatement.CompositeCQLStatement { public enum Type { @@ -202,7 +229,7 @@ public void validate() throws InvalidRequestException for (ModificationStatement statement : statements) { if (timestampSet && statement.isTimestampSet()) - throw new InvalidRequestException("Timestamp must be set either on BATCH or individual statements"); + throw new InvalidRequestException("Timestamp must be set either on BATCH or individual statements: " + statement.source); if (statement.isCounter()) hasCounters = true; @@ -243,7 +270,7 @@ public void validate() throws InvalidRequestException for (ModificationStatement stmt : statements) { if (ksName != null && (!stmt.keyspace().equals(ksName) || !stmt.table().equals(cfName))) - throw new InvalidRequestException("Batch with conditions cannot span multiple tables"); + throw new InvalidRequestException("Batch with conditions cannot span multiple tables: " + stmt.source); ksName = stmt.keyspace(); cfName = stmt.table(); } @@ -268,6 +295,7 @@ public void validate(ClientState state) throws InvalidRequestException statement.validate(state); } + @Override public List getStatements() { return statements; @@ -327,7 +355,9 @@ public List getMutations(ClientState state, ClientWarn.instance.warn(MessageFormatter.arrayFormat(LOGGED_BATCH_LOW_GCGS_WARNING, new Object[] { suffix, tablesWithZeroGcGs }) .getMessage()); } - return collector.toMutations(state); + // local is either executeWithoutConditions modifying a virtual table (doesn't support txns) or executeLocal + // which is called by test or internal things that are bypassing distributed system modification/checks + return collector.toMutations(state, local ? PotentialTxnConflicts.ALLOW : PotentialTxnConflicts.DISALLOW); } /** @@ -617,7 +647,7 @@ public String toString() return String.format("BatchStatement(type=%s, statements=%s)", type, statements); } - public static class Parsed extends QualifiedStatement + public static class Parsed extends QualifiedStatement.Composite { private final Type type; private final Attributes.Raw attrs; @@ -625,21 +655,15 @@ public static class Parsed extends QualifiedStatement public Parsed(Type type, Attributes.Raw attrs, List parsedStatements) { - super(null); this.type = type; this.attrs = attrs; this.parsedStatements = parsedStatements; } - // Not doing this in the constructor since we only need this for prepared statements @Override - public boolean isFullyQualified() + protected Iterable getStatements() { - for (ModificationStatement.Parsed statement : parsedStatements) - if (!statement.isFullyQualified()) - return false; - - return true; + return parsedStatements; } @Override diff --git a/src/java/org/apache/cassandra/cql3/statements/BatchUpdatesCollector.java b/src/java/org/apache/cassandra/cql3/statements/BatchUpdatesCollector.java index 521cd2afa6e2..4bc0d909d2d0 100644 --- a/src/java/org/apache/cassandra/cql3/statements/BatchUpdatesCollector.java +++ b/src/java/org/apache/cassandra/cql3/statements/BatchUpdatesCollector.java @@ -32,6 +32,7 @@ import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.IMutation; import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; import org.apache.cassandra.db.RegularAndStaticColumns; import org.apache.cassandra.db.commitlog.CommitLogSegment; import org.apache.cassandra.db.partitions.PartitionUpdate; @@ -137,14 +138,14 @@ private IMutationBuilder makeMutationBuilder(TableMetadata metadata, DecoratedKe * @return a collection containing all the mutations. */ @Override - public List toMutations(ClientState state) + public List toMutations(ClientState state, PotentialTxnConflicts potentialTxnConflicts) { List ms = new ArrayList<>(); for (Map ksMap : mutationBuilders.values()) { for (IMutationBuilder builder : ksMap.values()) { - IMutation mutation = builder.build(); + IMutation mutation = builder.build(potentialTxnConflicts); mutation.validateIndexedColumns(state); mutation.validateSize(MessagingService.current_version, CommitLogSegment.ENTRY_OVERHEAD_SIZE); ms.add(mutation); @@ -182,7 +183,7 @@ private interface IMutationBuilder /** * Build the immutable mutation */ - IMutation build(); + IMutation build(PotentialTxnConflicts potentialTxnConflicts); /** * Get the builder for the given tableId @@ -215,7 +216,7 @@ public MutationBuilder add(PartitionUpdate.Builder updateBuilder) return this; } - public Mutation build() + public Mutation build(PotentialTxnConflicts potentialTxnConflicts) { ImmutableMap.Builder updates = new ImmutableMap.Builder<>(); for (Map.Entry updateEntry : modifications.entrySet()) @@ -223,7 +224,7 @@ public Mutation build() PartitionUpdate update = updateEntry.getValue().build(); updates.put(updateEntry.getKey(), update); } - return new Mutation(keyspaceName, key, updates.build(), createdAt); + return new Mutation(keyspaceName, key, updates.build(), createdAt, potentialTxnConflicts); } public PartitionUpdate.Builder get(TableId tableId) @@ -263,9 +264,9 @@ public IMutationBuilder add(PartitionUpdate.Builder builder) return mutationBuilder.add(builder); } - public IMutation build() + public IMutation build(PotentialTxnConflicts potentialTxnConflicts) { - return new CounterMutation(mutationBuilder.build(), cl); + return new CounterMutation(mutationBuilder.build(potentialTxnConflicts), cl); } public PartitionUpdate.Builder get(TableId id) @@ -297,7 +298,7 @@ public VirtualMutationBuilder add(PartitionUpdate.Builder builder) } @Override - public VirtualMutation build() + public VirtualMutation build(PotentialTxnConflicts potentialTxnConflicts) { ImmutableMap.Builder updates = new ImmutableMap.Builder<>(); modifications.forEach((tableId, updateBuilder) -> updates.put(tableId, updateBuilder.build())); diff --git a/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java b/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java index 4db98459ec1f..816fc9a814f9 100644 --- a/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java +++ b/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java @@ -17,34 +17,84 @@ */ package org.apache.cassandra.cql3.statements; -import java.util.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; -import org.apache.cassandra.db.marshal.TimeUUIDType; -import org.apache.cassandra.index.IndexRegistry; -import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.cql3.*; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.Update; +import accord.primitives.Keys; +import accord.primitives.Txn; +import org.apache.cassandra.cql3.QueryOptions; +import org.apache.cassandra.cql3.UpdateParameters; import org.apache.cassandra.cql3.conditions.ColumnCondition; -import org.apache.cassandra.db.*; -import org.apache.cassandra.db.filter.*; -import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.Columns; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.RegularAndStaticColumns; +import org.apache.cassandra.db.SinglePartitionReadCommand; +import org.apache.cassandra.db.Slice; +import org.apache.cassandra.db.Slices; +import org.apache.cassandra.db.filter.ClusteringIndexNamesFilter; +import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.filter.DataLimits; +import org.apache.cassandra.db.filter.RowFilter; +import org.apache.cassandra.db.marshal.TimeUUIDType; import org.apache.cassandra.db.partitions.FilteredPartition; import org.apache.cassandra.db.partitions.Partition; import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.db.rows.Row; import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.index.IndexRegistry; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.TableParams; import org.apache.cassandra.service.CASRequest; import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.txn.TxnCondition; +import org.apache.cassandra.service.accord.txn.TxnData; +import org.apache.cassandra.service.accord.txn.TxnDataKeyValue; +import org.apache.cassandra.service.accord.txn.TxnQuery; +import org.apache.cassandra.service.accord.txn.TxnRead; +import org.apache.cassandra.service.accord.txn.TxnReference; +import org.apache.cassandra.service.accord.txn.TxnResult; +import org.apache.cassandra.service.accord.txn.TxnUpdate; +import org.apache.cassandra.service.accord.txn.TxnWrite; import org.apache.cassandra.service.paxos.Ballot; +import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.transport.Dispatcher; import org.apache.cassandra.utils.TimeUUID; -import org.apache.commons.lang3.builder.ToStringBuilder; -import org.apache.commons.lang3.builder.ToStringStyle; +import static com.google.common.base.Preconditions.checkState; +import static org.apache.cassandra.service.StorageProxy.ConsensusAttemptResult; +import static org.apache.cassandra.service.StorageProxy.ConsensusAttemptResult.RETRY_NEW_PROTOCOL; +import static org.apache.cassandra.service.StorageProxy.ConsensusAttemptResult.casResult; +import static org.apache.cassandra.service.accord.txn.TxnData.TxnDataNameKind.CAS_READ; +import static org.apache.cassandra.service.accord.txn.TxnData.txnDataName; +import static org.apache.cassandra.service.accord.txn.TxnResult.Kind.retry_new_protocol; +import static org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter.getTableMetadata; /** * Processed CAS conditions and update on potentially multiple rows of the same partition. */ public class CQL3CasRequest implements CASRequest { + @SuppressWarnings("unused") + private static final Logger logger = LoggerFactory.getLogger(CQL3CasRequest.class); + public final TableMetadata metadata; public final DecoratedKey key; private final RegularAndStaticColumns conditionColumns; @@ -149,7 +199,7 @@ public void addConditions(Clustering clustering, Collection } else if (!(condition instanceof ColumnsConditions)) { - throw new InvalidRequestException("Cannot mix IF conditions and IF NOT EXISTS for the same row"); + throw new InvalidRequestException("Cannot mix IF conditions and " + ((ToCQL) condition).toCQL() + " for the same row"); } ((ColumnsConditions)condition).addConditions(conds, options); } @@ -259,9 +309,9 @@ private static class CASUpdateParameters extends UpdateParameters final long timeUuidMsb; long timeUuidNanos; - public CASUpdateParameters(TableMetadata metadata, RegularAndStaticColumns updatedColumns, ClientState state, QueryOptions options, long timestamp, long nowInSec, int ttl, Map prefetchedRows, long timeUuidMsb, long timeUuidNanos) throws InvalidRequestException + public CASUpdateParameters(TableMetadata metadata, ClientState state, QueryOptions options, long timestamp, long nowInSec, int ttl, Map prefetchedRows, long timeUuidMsb, long timeUuidNanos) throws InvalidRequestException { - super(metadata, updatedColumns, state, options, timestamp, nowInSec, ttl, prefetchedRows); + super(metadata, state, options, timestamp, nowInSec, ttl, prefetchedRows); this.timeUuidMsb = timeUuidMsb; this.timeUuidNanos = timeUuidNanos; } @@ -299,7 +349,7 @@ long applyUpdates(FilteredPartition current, PartitionUpdate.Builder updateBuild { Map map = stmt.requiresRead() ? Collections.singletonMap(key, current) : null; CASUpdateParameters params = - new CASUpdateParameters(metadata, updateBuilder.columns(), state, options, timestamp, nowInSeconds, + new CASUpdateParameters(metadata, state, options, timestamp, nowInSeconds, stmt.getTimeToLive(options), map, timeUuidMsb, timeUuidNanos); stmt.addUpdateForKey(updateBuilder, clustering, params); return params.timeUuidNanos; @@ -329,7 +379,6 @@ void applyUpdates(FilteredPartition current, PartitionUpdate.Builder updateBuild Map map = stmt.requiresRead() ? Collections.singletonMap(key, current) : null; UpdateParameters params = new UpdateParameters(metadata, - updateBuilder.columns(), state, options, timestamp, @@ -350,9 +399,16 @@ protected RowCondition(Clustering clustering) } public abstract boolean appliesTo(FilteredPartition current) throws InvalidRequestException; + + public abstract TxnCondition asTxnCondition(); } - private static class NotExistCondition extends RowCondition + private interface ToCQL + { + String toCQL(); + } + + private static class NotExistCondition extends RowCondition implements ToCQL { private NotExistCondition(Clustering clustering) { @@ -363,9 +419,21 @@ public boolean appliesTo(FilteredPartition current) { return current.getRow(clustering) == null; } + + @Override + public String toCQL() + { + return "IF NOT EXISTS"; + } + + public TxnCondition asTxnCondition() + { + TxnReference txnReference = new TxnReference(txnDataName(CAS_READ), null, null); + return new TxnCondition.Exists(txnReference, TxnCondition.Kind.IS_NULL); + } } - private static class ExistCondition extends RowCondition + private static class ExistCondition extends RowCondition implements ToCQL { private ExistCondition(Clustering clustering) { @@ -376,6 +444,18 @@ public boolean appliesTo(FilteredPartition current) { return current.getRow(clustering) != null; } + + @Override + public String toCQL() + { + return "IF EXISTS"; + } + + public TxnCondition asTxnCondition() + { + TxnReference txnReference = new TxnReference(txnDataName(CAS_READ), null, null); + return new TxnCondition.Exists(txnReference, TxnCondition.Kind.IS_NOT_NULL); + } } private static class ColumnsConditions extends RowCondition @@ -405,6 +485,12 @@ public boolean appliesTo(FilteredPartition current) throws InvalidRequestExcepti } return true; } + + @Override + public TxnCondition asTxnCondition() + { + return new TxnCondition.ColumnConditionsAdapter(clustering, conditions); + } } @Override @@ -412,4 +498,83 @@ public String toString() { return ToStringBuilder.reflectionToString(this, ToStringStyle.SHORT_PREFIX_STYLE); } + + @Override + public Txn toAccordTxn(ClusterMetadata cm, ConsistencyLevel consistencyLevel, ConsistencyLevel commitConsistencyLevel, ClientState clientState, long nowInSecs) + { + SinglePartitionReadCommand readCommand = readCommand(nowInSecs); + TableMetadata metadata = getTableMetadata(cm, this.metadata.id); + TableMetadatas.Complete tables = TableMetadatas.of(metadata); + TableMetadatasAndKeys tablesAndKeys = new TableMetadatasAndKeys(tables, Keys.of(new PartitionKey(metadata.id, readCommand.partitionKey()))); + Update update = createUpdate(cm, tables, clientState, commitConsistencyLevel); + // If the write strategy is sending all writes through Accord there is no need to use the supplied consistency + // level since Accord will manage reading safely + TableParams tableParams = tables.getMetadata(metadata.id).params; + consistencyLevel = tableParams.transactionalMode.readCLForMode(tableParams.transactionalMigrationFrom, consistencyLevel, cm, metadata.id, readCommand.partitionKey().getToken()); + TxnRead read = TxnRead.createCasRead(readCommand, consistencyLevel, tablesAndKeys); + // In a CAS requesting only one key is supported and writes + // can't be dependent on any data that is read (only conditions) + // so the only relevant keys are the read key + return new Txn.InMemory(read.keys(), read, TxnQuery.CONDITION, update, tablesAndKeys); + } + + private Update createUpdate(ClusterMetadata cm, TableMetadatas.Complete tables, ClientState clientState, ConsistencyLevel commitConsistencyLevel) + { + // Potentially ignore commit consistency level if TransactionalMode is full + // since it is safe to match what non-SERIAL writes do + TableMetadata tableMetadata = tables.getMetadata(metadata.id); + TableParams tableParams = tableMetadata.params; + commitConsistencyLevel = tableParams.transactionalMode.commitCLForMode(tableParams.transactionalMigrationFrom, commitConsistencyLevel, cm, tableMetadata.id, key.getToken()); + // CAS requires using the new txn timestamp to correctly linearize some kinds of updates + return new TxnUpdate(tables, createWriteFragments(clientState), createCondition(), commitConsistencyLevel, false); + } + + private TxnCondition createCondition() + { + List txnConditions = new ArrayList<>(conditions.size() + (staticConditions == null ? 0 : 1)); + if (staticConditions != null) + { + txnConditions.add(staticConditions.asTxnCondition()); + } + for (RowCondition condition : conditions.values()) + txnConditions.add(condition.asTxnCondition()); + // CAS forbids empty conditions + checkState(!txnConditions.isEmpty()); + return conditions.size() == 1 ? txnConditions.get(0) : new TxnCondition.BooleanGroup(TxnCondition.Kind.AND, txnConditions); + } + + private List createWriteFragments(ClientState state) + { + PartitionKey partitionKey = new PartitionKey(metadata.id, key); + List fragments = new ArrayList<>(); + int idx = 0; + for (RowUpdate update : updates) + { + // Some operations may need to migrate to run in the transaction, so need to call forTxn to make sure this + // happens. + // see CASSANDRA-18337 + ModificationStatement modification = update.stmt.forTxn(); + QueryOptions options = update.options; + TxnWrite.Fragment fragment = modification.getTxnWriteFragment(idx++, state, options, partitionKey); + fragments.add(fragment); + } + for (RangeDeletion rangeDeletion : rangeDeletions) + { + ModificationStatement modification = rangeDeletion.stmt; + QueryOptions options = rangeDeletion.options; + TxnWrite.Fragment fragment = modification.getTxnWriteFragment(idx++, state, options, partitionKey); + fragments.add(fragment); + } + return fragments; + } + + @Override + public ConsensusAttemptResult toCasResult(TxnResult txnResult) + { + if (txnResult.kind() == retry_new_protocol) + return RETRY_NEW_PROTOCOL; + TxnData txnData = (TxnData)txnResult; + TxnDataKeyValue partition = (TxnDataKeyValue)txnData.get(txnDataName(CAS_READ)); + return casResult(partition != null ? partition.rowIterator(false) : null); + } } diff --git a/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java b/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java index 0bc22842556d..e34477dcc8cb 100644 --- a/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java @@ -20,9 +20,19 @@ import java.util.Collections; import java.util.List; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; + import org.apache.cassandra.audit.AuditLogContext; import org.apache.cassandra.audit.AuditLogEntryType; -import org.apache.cassandra.cql3.*; +import org.apache.cassandra.cql3.Attributes; +import org.apache.cassandra.cql3.Operation; +import org.apache.cassandra.cql3.Operations; +import org.apache.cassandra.cql3.QualifiedName; +import org.apache.cassandra.cql3.StatementSource; +import org.apache.cassandra.cql3.UpdateParameters; +import org.apache.cassandra.cql3.VariableSpecifications; +import org.apache.cassandra.cql3.WhereClause; import org.apache.cassandra.cql3.conditions.ColumnCondition; import org.apache.cassandra.cql3.conditions.Conditions; import org.apache.cassandra.cql3.restrictions.StatementRestrictions; @@ -33,8 +43,6 @@ import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.ClientState; -import org.apache.commons.lang3.builder.ToStringBuilder; -import org.apache.commons.lang3.builder.ToStringStyle; import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse; import static org.apache.cassandra.cql3.statements.RequestValidations.checkTrue; @@ -49,9 +57,16 @@ private DeleteStatement(VariableSpecifications bindVariables, Operations operations, StatementRestrictions restrictions, Conditions conditions, - Attributes attrs) + Attributes attrs, + StatementSource source) + { + super(StatementType.DELETE, bindVariables, cfm, operations, restrictions, conditions, attrs, source); + } + + @Override + protected ModificationStatement withOperations(Operations operations) { - super(StatementType.DELETE, bindVariables, cfm, operations, restrictions, conditions, attrs); + return new DeleteStatement(bindVariables, metadata, operations, restrictions, conditions, attrs, source); } @Override @@ -126,17 +141,21 @@ public static class Parsed extends ModificationStatement.Parsed { private final List deletions; private final WhereClause whereClause; + private final boolean isForTxn; public Parsed(QualifiedName name, Attributes.Raw attrs, List deletions, WhereClause whereClause, List conditions, - boolean ifExists) + boolean ifExists, + StatementSource source, + boolean isForTxn) { - super(name, StatementType.DELETE, attrs, conditions, false, ifExists); + super(name, StatementType.DELETE, attrs, conditions, false, ifExists, source); this.deletions = deletions; this.whereClause = whereClause; + this.isForTxn = isForTxn; } @@ -147,7 +166,7 @@ protected ModificationStatement prepareInternal(ClientState state, Conditions conditions, Attributes attrs) { - Operations operations = new Operations(type); + Operations operations = new Operations(type, isForTxn); for (Operation.RawDeletion deletion : deletions) { @@ -159,7 +178,7 @@ protected ModificationStatement prepareInternal(ClientState state, Operation op = deletion.prepare(metadata.keyspace, def, metadata); op.collectMarkerSpecification(bindVariables); - operations.add(op); + operations.add(op, metadata); } StatementRestrictions restrictions = newRestrictions(state, @@ -175,9 +194,10 @@ protected ModificationStatement prepareInternal(ClientState state, operations, restrictions, conditions, - attrs); + attrs, + source); - if (stmt.hasConditions() && !restrictions.hasAllPKColumnsRestrictedByEqualities()) + if (stmt.hasConditions() && !restrictions.hasAllPrimaryKeyColumnsRestrictedByEqualities()) { checkFalse(stmt.isVirtual(), "DELETE statements must restrict all PRIMARY KEY columns with equality relations"); diff --git a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java index 21da99c14af4..90671c1bd6c9 100644 --- a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java @@ -18,45 +18,104 @@ package org.apache.cassandra.cql3.statements; import java.nio.ByteBuffer; -import java.util.*; - +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.NavigableSet; +import java.util.Set; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; import com.google.common.collect.HashMultiset; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.cassandra.auth.Permission; -import org.apache.cassandra.cql3.constraints.ConstraintViolationException; -import org.apache.cassandra.db.guardrails.Guardrails; -import org.apache.cassandra.dht.Token; -import org.apache.cassandra.locator.Replica; -import org.apache.cassandra.locator.ReplicaLayout; -import org.apache.cassandra.schema.ColumnMetadata; -import org.apache.cassandra.schema.Schema; -import org.apache.cassandra.schema.SchemaConstants; -import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.schema.ViewMetadata; -import org.apache.cassandra.cql3.*; +import org.apache.cassandra.cql3.Attributes; +import org.apache.cassandra.cql3.CQLStatement; +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.cql3.ColumnSpecification; +import org.apache.cassandra.cql3.Operation; +import org.apache.cassandra.cql3.Operations; +import org.apache.cassandra.cql3.Ordering; +import org.apache.cassandra.cql3.QualifiedName; +import org.apache.cassandra.cql3.QueryOptions; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.cql3.ResultSet; +import org.apache.cassandra.cql3.StatementSource; +import org.apache.cassandra.cql3.UpdateParameters; +import org.apache.cassandra.cql3.Validation; +import org.apache.cassandra.cql3.VariableSpecifications; +import org.apache.cassandra.cql3.WhereClause; import org.apache.cassandra.cql3.conditions.ColumnCondition; import org.apache.cassandra.cql3.conditions.ColumnConditions; import org.apache.cassandra.cql3.conditions.Conditions; +import org.apache.cassandra.cql3.constraints.ConstraintViolationException; import org.apache.cassandra.cql3.functions.Function; import org.apache.cassandra.cql3.restrictions.StatementRestrictions; import org.apache.cassandra.cql3.selection.ResultSetBuilder; import org.apache.cassandra.cql3.selection.Selection; import org.apache.cassandra.cql3.selection.Selection.Selectors; -import org.apache.cassandra.db.*; -import org.apache.cassandra.db.filter.*; +import org.apache.cassandra.cql3.terms.Constants; +import org.apache.cassandra.cql3.transactions.ReferenceOperation; +import org.apache.cassandra.db.CBuilder; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.IMutation; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; +import org.apache.cassandra.db.ReadExecutionController; +import org.apache.cassandra.db.RegularAndStaticColumns; +import org.apache.cassandra.db.SinglePartitionReadCommand; +import org.apache.cassandra.db.SinglePartitionReadQuery; +import org.apache.cassandra.db.Slice; +import org.apache.cassandra.db.Slices; +import org.apache.cassandra.db.filter.ClusteringIndexFilter; +import org.apache.cassandra.db.filter.ClusteringIndexNamesFilter; +import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.filter.DataLimits; +import org.apache.cassandra.db.filter.RowFilter; +import org.apache.cassandra.db.guardrails.Guardrails; import org.apache.cassandra.db.marshal.BooleanType; -import org.apache.cassandra.db.partitions.*; +import org.apache.cassandra.db.partitions.FilteredPartition; +import org.apache.cassandra.db.partitions.Partition; +import org.apache.cassandra.db.partitions.PartitionIterator; +import org.apache.cassandra.db.partitions.PartitionIterators; +import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.db.rows.RowIterator; import org.apache.cassandra.db.view.View; -import org.apache.cassandra.exceptions.*; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.exceptions.RequestExecutionException; +import org.apache.cassandra.exceptions.RequestValidationException; +import org.apache.cassandra.exceptions.UnauthorizedException; +import org.apache.cassandra.locator.Replica; +import org.apache.cassandra.locator.ReplicaLayout; import org.apache.cassandra.metrics.ClientRequestSizeMetrics; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.ViewMetadata; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.service.QueryState; import org.apache.cassandra.service.StorageProxy; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys.KeyCollector; +import org.apache.cassandra.service.accord.txn.TxnReferenceOperation; +import org.apache.cassandra.service.accord.txn.TxnReferenceOperations; +import org.apache.cassandra.service.accord.txn.TxnWrite; import org.apache.cassandra.service.disk.usage.DiskUsageBroadcaster; import org.apache.cassandra.service.paxos.Ballot; import org.apache.cassandra.service.paxos.BallotGenerator; @@ -64,6 +123,7 @@ import org.apache.cassandra.transport.Dispatcher; import org.apache.cassandra.transport.messages.ResultMessage; import org.apache.cassandra.triggers.TriggerExecutor; +import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.MD5Digest; @@ -83,36 +143,43 @@ public abstract class ModificationStatement implements CQLStatement.SingleKeyspa public static final String CUSTOM_EXPRESSIONS_NOT_ALLOWED = "Custom index expressions cannot be used in WHERE clauses for UPDATE or DELETE statements"; - private static final ColumnIdentifier CAS_RESULT_COLUMN = new ColumnIdentifier("[applied]", false); + public static final ColumnIdentifier CAS_RESULT_COLUMN = new ColumnIdentifier("[applied]", false); protected final StatementType type; protected final VariableSpecifications bindVariables; public final TableMetadata metadata; - private final Attributes attrs; + protected final Attributes attrs; - private final StatementRestrictions restrictions; + protected final StatementRestrictions restrictions; private final Operations operations; private final RegularAndStaticColumns updatedColumns; - private final Conditions conditions; + protected final Conditions conditions; private final RegularAndStaticColumns conditionColumns; private final RegularAndStaticColumns requiresRead; + /** + * Used by {@link #forTxn()} to only compute a migrated copy of this statement for transactions + */ + private ModificationStatement txnStmt; private final List functions; + public final StatementSource source; + public ModificationStatement(StatementType type, VariableSpecifications bindVariables, TableMetadata metadata, Operations operations, StatementRestrictions restrictions, Conditions conditions, - Attributes attrs) + Attributes attrs, + StatementSource source) { this.type = type; this.bindVariables = bindVariables; @@ -121,6 +188,7 @@ public ModificationStatement(StatementType type, this.operations = operations; this.conditions = conditions; this.attrs = attrs; + this.source = source; if (!conditions.isEmpty()) { @@ -146,6 +214,15 @@ public ModificationStatement(StatementType type, requiresReadBuilder.add(operation.column); } } + for (ReferenceOperation operation : operations.allSubstitutions()) + { + ColumnMetadata receiver = operation.getReceiver(); + updatedColumnsBuilder.add(receiver); + // If the operation requires a read-before-write, make sure its receiver is selected by the auto-read the + // transaction creates during update creation. (see createSelectForTxn()) + if (operation.requiresRead()) + requiresReadBuilder.add(receiver); + } RegularAndStaticColumns modifiedColumns = updatedColumnsBuilder.build(); @@ -389,6 +466,11 @@ public List getStaticOperations() return operations.staticOperations(); } + public Collection allReferenceOperations() + { + return operations.allSubstitutions(); + } + public Iterable getColumnsWithConditions() { return conditions.getColumns(); @@ -451,7 +533,7 @@ public boolean requiresRead() // * Deleting list element by value // * Performing addition on a StringType (i.e. concatenation, only supported for CAS operations) // * Performing addition on a NumberType, again only supported for CAS operations. - return !requiresRead.isEmpty(); + return operations.requiresRead(); } private Map readRequiredLists(Collection partitionKeys, @@ -562,7 +644,8 @@ private ResultMessage executeWithoutCondition(QueryState queryState, QueryOption false, options.getTimestamp(queryState), options.getNowInSeconds(queryState), - requestTime); + requestTime + ); if (!mutations.isEmpty()) { StorageProxy.mutateWithTriggers(mutations, cl, false, requestTime); @@ -755,7 +838,7 @@ static RowIterator casInternal(ClientState state, CQL3CasRequest request, long t } if (!request.appliesTo(current)) - return current.rowIterator(); + return current.rowIterator(false); PartitionUpdate updates = request.makeUpdates(current, state, ballot); updates = TriggerExecutor.instance.execute(updates); @@ -775,28 +858,100 @@ static RowIterator casInternal(ClientState state, CQL3CasRequest request, long t * * @return list of the mutations */ - private List getMutations(ClientState state, - QueryOptions options, - boolean local, - long timestamp, - long nowInSeconds, - Dispatcher.RequestTime requestTime) + public List getMutations(ClientState state, + QueryOptions options, + boolean local, + long timestamp, + long nowInSeconds, + Dispatcher.RequestTime requestTime) { List keys = buildPartitionKeyNames(options, state); - if(keys.size() == 1) + + if (keys.size() == 1) { SingleTableSinglePartitionUpdatesCollector collector = new SingleTableSinglePartitionUpdatesCollector(metadata, updatedColumns); addUpdates(collector, keys, state, options, local, timestamp, nowInSeconds, requestTime); - return collector.toMutations(state); - } else + // local means this is test or internal things that are bypassing distributed system modification/checks + return collector.toMutations(state, local ? PotentialTxnConflicts.ALLOW : PotentialTxnConflicts.DISALLOW); + } + else { HashMultiset perPartitionKeyCounts = HashMultiset.create(keys); SingleTableUpdatesCollector collector = new SingleTableUpdatesCollector(metadata, updatedColumns, perPartitionKeyCounts); addUpdates(collector, keys, state, options, local, timestamp, nowInSeconds, requestTime); - return collector.toMutations(state); + // local means this is test or internal things that are bypassing distributed system modification/checks + return collector.toMutations(state, local ? PotentialTxnConflicts.ALLOW : PotentialTxnConflicts.DISALLOW); } } + public PartitionUpdate getTxnUpdate(ClientState state, QueryOptions options) + { + List mutations = getMutations(state, options, false, 0, 0, new Dispatcher.RequestTime(0, 0)); + // TODO: Temporary fix for CASSANDRA-20079 + if (mutations.isEmpty()) + return PartitionUpdate.emptyUpdate(metadata, metadata.partitioner.decorateKey(ByteBufferUtil.EMPTY_BYTE_BUFFER)); + if (mutations.size() != 1) + throw new IllegalArgumentException("When running withing a transaction, modification statements may only mutate a single partition"); + return Iterables.getOnlyElement(mutations.get(0).getPartitionUpdates()); + } + + private static List getTxnReferenceOps(List operations, QueryOptions options) + { + if (operations.isEmpty()) + return Collections.emptyList(); + + List result = new ArrayList<>(operations.size()); + for (ReferenceOperation operation : operations) + result.add(operation.bindAndGet(options)); + return result; + } + + public TxnReferenceOperations getTxnReferenceOps(QueryOptions options, ClientState state) + { + List regularOps = getTxnReferenceOps(operations.regularSubstitutions(), options); + List staticOps = getTxnReferenceOps(operations.staticSubstitutions(), options); + Clustering clustering = !regularOps.isEmpty() ? Iterables.getOnlyElement(createClustering(options, state)) : null; + return new TxnReferenceOperations(metadata, clustering, regularOps, staticOps); + } + + public ModificationStatement forTxn() + { + if (requiresRead.isEmpty()) return this; + ModificationStatement migrated = txnStmt; + if (migrated == null) + { + synchronized (requiresRead) + { + migrated = txnStmt; + if (migrated == null) + txnStmt = migrated = withOperations(operations.forTxn(metadata)); + } + } + return migrated; + } + + protected abstract ModificationStatement withOperations(Operations operations); + + @VisibleForTesting + public List getSubstitutions() + { + return operations.allSubstitutions(); + } + + public TxnWrite.Fragment getTxnWriteFragment(int index, ClientState state, QueryOptions options, PartitionKey partitionKey) + { + PartitionUpdate baseUpdate = getTxnUpdate(state, options); + TxnReferenceOperations referenceOps = getTxnReferenceOps(options, state); + return new TxnWrite.Fragment(partitionKey, index, baseUpdate, referenceOps); + } + + public TxnWrite.Fragment getTxnWriteFragment(int index, ClientState state, QueryOptions options, KeyCollector keyCollector) + { + PartitionUpdate baseUpdate = getTxnUpdate(state, options); + TxnReferenceOperations referenceOps = getTxnReferenceOps(options, state); + return new TxnWrite.Fragment(keyCollector.collect(baseUpdate.metadata(), baseUpdate.partitionKey()), index, baseUpdate, referenceOps); + } + final void addUpdates(UpdatesCollector collector, List keys, ClientState state, @@ -822,7 +977,8 @@ final void addUpdates(UpdatesCollector collector, local, timestamp, nowInSeconds, - requestTime); + requestTime + ); for (ByteBuffer key : keys) { Validation.validateKey(metadata(), key); @@ -835,7 +991,6 @@ final void addUpdates(UpdatesCollector collector, else for (Slice slice : slices) addUpdateForKey(updateBuilder, slice, params); - } } else @@ -915,7 +1070,8 @@ private UpdateParameters makeUpdateParameters(Collection keys, local, timestamp, nowInSeconds, - requestTime); + requestTime + ); return makeUpdateParameters(keys, new ClusteringIndexNamesFilter(clusterings, false), @@ -925,7 +1081,8 @@ private UpdateParameters makeUpdateParameters(Collection keys, local, timestamp, nowInSeconds, - requestTime); + requestTime + ); } private UpdateParameters makeUpdateParameters(Collection keys, @@ -949,7 +1106,6 @@ private UpdateParameters makeUpdateParameters(Collection keys, requestTime); return new UpdateParameters(metadata(), - updatedColumns(), state, options, getTimestamp(timestamp, options), @@ -965,13 +1121,15 @@ public static abstract class Parsed extends QualifiedStatement private final List conditions; private final boolean ifNotExists; private final boolean ifExists; + protected final StatementSource source; protected Parsed(QualifiedName name, StatementType type, Attributes.Raw attrs, List conditions, boolean ifNotExists, - boolean ifExists) + boolean ifExists, + StatementSource source) { super(name); this.type = type; @@ -979,6 +1137,7 @@ protected Parsed(QualifiedName name, this.conditions = conditions == null ? Collections.emptyList() : conditions; this.ifNotExists = ifNotExists; this.ifExists = ifExists; + this.source = source; } public ModificationStatement prepare(ClientState state) @@ -995,6 +1154,7 @@ public ModificationStatement prepare(ClientState state, VariableSpecifications b Conditions preparedConditions = prepareConditions(metadata, bindVariables); + // TODO: if this is a txn and has a read name, and updates non-static columns, confirm it selects an entire row return prepareInternal(state, metadata, bindVariables, preparedConditions, preparedAttributes); } @@ -1019,7 +1179,6 @@ private Conditions prepareConditions(TableMetadata metadata, VariableSpecificati if (ifNotExists) { assert conditions.isEmpty(); - assert !ifExists; return Conditions.IF_NOT_EXISTS_CONDITION; } @@ -1088,4 +1247,24 @@ public List getConditions() return conditions; } } + + private static final Constants.Value ONE = new Constants.Value(ByteBufferUtil.bytes(1)); + + public SelectStatement createSelectForTxn() + { + // TODO: get working with static-only updates that don't specify any/all primary key columns + Preconditions.checkState(getRestrictions().hasAllPrimaryKeyColumnsRestrictedByEqualities()); + Selection selection = Selection.forColumns(metadata, Lists.newArrayList(requiresRead), false); + return new SelectStatement(metadata, + bindVariables, + SelectStatement.defaultParameters, + selection, + getRestrictions(), + false, + null, + null, + ONE, + null, + StatementSource.INTERNAL); + } } diff --git a/src/java/org/apache/cassandra/cql3/statements/QualifiedStatement.java b/src/java/org/apache/cassandra/cql3/statements/QualifiedStatement.java index 4ed41d168888..c7183a9d1b54 100644 --- a/src/java/org/apache/cassandra/cql3/statements/QualifiedStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/QualifiedStatement.java @@ -78,4 +78,50 @@ public String toString() { return ToStringBuilder.reflectionToString(this, ToStringStyle.SHORT_PREFIX_STYLE); } + + public static abstract class Composite extends QualifiedStatement + { + Composite() + { + super(null); + } + + protected abstract Iterable getStatements(); + + @Override + public boolean isFullyQualified() + { + for (QualifiedStatement statement : getStatements()) + if (!statement.isFullyQualified()) + return false; + + return true; + } + + @Override + public void setKeyspace(ClientState state) + { + for (QualifiedStatement statement : getStatements()) + statement.setKeyspace(state); + } + + @Override + public void setKeyspace(String keyspace) + { + for (QualifiedStatement statement : getStatements()) + statement.setKeyspace(keyspace); + } + + @Override + public String keyspace() + { + return null; + } + + @Override + public String name() + { + return null; + } + } } diff --git a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java index 209ba88f5260..7f025cf02f14 100644 --- a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java @@ -18,10 +18,20 @@ package org.apache.cassandra.cql3.statements; import java.nio.ByteBuffer; -import java.util.*; -import java.util.stream.Collectors; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.EnumSet; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.NavigableSet; +import java.util.Set; +import java.util.TreeMap; import java.util.concurrent.TimeUnit; - +import java.util.stream.Collectors; import javax.annotation.concurrent.ThreadSafe; import com.google.common.annotations.VisibleForTesting; @@ -30,24 +40,27 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Iterables; - +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.cassandra.audit.AuditLogContext; import org.apache.cassandra.audit.AuditLogEntryType; import org.apache.cassandra.auth.Permission; -import org.apache.cassandra.cql3.restrictions.SingleRestriction; -import org.apache.cassandra.cql3.terms.Term; -import org.apache.cassandra.db.guardrails.Guardrails; -import org.apache.cassandra.index.Index; -import org.apache.cassandra.schema.ColumnMetadata; -import org.apache.cassandra.schema.Schema; -import org.apache.cassandra.schema.SchemaConstants; -import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.schema.TableMetadataRef; -import org.apache.cassandra.cql3.*; +import org.apache.cassandra.cql3.CQLStatement; +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.cql3.ColumnSpecification; +import org.apache.cassandra.cql3.Ordering; +import org.apache.cassandra.cql3.QualifiedName; +import org.apache.cassandra.cql3.QueryOptions; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.cql3.ResultSet; +import org.apache.cassandra.cql3.StatementSource; +import org.apache.cassandra.cql3.VariableSpecifications; +import org.apache.cassandra.cql3.WhereClause; import org.apache.cassandra.cql3.functions.Function; +import org.apache.cassandra.cql3.restrictions.SingleRestriction; import org.apache.cassandra.cql3.restrictions.StatementRestrictions; import org.apache.cassandra.cql3.selection.RawSelector; import org.apache.cassandra.cql3.selection.ResultSetBuilder; @@ -56,10 +69,32 @@ import org.apache.cassandra.cql3.selection.Selection; import org.apache.cassandra.cql3.selection.Selection.Selectors; import org.apache.cassandra.cql3.selection.Selector; -import org.apache.cassandra.db.*; +import org.apache.cassandra.cql3.terms.Marker; +import org.apache.cassandra.cql3.terms.Term; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.DataRange; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.db.PartitionRangeReadQuery; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; +import org.apache.cassandra.db.ReadExecutionController; +import org.apache.cassandra.db.ReadQuery; +import org.apache.cassandra.db.SinglePartitionReadCommand; +import org.apache.cassandra.db.SinglePartitionReadQuery; +import org.apache.cassandra.db.Slice; +import org.apache.cassandra.db.Slices; import org.apache.cassandra.db.aggregation.AggregationSpecification; import org.apache.cassandra.db.aggregation.GroupMaker; -import org.apache.cassandra.db.filter.*; +import org.apache.cassandra.db.filter.ClusteringIndexFilter; +import org.apache.cassandra.db.filter.ClusteringIndexNamesFilter; +import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.filter.DataLimits; +import org.apache.cassandra.db.filter.RowFilter; +import org.apache.cassandra.db.guardrails.Guardrails; import org.apache.cassandra.db.marshal.CompositeType; import org.apache.cassandra.db.marshal.Int32Type; import org.apache.cassandra.db.partitions.PartitionIterator; @@ -67,9 +102,20 @@ import org.apache.cassandra.db.rows.RowIterator; import org.apache.cassandra.db.view.View; import org.apache.cassandra.dht.AbstractBounds; -import org.apache.cassandra.exceptions.*; -import org.apache.cassandra.metrics.ClientRequestSizeMetrics; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.exceptions.ReadSizeAbortException; +import org.apache.cassandra.exceptions.RequestExecutionException; +import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestValidationException; +import org.apache.cassandra.exceptions.UnauthorizedException; +import org.apache.cassandra.index.Index; import org.apache.cassandra.index.IndexRegistry; +import org.apache.cassandra.metrics.ClientRequestSizeMetrics; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.TableMetadataRef; import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.service.ClientWarn; @@ -85,9 +131,6 @@ import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.NoSpamLogger; -import org.apache.commons.lang3.builder.ToStringBuilder; -import org.apache.commons.lang3.builder.ToStringStyle; - import static java.lang.String.format; import static org.apache.cassandra.cql3.restrictions.StatementRestrictions.requiresAllowFilteringIfNotSpecified; import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse; @@ -109,7 +152,7 @@ * Note that select statements can be accessed by multiple threads, so we cannot rely on mutable attributes. */ @ThreadSafe -public class SelectStatement implements CQLStatement.SingleKeyspaceCqlStatement +public class SelectStatement implements CQLStatement.SingleKeyspaceCqlStatement, CQLStatement.ReturningCQLStatement { private static final Logger logger = LoggerFactory.getLogger(SelectStatement.class); private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(SelectStatement.logger, 1, TimeUnit.MINUTES); @@ -146,8 +189,10 @@ public class SelectStatement implements CQLStatement.SingleKeyspaceCqlStatement */ private final ColumnComparator> orderingComparator; + public final StatementSource source; + // Used by forSelection below - private static final Parameters defaultParameters = new Parameters(Collections.emptyList(), + public static final Parameters defaultParameters = new Parameters(Collections.emptyList(), Collections.emptyList(), false, false, @@ -162,7 +207,8 @@ public SelectStatement(TableMetadata table, AggregationSpecification.Factory aggregationSpecFactory, ColumnComparator> orderingComparator, Term limit, - Term perPartitionLimit) + Term perPartitionLimit, + StatementSource source) { this.table = table; this.bindVariables = bindVariables; @@ -174,6 +220,7 @@ public SelectStatement(TableMetadata table, this.parameters = parameters; this.limit = limit; this.perPartitionLimit = perPartitionLimit; + this.source = source; } @Override @@ -240,9 +287,11 @@ static SelectStatement forSelection(TableMetadata table, Selection selection) null, null, null, - null); + null, + StatementSource.INTERNAL); } + @Override public ResultSet.ResultMetadata getResultMetadata() { return selection.getResultMetadata(); @@ -338,7 +387,7 @@ public ResultMessage.Rows execute(QueryState state, QueryOptions options, Dispat } } - ReadQuery query = getQuery(options, state.getClientState(), selectors.getColumnFilter(), nowInSec, limit); + ReadQuery query = getQuery(options, state.getClientState(), selectors.getColumnFilter(), nowInSec, limit, PotentialTxnConflicts.DISALLOW); if (options.isReadThresholdsEnabled()) query.trackWarnings(); @@ -374,6 +423,11 @@ public AggregationSpecification getAggregationSpec(QueryOptions options) return aggregationSpecFactory == null ? null : aggregationSpecFactory.newInstance(options); } + public boolean hasAggregation() + { + return aggregationSpecFactory != null; + } + public ReadQuery getQuery(QueryOptions options, long nowInSec) throws RequestValidationException { Selectors selectors = selection.newSelectors(options); @@ -384,7 +438,8 @@ public ReadQuery getQuery(QueryOptions options, long nowInSec) throws RequestVal getLimit(options), getPerPartitionLimit(options), options.getPageSize(), - getAggregationSpec(options)); + getAggregationSpec(options), + PotentialTxnConflicts.DISALLOW); } public ReadQuery getQuery(QueryOptions options, @@ -394,18 +449,20 @@ public ReadQuery getQuery(QueryOptions options, int userLimit, int perPartitionLimit, int pageSize, - AggregationSpecification aggregationSpec) + AggregationSpecification aggregationSpec, + PotentialTxnConflicts potentialTxnConflicts) { DataLimits limit = getDataLimits(userLimit, perPartitionLimit, pageSize, aggregationSpec); - return getQuery(options, state, columnFilter, nowInSec, limit); + return getQuery(options, state, columnFilter, nowInSec, limit, potentialTxnConflicts); } public ReadQuery getQuery(QueryOptions options, ClientState state, ColumnFilter columnFilter, long nowInSec, - DataLimits limit) + DataLimits limit, + PotentialTxnConflicts potentialTxnConflicts) { RowFilter rowFilter = getRowFilter(options, state); @@ -414,13 +471,13 @@ public ReadQuery getQuery(QueryOptions options, if (restrictions.usesSecondaryIndexing() && !SchemaConstants.isLocalSystemKeyspace(table.keyspace)) Guardrails.nonPartitionRestrictedIndexQueryEnabled.ensureEnabled(state); - return getRangeCommand(options, state, columnFilter, rowFilter, limit, nowInSec); + return getRangeCommand(options, state, columnFilter, rowFilter, limit, nowInSec, potentialTxnConflicts); } if (restrictions.usesSecondaryIndexing() && !rowFilter.isStrict()) - return getRangeCommand(options, state, columnFilter, rowFilter, limit, nowInSec); + return getRangeCommand(options, state, columnFilter, rowFilter, limit, nowInSec, potentialTxnConflicts); - return getSliceCommands(options, state, columnFilter, rowFilter, limit, nowInSec); + return getSliceCommands(options, state, columnFilter, rowFilter, limit, nowInSec, potentialTxnConflicts); } private ResultMessage.Rows execute(ReadQuery query, @@ -604,7 +661,8 @@ public ResultMessage.Rows executeInternal(QueryState state, userLimit, userPerPartitionLimit, pageSize, - aggregationSpec); + aggregationSpec, + PotentialTxnConflicts.ALLOW); try (ReadExecutionController executionController = query.executionController()) { @@ -651,7 +709,7 @@ public Map> executeRawInternal(QueryOptions options, Cli throw new IllegalStateException(); Selectors selectors = selection.newSelectors(options); - ReadQuery query = getQuery(options, state, selectors.getColumnFilter(), nowInSec, userLimit, userPerPartitionLimit, Integer.MAX_VALUE, null); + ReadQuery query = getQuery(options, state, selectors.getColumnFilter(), nowInSec, userLimit, userPerPartitionLimit, Integer.MAX_VALUE, null, PotentialTxnConflicts.ALLOW); Map> result = Collections.emptyMap(); try (ReadExecutionController executionController = query.executionController()) @@ -719,8 +777,13 @@ public StatementRestrictions getRestrictions() return restrictions; } + public boolean isPartitionRangeQuery() + { + return isForPartitionRange(restrictions); + } + private ReadQuery getSliceCommands(QueryOptions options, ClientState state, ColumnFilter columnFilter, - RowFilter rowFilter, DataLimits limit, long nowInSec) + RowFilter rowFilter, DataLimits limit, long nowInSec, PotentialTxnConflicts potentialTxnConflicts) { Collection keys = restrictions.getPartitionKeys(options, state); if (keys.isEmpty()) @@ -743,7 +806,7 @@ private ReadQuery getSliceCommands(QueryOptions options, ClientState state, Colu } SinglePartitionReadQuery.Group group = - SinglePartitionReadQuery.createGroup(table, nowInSec, columnFilter, rowFilter, limit, decoratedKeys, filter); + SinglePartitionReadQuery.createGroup(table, nowInSec, columnFilter, rowFilter, limit, decoratedKeys, filter, potentialTxnConflicts); // If there's a secondary index that the commands can use, have it validate the request parameters. group.maybeValidateIndex(); @@ -797,7 +860,7 @@ public RowFilter rowFilterForInternalCalls() } private ReadQuery getRangeCommand(QueryOptions options, ClientState state, ColumnFilter columnFilter, - RowFilter rowFilter, DataLimits limit, long nowInSec) + RowFilter rowFilter, DataLimits limit, long nowInSec, PotentialTxnConflicts potentialTxnConflicts) { ClusteringIndexFilter clusteringIndexFilter = makeClusteringIndexFilter(options, state, columnFilter); if (clusteringIndexFilter == null) @@ -810,7 +873,7 @@ private ReadQuery getRangeCommand(QueryOptions options, ClientState state, Colum return ReadQuery.empty(table); ReadQuery command = - PartitionRangeReadQuery.create(table, nowInSec, columnFilter, rowFilter, limit, new DataRange(keyBounds, clusteringIndexFilter)); + PartitionRangeReadQuery.create(table, nowInSec, columnFilter, rowFilter, limit, new DataRange(keyBounds, clusteringIndexFilter), potentialTxnConflicts); // If there's a secondary index that the command can use, have it validate the request parameters. command.maybeValidateIndex(); @@ -911,6 +974,11 @@ public int getLimit(QueryOptions options) return getLimit(limit, options); } + public boolean isLimitMarker() + { + return limit instanceof Marker; + } + /** * Returns the per partition limit specified by the user. * May be used by custom QueryHandler implementations @@ -1158,6 +1226,11 @@ private void orderResults(ResultSet cqlRows, QueryOptions options, ClientState s cqlRows.rows.sort(comparator); } + private static boolean isForPartitionRange(StatementRestrictions restrictions) + { + return restrictions.isKeyRange() || restrictions.usesSecondaryIndexing(); + } + public static class RawStatement extends QualifiedStatement { public final Parameters parameters; @@ -1166,13 +1239,15 @@ public static class RawStatement extends QualifiedStatement public final Term.Raw limit; public final Term.Raw perPartitionLimit; private ClientState state; + public final StatementSource source; public RawStatement(QualifiedName cfName, Parameters parameters, List selectClause, WhereClause whereClause, Term.Raw limit, - Term.Raw perPartitionLimit) + Term.Raw perPartitionLimit, + StatementSource source) { super(cfName); this.parameters = parameters; @@ -1180,16 +1255,35 @@ public RawStatement(QualifiedName cfName, this.whereClause = whereClause; this.limit = limit; this.perPartitionLimit = perPartitionLimit; + this.source = source; } public SelectStatement prepare(ClientState state) { // Cache locally for use by Guardrails this.state = state; - return prepare(state, false); + return prepare(state, false, bindVariables); + } + + public SelectStatement prepare(ClientState state, boolean forView) + { + return prepare(state, forView, bindVariables); } - public SelectStatement prepare(ClientState state, boolean forView) throws InvalidRequestException + public SelectStatement prepare(VariableSpecifications variableSpecifications) + { + return prepare(state, false, variableSpecifications); + } + + public SelectStatement prepare(boolean forView) + { + return prepare(state, forView, bindVariables); + } + + /** + * @throws InvalidRequestException if the statement being prepared is invalid + */ + public SelectStatement prepare(ClientState state, boolean forView, VariableSpecifications variableSpecifications) throws InvalidRequestException { TableMetadata table = Schema.instance.validateTable(keyspace(), name()); @@ -1197,7 +1291,7 @@ public SelectStatement prepare(ClientState state, boolean forView) throws Invali boolean containsOnlyStaticColumns = selectOnlyStaticColumns(table, selectables); List orderings = getOrderings(table); - StatementRestrictions restrictions = prepareRestrictions(state, table, bindVariables, orderings, containsOnlyStaticColumns, forView); + StatementRestrictions restrictions = prepareRestrictions(state, table, variableSpecifications, orderings, containsOnlyStaticColumns, forView); // If we order post-query, the sorted column needs to be in the ResultSet for sorting, // even if we don't ultimately ship them to the client (CASSANDRA-4911). @@ -1206,7 +1300,7 @@ public SelectStatement prepare(ClientState state, boolean forView) throws Invali Selection selection = prepareSelection(table, selectables, - bindVariables, + variableSpecifications, resultSetOrderingColumns, restrictions); @@ -1242,15 +1336,16 @@ public SelectStatement prepare(ClientState state, boolean forView) throws Invali checkNeedsFiltering(table, restrictions); return new SelectStatement(table, - bindVariables, + variableSpecifications, parameters, selection, restrictions, isReversed, aggregationSpecFactory, orderingComparator, - prepareLimit(bindVariables, limit, keyspace(), limitReceiver()), - prepareLimit(bindVariables, perPartitionLimit, keyspace(), perPartitionLimitReceiver())); + prepareLimit(variableSpecifications, limit, keyspace(), limitReceiver()), + prepareLimit(variableSpecifications, perPartitionLimit, keyspace(), perPartitionLimitReceiver()), + source); } private Set getResultSetOrdering(StatementRestrictions restrictions, Map orderingColumns) @@ -1581,7 +1676,7 @@ private boolean isReversed(TableMetadata table, Map or private void checkNeedsFiltering(TableMetadata table, StatementRestrictions restrictions) throws InvalidRequestException { // non-key-range non-indexed queries cannot involve filtering underneath - if (!parameters.allowFiltering && (restrictions.isKeyRange() || restrictions.usesSecondaryIndexing())) + if (!parameters.allowFiltering && isForPartitionRange(restrictions)) { // We will potentially filter data if the row filter is not the identity and there isn't any index group // supporting all the expressions in the filter. @@ -1620,18 +1715,30 @@ public static class Parameters public final boolean isDistinct; public final boolean allowFiltering; public final boolean isJson; + public final String refName; public Parameters(List orderings, List groups, boolean isDistinct, boolean allowFiltering, boolean isJson) + { + this(orderings, groups, isDistinct, allowFiltering, isJson, null); + } + + public Parameters(List orderings, + List groups, + boolean isDistinct, + boolean allowFiltering, + boolean isJson, + String refName) { this.orderings = orderings; this.groups = groups; this.isDistinct = isDistinct; this.allowFiltering = allowFiltering; this.isJson = isJson; + this.refName = refName; } } @@ -1780,7 +1887,7 @@ public String toString() private String loggableTokens(QueryOptions options, ClientState state) { - if (restrictions.isKeyRange() || restrictions.usesSecondaryIndexing()) + if (isPartitionRangeQuery()) { AbstractBounds bounds = restrictions.getPartitionKeyBounds(options); return "token range: " + (bounds.inclusiveLeft() ? '[' : '(') + @@ -1810,14 +1917,14 @@ private String loggableTokens(QueryOptions options, ClientState state) } } - private String asCQL(QueryOptions options, ClientState state) + public String asCQL(QueryOptions options, ClientState state) { ColumnFilter columnFilter = selection.newSelectors(options).getColumnFilter(); StringBuilder sb = new StringBuilder(); sb.append("SELECT ").append(queriedColumns().toCQLString()); sb.append(" FROM ").append(table.keyspace).append('.').append(table.name); - if (restrictions.isKeyRange() || restrictions.usesSecondaryIndexing()) + if (isPartitionRangeQuery()) { // partition range ClusteringIndexFilter clusteringIndexFilter = makeClusteringIndexFilter(options, state, columnFilter); diff --git a/src/java/org/apache/cassandra/cql3/statements/SingleTableSinglePartitionUpdatesCollector.java b/src/java/org/apache/cassandra/cql3/statements/SingleTableSinglePartitionUpdatesCollector.java index c650ef0370ea..eba1c27bfa38 100644 --- a/src/java/org/apache/cassandra/cql3/statements/SingleTableSinglePartitionUpdatesCollector.java +++ b/src/java/org/apache/cassandra/cql3/statements/SingleTableSinglePartitionUpdatesCollector.java @@ -25,6 +25,7 @@ import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.IMutation; import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; import org.apache.cassandra.db.RegularAndStaticColumns; import org.apache.cassandra.db.commitlog.CommitLogSegment; import org.apache.cassandra.db.partitions.PartitionUpdate; @@ -78,16 +79,16 @@ public PartitionUpdate.Builder getPartitionUpdateBuilder(TableMetadata metadata, * Returns a collection containing all the mutations. */ @Override - public List toMutations(ClientState state) + public List toMutations(ClientState state, PotentialTxnConflicts potentialTxnConflicts) { // it is possible that a modification statement does not create any mutations // for example: DELETE FROM some_table WHERE part_key = 1 AND clust_key < 3 AND clust_key > 5 if (builder == null) return Collections.emptyList(); - return Collections.singletonList(createMutation(state, builder)); + return Collections.singletonList(createMutation(state, builder, potentialTxnConflicts)); } - private IMutation createMutation(ClientState state, PartitionUpdate.Builder builder) + private IMutation createMutation(ClientState state, PartitionUpdate.Builder builder, PotentialTxnConflicts potentialTxnConflicts) { IMutation mutation; @@ -96,7 +97,7 @@ private IMutation createMutation(ClientState state, PartitionUpdate.Builder buil else if (metadata.isCounter()) mutation = new CounterMutation(new Mutation(builder.build()), counterConsistencyLevel); else - mutation = new Mutation(builder.build()); + mutation = new Mutation(builder.build(), potentialTxnConflicts); mutation.validateIndexedColumns(state); mutation.validateSize(MessagingService.current_version, CommitLogSegment.ENTRY_OVERHEAD_SIZE); diff --git a/src/java/org/apache/cassandra/cql3/statements/SingleTableUpdatesCollector.java b/src/java/org/apache/cassandra/cql3/statements/SingleTableUpdatesCollector.java index 2da6b8918080..b2570e9d7005 100644 --- a/src/java/org/apache/cassandra/cql3/statements/SingleTableUpdatesCollector.java +++ b/src/java/org/apache/cassandra/cql3/statements/SingleTableUpdatesCollector.java @@ -31,6 +31,7 @@ import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.IMutation; import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; import org.apache.cassandra.db.RegularAndStaticColumns; import org.apache.cassandra.db.commitlog.CommitLogSegment; import org.apache.cassandra.db.partitions.PartitionUpdate; @@ -95,24 +96,24 @@ public PartitionUpdate.Builder getPartitionUpdateBuilder(TableMetadata metadata, * @return a collection containing all the mutations. */ @Override - public List toMutations(ClientState state) + public List toMutations(ClientState state, PotentialTxnConflicts potentialTxnConflicts) { if (puBuilders.size() == 1) { PartitionUpdate.Builder builder = puBuilders.values().iterator().next(); - return Collections.singletonList(createMutation(state, builder)); + return Collections.singletonList(createMutation(state, builder, potentialTxnConflicts)); } List ms = new ArrayList<>(puBuilders.size()); for (PartitionUpdate.Builder builder : puBuilders.values()) { - IMutation mutation = createMutation(state, builder); + IMutation mutation = createMutation(state, builder, potentialTxnConflicts); ms.add(mutation); } return ms; } - private IMutation createMutation(ClientState state, PartitionUpdate.Builder builder) + private IMutation createMutation(ClientState state, PartitionUpdate.Builder builder, PotentialTxnConflicts potentialTxnConflicts) { IMutation mutation; @@ -121,7 +122,7 @@ private IMutation createMutation(ClientState state, PartitionUpdate.Builder buil else if (metadata.isCounter()) mutation = new CounterMutation(new Mutation(builder.build()), counterConsistencyLevel); else - mutation = new Mutation(builder.build()); + mutation = new Mutation(builder.build(), potentialTxnConflicts); mutation.validateIndexedColumns(state); mutation.validateSize(MessagingService.current_version, CommitLogSegment.ENTRY_OVERHEAD_SIZE); diff --git a/src/java/org/apache/cassandra/cql3/statements/TransactionStatement.java b/src/java/org/apache/cassandra/cql3/statements/TransactionStatement.java new file mode 100644 index 000000000000..f27c49cfcba6 --- /dev/null +++ b/src/java/org/apache/cassandra/cql3/statements/TransactionStatement.java @@ -0,0 +1,758 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.cql3.statements; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedSet; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import javax.annotation.Nullable; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; + +import accord.api.Key; +import accord.primitives.Keys; +import accord.primitives.Routable.Domain; +import accord.primitives.Txn; +import org.agrona.collections.Int2ObjectHashMap; +import org.apache.cassandra.audit.AuditLogContext; +import org.apache.cassandra.audit.AuditLogEntryType; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.CQLStatement; +import org.apache.cassandra.cql3.ColumnSpecification; +import org.apache.cassandra.cql3.QueryOptions; +import org.apache.cassandra.cql3.ResultSet; +import org.apache.cassandra.cql3.VariableSpecifications; +import org.apache.cassandra.cql3.selection.ResultSetBuilder; +import org.apache.cassandra.cql3.selection.Selection; +import org.apache.cassandra.cql3.transactions.ConditionStatement; +import org.apache.cassandra.cql3.transactions.ReferenceOperation; +import org.apache.cassandra.cql3.transactions.RowDataReference; +import org.apache.cassandra.cql3.transactions.SelectReferenceSource; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.SinglePartitionReadCommand; +import org.apache.cassandra.db.SinglePartitionReadQuery; +import org.apache.cassandra.db.filter.DataLimits; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableParams; +import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.QueryState; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.txn.AccordUpdate; +import org.apache.cassandra.service.accord.txn.TxnCondition; +import org.apache.cassandra.service.accord.txn.TxnData; +import org.apache.cassandra.service.accord.txn.TxnDataKeyValue; +import org.apache.cassandra.service.accord.txn.TxnNamedRead; +import org.apache.cassandra.service.accord.txn.TxnQuery; +import org.apache.cassandra.service.accord.txn.TxnRead; +import org.apache.cassandra.service.accord.txn.TxnReference; +import org.apache.cassandra.service.accord.txn.TxnResult; +import org.apache.cassandra.service.accord.txn.TxnUpdate; +import org.apache.cassandra.service.accord.txn.TxnWrite; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.service.consensus.migration.TransactionalMigrationFromMode; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.transport.Dispatcher; +import org.apache.cassandra.transport.messages.ResultMessage; +import org.apache.cassandra.utils.FBUtilities; + +import static accord.primitives.Txn.Kind.Read; +import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse; +import static org.apache.cassandra.cql3.statements.RequestValidations.checkNotNull; +import static org.apache.cassandra.cql3.statements.RequestValidations.checkTrue; +import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest; +import static org.apache.cassandra.service.accord.txn.TxnData.TxnDataNameKind.AUTO_READ; +import static org.apache.cassandra.service.accord.txn.TxnData.TxnDataNameKind.RETURNING; +import static org.apache.cassandra.service.accord.txn.TxnData.TxnDataNameKind.USER; +import static org.apache.cassandra.service.accord.txn.TxnData.txnDataName; +import static org.apache.cassandra.service.accord.txn.TxnRead.createTxnRead; +import static org.apache.cassandra.service.accord.txn.TxnResult.Kind.retry_new_protocol; +import static org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter.shouldReadEphemerally; + +public class TransactionStatement implements CQLStatement.CompositeCQLStatement, CQLStatement.ReturningCQLStatement +{ + public static final String DUPLICATE_TUPLE_NAME_MESSAGE = "The name '%s' has already been used by a LET assignment."; + public static final String INCOMPLETE_PARTITION_KEY_SELECT_MESSAGE = "SELECT must specify either all partition key elements. Partition key elements must be always specified with equality operators; %s %s"; + public static final String INCOMPLETE_PRIMARY_KEY_SELECT_MESSAGE = "SELECT must specify either all primary key elements or all partition key elements and LIMIT 1. In both cases partition key elements must be always specified with equality operators; %s %s"; + public static final String NO_CONDITIONS_IN_UPDATES_MESSAGE = "Updates within transactions may not specify their own conditions; %s statement %s"; + public static final String NO_TIMESTAMPS_IN_UPDATES_MESSAGE = "Updates within transactions may not specify custom timestamps; %s statement %s"; + public static final String NO_TTLS_IN_UPDATES_MESSAGE = "Updates within transactions may not specify custom ttls; %s statement %s"; + public static final String TRANSACTIONS_DISABLED_ON_TABLE_MESSAGE = "Accord transactions are disabled on table (See transactional_mode in table options); %s statement %s"; + public static final String TRANSACTIONS_DISABLED_ON_TABLE_BEING_DROPPED_MESSAGE = "Accord transactions are disabled on table (table is being dropped); %s statement %s"; + public static final String NO_COUNTERS_IN_TXNS_MESSAGE = "Counter columns cannot be accessed within a transaction; %s statement %s"; + public static final String NO_AGGREGATION_IN_TXNS_MESSAGE = "No aggregation functions allowed within a transaction; %s statement %s"; + public static final String NO_ORDER_BY_IN_TXNS_MESSAGE = "No ORDER BY clause allowed within a transaction; %s statement %s"; + public static final String NO_GROUP_BY_IN_TXNS_MESSAGE = "No GROUP BY clause allowed within a transaction; %s statement %s"; + public static final String EMPTY_TRANSACTION_MESSAGE = "Transaction contains no reads or writes"; + public static final String SELECT_REFS_NEED_COLUMN_MESSAGE = "SELECT references must specify a column."; + public static final String TRANSACTIONS_DISABLED_MESSAGE = "Accord transactions are disabled. (See accord.enabled in cassandra.yaml)"; + public static final String ILLEGAL_RANGE_QUERY_MESSAGE = "Range queries are not allowed for reads within a transaction; %s %s"; + public static final String UNSUPPORTED_MIGRATION = "Transaction Statement is unsupported when migrating away from Accord or before migration to Accord is complete for a range"; + public static final String NO_PARTITION_IN_CLAUSE_WITH_LIMIT = "Partition key is present in IN clause and there is a LIMIT... this is currently not supported; %s statement %s"; + + static class NamedSelect + { + final int name; + final SelectStatement select; + + public NamedSelect(int name, SelectStatement select) + { + this.name = name; + this.select = select; + } + } + + private final List assignments; + private final NamedSelect returningSelect; + private final List returningReferences; + private final List updates; + private final List conditions; + + private final VariableSpecifications bindVariables; + private final ResultSet.ResultMetadata resultMetadata; + + private long minEpoch = Epoch.EMPTY.getEpoch(); + + public TransactionStatement(List assignments, + NamedSelect returningSelect, + List returningReferences, + List updates, + List conditions, + VariableSpecifications bindVariables) + { + this.assignments = assignments; + this.returningSelect = returningSelect; + this.returningReferences = returningReferences; + this.updates = updates; + this.conditions = conditions; + this.bindVariables = bindVariables; + + if (returningSelect != null) + { + resultMetadata = returningSelect.select.getResultMetadata(); + } + else if (returningReferences != null && !returningReferences.isEmpty()) + { + List names = new ArrayList<>(returningReferences.size()); + for (RowDataReference reference : returningReferences) + names.add(reference.toResultMetadata()); + resultMetadata = new ResultSet.ResultMetadata(names); + } + else + { + resultMetadata = ResultSet.ResultMetadata.EMPTY; + } + } + + public List getUpdates() + { + return updates; + } + + @Override + public ImmutableList getBindVariables() + { + return bindVariables.getImmutableBindVariables(); + } + + @Override + public void authorize(ClientState state) + { + // Assess read permissions for all data from both explicit LET statements and generated reads. + for (NamedSelect let : assignments) + let.select.authorize(state); + + if (returningSelect != null) + returningSelect.select.authorize(state); + + for (ModificationStatement update : updates) + update.authorize(state); + } + + @Override + public void validate(ClientState state) + { + for (NamedSelect statement : assignments) + statement.select.validate(state); + if (returningSelect != null) + returningSelect.select.validate(state); + for (ModificationStatement statement : updates) + statement.validate(state); + } + + @Override + public Iterable getStatements() + { + return () -> { + Stream stream = assignments.stream().map(n -> n.select); + if (returningSelect != null) + stream = Stream.concat(stream, Stream.of(returningSelect.select)); + stream = Stream.concat(stream, updates.stream()); + return stream.iterator(); + }; + } + + @Override + public ResultSet.ResultMetadata getResultMetadata() + { + return resultMetadata; + } + + TxnNamedRead createNamedRead(NamedSelect namedSelect, QueryOptions options, TableMetadatasAndKeys.KeyCollector keyCollector) + { + SelectStatement select = namedSelect.select; + // We reject reads from both LET and SELECT that do not specify a single row. + @SuppressWarnings("unchecked") + SinglePartitionReadQuery.Group selectQuery = (SinglePartitionReadQuery.Group) select.getQuery(options, 0); + + if (selectQuery.queries.size() != 1) + throw new IllegalArgumentException("Within a transaction, SELECT statements must select a single partition; found " + selectQuery.queries.size() + " partitions"); + + SinglePartitionReadCommand command = Iterables.getOnlyElement(selectQuery.queries); + return new TxnNamedRead(namedSelect.name, keyCollector.collect(command.metadata(), command.partitionKey()), command, keyCollector.tables); + } + + List createNamedReads(NamedSelect namedSelect, QueryOptions options, TableMetadatasAndKeys.KeyCollector keyCollector) + { + SelectStatement select = namedSelect.select; + // We reject reads from both LET and SELECT that do not specify a single row. + @SuppressWarnings("unchecked") + SinglePartitionReadQuery.Group selectQuery = (SinglePartitionReadQuery.Group) select.getQuery(options, 0); + + if (selectQuery.queries.size() == 1) + return Collections.singletonList(new TxnNamedRead(namedSelect.name, keyCollector.collect(select.table, selectQuery.queries.get(0).partitionKey()), selectQuery.queries.get(0), keyCollector.tables)); + + List list = new ArrayList<>(selectQuery.queries.size()); + for (int i = 0; i < selectQuery.queries.size(); i++) + { + SinglePartitionReadCommand readCommand = selectQuery.queries.get(i); + list.add(new TxnNamedRead(txnDataName(RETURNING, i), keyCollector.collect(readCommand.metadata(), readCommand.partitionKey()), readCommand, keyCollector.tables)); + } + return list; + } + + private List createNamedReads(QueryOptions options, @Nullable Int2ObjectHashMap autoReads, TableMetadatasAndKeys.KeyCollector keyCollector) + { + List reads = new ArrayList<>(assignments.size() + 1); + + for (NamedSelect select : assignments) + { + TxnNamedRead read = createNamedRead(select, options, keyCollector); + minEpoch = Math.max(minEpoch, select.select.table.epoch.getEpoch()); + reads.add(read); + } + + if (returningSelect != null) + { + for (TxnNamedRead read : createNamedReads(returningSelect, options, keyCollector)) + { + minEpoch = Math.max(minEpoch, returningSelect.select.table.epoch.getEpoch()); + reads.add(read); + } + } + + if (autoReads != null) + { + for (NamedSelect select : autoReads.values()) + { + TxnNamedRead read = createNamedRead(select, options, keyCollector); + reads.add(read); + } + } + + return reads; + } + + TxnCondition createCondition(QueryOptions options) + { + if (conditions.isEmpty()) + return TxnCondition.none(); + if (conditions.size() == 1) + return conditions.get(0).createCondition(options); + + List result = new ArrayList<>(conditions.size()); + for (ConditionStatement condition : conditions) + result.add(condition.createCondition(options)); + + // TODO: OR support + return new TxnCondition.BooleanGroup(TxnCondition.Kind.AND, result); + } + + TableMetadatas.Complete collectTables() + { + TableMetadatas.Collector collector = new TableMetadatas.Collector(); + if (updates != null) + { + for (ModificationStatement modification : updates) + collector.add(modification.metadata); + } + if (assignments != null) + { + for (NamedSelect select : assignments) + collector.add(select.select.table); + } + if (returningSelect != null) + { + collector.add(returningSelect.select.table); + } + if (returningReferences != null) + { + for (RowDataReference ref : returningReferences) + collector.add(ref.table()); + } + return collector.build(); + } + + private Keys toKeys(SortedSet keySet) + { + return new Keys(keySet); + } + + List createWriteFragments(ClientState state, QueryOptions options, Map autoReads, TableMetadatasAndKeys.KeyCollector keyCollector) + { + List fragments = new ArrayList<>(updates.size()); + int idx = 0; + for (ModificationStatement modification : updates) + { + TxnWrite.Fragment fragment = modification.getTxnWriteFragment(idx, state, options, keyCollector); + minEpoch = Math.max(minEpoch, fragment.baseUpdate.metadata().epoch.getEpoch()); + fragments.add(fragment); + + if (modification.allReferenceOperations().stream().anyMatch(ReferenceOperation::requiresRead)) + { + // Reads are not merged by partition here due to potentially differing columns retrieved, etc. + int partitionName = txnDataName(AUTO_READ, idx); + if (!autoReads.containsKey(partitionName)) + autoReads.put(partitionName, new NamedSelect(partitionName, modification.createSelectForTxn())); + } + + idx++; + } + return fragments; + } + + private ConsistencyLevel consistencyLevelForAccordRead(ClusterMetadata cm, TableMetadatas.Complete tables, Keys keys, @Nullable ConsistencyLevel consistencyLevel) + { + // Write transactions are read/write so it creates a read and ends up needing a consistency level + // which is fine to leave null + if (keys.isEmpty()) + return null; + + // Null means no specific consistency behavior is required from Accord, it's functionally similar to + // reading at ONE if you are reading data that wasn't written via Accord + if (consistencyLevel == null) + return null; + + for (Key key : keys) + { + // readCLForMode should return either null or the supplied consistency level + // in which case we will read everything at that CL since Accord doesn't support per table + // read consistency + ConsistencyLevel readCL = consistencyLevelForAccordRead(cm, tables, key, consistencyLevel); + if (readCL != null) + return readCL; + } + return null; + } + + private ConsistencyLevel consistencyLevelForAccordRead(ClusterMetadata cm, TableMetadatas.Complete tables, Key key, ConsistencyLevel consistencyLevel) + { + // Null means no specific consistency behavior is required from Accord, it's functionally similar to + // reading at ONE if you are reading data that wasn't written via Accord + if (consistencyLevel == null) + return null; + + PartitionKey pk = (PartitionKey)key; + TableId tableId = pk.table(); + Token token = pk.token(); + TableParams tableParams = tables.getMetadata(tableId).params; + TransactionalMode mode = tableParams.transactionalMode; + TransactionalMigrationFromMode migrationFromMode = tableParams.transactionalMigrationFrom; + return mode.readCLForMode(migrationFromMode, consistencyLevel, cm, tableId, token); + } + + private static ConsistencyLevel consistencyLevelForAccordCommit(ClusterMetadata cm, TableMetadatas.Complete tables, TableMetadatasAndKeys.KeyCollector keys, @Nullable ConsistencyLevel consistencyLevel) + { + checkArgument(!keys.isEmpty(), "keys should not be empty"); + // Null means no specific consistency behavior is required from Accord, it's functionally similar to ANY + // if you aren't reading the result back via Accord + if (consistencyLevel == null) + return null; + + for (Key key : keys) + { + // commitCLForMode should return either null or the supplied consistency level + // in which case we will commit everything at that CL since Accord doesn't support per table + // commit consistency + ConsistencyLevel commitCL = consistencyLevelForAccordCommit(cm, tables, key, consistencyLevel); + if (commitCL != null) + return commitCL; + } + return null; + } + + private static ConsistencyLevel consistencyLevelForAccordCommit(ClusterMetadata cm, TableMetadatas.Complete tables, Key key, @Nullable ConsistencyLevel consistencyLevel) + { + // Null means no specific consistency behavior is required from Accord, it's functionally similar to ANY + // if you aren't reading the result back via Accord + if (consistencyLevel == null) + return null; + + PartitionKey pk = (PartitionKey)key; + TableId tableId = pk.table(); + Token token = pk.token(); + TableParams tableParams = tables.getMetadata(tableId).params; + TransactionalMode mode = tableParams.transactionalMode; + TransactionalMigrationFromMode migrationFromMode = tableParams.transactionalMigrationFrom; + // commitCLForMode should return either null or the supplied consistency level + // in which case we will commit everything at that CL since Accord doesn't support per table + // commit consistency + return mode.commitCLForMode(migrationFromMode, consistencyLevel, cm, tableId, token); + } + + @VisibleForTesting + public Txn createTxn(ClientState state, QueryOptions options) + { + ClusterMetadata cm = ClusterMetadata.current(); + TableMetadatas.Complete tables = collectTables(); + TableMetadatasAndKeys.KeyCollector keyCollector = new TableMetadatasAndKeys.KeyCollector(tables); + + if (updates.isEmpty()) + { + // TODO: Test case around this... + Preconditions.checkState(conditions.isEmpty(), "No condition should exist without updates present"); + List reads = createNamedReads(options, null, keyCollector); + Keys keys = keyCollector.build(); + TxnRead read = createTxnRead(tables, reads, consistencyLevelForAccordRead(cm, tables, keys, options.getSerialConsistency()), Domain.Key); + Txn.Kind kind = shouldReadEphemerally(keys, tables.getMetadata((TableId)keys.get(0).prefix()).params, Read); + return new Txn.InMemory(kind, keys, read, TxnQuery.ALL, null, new TableMetadatasAndKeys(tables, keys)); + } + else + { + Int2ObjectHashMap autoReads = new Int2ObjectHashMap<>(); + List writeFragments = createWriteFragments(state, options, autoReads, keyCollector); + ConsistencyLevel commitCL = consistencyLevelForAccordCommit(cm, tables, keyCollector, options.getConsistency()); + List reads = createNamedReads(options, autoReads, keyCollector); + Keys keys = keyCollector.build(); + AccordUpdate update = new TxnUpdate(tables, writeFragments, createCondition(options), commitCL, false); + TxnRead read = createTxnRead(tables, reads, null, Domain.Key); + return new Txn.InMemory(keys, read, TxnQuery.ALL, update, new TableMetadatasAndKeys(tables, keys)); + } + } + + /** + * Returns {@code true} only if the statement selects multiple clusterings in a partition + */ + private static boolean isSelectingMultipleClusterings(SelectStatement select, @Nullable QueryOptions options) + { + if (select.getRestrictions().hasAllPrimaryKeyColumnsRestrictedByEqualities()) + return false; + + if (options == null) + { + // if the limit is a non-terminal marker (because we're preparing), defer validation until execution (when options != null) + if (select.isLimitMarker()) + return false; + + options = QueryOptions.DEFAULT; + } + + return select.getLimit(options) != 1; + } + + @Override + public ResultMessage execute(QueryState state, QueryOptions options, Dispatcher.RequestTime requestTime) + { + checkTrue(DatabaseDescriptor.getAccordTransactionsEnabled(), TRANSACTIONS_DISABLED_MESSAGE); + + // check again since now we have query options; note that statements are quaranted to be single partition reads at this point + for (NamedSelect assignment : assignments) + { + checkFalse(isSelectingMultipleClusterings(assignment.select, options), INCOMPLETE_PRIMARY_KEY_SELECT_MESSAGE, "LET assignment", assignment.select.source); + if (assignment.select.getRestrictions().keyIsInRelation()) + checkTrue(assignment.select.getLimit(options) == DataLimits.NO_LIMIT, NO_PARTITION_IN_CLAUSE_WITH_LIMIT, "SELECT", assignment.select.source); + } + if (returningSelect != null && returningSelect.select.getRestrictions().keyIsInRelation()) + { + checkTrue(returningSelect.select.getLimit(options) == DataLimits.NO_LIMIT, NO_PARTITION_IN_CLAUSE_WITH_LIMIT, "SELECT", returningSelect.select.source); + } + + Txn txn = createTxn(state.getClientState(), options); + + TxnResult txnResult = AccordService.instance().coordinate(minEpoch, txn, options.getConsistency(), requestTime); + if (txnResult.kind() == retry_new_protocol) + throw new InvalidRequestException(UNSUPPORTED_MIGRATION); + TxnData data = (TxnData)txnResult; + + if (returningSelect != null) + { + @SuppressWarnings("unchecked") + SinglePartitionReadQuery.Group selectQuery = (SinglePartitionReadQuery.Group) returningSelect.select.getQuery(options, 0); + Selection.Selectors selectors = returningSelect.select.getSelection().newSelectors(options); + ResultSetBuilder result = new ResultSetBuilder(resultMetadata, selectors, false); + if (selectQuery.queries.size() == 1) + { + TxnDataKeyValue partition = (TxnDataKeyValue)data.get(txnDataName(RETURNING)); + boolean reversed = selectQuery.queries.get(0).isReversed(); + if (partition != null) + returningSelect.select.processPartition(partition.rowIterator(reversed), options, result, FBUtilities.nowInSeconds()); + } + else + { + long nowInSec = FBUtilities.nowInSeconds(); + for (int i = 0; i < selectQuery.queries.size(); i++) + { + TxnDataKeyValue partition = (TxnDataKeyValue)data.get(txnDataName(RETURNING, i)); + boolean reversed = selectQuery.queries.get(i).isReversed(); + if (partition != null) + returningSelect.select.processPartition(partition.rowIterator(reversed), options, result, nowInSec); + } + } + return new ResultMessage.Rows(result.build()); + } + + if (returningReferences != null) + { + List> resultType = new ArrayList<>(returningReferences.size()); + List columns = new ArrayList<>(returningReferences.size()); + + for (RowDataReference reference : returningReferences) + { + ColumnMetadata forMetadata = reference.toResultMetadata(); + resultType.add(forMetadata.type); + columns.add(reference.column()); + } + + ResultSetBuilder result = new ResultSetBuilder(resultMetadata, Selection.noopSelector(), false); + result.newRow(options.getProtocolVersion(), null, null, columns); + + for (int i = 0; i < returningReferences.size(); i++) + { + RowDataReference reference = returningReferences.get(i); + TxnReference txnReference = reference.toTxnReference(options); + ByteBuffer buffer = txnReference.toByteBuffer(data, resultType.get(i)); + result.add(buffer); + } + + return new ResultMessage.Rows(result.build()); + } + + // In the case of a write-only transaction, just return and empty result. + // TODO: This could be modified to return an indication of whether a condition (if present) succeeds. + return new ResultMessage.Void(); + } + + @Override + public ResultMessage executeLocally(QueryState state, QueryOptions options) + { + return execute(state, options, Dispatcher.RequestTime.forImmediateExecution()); + } + + @Override + public AuditLogContext getAuditLogContext() + { + return new AuditLogContext(AuditLogEntryType.TRANSACTION); + } + + @Override + public boolean eligibleAsPreparedStatement() + { + // false is the default, but still best to be explicit. + return false; + } + + private static void validate(SelectStatement.RawStatement select) + { + if (select.parameters.orderings != null && !select.parameters.orderings.isEmpty()) + throw invalidRequest(NO_ORDER_BY_IN_TXNS_MESSAGE, "SELECT", select.source); + if (select.parameters.groups != null && !select.parameters.groups.isEmpty()) + throw invalidRequest(NO_GROUP_BY_IN_TXNS_MESSAGE, "SELECT", select.source); + } + + private static void validate(SelectStatement prepared) + { + if (!prepared.table.isAccordEnabled()) + throw invalidRequest(TRANSACTIONS_DISABLED_ON_TABLE_MESSAGE, "SELECT", prepared.source); + if (prepared.table.params.pendingDrop) + throw invalidRequest(TRANSACTIONS_DISABLED_ON_TABLE_BEING_DROPPED_MESSAGE, "SELECT", prepared.source); + if (prepared.table.isCounter()) + throw invalidRequest(NO_COUNTERS_IN_TXNS_MESSAGE, "SELECT", prepared.source); + if (prepared.hasAggregation()) + throw invalidRequest(NO_AGGREGATION_IN_TXNS_MESSAGE, "SELECT", prepared.source); + + // when "LIMIT ?" this check can't be performed, so need to do again once the options are known + if (prepared.getRestrictions().keyIsInRelation()) + checkTrue(prepared.isLimitMarker() || prepared.getLimit(null) == DataLimits.NO_LIMIT, NO_PARTITION_IN_CLAUSE_WITH_LIMIT, "SELECT", prepared.source); + } + + public static class Parsed extends QualifiedStatement.Composite + { + private final List assignments; + private final SelectStatement.RawStatement select; + private final List returning; + private final List updates; + private final List conditions; + private final List dataReferences; + + public Parsed(List assignments, + SelectStatement.RawStatement select, + List returning, + List updates, + List conditions, + List dataReferences) + { + this.assignments = assignments; + this.select = select; + this.returning = returning; + this.updates = updates; + this.conditions = conditions != null ? conditions : Collections.emptyList(); + this.dataReferences = dataReferences; + } + + @Override + protected Iterable getStatements() + { + Iterable group = Iterables.concat(assignments, updates); + if (select != null) + group = Iterables.concat(group, Collections.singleton(select)); + return group; + } + + @Override + public CQLStatement prepare(ClientState state) + { + checkFalse(updates.isEmpty() && returning == null && select == null, EMPTY_TRANSACTION_MESSAGE); + + if (select != null || returning != null) + checkTrue(select != null ^ returning != null, "Cannot specify both a full SELECT and a SELECT w/ LET references."); + + List preparedAssignments = new ArrayList<>(assignments.size()); + Map refSources = new HashMap<>(); + Set selectNames = new HashSet<>(); + + int userReadIndex = 0; + Map nameToTxnDataName = new HashMap<>(); + for (SelectStatement.RawStatement select : assignments) + { + checkNotNull(select.parameters.refName, "Assignments must be named"); + int name = txnDataName(USER, userReadIndex++); + nameToTxnDataName.put(select.parameters.refName, name); + checkTrue(selectNames.add(select.parameters.refName), DUPLICATE_TUPLE_NAME_MESSAGE, select.parameters.refName); + validate(select); + + SelectStatement prepared = select.prepare(bindVariables); + validate(prepared); + + NamedSelect namedSelect = new NamedSelect(name, prepared); + checkAtMostOneRowSpecified(namedSelect.select, "LET assignment " + select.parameters.refName); + preparedAssignments.add(namedSelect); + refSources.put(name, new SelectReferenceSource(prepared)); + } + + if (dataReferences != null) + for (RowDataReference.Raw reference : dataReferences) + reference.resolveReference(refSources, nameToTxnDataName, userReadIndex++); + + NamedSelect returningSelect = null; + if (select != null) + { + validate(select); + SelectStatement prepared = select.prepare(bindVariables); + validate(prepared); + returningSelect = new NamedSelect(txnDataName(RETURNING), prepared); + checkAtMostOnePartitionSpecified(returningSelect.select, "returning select"); + } + + List returningReferences = null; + + if (returning != null) + { + // TODO: Eliminate/modify this check if we allow full tuple selections. + returningReferences = returning.stream().peek(raw -> checkTrue(raw.column() != null, SELECT_REFS_NEED_COLUMN_MESSAGE)) + .map(RowDataReference.Raw::prepareAsReceiver) + .collect(Collectors.toList()); + } + + List preparedUpdates = new ArrayList<>(updates.size()); + + // check for any read-before-write updates + for (int i = 0; i < updates.size(); i++) + { + ModificationStatement.Parsed parsed = updates.get(i); + + ModificationStatement prepared = parsed.prepare(state, bindVariables); + checkTrue(prepared.metadata().isAccordEnabled(), TRANSACTIONS_DISABLED_ON_TABLE_MESSAGE, prepared.type, prepared.source); + checkFalse(prepared.metadata().params.pendingDrop, TRANSACTIONS_DISABLED_ON_TABLE_BEING_DROPPED_MESSAGE, prepared.type, prepared.source); + checkFalse(prepared.hasConditions(), NO_CONDITIONS_IN_UPDATES_MESSAGE, prepared.type, prepared.source); + checkFalse(prepared.isTimestampSet(), NO_TIMESTAMPS_IN_UPDATES_MESSAGE, prepared.type, prepared.source); + checkFalse(prepared.attrs.isTimeToLiveSet(), NO_TTLS_IN_UPDATES_MESSAGE, prepared.type, prepared.source); + + if (prepared.metadata().isCounter()) + throw invalidRequest(NO_COUNTERS_IN_TXNS_MESSAGE, prepared.type, prepared.source); + + preparedUpdates.add(prepared); + } + + List preparedConditions = new ArrayList<>(conditions.size()); + for (ConditionStatement.Raw condition : conditions) + // TODO: If we eventually support IF ks.function(ref) THEN, the keyspace will have to be provided here + preparedConditions.add(condition.prepare("[txn]", bindVariables)); + + return new TransactionStatement(preparedAssignments, returningSelect, returningReferences, preparedUpdates, preparedConditions, bindVariables); + } + + /** + * Do not use this method in execution!!! It is only allowed during prepare because it outputs a query raw text. + * We don't want it print it for a user who provided an identifier of someone's else prepared statement. + */ + private static void checkAtMostOnePartitionSpecified(SelectStatement select, String name) + { + checkTrue(select.getRestrictions().hasPartitionKeyRestrictions(), INCOMPLETE_PARTITION_KEY_SELECT_MESSAGE, name, select.source); + } + + /** + * Do not use this method in execution!!! It is only allowed during prepare because it outputs a query raw text. + * We don't want it print it for a user who provided an identifier of someone's else prepared statement. + */ + private static void checkAtMostOneRowSpecified(SelectStatement select, String name) + { + checkFalse(select.isPartitionRangeQuery(), ILLEGAL_RANGE_QUERY_MESSAGE, name, select.source); + checkFalse(isSelectingMultipleClusterings(select, null), INCOMPLETE_PRIMARY_KEY_SELECT_MESSAGE, name, select.source); + } + } +} diff --git a/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java b/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java index d8310cdd2f8d..c48483d0ba0c 100644 --- a/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java @@ -18,19 +18,37 @@ package org.apache.cassandra.cql3.statements; import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.List; +import com.google.common.base.Preconditions; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; + import org.apache.cassandra.audit.AuditLogContext; import org.apache.cassandra.audit.AuditLogEntryType; -import org.apache.cassandra.cql3.*; +import org.apache.cassandra.cql3.Attributes; +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.cql3.Relation; +import org.apache.cassandra.cql3.terms.Constants; +import org.apache.cassandra.cql3.Json; +import org.apache.cassandra.cql3.Operation; +import org.apache.cassandra.cql3.Operations; +import org.apache.cassandra.cql3.Operator; +import org.apache.cassandra.cql3.QualifiedName; +import org.apache.cassandra.cql3.StatementSource; +import org.apache.cassandra.cql3.terms.Term; +import org.apache.cassandra.cql3.UpdateParameters; +import org.apache.cassandra.cql3.VariableSpecifications; +import org.apache.cassandra.cql3.WhereClause; import org.apache.cassandra.cql3.conditions.ColumnCondition; import org.apache.cassandra.cql3.conditions.Conditions; import org.apache.cassandra.cql3.constraints.ColumnConstraint; import org.apache.cassandra.cql3.restrictions.StatementRestrictions; -import org.apache.cassandra.cql3.terms.Constants; -import org.apache.cassandra.cql3.terms.Term; +import org.apache.cassandra.cql3.transactions.ReferenceOperation; +import org.apache.cassandra.cql3.transactions.ReferenceValue; import org.apache.cassandra.db.Clustering; import org.apache.cassandra.db.Slice; import org.apache.cassandra.db.partitions.PartitionUpdate; @@ -39,10 +57,9 @@ import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.accord.txn.TxnReferenceOperation; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.Pair; -import org.apache.commons.lang3.builder.ToStringBuilder; -import org.apache.commons.lang3.builder.ToStringStyle; import static org.apache.cassandra.cql3.statements.RequestValidations.checkContainsNoDuplicates; import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse; @@ -53,6 +70,9 @@ */ public class UpdateStatement extends ModificationStatement { + public static final String UPDATING_PRIMARY_KEY_MESSAGE = "PRIMARY KEY part %s found in SET part"; + public static final String CANNOT_SET_KEY_WITH_REFERENCE_MESSAGE = "Value reference %s cannot be used to insert PRIMARY KEY column %s"; + private static final Constants.Value EMPTY = new Constants.Value(ByteBufferUtil.EMPTY_BYTE_BUFFER); private UpdateStatement(StatementType type, @@ -61,9 +81,16 @@ private UpdateStatement(StatementType type, Operations operations, StatementRestrictions restrictions, Conditions conditions, - Attributes attrs) + Attributes attrs, + StatementSource source) + { + super(type, bindVariables, metadata, operations, restrictions, conditions, attrs, source); + } + + @Override + protected ModificationStatement withOperations(Operations operations) { - super(type, bindVariables, metadata, operations, restrictions, conditions, attrs); + return new UpdateStatement(type, bindVariables, metadata, operations, restrictions, conditions, attrs, source); } @Override @@ -122,6 +149,7 @@ public static class ParsedInsert extends ModificationStatement.Parsed { private final List columnNames; private final List columnValues; + private final boolean isForTxn; /** * A parsed INSERT statement. @@ -136,11 +164,14 @@ public ParsedInsert(QualifiedName name, Attributes.Raw attrs, List columnNames, List columnValues, - boolean ifNotExists) + boolean ifNotExists, + StatementSource source, + boolean isForTxn) { - super(name, StatementType.INSERT, attrs, null, ifNotExists, false); + super(name, StatementType.INSERT, attrs, null, ifNotExists, false, source); this.columnNames = columnNames; this.columnValues = columnValues; + this.isForTxn = isForTxn; } @Override @@ -160,7 +191,7 @@ protected ModificationStatement prepareInternal(ClientState state, checkContainsNoDuplicates(columnNames, "The column names contains duplicates"); WhereClause.Builder whereClause = new WhereClause.Builder(); - Operations operations = new Operations(type); + Operations operations = new Operations(type, isForTxn); boolean hasClusteringColumnsSet = false; for (int i = 0; i < columnNames.size(); i++) @@ -174,13 +205,21 @@ protected ModificationStatement prepareInternal(ClientState state, if (def.isPrimaryKeyColumn()) { + checkFalse(value instanceof ReferenceValue.Raw, String.format(CANNOT_SET_KEY_WITH_REFERENCE_MESSAGE, value, def)); whereClause.add(Relation.singleColumn(columnNames.get(i), Operator.EQ, value)); } + else if (value instanceof ReferenceValue.Raw) + { + ReferenceValue.Raw raw = (ReferenceValue.Raw) value; + ReferenceValue referenceValue = raw.prepare(def, bindVariables); + ReferenceOperation operation = new ReferenceOperation(def, metadata, TxnReferenceOperation.Kind.setterFor(def), null, null, referenceValue); + operations.add(def, operation); + } else { Operation operation = new Operation.SetValue(value).prepare(metadata, def, !conditions.isEmpty()); operation.collectMarkerSpecification(bindVariables); - operations.add(operation); + operations.add(operation, metadata); } } @@ -202,7 +241,8 @@ protected ModificationStatement prepareInternal(ClientState state, operations, restrictions, conditions, - attrs); + attrs, + source); } } @@ -213,12 +253,14 @@ public static class ParsedInsertJson extends ModificationStatement.Parsed { private final Json.Raw jsonValue; private final boolean defaultUnset; + private final boolean isForTxn; - public ParsedInsertJson(QualifiedName name, Attributes.Raw attrs, Json.Raw jsonValue, boolean defaultUnset, boolean ifNotExists) + public ParsedInsertJson(QualifiedName name, Attributes.Raw attrs, Json.Raw jsonValue, boolean defaultUnset, boolean ifNotExists, StatementSource source, boolean isForTxn) { - super(name, StatementType.INSERT, attrs, null, ifNotExists, false); + super(name, StatementType.INSERT, attrs, null, ifNotExists, false, source); this.jsonValue = jsonValue; this.defaultUnset = defaultUnset; + this.isForTxn = isForTxn; } @Override @@ -234,7 +276,7 @@ protected ModificationStatement prepareInternal(ClientState state, Json.Prepared prepared = jsonValue.prepareAndCollectMarkers(metadata, defs, bindVariables); WhereClause.Builder whereClause = new WhereClause.Builder(); - Operations operations = new Operations(type); + Operations operations = new Operations(type, isForTxn); boolean hasClusteringColumnsSet = false; for (ColumnMetadata def : defs) @@ -251,7 +293,7 @@ protected ModificationStatement prepareInternal(ClientState state, { Operation operation = new Operation.SetValue(raw).prepare(metadata, def, !conditions.isEmpty()); operation.collectMarkerSpecification(bindVariables); - operations.add(operation); + operations.add(operation, metadata); } } @@ -273,15 +315,64 @@ protected ModificationStatement prepareInternal(ClientState state, operations, restrictions, conditions, - attrs); + attrs, + source); + } + } + + public static class OperationCollector + { + public final List> operations = new ArrayList<>(); + public final List> referenceOps = new ArrayList<>(); + + public boolean conflictsWithExistingUpdate(ColumnIdentifier column, Operation.RawUpdate update) + { + for (Pair p : operations) + { + if (p.left.equals(column) && !p.right.isCompatibleWith(update)) + return true; + } + return false; + } + + public boolean conflictsWithExistingSubstitution(ColumnIdentifier column) + { + for (Pair p : referenceOps) + { + if (p.left.equals(column)) + return true; + } + return false; + } + + public void addRawUpdate(ColumnIdentifier column, Operation.RawUpdate update) + { + operations.add(Pair.create(column, update)); + } + + public boolean conflictsWithExistingUpdate(ColumnIdentifier column) + { + for (Pair p : operations) + { + if (p.left.equals(column)) + return true; + } + return false; + } + + public void addRawReferenceOperation(ColumnIdentifier column, ReferenceOperation.Raw substitution) + { + // TODO: Make sure there's more than a tuple name here...i.e. an actual reference column? + referenceOps.add(Pair.create(column, substitution)); } } public static class ParsedUpdate extends ModificationStatement.Parsed { // Provided for an UPDATE - private final List> updates; + private final OperationCollector updates; private final WhereClause whereClause; + private final boolean isForTxn; /** * Creates a new UpdateStatement from a column family name, columns map, consistency @@ -295,14 +386,17 @@ public static class ParsedUpdate extends ModificationStatement.Parsed * */ public ParsedUpdate(QualifiedName name, Attributes.Raw attrs, - List> updates, + OperationCollector updates, WhereClause whereClause, List conditions, - boolean ifExists) + boolean ifExists, + boolean isForTxn, + StatementSource source) { - super(name, StatementType.UPDATE, attrs, conditions, false, ifExists); + super(name, StatementType.UPDATE, attrs, conditions, false, ifExists, source); this.updates = updates; this.whereClause = whereClause; + this.isForTxn = isForTxn; } @Override @@ -312,17 +406,24 @@ protected ModificationStatement prepareInternal(ClientState state, Conditions conditions, Attributes attrs) { - Operations operations = new Operations(type); + Operations operations = new Operations(type, isForTxn); - for (Pair entry : updates) + for (Pair entry : updates.operations) { ColumnMetadata def = metadata.getExistingColumn(entry.left); - - checkFalse(def.isPrimaryKeyColumn(), "PRIMARY KEY part %s found in SET part", def.name); - - Operation operation = entry.right.prepare(metadata, def, !conditions.isEmpty()); + checkFalse(def.isPrimaryKeyColumn(), UPDATING_PRIMARY_KEY_MESSAGE, def.name); + Operation operation = entry.right.prepare(metadata, def, !conditions.isEmpty() || isForTxn); operation.collectMarkerSpecification(bindVariables); - operations.add(operation); + operations.add(operation, metadata); + } + + Preconditions.checkState(updates.referenceOps.isEmpty() || isForTxn); + for (Pair entry : updates.referenceOps) + { + ColumnMetadata def = metadata.getExistingColumn(entry.left); + checkFalse(def.isPrimaryKeyColumn(), UPDATING_PRIMARY_KEY_MESSAGE, def.name); + ReferenceOperation operation = entry.right.prepare(metadata, bindVariables); + operations.add(def, operation); } StatementRestrictions restrictions = newRestrictions(state, @@ -339,7 +440,8 @@ protected ModificationStatement prepareInternal(ClientState state, operations, restrictions, conditions, - attrs); + attrs, + source); } } @@ -371,9 +473,7 @@ public static void evaluateConstraintsForRow(Row row, TableMetadata metadata) public static void evaluateConstraint(ColumnMetadata columnMetadata, ByteBuffer cellData) { - for (ColumnConstraint constraint : columnMetadata.getColumnConstraints().getConstraints()) - { + for (ColumnConstraint constraint : columnMetadata.getColumnConstraints().getConstraints()) constraint.evaluate(columnMetadata.type, cellData); - } } } diff --git a/src/java/org/apache/cassandra/cql3/statements/UpdatesCollector.java b/src/java/org/apache/cassandra/cql3/statements/UpdatesCollector.java index 40b75ab5faba..a3867b608ffa 100644 --- a/src/java/org/apache/cassandra/cql3/statements/UpdatesCollector.java +++ b/src/java/org/apache/cassandra/cql3/statements/UpdatesCollector.java @@ -23,6 +23,7 @@ import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.IMutation; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.ClientState; @@ -30,5 +31,5 @@ public interface UpdatesCollector { PartitionUpdate.Builder getPartitionUpdateBuilder(TableMetadata metadata, DecoratedKey dk, ConsistencyLevel consistency); - List toMutations(ClientState state); + List toMutations(ClientState state, PotentialTxnConflicts allowPotentialTxnConflicts); } diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/AlterSchemaStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/AlterSchemaStatement.java index 0282cbd40943..3901e65cfb86 100644 --- a/src/java/org/apache/cassandra/cql3/statements/schema/AlterSchemaStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/schema/AlterSchemaStatement.java @@ -40,6 +40,7 @@ import org.apache.cassandra.service.ClientState; import org.apache.cassandra.service.ClientWarn; import org.apache.cassandra.service.QueryState; +import org.apache.cassandra.service.accord.AccordTopology; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.transport.Dispatcher; import org.apache.cassandra.transport.Event.SchemaChange; @@ -180,7 +181,7 @@ public ResultMessage execute(QueryState state) // cluster, as config can be heterogenous falling back to safe defaults may occur on some nodes. ClusterMetadata metadata = ClusterMetadata.current(); apply(metadata); - ClusterMetadata result = Schema.instance.submit(this); + ClusterMetadata result = commit(metadata); KeyspacesDiff diff = Keyspaces.diff(metadata.schema.getKeyspaces(), result.schema.getKeyspaces()); clientWarnings(diff).forEach(ClientWarn.instance::warn); @@ -199,9 +200,17 @@ public ResultMessage execute(QueryState state) if (null != user && !user.isAnonymous()) createdResources(diff).forEach(r -> grantPermissionsOnResource(r, user)); + // if the changes affected accord, wait for accord to apply them + AccordTopology.awaitTopologyReadiness(diff, result.epoch); + return new ResultMessage.SchemaChange(schemaChangeEvent(diff)); } + protected ClusterMetadata commit(ClusterMetadata metadata) + { + return Schema.instance.submit(this); + } + private void validateKeyspaceName() { if (!SchemaConstants.isValidName(keyspaceName)) diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java index fc2ab582f754..c7a57139d129 100644 --- a/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java @@ -59,12 +59,16 @@ import org.apache.cassandra.schema.Keyspaces; import org.apache.cassandra.schema.Keyspaces.KeyspacesDiff; import org.apache.cassandra.schema.MemtableParams; +import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.schema.TableParams; +import org.apache.cassandra.schema.TableParams.Option; import org.apache.cassandra.schema.UserFunctions; import org.apache.cassandra.schema.ViewMetadata; import org.apache.cassandra.schema.Views; import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.service.consensus.migration.TransactionalMigrationFromMode; import org.apache.cassandra.service.reads.repair.ReadRepairStrategy; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; @@ -83,6 +87,11 @@ public abstract class AlterTableStatement extends AlterSchemaStatement { + private static final Logger logger = LoggerFactory.getLogger(AlterTableStatement.class); + + public static final String ACCORD_COUNTER_TABLES_UNSUPPORTED = "Counters are not supported with Accord for table %s.%s"; + public static final String ACCORD_COUNTER_COLUMN_UNSUPPORTED = "Cannot add a counter column to Accord table %s.%s with transactional mode %s and transactional migration from %s"; + protected final String tableName; private final boolean ifExists; protected ClientState state; @@ -119,6 +128,9 @@ public Keyspaces apply(ClusterMetadata metadata) return schema; } + if (table.params.pendingDrop) + throw ire("Cannot use ALTER TABLE on a table that is being dropped."); + if (table.isView()) throw ire("Cannot use ALTER TABLE on a materialized view; use ALTER MATERIALIZED VIEW instead"); @@ -257,13 +269,18 @@ private static class Column private final boolean isStatic; @Nullable private final ColumnMask.Raw mask; + @Nullable + private final ColumnConstraints.Raw constraints; - Column(ColumnIdentifier name, CQL3Type.Raw type, boolean isStatic, @Nullable ColumnMask.Raw mask) + Column(ColumnIdentifier name, CQL3Type.Raw type, boolean isStatic, @Nullable ColumnMask.Raw mask, @Nullable ColumnConstraints.Raw constraints) { this.name = name; this.type = type; this.isStatic = isStatic; this.mask = mask; + if (constraints != null) + constraints.prepare(name); + this.constraints = constraints; } } @@ -311,6 +328,7 @@ private void addColumn(KeyspaceMetadata keyspace, AbstractType type = column.type.prepare(keyspaceName, keyspace.types).getType(); boolean isStatic = column.isStatic; ColumnMask mask = column.mask == null ? null : column.mask.prepare(keyspaceName, tableName, name, type, keyspace.userFunctions); + ColumnConstraints columnConstraints = column.constraints == null ? ColumnConstraints.NO_OP : column.constraints.prepare(name); if (null != tableBuilder.getColumn(name)) { if (!ifColumnNotExists) @@ -318,6 +336,9 @@ private void addColumn(KeyspaceMetadata keyspace, return; } + if (type.isCounter() && (table.params.transactionalMode.accordIsEnabled || table.params.transactionalMigrationFrom.migratingFromAccord())) + throw ire(format(ACCORD_COUNTER_COLUMN_UNSUPPORTED, keyspaceName, tableName, table.params.transactionalMode, table.params.transactionalMigrationFrom)); + if (table.isCompactTable()) throw ire("Cannot add new column to a COMPACT STORAGE table"); @@ -361,9 +382,9 @@ private void addColumn(KeyspaceMetadata keyspace, } if (isStatic) - tableBuilder.addStaticColumn(name, type, mask); + tableBuilder.addStaticColumn(name, type, mask, columnConstraints); else - tableBuilder.addRegularColumn(name, type, mask); + tableBuilder.addRegularColumn(name, type, mask, columnConstraints); if (!isStatic) { @@ -371,8 +392,9 @@ private void addColumn(KeyspaceMetadata keyspace, { if (view.includeAllColumns) { - ColumnMetadata viewColumn = ColumnMetadata.regularColumn(view.metadata, name.bytes, type) - .withNewMask(mask); + ColumnMetadata viewColumn = ColumnMetadata.regularColumn(view.metadata, name.bytes, type, ColumnMetadata.NO_UNIQUE_ID) + .withNewMask(mask) + .withNewColumnConstraints(columnConstraints); viewsBuilder.put(viewsBuilder.get(view.name()).withAddedRegularColumn(viewColumn)); } } @@ -575,9 +597,50 @@ public void validate(ClientState state) MemtableParams.get(attrs.getString(TableParams.Option.MEMTABLE.toString())); Guardrails.tableProperties.guard(attrs.updatedProperties(), attrs::removeProperty, state); - validateDefaultTimeToLive(attrs.asNewTableParams()); + validateDefaultTimeToLive(attrs.asNewTableParams(keyspaceName)); + } + + private TableParams validateAndUpdateTransactionalMigration(boolean isCounter, TableParams prev, TableParams next) + { + if (next.transactionalMode.accordIsEnabled && SchemaConstants.isSystemKeyspace(keyspaceName)) + throw ire("Cannot enable accord on system tables (%s.%s)", keyspaceName, tableName); + + boolean modeChange = prev.transactionalMode != next.transactionalMode; + boolean wasMigrating = prev.transactionalMigrationFrom.isMigrating(); + boolean explicitlySetMigrationFrom = attrs.hasOption(Option.TRANSACTIONAL_MIGRATION_FROM); + // set table to migrating + TransactionalMigrationFromMode newMigrateFrom = TransactionalMigrationFromMode.fromMode(prev.transactionalMode, next.transactionalMode); + + if (isCounter && (next.transactionalMode != TransactionalMode.off || newMigrateFrom != TransactionalMigrationFromMode.none || next.transactionalMigrationFrom != TransactionalMigrationFromMode.none)) + throw ire(format(ACCORD_COUNTER_TABLES_UNSUPPORTED, keyspaceName, tableName)); + + boolean forceMigrationChange = modeChange && explicitlySetMigrationFrom && next.transactionalMigrationFrom != newMigrateFrom; + + if (modeChange && next.transactionalMode.accordIsEnabled && !DatabaseDescriptor.getAccordTransactionsEnabled()) + throw ire(format("Cannot change transactional mode to %s for %s.%s with accord_transactions_enabled set to false", + next.transactionalMode, keyspaceName, tableName)); + + // user is manually updating migration mode, don't interfere + if (forceMigrationChange) + { + logger.warn("Forcing unsafe migration change from {} to {} with transaction mode {}", prev.transactionalMigrationFrom, next.transactionalMigrationFrom, next.transactionalMode); + return next; + } + + if (!modeChange) + return next; + + // if the user is trying to revert to the mode being migrated from, allow it. The migration states will be inverted when + // the transformation is applied. Otherwise throw + if (wasMigrating && next.transactionalMode != prev.transactionalMigrationFrom.from) + throw ire(format("Cannot change transactional mode from %s to %s for %s.%s before transactional migration has completed", + prev.transactionalMode, next.transactionalMode, + keyspaceName, tableName)); + + return next.unbuild().transactionalMigrationFrom(newMigrateFrom).build(); } + public KeyspaceMetadata apply(Epoch epoch, KeyspaceMetadata keyspace, TableMetadata table, ClusterMetadata metadata) { attrs.validate(); @@ -605,6 +668,8 @@ public KeyspaceMetadata apply(Epoch epoch, KeyspaceMetadata keyspace, TableMetad if (!params.compression.isEnabled()) Guardrails.uncompressedTablesEnabled.ensureEnabled(state); + params = validateAndUpdateTransactionalMigration(table.isCounter(), table.params, params); + return keyspace.withSwapped(keyspace.tables.withSwapped(table.withSwapped(params))); } } @@ -732,7 +797,7 @@ public KeyspaceMetadata apply(Epoch epoch, KeyspaceMetadata keyspace, TableMetad if (column != null) { ColumnConstraints oldConstraints = column.getColumnConstraints(); - ColumnConstraints newConstraints = constraints == null ? ColumnConstraints.NO_OP : constraints.prepare(); + ColumnConstraints newConstraints = constraints == null ? ColumnConstraints.NO_OP : constraints.prepare(columnName); if (Objects.equals(oldConstraints, newConstraints)) return keyspace; newConstraints.validate(column); @@ -837,10 +902,10 @@ public void mask(ColumnIdentifier name, ColumnMask.Raw mask) rawMask = mask; } - public void add(ColumnIdentifier name, CQL3Type.Raw type, boolean isStatic, @Nullable ColumnMask.Raw mask) + public void add(ColumnIdentifier name, CQL3Type.Raw type, boolean isStatic, @Nullable ColumnMask.Raw mask, @Nullable ColumnConstraints.Raw constraints) { kind = Kind.ADD_COLUMNS; - addedColumns.add(new AddColumns.Column(name, type, isStatic, mask)); + addedColumns.add(new AddColumns.Column(name, type, isStatic, mask, constraints)); } public void drop(ColumnIdentifier name) @@ -868,6 +933,8 @@ public void constraint(ColumnIdentifier name, ColumnConstraints.Raw rawConstrain { kind = Kind.ALTER_CONSTRAINTS; this.constraintName = name; + if (rawConstraints != null) + rawConstraints.prepare(constraintName); this.constraints = rawConstraints; } diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CopyTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CopyTableStatement.java index 3a79311dc771..c5b38e5e0850 100644 --- a/src/java/org/apache/cassandra/cql3/statements/schema/CopyTableStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/schema/CopyTableStatement.java @@ -236,7 +236,7 @@ public void validate(ClientState state) .sum(); Guardrails.tables.guard(totalUserTables + 1, targetTableName, false, state); } - validateDefaultTimeToLive(attrs.asNewTableParams()); + validateDefaultTimeToLive(attrs.asNewTableParams(keyspaceName)); } private void maybeCopyIndexes(TableMetadata.Builder builder, TableMetadata sourceTableMeta, KeyspaceMetadata targetKeyspaceMeta) diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java index 997a40200077..1f12366677ac 100644 --- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java @@ -17,13 +17,20 @@ */ package org.apache.cassandra.cql3.statements.schema; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; import java.util.stream.Collectors; - import javax.annotation.Nullable; import com.google.common.collect.ImmutableSet; - import org.apache.commons.lang3.StringUtils; import org.apache.cassandra.audit.AuditLogContext; @@ -31,24 +38,48 @@ import org.apache.cassandra.auth.DataResource; import org.apache.cassandra.auth.IResource; import org.apache.cassandra.auth.Permission; -import org.apache.cassandra.cql3.*; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.constraints.ColumnConstraint; import org.apache.cassandra.cql3.constraints.ColumnConstraints; +import org.apache.cassandra.cql3.CQL3Type; +import org.apache.cassandra.cql3.CQLFragmentParser; +import org.apache.cassandra.cql3.CQLStatement; +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.cql3.CqlParser; +import org.apache.cassandra.cql3.QualifiedName; +import org.apache.cassandra.cql3.constraints.NotNullConstraint; +import org.apache.cassandra.cql3.constraints.UnaryFunctionColumnConstraint; import org.apache.cassandra.cql3.functions.masking.ColumnMask; import org.apache.cassandra.db.guardrails.Guardrails; -import org.apache.cassandra.db.marshal.*; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.BytesType; +import org.apache.cassandra.db.marshal.CounterColumnType; +import org.apache.cassandra.db.marshal.EmptyType; +import org.apache.cassandra.db.marshal.ReversedType; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.db.marshal.UserType; import org.apache.cassandra.exceptions.AlreadyExistsException; -import org.apache.cassandra.schema.*; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.Keyspaces; import org.apache.cassandra.schema.Keyspaces.KeyspacesDiff; +import org.apache.cassandra.schema.MemtableParams; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.TableParams; +import org.apache.cassandra.schema.Types; +import org.apache.cassandra.schema.UserFunctions; import org.apache.cassandra.service.ClientState; -import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.service.reads.repair.ReadRepairStrategy; +import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.transport.Event.SchemaChange; import org.apache.cassandra.transport.Event.SchemaChange.Change; import org.apache.cassandra.transport.Event.SchemaChange.Target; -import static java.util.Comparator.comparing; - import static com.google.common.collect.Iterables.concat; +import static java.lang.String.format; +import static java.util.Comparator.comparing; public final class CreateTableStatement extends AlterSchemaStatement { @@ -145,6 +176,16 @@ public Keyspaces apply(ClusterMetadata metadata) if (!table.params.compression.isEnabled()) Guardrails.uncompressedTablesEnabled.ensureEnabled(state); + if (table.params.transactionalMode.accordIsEnabled && SchemaConstants.isSystemKeyspace(keyspaceName)) + throw ire("Cannot enable accord on system tables (%s.%s)", keyspaceName, tableName); + + if (table.params.transactionalMode.accordIsEnabled && !DatabaseDescriptor.getAccordTransactionsEnabled()) + throw ire(format("Cannot create table %s.%s with transactional mode %s with accord.enabled set to false", + keyspaceName, tableName, table.params.transactionalMode)); + + if (table.params.transactionalMigrationFrom.isMigrating()) + throw ire("Cannot set transactional migration on new tables (%s.%s), %s", keyspaceName, tableName, table.params.transactionalMigrationFrom); + return schema.withAddedOrUpdated(keyspace.withSwapped(keyspace.tables.with(table))); } @@ -177,7 +218,7 @@ public void validate(ClientState state) if (useCompactStorage) Guardrails.compactTablesEnabled.ensureEnabled(state); - validateDefaultTimeToLive(attrs.asNewTableParams()); + validateDefaultTimeToLive(attrs.asNewTableParams(keyspaceName)); rawColumns.forEach((name, raw) -> raw.validate(state, name)); } @@ -212,7 +253,7 @@ public String toString() public TableMetadata.Builder builder(Types types, UserFunctions functions) { attrs.validate(); - TableParams params = attrs.asNewTableParams(); + TableParams params = attrs.asNewTableParams(keyspaceName); // use a TreeMap to preserve ordering across JDK versions (see CASSANDRA-9492) - important for stable unit tests Map columns = new TreeMap<>(comparing(o -> o.bytes)); @@ -568,17 +609,33 @@ public String table() return name.getName(); } - public void addColumn(ColumnIdentifier column, CQL3Type.Raw type, boolean isStatic, ColumnMask.Raw mask, ColumnConstraints.Raw constraints) + public void addColumn(ColumnIdentifier column, CQL3Type.Raw type, boolean isStatic, boolean isNotNull, ColumnMask.Raw mask, ColumnConstraints.Raw constraints) { if (null != rawColumns.put(column, new ColumnProperties.Raw(type, mask))) throw ire("Duplicate column '%s' declaration for table '%s'", column, name); if (isStatic) staticColumns.add(column); - if (null == constraints) - columnConstraints.put(column, ColumnConstraints.NO_OP); - else - columnConstraints.put(column, constraints.prepare()); + + ColumnConstraints preparedConstraints = constraints == null ? ColumnConstraints.NO_OP : constraints.prepare(column); + + if (isNotNull) + { + if (preparedConstraints.containsNotNullConstraint()) + throw ire("Duplicate definition of NOT NULL constraint"); + + List> checkConstraints = new ArrayList<>(preparedConstraints.getConstraints()); + checkConstraints.add(new UnaryFunctionColumnConstraint(new NotNullConstraint())); + preparedConstraints = new ColumnConstraints(checkConstraints); + preparedConstraints.setColumnName(column); + } + + columnConstraints.put(column, preparedConstraints); + } + + public void addColumn(ColumnIdentifier column, CQL3Type.Raw type, boolean isStatic, ColumnMask.Raw mask, ColumnConstraints.Raw constraints) + { + addColumn(column, type, isStatic, false, mask, constraints); } public void setCompactStorage() diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java index 40cbee967b60..284edf112b50 100644 --- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java @@ -17,7 +17,12 @@ */ package org.apache.cassandra.cql3.statements.schema; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Set; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; @@ -27,7 +32,11 @@ import org.apache.cassandra.audit.AuditLogEntryType; import org.apache.cassandra.auth.Permission; import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.cql3.*; +import org.apache.cassandra.cql3.CQLStatement; +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.cql3.QualifiedName; +import org.apache.cassandra.cql3.VariableSpecifications; +import org.apache.cassandra.cql3.WhereClause; import org.apache.cassandra.cql3.restrictions.StatementRestrictions; import org.apache.cassandra.cql3.selection.RawSelector; import org.apache.cassandra.cql3.selection.Selectable; @@ -38,19 +47,24 @@ import org.apache.cassandra.db.view.View; import org.apache.cassandra.exceptions.AlreadyExistsException; import org.apache.cassandra.exceptions.InvalidRequestException; -import org.apache.cassandra.schema.*; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.Keyspaces; import org.apache.cassandra.schema.Keyspaces.KeyspacesDiff; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.TableParams; +import org.apache.cassandra.schema.ViewMetadata; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.transport.Event.SchemaChange; import org.apache.cassandra.transport.Event.SchemaChange.Change; import org.apache.cassandra.transport.Event.SchemaChange.Target; -import static java.lang.String.join; - import static com.google.common.collect.Iterables.concat; import static com.google.common.collect.Iterables.filter; import static com.google.common.collect.Iterables.transform; +import static java.lang.String.join; import static org.apache.cassandra.config.CassandraRelevantProperties.MV_ALLOW_FILTERING_NONKEY_COLUMNS_UNSAFE; public final class CreateViewStatement extends AlterSchemaStatement @@ -173,6 +187,11 @@ public Keyspaces apply(ClusterMetadata metadata) viewName, tableName); } + if (table.params.pendingDrop) + throw ire("Cannot create materialized view '%s' for base table " + + "'%s' as it is being dropped.", + viewName, tableName); + /* * Process SELECT clause */ @@ -329,7 +348,7 @@ public Keyspaces apply(ClusterMetadata metadata) else if (!builder.hasId()) builder.id(TableId.get(metadata)); - builder.params(attrs.asNewTableParams()) + builder.params(attrs.asNewTableParams(keyspaceName)) .kind(TableMetadata.Kind.VIEW); partitionKeyColumns.stream() diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/DropKeyspaceStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/DropKeyspaceStatement.java index e074da54a33b..e7159cda8d41 100644 --- a/src/java/org/apache/cassandra/cql3/statements/schema/DropKeyspaceStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/schema/DropKeyspaceStatement.java @@ -17,13 +17,18 @@ */ package org.apache.cassandra.cql3.statements.schema; +import java.util.List; +import java.util.stream.Collectors; + import org.apache.cassandra.audit.AuditLogContext; import org.apache.cassandra.audit.AuditLogEntryType; import org.apache.cassandra.auth.Permission; import org.apache.cassandra.cql3.CQLStatement; import org.apache.cassandra.db.guardrails.Guardrails; +import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.Keyspaces; import org.apache.cassandra.schema.Keyspaces.KeyspacesDiff; +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.transport.Event.SchemaChange; @@ -45,8 +50,30 @@ public Keyspaces apply(ClusterMetadata metadata) Guardrails.dropKeyspaceEnabled.ensureEnabled(state); Keyspaces schema = metadata.schema.getKeyspaces(); - if (schema.containsKeyspace(keyspaceName)) + KeyspaceMetadata keyspace = schema.getNullable(keyspaceName); + if (keyspace != null) + { + // check that no accord tables in the keyspace are currently in the process of being dropped + List pendingDrop = keyspace.tables.stream() + .filter(t -> t.params.pendingDrop) + .collect(Collectors.toList()); + if (!pendingDrop.isEmpty()) + throw ire("Cannot drop keyspace '%s' as it contains accord tables which are currently being dropped. " + + "Please wait for those operations to complete before dropping the keyspace. (%s)", + keyspaceName, pendingDrop.stream() + .map(Object::toString) + .collect(Collectors.joining(","))); + + List accordTables = keyspace.tables.stream() + .filter(TableMetadata::isAccordEnabled) + .collect(Collectors.toList()); + if (!accordTables.isEmpty()) + throw ire("Cannot drop keyspace '%s' as it contains accord tables. (%s)", + keyspaceName, accordTables.stream() + .map(Object::toString) + .collect(Collectors.joining(","))); return schema.without(keyspaceName); + } if (ifExists) return schema; diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/DropTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/DropTableStatement.java index 56848a8c2275..a008b4548823 100644 --- a/src/java/org/apache/cassandra/cql3/statements/schema/DropTableStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/schema/DropTableStatement.java @@ -27,6 +27,10 @@ import org.apache.cassandra.schema.Keyspaces.KeyspacesDiff; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.sequences.DropAccordTable.TableReference; +import org.apache.cassandra.tcm.sequences.InProgressSequences; +import org.apache.cassandra.tcm.transformations.PrepareDropAccordTable; import org.apache.cassandra.transport.Event.SchemaChange; import org.apache.cassandra.transport.Event.SchemaChange.Change; import org.apache.cassandra.transport.Event.SchemaChange.Target; @@ -48,6 +52,26 @@ public DropTableStatement(String keyspaceName, String tableName, boolean ifExist this.ifExists = ifExists; } + @Override + protected ClusterMetadata commit(ClusterMetadata metadata) + { + KeyspaceMetadata keyspace = metadata.schema.getKeyspaces().getNullable(keyspaceName); + TableMetadata table = null == keyspace + ? null + : keyspace.getTableOrViewNullable(tableName); + if (table == null // this can happen when ifExists=true... since its already been validated can skip + || !table.isAccordEnabled()) + return super.commit(metadata); + + // Multi-Step Operation + // 1) mark the table as pending delete + // 2) await for Accord to finish transactions + // 3) drop table + TableReference ref = TableReference.from(table); + ClusterMetadataService.instance().commit(new PrepareDropAccordTable(ref)); + return InProgressSequences.finishInProgressSequences(ref); + } + public Keyspaces apply(ClusterMetadata metadata) { Guardrails.dropTruncateTableEnabled.ensureEnabled(state); @@ -70,6 +94,9 @@ public Keyspaces apply(ClusterMetadata metadata) if (table.isView()) throw ire("Cannot use DROP TABLE on a materialized view. Please use DROP MATERIALIZED VIEW instead."); + if (table.isAccordEnabled() && table.params.pendingDrop) + throw ire("Table '%s.%s' is already being dropped", keyspaceName, tableName); + Iterable views = keyspace.views.forTable(table.id); if (!isEmpty(views)) { diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/KeyspaceAttributes.java b/src/java/org/apache/cassandra/cql3/statements/schema/KeyspaceAttributes.java index d4d5b984b3c3..0be8b882dd79 100644 --- a/src/java/org/apache/cassandra/cql3/statements/schema/KeyspaceAttributes.java +++ b/src/java/org/apache/cassandra/cql3/statements/schema/KeyspaceAttributes.java @@ -23,9 +23,11 @@ import org.apache.cassandra.cql3.statements.PropertyDefinitions; import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.exceptions.SyntaxException; import org.apache.cassandra.schema.KeyspaceParams; import org.apache.cassandra.schema.KeyspaceParams.Option; import org.apache.cassandra.schema.ReplicationParams; +import org.apache.cassandra.service.accord.fastpath.FastPathStrategy; public final class KeyspaceAttributes extends PropertyDefinitions { @@ -48,6 +50,10 @@ public void validate() Map replicationOptions = getAllReplicationOptions(); if (!replicationOptions.isEmpty() && !replicationOptions.containsKey(ReplicationParams.CLASS)) throw new ConfigurationException("Missing replication strategy class"); + + FastPathStrategy strategy = getFastPathStrategy(); + if (strategy != null && strategy.kind() == FastPathStrategy.Kind.INHERIT_KEYSPACE) + throw new ConfigurationException("Cannot use keyspace inheriting fast path strategy with keyspaces"); } public String getReplicationStrategyClass() @@ -63,10 +69,26 @@ private Map getAllReplicationOptions() : replication; } + private FastPathStrategy getFastPathStrategy() + { + if (!hasOption(Option.FAST_PATH)) + return null; + + try + { + return FastPathStrategy.fromMap(getMap(Option.FAST_PATH.toString())); + } + catch (SyntaxException e) + { + return FastPathStrategy.keyspaceStrategyFromString(getString(Option.FAST_PATH.toString())); + } + } + KeyspaceParams asNewKeyspaceParams() { boolean durableWrites = getBoolean(Option.DURABLE_WRITES.toString(), KeyspaceParams.DEFAULT_DURABLE_WRITES); - return KeyspaceParams.create(durableWrites, getAllReplicationOptions()); + FastPathStrategy fastPath = getFastPathStrategy(); + return KeyspaceParams.create(durableWrites, getAllReplicationOptions(), fastPath != null ? fastPath : FastPathStrategy.simple()); } KeyspaceParams asAlteredKeyspaceParams(KeyspaceParams previous) @@ -76,7 +98,8 @@ KeyspaceParams asAlteredKeyspaceParams(KeyspaceParams previous) ReplicationParams replication = getReplicationStrategyClass() == null ? previous.replication : ReplicationParams.fromMapWithDefaults(getAllReplicationOptions(), previousOptions); - return new KeyspaceParams(durableWrites, replication); + FastPathStrategy fastPath = getFastPathStrategy(); + return new KeyspaceParams(durableWrites, replication, fastPath != null ? fastPath : previous.fastPath); } public boolean hasOption(Option option) diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java b/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java index 87af6b840b00..9ec04f502bc6 100644 --- a/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java +++ b/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java @@ -23,20 +23,45 @@ import com.google.common.collect.ImmutableSet; import com.google.common.collect.Sets; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.statements.PropertyDefinitions; import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.exceptions.SyntaxException; +import org.apache.cassandra.schema.AutoRepairParams; import org.apache.cassandra.schema.CachingParams; import org.apache.cassandra.schema.CompactionParams; import org.apache.cassandra.schema.CompressionParams; import org.apache.cassandra.schema.MemtableParams; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.schema.TableParams; import org.apache.cassandra.schema.TableParams.Option; +import org.apache.cassandra.service.accord.fastpath.FastPathStrategy; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.service.consensus.migration.TransactionalMigrationFromMode; import org.apache.cassandra.service.reads.SpeculativeRetryPolicy; import org.apache.cassandra.service.reads.repair.ReadRepairStrategy; import static java.lang.String.format; -import static org.apache.cassandra.schema.TableParams.Option.*; +import static org.apache.cassandra.schema.TableParams.Option.ADDITIONAL_WRITE_POLICY; +import static org.apache.cassandra.schema.TableParams.Option.ALLOW_AUTO_SNAPSHOT; +import static org.apache.cassandra.schema.TableParams.Option.BLOOM_FILTER_FP_CHANCE; +import static org.apache.cassandra.schema.TableParams.Option.CACHING; +import static org.apache.cassandra.schema.TableParams.Option.CDC; +import static org.apache.cassandra.schema.TableParams.Option.COMMENT; +import static org.apache.cassandra.schema.TableParams.Option.COMPACTION; +import static org.apache.cassandra.schema.TableParams.Option.COMPRESSION; +import static org.apache.cassandra.schema.TableParams.Option.CRC_CHECK_CHANCE; +import static org.apache.cassandra.schema.TableParams.Option.DEFAULT_TIME_TO_LIVE; +import static org.apache.cassandra.schema.TableParams.Option.GC_GRACE_SECONDS; +import static org.apache.cassandra.schema.TableParams.Option.INCREMENTAL_BACKUPS; +import static org.apache.cassandra.schema.TableParams.Option.MAX_INDEX_INTERVAL; +import static org.apache.cassandra.schema.TableParams.Option.MEMTABLE_FLUSH_PERIOD_IN_MS; +import static org.apache.cassandra.schema.TableParams.Option.MIN_INDEX_INTERVAL; +import static org.apache.cassandra.schema.TableParams.Option.READ_REPAIR; +import static org.apache.cassandra.schema.TableParams.Option.SPECULATIVE_RETRY; +import static org.apache.cassandra.schema.TableParams.Option.TRANSACTIONAL_MODE; public final class TableAttributes extends PropertyDefinitions { @@ -60,9 +85,12 @@ public void validate() build(TableParams.builder()).validate(); } - TableParams asNewTableParams() + TableParams asNewTableParams(String keyspaceName) { - return build(TableParams.builder()); + TableParams.Builder builder = TableParams.builder(); + if (!hasOption(TRANSACTIONAL_MODE) && !SchemaConstants.isSystemKeyspace(keyspaceName) && Schema.instance.distributedKeyspaces().names().contains(keyspaceName)) + builder.transactionalMode(DatabaseDescriptor.defaultTransactionalMode()); + return build(builder); } TableParams asAlteredTableParams(TableParams previous) @@ -151,6 +179,27 @@ private TableParams build(TableParams.Builder builder) if (hasOption(READ_REPAIR)) builder.readRepair(ReadRepairStrategy.fromString(getString(READ_REPAIR))); + if (hasOption(Option.FAST_PATH)) + { + try + { + builder.fastPath(FastPathStrategy.fromMap(getMap(Option.FAST_PATH))); + } + catch (SyntaxException e) + { + builder.fastPath(FastPathStrategy.tableStrategyFromString(getString(Option.FAST_PATH))); + } + } + + if (hasOption(Option.TRANSACTIONAL_MODE)) + builder.transactionalMode(TransactionalMode.fromString(getString(Option.TRANSACTIONAL_MODE))); + + if (hasOption(Option.TRANSACTIONAL_MIGRATION_FROM)) + builder.transactionalMigrationFrom(TransactionalMigrationFromMode.fromString(getString(Option.TRANSACTIONAL_MIGRATION_FROM))); + + if (hasOption(Option.AUTO_REPAIR)) + builder.automatedRepair(AutoRepairParams.fromMap(getMap(Option.AUTO_REPAIR))); + return builder.build(); } diff --git a/src/java/org/apache/cassandra/cql3/terms/Constants.java b/src/java/org/apache/cassandra/cql3/terms/Constants.java index 3b9ae6b4c90e..a912f5556d7f 100644 --- a/src/java/org/apache/cassandra/cql3/terms/Constants.java +++ b/src/java/org/apache/cassandra/cql3/terms/Constants.java @@ -20,7 +20,6 @@ import java.math.BigDecimal; import java.math.BigInteger; import java.nio.ByteBuffer; -import java.nio.charset.StandardCharsets; import org.apache.cassandra.cql3.AssignmentTestable; import org.apache.cassandra.cql3.CQL3Type; @@ -30,7 +29,24 @@ import org.apache.cassandra.cql3.UpdateParameters; import org.apache.cassandra.db.Clustering; import org.apache.cassandra.db.DecoratedKey; -import org.apache.cassandra.db.marshal.*; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.AsciiType; +import org.apache.cassandra.db.marshal.BooleanType; +import org.apache.cassandra.db.marshal.ByteType; +import org.apache.cassandra.db.marshal.BytesType; +import org.apache.cassandra.db.marshal.CounterColumnType; +import org.apache.cassandra.db.marshal.DecimalType; +import org.apache.cassandra.db.marshal.DoubleType; +import org.apache.cassandra.db.marshal.DurationType; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.IntegerType; +import org.apache.cassandra.db.marshal.LongType; +import org.apache.cassandra.db.marshal.NumberType; +import org.apache.cassandra.db.marshal.ReversedType; +import org.apache.cassandra.db.marshal.StringType; +import org.apache.cassandra.db.marshal.TimeUUIDType; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.db.marshal.UUIDType; import org.apache.cassandra.db.rows.Cell; import org.apache.cassandra.db.rows.Row; import org.apache.cassandra.exceptions.InvalidRequestException; @@ -39,12 +55,13 @@ import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FastByteOperations; +import static java.nio.charset.StandardCharsets.US_ASCII; + /** * Static helper methods and classes for constants. */ public abstract class Constants { - private static ByteBuffer getCurrentCellBuffer(ColumnMetadata column, DecoratedKey key, UpdateParameters params) { Row currentRow = params.getPrefetchedRow(key, column.isStatic() ? Clustering.STATIC_CLUSTERING : params.currentClustering()); @@ -59,7 +76,7 @@ public enum Type @Override public AbstractType getPreferedTypeFor(String text) { - if (StandardCharsets.US_ASCII.newEncoder().canEncode(text)) + if (US_ASCII.newEncoder().canEncode(text)) { return AsciiType.instance; } @@ -272,6 +289,7 @@ public static Literal duration(String text) return new Literal(Type.DURATION, text); } + @Override public Value prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException { if (!testAssignment(keyspace, receiver).isAssignable()) diff --git a/src/java/org/apache/cassandra/cql3/terms/Lists.java b/src/java/org/apache/cassandra/cql3/terms/Lists.java index 1e95fcfd83c4..579b783d8d52 100644 --- a/src/java/org/apache/cassandra/cql3/terms/Lists.java +++ b/src/java/org/apache/cassandra/cql3/terms/Lists.java @@ -27,6 +27,10 @@ import java.util.stream.Collectors; import java.util.stream.StreamSupport; +import com.google.common.annotations.VisibleForTesting; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import org.apache.cassandra.cql3.AssignmentTestable; import org.apache.cassandra.cql3.ColumnIdentifier; import org.apache.cassandra.cql3.ColumnSpecification; @@ -34,21 +38,23 @@ import org.apache.cassandra.cql3.QueryOptions; import org.apache.cassandra.cql3.UpdateParameters; import org.apache.cassandra.cql3.VariableSpecifications; +import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.guardrails.Guardrails; -import org.apache.cassandra.db.marshal.MultiElementType; -import org.apache.cassandra.schema.ColumnMetadata; -import com.google.common.annotations.VisibleForTesting; -import org.apache.cassandra.db.*; -import org.apache.cassandra.db.rows.*; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.Int32Type; import org.apache.cassandra.db.marshal.ListType; +import org.apache.cassandra.db.marshal.MultiElementType; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellPath; +import org.apache.cassandra.db.rows.ComplexColumnData; +import org.apache.cassandra.db.rows.Row; import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.utils.ByteBufferUtil; -import static org.apache.cassandra.cql3.terms.Constants.UNSET_VALUE; import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse; import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest; +import static org.apache.cassandra.cql3.terms.Constants.UNSET_VALUE; import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis; import static org.apache.cassandra.utils.TimeUUID.Generator.atUnixMillisAsBytes; @@ -57,6 +63,9 @@ */ public abstract class Lists { + @SuppressWarnings("unused") + private static final Logger logger = LoggerFactory.getLogger(Lists.class); + private Lists() {} public static ColumnSpecification indexSpecOf(ColumnSpecification column) @@ -288,6 +297,12 @@ public Setter(ColumnMetadata column, Term t) super(column, t); } + @Override + public boolean requiresTimestamp() + { + return true; + } + public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException { Term.Terminal value = t.bind(params.options); @@ -312,7 +327,7 @@ private static int existingSize(Row row, ColumnMetadata column) public static class SetterByIndex extends Operation { - private final Term idx; + public final Term idx; public SetterByIndex(ColumnMetadata column, Term idx, Term t) { @@ -379,6 +394,12 @@ public void execute(DecoratedKey partitionKey, UpdateParameters params) throws I doAppend(value, column, params); } + @Override + public boolean requiresTimestamp() + { + return true; + } + static void doAppend(Term.Terminal value, ColumnMetadata column, UpdateParameters params) throws InvalidRequestException { ListType type = (ListType) column.type; @@ -409,8 +430,8 @@ static void doAppend(Term.Terminal value, ColumnMetadata column, UpdateParameter int dataSize = 0; for (ByteBuffer buffer : elements) { - ByteBuffer uuid = ByteBuffer.wrap(params.nextTimeUUIDAsBytes()); - Cell cell = params.addCell(column, CellPath.create(uuid), buffer); + ByteBuffer cellPath = ByteBuffer.wrap(params.nextTimeUUIDAsBytes()); + Cell cell = params.addCell(column, CellPath.create(cellPath), buffer); dataSize += cell.dataSize(); } Guardrails.collectionListSize.guard(dataSize, column.name.toString(), false, params.clientState); @@ -431,6 +452,12 @@ public Prepender(ColumnMetadata column, Term t) super(column, t); } + @Override + public boolean requiresTimestamp() + { + return true; + } + public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException { assert column.type.isMultiCell() : "Attempted to prepend to a frozen list"; diff --git a/src/java/org/apache/cassandra/cql3/terms/Maps.java b/src/java/org/apache/cassandra/cql3/terms/Maps.java index 4355e8a3bf5c..b21d84bf81ed 100644 --- a/src/java/org/apache/cassandra/cql3/terms/Maps.java +++ b/src/java/org/apache/cassandra/cql3/terms/Maps.java @@ -267,7 +267,7 @@ public void execute(DecoratedKey partitionKey, UpdateParameters params) throws I public static class SetterByKey extends Operation { - private final Term k; + public final Term k; public SetterByKey(ColumnMetadata column, Term k, Term t) { diff --git a/src/java/org/apache/cassandra/cql3/terms/UserTypes.java b/src/java/org/apache/cassandra/cql3/terms/UserTypes.java index 08c6abb722cb..85b33efba2c3 100644 --- a/src/java/org/apache/cassandra/cql3/terms/UserTypes.java +++ b/src/java/org/apache/cassandra/cql3/terms/UserTypes.java @@ -255,7 +255,7 @@ public void execute(DecoratedKey partitionKey, UpdateParameters params) throws I public static class SetterByField extends Operation { - private final FieldIdentifier field; + public final FieldIdentifier field; public SetterByField(ColumnMetadata column, FieldIdentifier field, Term t) { diff --git a/src/java/org/apache/cassandra/cql3/transactions/ConditionStatement.java b/src/java/org/apache/cassandra/cql3/transactions/ConditionStatement.java new file mode 100644 index 000000000000..2ce6f3350236 --- /dev/null +++ b/src/java/org/apache/cassandra/cql3/transactions/ConditionStatement.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.cql3.transactions; + +import com.google.common.base.Preconditions; + +import org.apache.cassandra.cql3.ColumnSpecification; +import org.apache.cassandra.cql3.QueryOptions; +import org.apache.cassandra.cql3.terms.Term; +import org.apache.cassandra.cql3.VariableSpecifications; +import org.apache.cassandra.service.accord.txn.TxnCondition; + +public class ConditionStatement +{ + public enum Kind + { + IS_NOT_NULL(TxnCondition.Kind.IS_NOT_NULL, null), + IS_NULL(TxnCondition.Kind.IS_NULL, null), + EQ(TxnCondition.Kind.EQUAL, TxnCondition.Kind.EQUAL), + NEQ(TxnCondition.Kind.NOT_EQUAL, TxnCondition.Kind.NOT_EQUAL), + GT(TxnCondition.Kind.GREATER_THAN, TxnCondition.Kind.LESS_THAN), + GTE(TxnCondition.Kind.GREATER_THAN_OR_EQUAL, TxnCondition.Kind.LESS_THAN_OR_EQUAL), + LT(TxnCondition.Kind.LESS_THAN, TxnCondition.Kind.GREATER_THAN), + LTE(TxnCondition.Kind.LESS_THAN_OR_EQUAL, TxnCondition.Kind.GREATER_THAN_OR_EQUAL); + + // TODO: Support for IN, CONTAINS, CONTAINS KEY + + private final TxnCondition.Kind kind; + private final TxnCondition.Kind reversedKind; + + Kind(TxnCondition.Kind kind, TxnCondition.Kind reversedKind) + { + this.kind = kind; + this.reversedKind = reversedKind; + } + + TxnCondition.Kind toTxnKind(boolean reversed) + { + return reversed ? reversedKind : kind; + } + } + + private final RowDataReference reference; + private final Kind kind; + private final Term value; + private final boolean reversed; + + public ConditionStatement(RowDataReference reference, Kind kind, Term value, boolean reversed) + { + this.reference = reference; + this.kind = kind; + this.value = value; + this.reversed = reversed; + } + + public static class Raw + { + private final Term.Raw lhs; + private final Kind kind; + private final Term.Raw rhs; + + public Raw(Term.Raw lhs, Kind kind, Term.Raw rhs) + { + Preconditions.checkArgument(lhs != null); + Preconditions.checkArgument((rhs == null) == (kind == Kind.IS_NOT_NULL || kind == Kind.IS_NULL)); + this.lhs = lhs; + this.kind = kind; + this.rhs = rhs; + } + + public ConditionStatement prepare(String keyspace, VariableSpecifications bindVariables) + { + if (rhs == null) + { + // In the IS NULL/IS NOT NULL case, the reference will always be on the LHS + RowDataReference reference = ((RowDataReference.Raw) lhs).prepareAsReceiver(); + reference.collectMarkerSpecification(bindVariables); + return new ConditionStatement(reference, kind, null, false); + } + + RowDataReference reference; + Term value; + boolean reversed = false; + + if (lhs instanceof RowDataReference.Raw) + { + reference = ((RowDataReference.Raw) lhs).prepareAsReceiver(); + ColumnSpecification receiver = reference.getValueReceiver(); + value = rhs.prepare(keyspace, receiver); + } + else if (rhs instanceof RowDataReference.Raw) + { + reference = ((RowDataReference.Raw) rhs).prepareAsReceiver(); + ColumnSpecification receiver = reference.getValueReceiver(); + value = lhs.prepare(keyspace, receiver); + // TxnCondition expects the reference to be on the LHS, so reverse the operator. + reversed = true; + } + else + { + throw new IllegalStateException("Either the left-hand or right-hand side must be a reference!"); + } + + reference.collectMarkerSpecification(bindVariables); + value.collectMarkerSpecification(bindVariables); + return new ConditionStatement(reference, kind, value, reversed); + } + } + + public TxnCondition createCondition(QueryOptions options) + { + switch (kind) + { + case IS_NOT_NULL: + case IS_NULL: + return new TxnCondition.Exists(reference.toTxnReference(options), kind.toTxnKind(reversed)); + case EQ: + case NEQ: + case GT: + case GTE: + case LT: + case LTE: + // TODO: Support for references on LHS and RHS + return new TxnCondition.Value(reference.toTxnReference(options), + kind.toTxnKind(reversed), + value.bindAndGet(options), + options.getProtocolVersion()); + default: + throw new IllegalStateException(); + } + } +} diff --git a/src/java/org/apache/cassandra/cql3/transactions/ReferenceOperation.java b/src/java/org/apache/cassandra/cql3/transactions/ReferenceOperation.java new file mode 100644 index 000000000000..2b4b6f999f0f --- /dev/null +++ b/src/java/org/apache/cassandra/cql3/transactions/ReferenceOperation.java @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.cql3.transactions; + +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.cql3.FieldIdentifier; +import org.apache.cassandra.cql3.Operation; +import org.apache.cassandra.cql3.QueryOptions; +import org.apache.cassandra.cql3.VariableSpecifications; +import org.apache.cassandra.cql3.terms.Lists; +import org.apache.cassandra.cql3.terms.Maps; +import org.apache.cassandra.cql3.terms.Term; +import org.apache.cassandra.cql3.terms.UserTypes; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.CollectionType; +import org.apache.cassandra.db.marshal.MapType; +import org.apache.cassandra.db.marshal.SetType; +import org.apache.cassandra.db.marshal.UserType; +import org.apache.cassandra.db.rows.CellPath; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.txn.TxnReferenceOperation; +import org.apache.cassandra.utils.ByteBufferUtil; + +import static org.apache.cassandra.cql3.statements.RequestValidations.checkTrue; +import static org.apache.cassandra.db.marshal.CollectionType.Kind.MAP; +import static org.apache.cassandra.schema.TableMetadata.UNDEFINED_COLUMN_NAME_MESSAGE; + +public class ReferenceOperation +{ + private final ColumnMetadata receiver; + private final TableMetadata table; + private final TxnReferenceOperation.Kind kind; + private final FieldIdentifier field; + private final Term key; + private final ReferenceValue value; + + public ReferenceOperation(ColumnMetadata receiver, TableMetadata table, TxnReferenceOperation.Kind kind, Term key, FieldIdentifier field, ReferenceValue value) + { + this.receiver = receiver; + this.table = table; + this.kind = kind; + this.key = key; + this.field = field; + this.value = value; + } + + /** + * Creates a {@link ReferenceOperation} from the given {@link Operation} for the purpose of defering execution + * within a transaction. When the language sees an Operation using a reference one is created already, but for cases + * that needs to defer execution (such as when {@link Operation#requiresRead()} is true), this method can be used. + */ + public static ReferenceOperation create(Operation operation, TableMetadata table) + { + TxnReferenceOperation.Kind kind = TxnReferenceOperation.Kind.from(operation); + ColumnMetadata receiver = operation.column; + + // We already have a prepared reference value, so there is no need to inspect the value type. + ReferenceValue value = new ReferenceValue.Constant(operation.term()); + Term key = extractKeyOrIndex(operation); + FieldIdentifier field = extractField(operation); + return new ReferenceOperation(receiver, table, kind, key, field, value); + } + + public TxnReferenceOperation.Kind getKind() + { + return kind; + } + + public ReferenceValue getValue() + { + return value; + } + + public ColumnMetadata getReceiver() + { + return receiver; + } + + public boolean requiresRead() + { + // TODO: Find a better way than delegating to the operation? + return kind.toOperation(receiver, null, null, null).requiresRead(); + } + + public TxnReferenceOperation bindAndGet(QueryOptions options) + { + return new TxnReferenceOperation(kind, + receiver, table, + key != null ? key.bindAndGet(options) : null, + field != null ? field.bytes : null, + value.bindAndGet(options)); + } + + public static class Raw + { + private final Operation.RawUpdate rawUpdate; + public final ColumnIdentifier column; + private final ReferenceValue.Raw value; + + public Raw(Operation.RawUpdate rawUpdate, ColumnIdentifier column, ReferenceValue.Raw value) + { + this.rawUpdate = rawUpdate; + this.column = column; + this.value = value; + } + + public ReferenceOperation prepare(TableMetadata metadata, VariableSpecifications bindVariables) + { + ColumnMetadata receiver = metadata.getColumn(column); + Operation operation = rawUpdate.prepare(metadata, receiver, true); + TxnReferenceOperation.Kind kind = TxnReferenceOperation.Kind.from(operation); + Term key = extractKeyOrIndex(operation); + + checkTrue(receiver != null, UNDEFINED_COLUMN_NAME_MESSAGE, column.toCQLString(), metadata); + AbstractType type = receiver.type; + ColumnMetadata valueReceiver = receiver; + + if (type.isCollection()) + { + CollectionType collectionType = (CollectionType) type; + + // The value for a map subtraction is actually a set (see Operation.Substraction) + if (kind == TxnReferenceOperation.Kind.SetDiscarder && collectionType.kind == MAP) + valueReceiver = valueReceiver.withNewType(SetType.getInstance(((MapType) type).getKeysType(), true)); + + if (kind == TxnReferenceOperation.Kind.ListSetterByIndex || kind == TxnReferenceOperation.Kind.MapSetterByKey) + valueReceiver = valueReceiver.withNewType(collectionType.valueComparator()); + } + + FieldIdentifier field = extractField(operation); + + if (type.isUDT()) + { + if (kind == TxnReferenceOperation.Kind.UserTypeSetterByField) + { + @SuppressWarnings("ConstantConditions") UserType userType = (UserType) type; + CellPath fieldPath = userType.cellPathForField(field); + int i = ByteBufferUtil.getUnsignedShort(fieldPath.get(0), 0); + valueReceiver = valueReceiver.withNewType(userType.fieldType(i)); + } + } + + return new ReferenceOperation(receiver, metadata, kind, key, field, value.prepare(valueReceiver, bindVariables)); + } + } + + private static FieldIdentifier extractField(Operation operation) + { + if (operation instanceof UserTypes.SetterByField) + return ((UserTypes.SetterByField) operation).field; + return null; + } + + private static Term extractKeyOrIndex(Operation operation) + { + // TODO: Is there a way to do this without exposing k and idx? + if (operation instanceof Maps.SetterByKey) + return ((Maps.SetterByKey) operation).k; + else if (operation instanceof Lists.SetterByIndex) + return ((Lists.SetterByIndex) operation).idx; + return null; + } +} diff --git a/src/java/org/apache/cassandra/cql3/transactions/ReferenceValue.java b/src/java/org/apache/cassandra/cql3/transactions/ReferenceValue.java new file mode 100644 index 000000000000..d6a4ab8acf4a --- /dev/null +++ b/src/java/org/apache/cassandra/cql3/transactions/ReferenceValue.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.cql3.transactions; + +import org.apache.cassandra.cql3.ColumnSpecification; +import org.apache.cassandra.cql3.QueryOptions; +import org.apache.cassandra.cql3.terms.Term; +import org.apache.cassandra.cql3.VariableSpecifications; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.service.accord.txn.TxnReferenceValue; + +import static org.apache.cassandra.cql3.statements.RequestValidations.checkTrue; + +public abstract class ReferenceValue +{ + public abstract TxnReferenceValue bindAndGet(QueryOptions options); + + public static abstract class Raw extends Term.Raw + { + public abstract ReferenceValue prepare(ColumnMetadata receiver, VariableSpecifications bindVariables); + } + + public static class Constant extends ReferenceValue + { + private final Term term; + + public Constant(Term term) + { + this.term = term; + } + + @Override + public TxnReferenceValue bindAndGet(QueryOptions options) + { + return new TxnReferenceValue.Constant(term.bindAndGet(options)); + } + + public static class Raw extends ReferenceValue.Raw + { + private final Term.Raw term; + + public Raw(Term.Raw term) + { + this.term = term; + } + + @Override + public ReferenceValue prepare(ColumnMetadata receiver, VariableSpecifications bindVariables) + { + return new Constant(term.prepare(receiver.ksName, receiver)); + } + + @Override + public TestResult testAssignment(String keyspace, ColumnSpecification receiver) + { + return term.testAssignment(keyspace, receiver); + } + + @Override + public Term prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException + { + return term.prepare(keyspace, receiver); + } + + @Override + public String getText() + { + return term.getText(); + } + + @Override + public AbstractType getExactTypeIfKnown(String keyspace) + { + return term.getExactTypeIfKnown(keyspace); + } + } + } + + public static class Substitution extends ReferenceValue + { + private final RowDataReference reference; + + public Substitution(RowDataReference reference) + { + this.reference = reference; + } + + @Override + public TxnReferenceValue bindAndGet(QueryOptions options) + { + return new TxnReferenceValue.Substitution(reference.toTxnReference(options)); + } + + public static class Raw extends ReferenceValue.Raw + { + private final RowDataReference.Raw reference; + + public Raw(RowDataReference.Raw reference) + { + this.reference = reference; + } + + + @Override + public ReferenceValue prepare(ColumnMetadata receiver, VariableSpecifications bindVariables) + { + reference.checkResolved(); + checkTrue(reference.column() != null, "substitution references must reference a column (%s)", reference); + return new Substitution((RowDataReference) reference.prepare(null, receiver)); + } + + @Override + public TestResult testAssignment(String keyspace, ColumnSpecification receiver) + { + return reference.testAssignment(keyspace, receiver); + } + + @Override + public Term prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException + { + return reference.prepare(keyspace, receiver); + } + + @Override + public String getText() + { + return reference.getText(); + } + + @Override + public AbstractType getExactTypeIfKnown(String keyspace) + { + return reference.getExactTypeIfKnown(keyspace); + } + } + } +} diff --git a/src/java/org/apache/cassandra/cql3/transactions/RowDataReference.java b/src/java/org/apache/cassandra/cql3/transactions/RowDataReference.java new file mode 100644 index 000000000000..038bd459633d --- /dev/null +++ b/src/java/org/apache/cassandra/cql3/transactions/RowDataReference.java @@ -0,0 +1,419 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.cql3.transactions; + +import java.util.List; +import java.util.Map; + +import com.google.common.base.Preconditions; + +import org.apache.cassandra.cql3.AssignmentTestable; +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.cql3.ColumnSpecification; +import org.apache.cassandra.cql3.FieldIdentifier; +import org.apache.cassandra.cql3.QueryOptions; +import org.apache.cassandra.cql3.VariableSpecifications; +import org.apache.cassandra.cql3.functions.Function; +import org.apache.cassandra.cql3.functions.types.utils.Bytes; +import org.apache.cassandra.cql3.selection.Selectable; +import org.apache.cassandra.cql3.terms.Lists; +import org.apache.cassandra.cql3.terms.Maps; +import org.apache.cassandra.cql3.terms.Sets; +import org.apache.cassandra.cql3.terms.Term; +import org.apache.cassandra.cql3.terms.UserTypes; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.CollectionType; +import org.apache.cassandra.db.marshal.ListType; +import org.apache.cassandra.db.marshal.MapType; +import org.apache.cassandra.db.marshal.SetType; +import org.apache.cassandra.db.marshal.UserType; +import org.apache.cassandra.db.rows.CellPath; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.txn.TxnReference; +import org.apache.cassandra.utils.ByteBufferUtil; + +import static org.apache.cassandra.cql3.statements.RequestValidations.checkNotNull; + +public class RowDataReference extends Term.NonTerminal +{ + public static final String CANNOT_FIND_TUPLE_MESSAGE = "Cannot resolve reference to tuple '%s'."; + public static final String COLUMN_NOT_IN_TUPLE_MESSAGE = "Column '%s' does not exist in tuple '%s'."; + + private final String selectName; + private final int txnDataName; + private final ColumnMetadata column; + private final TableMetadata table; + private final Term elementPath; + private final CellPath fieldPath; + + public RowDataReference(String selectName, int txnDataName, ColumnMetadata column, TableMetadata table, Term elementPath, CellPath fieldPath) + { + Preconditions.checkArgument(elementPath == null || fieldPath == null, "Cannot specify both element and field paths"); + + this.selectName = selectName; + this.txnDataName = txnDataName; + this.column = column; + this.table = table; + this.elementPath = elementPath; + this.fieldPath = fieldPath; + } + + @Override + public void collectMarkerSpecification(VariableSpecifications boundNames) + { + if (elementPath != null) + elementPath.collectMarkerSpecification(boundNames); + } + + @Override + public Terminal bind(QueryOptions options) throws InvalidRequestException + { + throw new UnsupportedOperationException(); + } + + @Override + public boolean containsBindMarker() + { + return elementPath != null && elementPath.containsBindMarker(); + } + + @Override + public void addFunctionsTo(List functions) + { + throw new UnsupportedOperationException("Functions are not currently supported w/ reference terms."); + } + + public ColumnMetadata toResultMetadata() + { + ColumnIdentifier fullName = getFullyQualifiedName(); + ColumnMetadata forMetadata = column.withNewName(fullName); + + if (isElementSelection()) + { + if (forMetadata.type instanceof ListType) + forMetadata = forMetadata.withNewType(((ListType) forMetadata.type).valueComparator()); + else if (forMetadata.type instanceof SetType) + forMetadata = forMetadata.withNewType(((SetType) forMetadata.type).nameComparator()); + else if (forMetadata.type instanceof MapType) + forMetadata = forMetadata.withNewType(((MapType) forMetadata.type).valueComparator()); + } + else if (isFieldSelection()) + { + forMetadata = forMetadata.withNewType(getFieldSelectionType()); + } + return forMetadata; + } + + public ColumnSpecification getValueReceiver() + { + if (isElementSelection()) + { + CollectionType.Kind collectionKind = ((CollectionType) column.type).kind; + switch (collectionKind) + { + case LIST: + return Lists.valueSpecOf(column); + case MAP: + return Maps.valueSpecOf(column); + default: + throw new InvalidRequestException(String.format("Element selection not supported for column %s of type %s" , + column.name, collectionKind)); + } + } + else if (isFieldSelection()) + { + return getFieldSelectionSpec(); + } + + return column; + } + + public boolean isElementSelection() + { + return elementPath != null && column.type.isCollection(); + } + + public boolean isFieldSelection() + { + return fieldPath != null && column.type.isUDT(); + } + + private AbstractType getFieldSelectionType() + { + assert isFieldSelection() : "No field selection type exists"; + return getFieldSelectionType(column, fieldPath); + } + + private static AbstractType getFieldSelectionType(ColumnMetadata column, CellPath fieldPath) + { + return ((UserType) column.type).fieldType(fieldPath); + } + + public ColumnSpecification getFieldSelectionSpec() + { + assert isFieldSelection() : "No field selection type exists"; + int field = ByteBufferUtil.getUnsignedShort(fieldPath.get(0), 0); + return UserTypes.fieldSpecOf(column, field); + } + + private CellPath bindCellPath(QueryOptions options) + { + if (fieldPath != null) + return fieldPath; + + return elementPath != null ? CellPath.create(elementPath.bindAndGet(options)) : null; + } + + public TxnReference toTxnReference(QueryOptions options) + { + Preconditions.checkState(elementPath == null || column.isComplex() || column.type.isFrozenCollection()); + Preconditions.checkState(fieldPath == null || column.isComplex() || column.type.isUDT()); + + return new TxnReference(txnDataName, table, column, bindCellPath(options)); + } + + public ColumnIdentifier getFullyQualifiedName() + { + // TODO: Make this more user-friendly... + String path = fieldPath != null ? '.' + Bytes.toHexString(fieldPath.get(0)) : (elementPath == null ? "" : "[0x" + elementPath + ']'); + String fullName = selectName + '.' + column.name.toString() + path; + return new ColumnIdentifier(fullName, true); + } + + public ColumnMetadata column() + { + return column; + } + + public TableMetadata table() + { + return table; + } + + public static class Raw extends Term.Raw + { + private final Selectable.RawIdentifier tuple; + private final Selectable.RawIdentifier selected; + private final Object fieldOrElement; + + private boolean isResolved = false; + + private int tupleName; + private ColumnMetadata column; + private TableMetadata table; + private Term elementPath = null; + private CellPath fieldPath = null; + + public Raw(Selectable.RawIdentifier tuple, Selectable.Raw selected, Object fieldOrElement) + { + Preconditions.checkArgument(tuple != null, "tuple is null"); + Preconditions.checkArgument(selected == null || selected instanceof Selectable.RawIdentifier, "selected is not a Selectable.RawIdentifier: " + selected); + this.tuple = tuple; + this.selected = (Selectable.RawIdentifier) selected; + this.fieldOrElement = fieldOrElement; + } + + public static Raw fromSelectable(Selectable.RawIdentifier tuple, Selectable.Raw selectable) + { + if (selectable == null) + return new RowDataReference.Raw(tuple, null, null); + + // TODO: Ideally it would be nice not to have to make items in the Selectables public + if (selectable instanceof Selectable.WithFieldSelection.Raw) + { + Selectable.WithFieldSelection.Raw selection = (Selectable.WithFieldSelection.Raw) selectable; + return new RowDataReference.Raw(tuple, selection.selected, selection.field); + } + else if (selectable instanceof Selectable.WithElementSelection.Raw) + { + Selectable.WithElementSelection.Raw elementSelection = (Selectable.WithElementSelection.Raw) selectable; + return new RowDataReference.Raw(tuple, elementSelection.selected, elementSelection.element); + } + else if (selectable instanceof Selectable.RawIdentifier) + { + Selectable.RawIdentifier selection = (Selectable.RawIdentifier) selectable; + return new RowDataReference.Raw(tuple, selection, null); + } + + throw new UnsupportedOperationException("Cannot create column reference from selectable: " + selectable); + } + + private void resolveFinished() + { + isResolved = true; + } + + public void resolveReference(Map sources, Map nameToTxnDataName, int userReadIndex) + { + if (isResolved) + return; + + // root level name, use the one that was already created if it exists otherwise generate a new one + String rawTupleName = tuple.getText(); + tupleName = nameToTxnDataName.getOrDefault(rawTupleName, userReadIndex); + ReferenceSource source = sources.get(tupleName); + checkNotNull(source, CANNOT_FIND_TUPLE_MESSAGE, rawTupleName); + + if (selected == null) + { + resolveFinished(); + return; + } + + column = source.getColumn(selected.toString()); + table = source.getTable(); + checkNotNull(column, COLUMN_NOT_IN_TUPLE_MESSAGE, selected.toString(), rawTupleName); + + // TODO: confirm update partition key terms don't contain column references. This can't be done in prepare + // because there can be intermediate functions (ie: pk=row.v+1 or pk=_add(row.v, 5)). Need a recursive Term visitor + + if (fieldOrElement == null) + { + resolveFinished(); + return; + } + + if (column.type.isCollection()) + { + Term.Raw element = (Term.Raw) fieldOrElement; + elementPath = element.prepare(column.ksName, specForElementOrSlice(column)); + } + else if (column.type.isUDT()) + { + FieldIdentifier field = (FieldIdentifier) fieldOrElement; + UserType userType = (UserType) column.type; + fieldPath = userType.cellPathForField(field); + } + + resolveFinished(); + } + + private ColumnSpecification specForElementOrSlice(ColumnSpecification receiver) + { + switch (((CollectionType) receiver.type).kind) + { + case LIST: return Lists.indexSpecOf(receiver); + case SET: return Sets.valueSpecOf(receiver); + case MAP: return Maps.keySpecOf(receiver); + default: throw new AssertionError("Unknown collection type: " + receiver.type); + } + } + + public void checkResolved() + { + if (!isResolved) + throw new IllegalStateException(); + } + + @Override + public AssignmentTestable.TestResult testAssignment(String keyspace, ColumnSpecification receiver) + { + checkResolved(); + + AbstractType type = column.type; + + if (elementPath != null) + { + CollectionType collectionType = (CollectionType) type; + type = collectionType.kind == CollectionType.Kind.SET ? collectionType.nameComparator() : collectionType.valueComparator(); + } + else if (fieldPath != null) + { + type = RowDataReference.getFieldSelectionType(column, fieldPath); + } + + return type.testAssignment(receiver.type); + } + + @Override + public Term prepare(String keyspace, ColumnSpecification receiver) throws InvalidRequestException + { + return prepare(keyspace, receiver, tupleName, column, elementPath, fieldPath); + } + + public RowDataReference prepareAsReceiver() + { + checkResolved(); + return new RowDataReference(tuple.toString(), tupleName, column, table, elementPath, fieldPath); + } + + private RowDataReference prepare(String keyspace, + ColumnSpecification receiver, + int txnDataName, + ColumnMetadata column, + Term elementPath, + CellPath fieldPath) + { + if (!testAssignment(keyspace, receiver).isAssignable()) + throw new InvalidRequestException(String.format("Invalid reference type %s (%s) for \"%s\" of type %s", + column.type, column.name, receiver.name, receiver.type.asCQL3Type())); + + return new RowDataReference(tuple.toString(), txnDataName, column, table, elementPath, fieldPath); + } + + @Override + public String getText() + { + StringBuilder text = new StringBuilder(tuple.toString()); + + if (selected != null) + text.append('.').append(selected); + + if (fieldOrElement != null) + { + if (fieldOrElement instanceof Term.Raw) + { + Term.Raw element = (Term.Raw) fieldOrElement; + text.append('.').append(element.getText()); + } + else if (fieldOrElement instanceof FieldIdentifier) + { + FieldIdentifier field = (FieldIdentifier) fieldOrElement; + text.append('.').append(field); + } + else + { + throw new IllegalStateException("Field or element is neither a raw term nor a field identifier"); + } + } + + return text.toString(); + } + + @Override + public AbstractType getExactTypeIfKnown(String keyspace) + { + checkResolved(); + return column.type; + } + + public ColumnMetadata column() + { + return column; + } + } + + public interface ReferenceSource + { + ColumnMetadata getColumn(String name); + TableMetadata getTable(); + } +} diff --git a/src/java/org/apache/cassandra/cql3/transactions/SelectReferenceSource.java b/src/java/org/apache/cassandra/cql3/transactions/SelectReferenceSource.java new file mode 100644 index 000000000000..b8cca735899b --- /dev/null +++ b/src/java/org/apache/cassandra/cql3/transactions/SelectReferenceSource.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.cql3.transactions; + +import java.util.HashSet; +import java.util.Set; + +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.cql3.statements.SelectStatement; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableMetadata; + +import static org.apache.cassandra.cql3.statements.RequestValidations.checkTrue; + +public class SelectReferenceSource implements RowDataReference.ReferenceSource +{ + public static final String COLUMN_NOT_IN_SELECT_MESSAGE = "%s references a column not included in the select"; + + private final SelectStatement statement; + + public SelectReferenceSource(SelectStatement statement) + { + this.statement = statement; + } + + @Override + public ColumnMetadata getColumn(String name) + { + ColumnMetadata column = statement.table.getColumn(new ColumnIdentifier(name, true)); + if (column != null) + { + Set selectedColumns = new HashSet<>(statement.getSelection().getColumns()); + checkTrue(selectedColumns.contains(column), COLUMN_NOT_IN_SELECT_MESSAGE, statement); + } + return column; + } + + public TableMetadata getTable() + { + return statement.table; + } +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/db/AbstractMutationVerbHandler.java b/src/java/org/apache/cassandra/db/AbstractMutationVerbHandler.java index cfea7eb45c5f..05512e5a8e3f 100644 --- a/src/java/org/apache/cassandra/db/AbstractMutationVerbHandler.java +++ b/src/java/org/apache/cassandra/db/AbstractMutationVerbHandler.java @@ -27,18 +27,23 @@ import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.exceptions.CoordinatorBehindException; import org.apache.cassandra.exceptions.InvalidRoutingException; +import org.apache.cassandra.exceptions.RetryOnDifferentSystemException; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.metrics.TCMMetrics; import org.apache.cassandra.net.IVerbHandler; import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.ownership.VersionedEndpoints; +import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.utils.NoSpamLogger; +import static org.apache.cassandra.exceptions.RequestFailureReason.RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM; + public abstract class AbstractMutationVerbHandler implements IVerbHandler { private static final Logger logger = LoggerFactory.getLogger(AbstractMutationVerbHandler.class); @@ -54,15 +59,25 @@ protected void processMessage(Message message, InetAddressAndPort respondTo) if (message.epoch().isAfter(Epoch.EMPTY)) { ClusterMetadata metadata = ClusterMetadata.current(); - metadata = checkTokenOwnership(metadata, message); - metadata = checkSchemaVersion(metadata, message); + metadata = checkTokenOwnership(metadata, message, respondTo); + metadata = checkSchemaVersion(metadata, message, respondTo); + } + + try + { + applyMutation(message, respondTo); + } + catch (RetryOnDifferentSystemException e) + { + logger.debug("Responding with retry on different system"); + MessagingService.instance().respondWithFailure(RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM, message); + Tracing.trace("Payload application resulted in RetryOnDifferentSysten"); } - applyMutation(message, respondTo); } abstract void applyMutation(Message message, InetAddressAndPort respondToAddress); - private ClusterMetadata checkTokenOwnership(ClusterMetadata metadata, Message message) + private ClusterMetadata checkTokenOwnership(ClusterMetadata metadata, Message message, InetAddressAndPort respondTo) { String keyspace = message.payload.getKeyspaceName(); DecoratedKey key = message.payload.key(); @@ -75,13 +90,13 @@ private ClusterMetadata checkTokenOwnership(ClusterMetadata metadata, Message // since coordinator's routing may be more recent. if (!forToken.get().containsSelf()) { - metadata = ClusterMetadataService.instance().fetchLogFromPeerOrCMS(metadata, message.from(), message.epoch()); + metadata = ClusterMetadataService.instance().fetchLogFromPeerOrCMS(metadata, respondTo, message.epoch()); forToken = writePlacements(metadata, keyspace, key); } // Otherwise, coordinator and the replica agree about the placement of the givent token, so catch-up can be async else { - ClusterMetadataService.instance().fetchLogFromPeerOrCMSAsync(metadata, message.from(), message.epoch()); + ClusterMetadataService.instance().fetchLogFromPeerOrCMSAsync(metadata, respondTo, message.epoch()); } } @@ -89,8 +104,8 @@ private ClusterMetadata checkTokenOwnership(ClusterMetadata metadata, Message { StorageService.instance.incOutOfRangeOperationCount(); Keyspace.open(message.payload.getKeyspaceName()).metric.outOfRangeTokenWrites.inc(); - NoSpamLogger.log(logger, NoSpamLogger.Level.WARN, 1, TimeUnit.SECONDS, logMessageTemplate, message.from(), key.getToken(), message.payload.getKeyspaceName()); - throw InvalidRoutingException.forWrite(message.from(), key.getToken(), metadata.epoch, message.payload); + NoSpamLogger.log(logger, NoSpamLogger.Level.WARN, 1, TimeUnit.SECONDS, logMessageTemplate, respondTo, key.getToken(), message.payload.getKeyspaceName()); + throw InvalidRoutingException.forWrite(respondTo, key.getToken(), metadata.epoch, message.payload); } if (forToken.lastModified().isAfter(message.epoch())) @@ -103,7 +118,7 @@ private ClusterMetadata checkTokenOwnership(ClusterMetadata metadata, Message return metadata; } - private ClusterMetadata checkSchemaVersion(ClusterMetadata metadata, Message message) + private ClusterMetadata checkSchemaVersion(ClusterMetadata metadata, Message message, InetAddressAndPort respondTo) { if (SchemaConstants.isSystemKeyspace(message.payload.getKeyspaceName()) || message.epoch().is(metadata.epoch)) return metadata; @@ -121,10 +136,10 @@ private ClusterMetadata checkSchemaVersion(ClusterMetadata metadata, Message { // the partition update was serialized after the epoch we currently know, catch up and // make sure we've seen the epoch it has seen, otherwise fail request. - metadata = ClusterMetadataService.instance().fetchLogFromPeerOrCMS(metadata, message.from(), message.epoch()); + metadata = ClusterMetadataService.instance().fetchLogFromPeerOrCMS(metadata, respondTo, message.epoch()); if (pu.serializedAtEpoch.isAfter(metadata.epoch)) throw new IllegalStateException(String.format("Coordinator %s is still ahead after fetching log, our epoch = %s, their epoch = %s", - message.from(), + respondTo, metadata.epoch, message.epoch())); } } @@ -143,7 +158,7 @@ else if (message.epoch().isBefore(metadata.schema.lastModified())) { TCMMetrics.instance.coordinatorBehindSchema.mark(); throw new CoordinatorBehindException(String.format("Coordinator %s is behind, our epoch = %s, their epoch = %s", - message.from(), + respondTo, metadata.epoch, message.epoch())); } } @@ -151,7 +166,7 @@ else if (message.epoch().isBefore(metadata.schema.lastModified())) { TCMMetrics.instance.coordinatorBehindSchema.mark(); throw new CoordinatorBehindException(String.format("Schema mismatch, coordinator %s is behind, we're missing table %s.%s, our epoch = %s, their epoch = %s", - message.from(), + respondTo, pu.metadata().keyspace, pu.metadata().name, metadata.epoch, message.epoch())); @@ -165,13 +180,13 @@ else if (message.epoch().isBefore(metadata.schema.lastModified())) { TCMMetrics.instance.coordinatorBehindSchema.mark(); throw new CoordinatorBehindException(String.format("Schema mismatch, coordinator %s is behind, we're missing keyspace %s, our epoch = %s, their epoch = %s", - message.from(), + respondTo, keyspace, metadata.epoch, message.epoch())); } else { - metadata = ClusterMetadataService.instance().fetchLogFromPeerOrCMS(metadata, message.from(), message.epoch()); + metadata = ClusterMetadataService.instance().fetchLogFromPeerOrCMS(metadata, respondTo, message.epoch()); } } diff --git a/src/java/org/apache/cassandra/db/AbstractReadQuery.java b/src/java/org/apache/cassandra/db/AbstractReadQuery.java index 448069cfca10..2e72c7ec4fc5 100644 --- a/src/java/org/apache/cassandra/db/AbstractReadQuery.java +++ b/src/java/org/apache/cassandra/db/AbstractReadQuery.java @@ -118,4 +118,16 @@ public String toCQLString() } protected abstract void appendCQLWhereClause(StringBuilder sb); + + @Override + public String monitoredOnKeyspace() + { + return metadata().keyspace; + } + + @Override + public String monitoredOnTable() + { + return metadata().name; + } } \ No newline at end of file diff --git a/src/java/org/apache/cassandra/db/ArrayClustering.java b/src/java/org/apache/cassandra/db/ArrayClustering.java index b04910c434cb..752b3db99517 100644 --- a/src/java/org/apache/cassandra/db/ArrayClustering.java +++ b/src/java/org/apache/cassandra/db/ArrayClustering.java @@ -48,6 +48,12 @@ public long unsharedHeapSizeExcludingData() return EMPTY_SIZE + ObjectSizes.sizeOfArray(values); } + @Override + public Clustering ensureAccessorFactorySupport() + { + return this; + } + public static ArrayClustering make(byte[]... values) { return new ArrayClustering(values); diff --git a/src/java/org/apache/cassandra/db/BufferClustering.java b/src/java/org/apache/cassandra/db/BufferClustering.java index 6cacbd14c910..205036505220 100644 --- a/src/java/org/apache/cassandra/db/BufferClustering.java +++ b/src/java/org/apache/cassandra/db/BufferClustering.java @@ -59,4 +59,10 @@ public static BufferClustering make(ByteBuffer... values) { return new BufferClustering(values); } + + @Override + public Clustering ensureAccessorFactorySupport() + { + return this; + } } diff --git a/src/java/org/apache/cassandra/db/CassandraKeyspaceWriteHandler.java b/src/java/org/apache/cassandra/db/CassandraKeyspaceWriteHandler.java index ef9d0d137778..825ef12292cd 100644 --- a/src/java/org/apache/cassandra/db/CassandraKeyspaceWriteHandler.java +++ b/src/java/org/apache/cassandra/db/CassandraKeyspaceWriteHandler.java @@ -86,10 +86,10 @@ private CommitLogPosition addToCommitLog(Mutation mutation) Set ids = new HashSet<>(); for (PartitionUpdate update : mutation.getPartitionUpdates()) { - if (update.metadata().params.memtable.factory().writesShouldSkipCommitLog()) + if (!update.metadata().params.memtable.factory().writesShouldSkipCommitLog()) ids.add(update.metadata().id); } - mutation = mutation.without(ids); + mutation = mutation.filter(ids::contains); } } // Note: It may be a good idea to precalculate none/all for the set of all tables in the keyspace, diff --git a/src/java/org/apache/cassandra/db/Clustering.java b/src/java/org/apache/cassandra/db/Clustering.java index 426d3279f97d..3e42e4a361b7 100644 --- a/src/java/org/apache/cassandra/db/Clustering.java +++ b/src/java/org/apache/cassandra/db/Clustering.java @@ -190,4 +190,11 @@ public Clustering deserialize(ByteBuffer in, int version, List ensureAccessorFactorySupport(); } \ No newline at end of file diff --git a/src/java/org/apache/cassandra/db/ClusteringPrefix.java b/src/java/org/apache/cassandra/db/ClusteringPrefix.java index 167d89c6a485..c7687bb80f3e 100644 --- a/src/java/org/apache/cassandra/db/ClusteringPrefix.java +++ b/src/java/org/apache/cassandra/db/ClusteringPrefix.java @@ -258,6 +258,25 @@ default boolean isEmpty() */ public V get(int i); + /** + * The method is introduced to allow to avoid a value object retrieval/allocation for simple checks + */ + public default boolean isNull(int i) + { + return get(i) == null; + } + + /** + * The method is introduced to allow to avoid a value object retrieval/allocation for simple checks + */ + public default boolean isEmpty(int i) + { + V v = get(i); + if (v == null) + return true; + return accessor().isEmpty(v); + } + public ValueAccessor accessor(); default ByteBuffer bufferAt(int i) @@ -402,7 +421,7 @@ public default String clusteringString(List> types) * memory (i.e. in memtables) to minimized on-heap versions. * If the object is already in minimal form, no action will be taken. */ - public ClusteringPrefix retainable(); + public ClusteringPrefix retainable(); public static class Serializer { @@ -549,14 +568,12 @@ void skipValuesWithoutSize(DataInputPlus in, int size, int version, List long makeHeader(ClusteringPrefix clustering, int offset, int limit) { long header = 0; - ValueAccessor accessor = clustering.accessor(); for (int i = offset ; i < limit ; i++) { - V v = clustering.get(i); // no need to do modulo arithmetic for i, since the left-shift execute on the modulus of RH operand by definition - if (v == null) + if (clustering.isNull(i)) header |= (1L << (i * 2) + 1); - else if (accessor.isEmpty(v)) + else if (clustering.isEmpty(i)) header |= (1L << (i * 2)); } return header; diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java index 91a3a40152af..e81c0d7be8e6 100644 --- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java +++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java @@ -236,6 +236,8 @@ public enum FlushReason ANTICOMPACTION, SCHEMA_CHANGE, OWNED_RANGES_CHANGE, + ACCORD, + ACCORD_TXN_GC, UNIT_TESTS // explicitly requested flush needed for a test } @@ -512,7 +514,7 @@ public ColumnFamilyStore(Keyspace keyspace, { Directories.SSTableLister sstableFiles = directories.sstableLister(Directories.OnTxnErr.IGNORE).skipTemporary(true); sstables = SSTableReader.openAll(this, sstableFiles.list().entrySet(), metadata); - data.addInitialSSTablesWithoutUpdatingSize(sstables, this); + data.addInitialSSTablesWithoutUpdatingSize(sstables); } // compaction strategy should be created after the CFS has been prepared @@ -3338,4 +3340,9 @@ public TableMetrics getMetrics() { return metric; } + + public TableId getTableId() + { + return metadata().id; + } } diff --git a/src/java/org/apache/cassandra/db/Columns.java b/src/java/org/apache/cassandra/db/Columns.java index 60852c5c5667..3e014d3254f7 100644 --- a/src/java/org/apache/cassandra/db/Columns.java +++ b/src/java/org/apache/cassandra/db/Columns.java @@ -61,6 +61,7 @@ public class Columns extends AbstractCollection implements Colle "", ColumnIdentifier.getInterned(ByteBufferUtil.EMPTY_BYTE_BUFFER, UTF8Type.instance), SetType.getInstance(UTF8Type.instance, true), + ColumnMetadata.NO_UNIQUE_ID, ColumnMetadata.NO_POSITION, ColumnMetadata.Kind.STATIC, null, @@ -71,6 +72,7 @@ public class Columns extends AbstractCollection implements Colle "", ColumnIdentifier.getInterned(ByteBufferUtil.EMPTY_BYTE_BUFFER, UTF8Type.instance), SetType.getInstance(UTF8Type.instance, true), + ColumnMetadata.NO_UNIQUE_ID, ColumnMetadata.NO_POSITION, ColumnMetadata.Kind.REGULAR, null, @@ -121,6 +123,12 @@ public static Columns from(Row row) } public static Columns from(BTree.Builder builder) + { + Object[] tree = builder.build(); + return new Columns(tree, findFirstComplexIdx(tree)); + } + + public static Columns from(BTree.FastBuilder builder) { Object[] tree = builder.build(); return new Columns(tree, findFirstComplexIdx(tree)); diff --git a/src/java/org/apache/cassandra/db/CounterMutation.java b/src/java/org/apache/cassandra/db/CounterMutation.java index 14d7a010ffe2..c5ed2364be5d 100644 --- a/src/java/org/apache/cassandra/db/CounterMutation.java +++ b/src/java/org/apache/cassandra/db/CounterMutation.java @@ -18,10 +18,15 @@ package org.apache.cassandra.db; import java.io.IOException; -import java.util.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Lock; +import java.util.function.Predicate; import java.util.function.Supplier; +import javax.annotation.Nullable; import com.google.common.base.Function; import com.google.common.base.Objects; @@ -31,11 +36,17 @@ import com.google.common.util.concurrent.Striped; import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.db.marshal.ByteBufferAccessor; -import org.apache.cassandra.db.rows.*; -import org.apache.cassandra.db.filter.*; -import org.apache.cassandra.db.partitions.*; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; import org.apache.cassandra.db.context.CounterContext; +import org.apache.cassandra.db.filter.ClusteringIndexNamesFilter; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; +import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.ColumnData; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.RowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIterators; import org.apache.cassandra.exceptions.WriteTimeoutException; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; @@ -44,10 +55,11 @@ import org.apache.cassandra.schema.TableId; import org.apache.cassandra.service.CacheService; import org.apache.cassandra.tracing.Tracing; -import org.apache.cassandra.utils.*; +import org.apache.cassandra.utils.CounterId; +import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.btree.BTreeSet; -import static java.util.concurrent.TimeUnit.*; +import static java.util.concurrent.TimeUnit.NANOSECONDS; import static org.apache.cassandra.net.MessagingService.VERSION_40; import static org.apache.cassandra.net.MessagingService.VERSION_50; import static org.apache.cassandra.net.MessagingService.VERSION_51; @@ -83,6 +95,12 @@ public Collection getPartitionUpdates() return mutation.getPartitionUpdates(); } + @Override + public boolean hasUpdateForTable(TableId tableId) + { + return mutation.hasUpdateForTable(tableId); + } + @Override public Supplier hintOnFailure() { @@ -156,6 +174,27 @@ public void apply() applyCounterMutation(); } + @Override + public @Nullable CounterMutation filter(Predicate test) + { + Mutation m = mutation.filter(test); + if (m == null) + return null; + if (m == mutation) + return this; + return new CounterMutation(m, consistency); + } + + /* + * Accord currently doesn't support interoperability with counters so no Accord transactions should read them + * anyways and it's safe to continue non-transactionally updating them + */ + @Override + public PotentialTxnConflicts potentialTxnConflicts() + { + return PotentialTxnConflicts.ALLOW; + } + private void grabCounterLocks(Keyspace keyspace, List locks) throws WriteTimeoutException { long startTime = nanoTime(); diff --git a/src/java/org/apache/cassandra/db/DecoratedKey.java b/src/java/org/apache/cassandra/db/DecoratedKey.java index 03d6374112a4..a974de81d645 100644 --- a/src/java/org/apache/cassandra/db/DecoratedKey.java +++ b/src/java/org/apache/cassandra/db/DecoratedKey.java @@ -90,6 +90,11 @@ public int compareTo(PartitionPosition pos) return cmp == 0 ? ByteBufferUtil.compareUnsigned(getKey(), otherKey.getKey()) : cmp; } + public int compareBytesOnly(DecoratedKey that) + { + return ByteBufferUtil.compareUnsigned(getKey(), that.getKey()); + } + public static int compareTo(IPartitioner partitioner, ByteBuffer key, PartitionPosition position) { // delegate to Token.KeyBound if needed @@ -97,7 +102,7 @@ public static int compareTo(IPartitioner partitioner, ByteBuffer key, PartitionP return -position.compareTo(partitioner.decorateKey(key)); DecoratedKey otherKey = (DecoratedKey) position; - int cmp = partitioner.getToken(key).compareTo(otherKey.getToken()); + int cmp = partitioner.compareToken(key, otherKey.getToken()); return cmp == 0 ? ByteBufferUtil.compareUnsigned(key, otherKey.getKey()) : cmp; } diff --git a/src/java/org/apache/cassandra/db/DiskBoundaryManager.java b/src/java/org/apache/cassandra/db/DiskBoundaryManager.java index 5c6b59a0b165..4aad348e4ac4 100644 --- a/src/java/org/apache/cassandra/db/DiskBoundaryManager.java +++ b/src/java/org/apache/cassandra/db/DiskBoundaryManager.java @@ -195,7 +195,7 @@ private static List getDiskBoundaries(RangesAtEndpoint replic List diskBoundaries = new ArrayList<>(); for (int i = 0; i < boundaries.size() - 1; i++) diskBoundaries.add(boundaries.get(i).maxKeyBound()); - diskBoundaries.add(partitioner.getMaximumToken().maxKeyBound()); + diskBoundaries.add(partitioner.getMaximumTokenForSplitting().maxKeyBound()); return diskBoundaries; } } diff --git a/src/java/org/apache/cassandra/db/IMutation.java b/src/java/org/apache/cassandra/db/IMutation.java index 1998e2c0353c..282db2891694 100644 --- a/src/java/org/apache/cassandra/db/IMutation.java +++ b/src/java/org/apache/cassandra/db/IMutation.java @@ -19,9 +19,12 @@ import java.util.Collection; import java.util.concurrent.TimeUnit; +import java.util.function.Predicate; import java.util.function.Supplier; +import javax.annotation.Nullable; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.service.ClientState; @@ -37,6 +40,7 @@ public interface IMutation long getTimeout(TimeUnit unit); String toString(boolean shallow); Collection getPartitionUpdates(); + boolean hasUpdateForTable(TableId tableId); Supplier hintOnFailure(); default void validateIndexedColumns(ClientState state) @@ -70,4 +74,24 @@ static long dataSize(Collection mutations) } return size; } + + /** + * True if this mutation is being applied by a transaction system or doesn't need to be + * and conflicts between this mutation and transactions systems that are managing all or part of this table + * should be assumed to be handled already (by either Paxos or Accord) and the mutation should be applied. + * + * This causes mutations against tables to fail if they are from a non-transaction sub-system such as mutations, + * logged and unlogged batches, hints, and read repair against tables that are being managed by a transaction system + * like Accord that can't safely read data that is written non-transactionally. + * + */ + default PotentialTxnConflicts potentialTxnConflicts() + { + return PotentialTxnConflicts.DISALLOW; + } + + // Construct replacement mutation that is identical except it only includes updates for the specified tables + @Nullable IMutation filter(Predicate predicate); + + default void clearCachedSerializationsForRetry() {} } diff --git a/src/java/org/apache/cassandra/db/Keyspace.java b/src/java/org/apache/cassandra/db/Keyspace.java index f731139e442f..f31662660bc5 100644 --- a/src/java/org/apache/cassandra/db/Keyspace.java +++ b/src/java/org/apache/cassandra/db/Keyspace.java @@ -47,8 +47,8 @@ import org.apache.cassandra.exceptions.WriteTimeoutException; import org.apache.cassandra.index.Index; import org.apache.cassandra.index.SecondaryIndexManager; -import org.apache.cassandra.io.util.File; import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.io.util.File; import org.apache.cassandra.locator.AbstractReplicationStrategy; import org.apache.cassandra.metrics.KeyspaceMetrics; import org.apache.cassandra.repair.KeyspaceRepairManager; @@ -58,14 +58,15 @@ import org.apache.cassandra.schema.SchemaProvider; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationMutationHelper; import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.JVMStabilityInspector; import org.apache.cassandra.utils.concurrent.AsyncPromise; import org.apache.cassandra.utils.concurrent.Future; import org.apache.cassandra.utils.concurrent.OpOrder; -import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; import org.apache.cassandra.utils.concurrent.Promise; +import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; import static java.util.concurrent.TimeUnit.MILLISECONDS; import static java.util.concurrent.TimeUnit.NANOSECONDS; @@ -545,6 +546,7 @@ else if (isDeferrable) } try (WriteContext ctx = getWriteHandler().beginWrite(mutation, makeDurable)) { + ConsensusMigrationMutationHelper.validateSafeToExecuteNonTransactionally(mutation); for (PartitionUpdate upd : mutation.getPartitionUpdates()) { ColumnFamilyStore cfs = columnFamilyStores.get(upd.metadata().id); @@ -696,7 +698,7 @@ public static Iterable nonLocalStrategy() public static Iterable system() { - return Iterables.transform(SchemaConstants.LOCAL_SYSTEM_KEYSPACE_NAMES, Keyspace::open); + return Iterables.transform(Schema.instance.localKeyspaces().names(), Keyspace::open); } @Override diff --git a/src/java/org/apache/cassandra/db/LivenessInfo.java b/src/java/org/apache/cassandra/db/LivenessInfo.java index 987c5fdf90fd..168473add552 100644 --- a/src/java/org/apache/cassandra/db/LivenessInfo.java +++ b/src/java/org/apache/cassandra/db/LivenessInfo.java @@ -206,7 +206,10 @@ public int dataSize() * supersedes, ie. tombstone supersedes. * * If timestamps are the same and both of them are expired livenessInfo(Ideally it shouldn't happen), - * greater localDeletionTime wins. + * greater localDeletionTime wins. If the localDeletion times are the same, prefer the + * lower TTL to make the merge deterministic (it is likely that the row has been rewritten with + * USING TTL/TIMESTAMP with an updated TTL that computes to the same local deletion time -- perhaps + * from rerunning a process to migrate user data between clusters or tables). * * @param other * the {@code LivenessInfo} to compare this info to. @@ -220,7 +223,11 @@ public boolean supersedes(LivenessInfo other) if (isExpired() ^ other.isExpired()) return isExpired(); if (isExpiring() == other.isExpiring()) - return localExpirationTime() > other.localExpirationTime(); + { + return localExpirationTime() > other.localExpirationTime() || + (localExpirationTime() == other.localExpirationTime() && ttl() < other.ttl()); + } + return isExpiring(); } diff --git a/src/java/org/apache/cassandra/db/MutableDeletionInfo.java b/src/java/org/apache/cassandra/db/MutableDeletionInfo.java index c8d9fd18116d..3bbfe163b3c8 100644 --- a/src/java/org/apache/cassandra/db/MutableDeletionInfo.java +++ b/src/java/org/apache/cassandra/db/MutableDeletionInfo.java @@ -23,7 +23,8 @@ import com.google.common.base.Objects; import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.db.rows.RangeTombstoneMarker; import org.apache.cassandra.utils.ObjectSizes; import org.apache.cassandra.utils.memory.ByteBufferCloner; @@ -230,6 +231,16 @@ public DeletionInfo updateAllTimestamp(long timestamp) return this; } + public DeletionInfo updateAllTimestampAndLocalDeletionTime(long timestamp, long localDeletionTime) + { + if (partitionDeletion.markedForDeleteAt() != Long.MIN_VALUE) + partitionDeletion = DeletionTime.build(timestamp, localDeletionTime); + + if (ranges != null) + ranges.updateAllTimestampAndLocalDeletionTime(timestamp, localDeletionTime); + return this; + } + @Override public boolean equals(Object o) { diff --git a/src/java/org/apache/cassandra/db/Mutation.java b/src/java/org/apache/cassandra/db/Mutation.java index 0861bb64c41f..441163c8bd68 100644 --- a/src/java/org/apache/cassandra/db/Mutation.java +++ b/src/java/org/apache/cassandra/db/Mutation.java @@ -18,10 +18,18 @@ package org.apache.cassandra.db; import java.io.IOException; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLongFieldUpdater; +import java.util.function.Predicate; import java.util.function.Supplier; +import javax.annotation.Nullable; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableCollection; @@ -31,6 +39,7 @@ import org.apache.cassandra.concurrent.Stage; import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; import org.apache.cassandra.db.commitlog.CommitLog; import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.db.rows.DeserializationHelper; @@ -48,6 +57,7 @@ import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.concurrent.Future; +import static com.google.common.base.Preconditions.checkState; import static org.apache.cassandra.net.MessagingService.VERSION_40; import static org.apache.cassandra.net.MessagingService.VERSION_50; import static org.apache.cassandra.net.MessagingService.VERSION_51; @@ -56,6 +66,8 @@ public class Mutation implements IMutation, Supplier { public static final MutationSerializer serializer = new MutationSerializer(); + public static final int ALLOW_POTENTIAL_TRANSACTION_CONFLICTS = 0x01; + // todo this is redundant // when we remove it, also restore SerializationsTest.testMutationRead to not regenerate new Mutations each test @@ -77,30 +89,41 @@ public class Mutation implements IMutation, Supplier private static final int SERIALIZATION_VERSION_COUNT = MessagingService.Version.values().length; // Contains serialized representations of this mutation. - // Note: there is no functionality to clear/remove serialized instances, because a mutation must never - // be modified (e.g. calling add(PartitionUpdate)) when it's being serialized. + // Note: The cached serializations can be cleared when CoordinatorBehindException is being retried private final Serialization[] cachedSerializations = new Serialization[SERIALIZATION_VERSION_COUNT]; /** @see CassandraRelevantProperties#CACHEABLE_MUTATION_SIZE_LIMIT */ private static final long CACHEABLE_MUTATION_SIZE_LIMIT = CassandraRelevantProperties.CACHEABLE_MUTATION_SIZE_LIMIT.getLong(); + // Paxos & Accord manage conflicts directly and needs to apply mutations to tables/ranges + // that are only safe to write to from a transaction system. + // Don't refuse to apply this mutation because it should go through a transaction system + // because it is being applied by one or in a context where transaction conflicts don't occur + private PotentialTxnConflicts potentialTxnConflicts; + public Mutation(PartitionUpdate update) { - this(update.metadata().keyspace, update.partitionKey(), ImmutableMap.of(update.metadata().id, update), approxTime.now(), update.metadata().params.cdc); + this(update, PotentialTxnConflicts.DISALLOW); + } + + public Mutation(PartitionUpdate update, PotentialTxnConflicts potentialTxnConflicts) + { + this(update.metadata().keyspace, update.partitionKey(), ImmutableMap.of(update.metadata().id, update), approxTime.now(), update.metadata().params.cdc, potentialTxnConflicts); } - public Mutation(String keyspaceName, DecoratedKey key, ImmutableMap modifications, long approxCreatedAtNanos) + public Mutation(String keyspaceName, DecoratedKey key, ImmutableMap modifications, long approxCreatedAtNanos, PotentialTxnConflicts potentialTxnConflicts) { - this(keyspaceName, key, modifications, approxCreatedAtNanos, cdcEnabled(modifications.values())); + this(keyspaceName, key, modifications, approxCreatedAtNanos, cdcEnabled(modifications.values()), potentialTxnConflicts); } - public Mutation(String keyspaceName, DecoratedKey key, ImmutableMap modifications, long approxCreatedAtNanos, boolean cdcEnabled) + public Mutation(String keyspaceName, DecoratedKey key, ImmutableMap modifications, long approxCreatedAtNanos, boolean cdcEnabled, PotentialTxnConflicts potentialTxnConflicts) { this.keyspaceName = keyspaceName; this.key = key; this.modifications = modifications; this.cdcEnabled = cdcEnabled; this.approxCreatedAtNanos = approxCreatedAtNanos; + this.potentialTxnConflicts = potentialTxnConflicts; } private static boolean cdcEnabled(Iterable modifications) @@ -111,26 +134,35 @@ private static boolean cdcEnabled(Iterable modifications) return cdc; } - public Mutation without(Set tableIds) + @Override + public @Nullable Mutation filter(Predicate predicate) { - if (tableIds.isEmpty()) + boolean allMatch = true; + boolean noneMatch = true; + for (TableId tableId : modifications.keySet()) + { + boolean test = predicate.test(tableId); + allMatch &= test; + noneMatch &= !test; + } + if (allMatch) return this; + if (noneMatch) + return null; ImmutableMap.Builder builder = new ImmutableMap.Builder<>(); for (Map.Entry update : modifications.entrySet()) - { - if (!tableIds.contains(update.getKey())) - { + if (predicate.test(update.getKey())) builder.put(update); - } - } - return new Mutation(keyspaceName, key, builder.build(), approxCreatedAtNanos); + Map updates = builder.build(); + checkState(!updates.isEmpty(), "Updates should not be empty"); + return new Mutation(keyspaceName, key, builder.build(), approxCreatedAtNanos, potentialTxnConflicts); } - public Mutation without(TableId tableId) + public @Nullable Mutation without(TableId tableId) { - return without(Collections.singleton(tableId)); + return filter(otherTableId -> !tableId.equals(otherTableId)); } public String getKeyspaceName() @@ -158,6 +190,12 @@ public long getApproxCreatedAtNanos() return approxCreatedAtNanos; } + @Override + public boolean hasUpdateForTable(TableId tableId) + { + return modifications.containsKey(tableId); + } + @Override public Supplier hintOnFailure() { @@ -201,18 +239,22 @@ public boolean isEmpty() * @throws IllegalArgumentException if not all the mutations are on the same * keyspace and key. */ - public static Mutation merge(List mutations) + public static Mutation merge(Collection mutations) { assert !mutations.isEmpty(); if (mutations.size() == 1) - return mutations.get(0); + return mutations.iterator().next(); Set updatedTables = new HashSet<>(); String ks = null; DecoratedKey key = null; + PotentialTxnConflicts potentialTxnConflicts = null; for (Mutation mutation : mutations) { + if (potentialTxnConflicts != null && potentialTxnConflicts != mutation.potentialTxnConflicts) + throw new IllegalArgumentException("Can't merge mutations with differing policies on allowing potential transaction conflicts"); + potentialTxnConflicts = mutation.potentialTxnConflicts; updatedTables.addAll(mutation.modifications.keySet()); if (ks != null && !ks.equals(mutation.keyspaceName)) throw new IllegalArgumentException(); @@ -239,7 +281,7 @@ public static Mutation merge(List mutations) modifications.put(table, updates.size() == 1 ? updates.get(0) : PartitionUpdate.merge(updates)); updates.clear(); } - return new Mutation(ks, key, modifications.build(), approxTime.now()); + return new Mutation(ks, key, modifications.build(), approxTime.now(), potentialTxnConflicts); } public Future applyFuture() @@ -296,11 +338,39 @@ public boolean trackedByCDC() return cdcEnabled; } + public void allowPotentialTransactionConflicts() + { + potentialTxnConflicts = PotentialTxnConflicts.ALLOW; + Arrays.fill(cachedSerializations, null); + } + + @Override + public PotentialTxnConflicts potentialTxnConflicts() + { + return potentialTxnConflicts; + } + + private static int potentialTxnConflictsFlag(PotentialTxnConflicts potentialTxnConflicts) + { + return potentialTxnConflicts.allowed ? ALLOW_POTENTIAL_TRANSACTION_CONFLICTS : 0; + } + + public static PotentialTxnConflicts potentialTxnConflicts(int flags) + { + return (flags & ALLOW_POTENTIAL_TRANSACTION_CONFLICTS) != 0 ? PotentialTxnConflicts.ALLOW : PotentialTxnConflicts.DISALLOW; + } + public String toString() { return toString(false); } + @Override + public void clearCachedSerializationsForRetry() + { + Arrays.fill(cachedSerializations, null); + } + public String toString(boolean shallow) { StringBuilder buff = new StringBuilder("Mutation("); @@ -369,6 +439,13 @@ public static SimpleBuilder simpleBuilder(String keyspaceName, DecoratedKey part */ public interface SimpleBuilder { + /** + * Assume any potential transaction conflicts that might occur by applying this mutation are already + * being handled by the caller + * @return this builder + */ + public SimpleBuilder allowPotentialTxnConflicts(); + /** * Sets the timestamp to use for the following additions to this builder or any derived (update or row) builder. * @@ -481,6 +558,13 @@ static void serializeInternal(PartitionUpdate.PartitionUpdateSerializer serializ { Map modifications = mutation.modifications; + if (version >= VERSION_51) + { + int flags = 0; + flags |= potentialTxnConflictsFlag(mutation.potentialTxnConflicts); + out.write(flags); + } + /* serialize the modifications in the mutation */ int size = modifications.size(); out.writeUnsignedVInt32(size); @@ -500,13 +584,19 @@ public Mutation deserialize(DataInputPlus in, int version, DeserializationHelper { teeIn = new TeeDataInputPlus(in, dob, CACHEABLE_MUTATION_SIZE_LIMIT); + PotentialTxnConflicts potentialTxnConflicts = PotentialTxnConflicts.DISALLOW; + if (version >= VERSION_51) + { + int flags = teeIn.readByte(); + potentialTxnConflicts = potentialTxnConflicts(flags); + } int size = teeIn.readUnsignedVInt32(); assert size > 0; PartitionUpdate update = PartitionUpdate.serializer.deserialize(teeIn, version, flag); if (size == 1) { - m = new Mutation(update); + m = new Mutation(update, potentialTxnConflicts); } else { @@ -519,7 +609,7 @@ public Mutation deserialize(DataInputPlus in, int version, DeserializationHelper update = PartitionUpdate.serializer.deserialize(teeIn, version, flag); modifications.put(update.metadata().id, update); } - m = new Mutation(update.metadata().keyspace, dk, modifications.build(), approxTime.now()); + m = new Mutation(update.metadata().keyspace, dk, modifications.build(), approxTime.now(), potentialTxnConflicts); } //Only cache serializations that don't hit the limit @@ -597,7 +687,9 @@ long serializedSize(PartitionUpdate.PartitionUpdateSerializer serializer, Mutati long size = this.size; if (size == 0L) { - size = TypeSizes.sizeofUnsignedVInt(mutation.modifications.size()); + if (version >= VERSION_51) + size += TypeSizes.sizeof((byte)ALLOW_POTENTIAL_TRANSACTION_CONFLICTS); // flags + size += TypeSizes.sizeofUnsignedVInt(mutation.modifications.size()); for (PartitionUpdate partitionUpdate : mutation.modifications.values()) size += serializer.serializedSize(partitionUpdate, version); this.size = size; @@ -617,16 +709,28 @@ public static class PartitionUpdateCollector private final long approxCreatedAtNanos = approxTime.now(); private boolean empty = true; + private PotentialTxnConflicts potentialTxnConflicts; + public PartitionUpdateCollector(String keyspaceName, DecoratedKey key) + { + this(keyspaceName, key, PotentialTxnConflicts.DISALLOW); + } + + public PartitionUpdateCollector(String keyspaceName, DecoratedKey key, PotentialTxnConflicts potentialTxnConflicts) { this.keyspaceName = keyspaceName; this.key = key; + this.potentialTxnConflicts = potentialTxnConflicts; } public PartitionUpdateCollector add(PartitionUpdate partitionUpdate) { - assert partitionUpdate != null; - assert partitionUpdate.partitionKey().getPartitioner() == key.getPartitioner(); + assert partitionUpdate != null : "Null updates are not allowed"; + assert partitionUpdate.partitionKey().getPartitioner() == key.getPartitioner(): String.format("Update to key %s with partitioner %s (%s) had an update (%s) with a different partitioner! %s (%s)", + key, + key.getPartitioner(), key.getPartitioner().getClass(), + partitionUpdate, + partitionUpdate.partitionKey().getPartitioner(), partitionUpdate.partitionKey().getPartitioner().getClass()); // note that ImmutableMap.Builder only allows put:ing the same key once, it will fail during build() below otherwise modifications.put(partitionUpdate.metadata().id, partitionUpdate); empty = false; @@ -650,7 +754,7 @@ public boolean isEmpty() public Mutation build() { - return new Mutation(keyspaceName, key, modifications.build(), approxCreatedAtNanos); + return new Mutation(keyspaceName, key, modifications.build(), approxCreatedAtNanos, potentialTxnConflicts); } } } diff --git a/src/java/org/apache/cassandra/db/MutationVerbHandler.java b/src/java/org/apache/cassandra/db/MutationVerbHandler.java index c30fae63b42b..3312a12b65b0 100644 --- a/src/java/org/apache/cassandra/db/MutationVerbHandler.java +++ b/src/java/org/apache/cassandra/db/MutationVerbHandler.java @@ -19,7 +19,10 @@ import org.apache.cassandra.exceptions.WriteTimeoutException; import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.net.*; +import org.apache.cassandra.net.ForwardingInfo; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.net.ParamType; import org.apache.cassandra.tracing.Tracing; import static java.util.concurrent.TimeUnit.NANOSECONDS; diff --git a/src/java/org/apache/cassandra/db/NativeClustering.java b/src/java/org/apache/cassandra/db/NativeClustering.java index e7c7e8893a17..f51ea90a8245 100644 --- a/src/java/org/apache/cassandra/db/NativeClustering.java +++ b/src/java/org/apache/cassandra/db/NativeClustering.java @@ -21,16 +21,20 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; +import org.apache.cassandra.db.marshal.AddressBasedNativeData; import org.apache.cassandra.db.marshal.ByteBufferAccessor; +import org.apache.cassandra.db.marshal.NativeAccessor; +import org.apache.cassandra.db.marshal.NativeData; import org.apache.cassandra.db.marshal.ValueAccessor; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.ObjectSizes; import org.apache.cassandra.utils.concurrent.OpOrder; import org.apache.cassandra.utils.memory.HeapCloner; import org.apache.cassandra.utils.memory.MemoryUtil; +import org.apache.cassandra.utils.memory.NativeEndianMemoryUtil; import org.apache.cassandra.utils.memory.NativeAllocator; -public class NativeClustering implements Clustering +public class NativeClustering implements Clustering { private static final long EMPTY_SIZE = ObjectSizes.measure(new NativeClustering()); @@ -50,30 +54,30 @@ public NativeClustering(NativeAllocator allocator, OpOrder.Group writeOp, Cluste peer = allocator.allocate(metadataSize + dataSize + bitmapSize, writeOp); long bitmapStart = peer + metadataSize; - MemoryUtil.setShort(peer, (short) count); - MemoryUtil.setShort(peer + (metadataSize - 2), (short) dataSize); // goes at the end of the other offsets + NativeEndianMemoryUtil.setShort(peer, (short) count); + NativeEndianMemoryUtil.setShort(peer + (metadataSize - 2), (short) dataSize); // goes at the end of the other offsets - MemoryUtil.setByte(bitmapStart, bitmapSize, (byte) 0); + NativeEndianMemoryUtil.setByte(bitmapStart, bitmapSize, (byte) 0); long dataStart = peer + metadataSize + bitmapSize; int dataOffset = 0; for (int i = 0 ; i < count ; i++) { - MemoryUtil.setShort(peer + 2 + i * 2, (short) dataOffset); + NativeEndianMemoryUtil.setShort(peer + 2 + i * 2, (short) dataOffset); ByteBuffer value = clustering.bufferAt(i); if (value == null) { long boffset = bitmapStart + (i >>> 3); - int b = MemoryUtil.getByte(boffset); + int b = NativeEndianMemoryUtil.getByte(boffset); b |= 1 << (i & 7); - MemoryUtil.setByte(boffset, (byte) b); + NativeEndianMemoryUtil.setByte(boffset, (byte) b); continue; } assert value.order() == ByteOrder.BIG_ENDIAN; int size = value.remaining(); - MemoryUtil.setBytes(dataStart + dataOffset, value); + NativeEndianMemoryUtil.setBytes(dataStart + dataOffset, value); dataOffset += size; } } @@ -83,25 +87,66 @@ public Kind kind() return Kind.CLUSTERING; } - public ClusteringPrefix clustering() + public ClusteringPrefix clustering() { return this; } public int size() { - return MemoryUtil.getShort(peer); + return NativeEndianMemoryUtil.getUnsignedShort(peer); } public int dataSize() { int dataSizeOffset = (size() * 2) + 2; // metadataSize - 2 - return MemoryUtil.getShort(peer + dataSizeOffset); + return NativeEndianMemoryUtil.getUnsignedShort(peer + dataSizeOffset); } - public ByteBuffer get(int i) + public NativeData get(int i) + { + return buildDataObject(i, AddressBasedNativeData::new); + } + + public boolean isNull(int i) + { + return isNull(peer, size(), i); + } + + private static boolean isNull(long peer, int size, int i) + { + if (i >= size) + throw new IndexOutOfBoundsException(); + + int metadataSize = (size * 2) + 4; + long bitmapStart = peer + metadataSize; + int b = NativeEndianMemoryUtil.getByte(bitmapStart + (i >>> 3)); + return ((b & (1 << (i & 7))) != 0); + } + + public boolean isEmpty(int i) + { + int size = size(); + if (isNull(peer, size, i)) + return true; + + int startOffset = NativeEndianMemoryUtil.getUnsignedShort(peer + 2 + i * 2); + int endOffset = NativeEndianMemoryUtil.getUnsignedShort(peer + 4 + i * 2); + return (endOffset - startOffset) == 0; + } + + + private ByteBuffer getByteBuffer(int i) + { + return buildDataObject(i, (long address, int length) -> MemoryUtil.getByteBuffer(address, length, ByteOrder.BIG_ENDIAN)); + } + + private interface DataObjectBuilder { + D build(long address, int length); + } + + private D buildDataObject(int i, DataObjectBuilder builder) { - // offset at which we store the dataOffset int size = size(); if (i >= size) throw new IndexOutOfBoundsException(); @@ -109,20 +154,21 @@ public ByteBuffer get(int i) int metadataSize = (size * 2) + 4; int bitmapSize = ((size + 7) >>> 3); long bitmapStart = peer + metadataSize; - int b = MemoryUtil.getByte(bitmapStart + (i >>> 3)); + int b = NativeEndianMemoryUtil.getByte(bitmapStart + (i >>> 3)); if ((b & (1 << (i & 7))) != 0) return null; - int startOffset = MemoryUtil.getShort(peer + 2 + i * 2); - int endOffset = MemoryUtil.getShort(peer + 4 + i * 2); - return MemoryUtil.getByteBuffer(bitmapStart + bitmapSize + startOffset, - endOffset - startOffset, - ByteOrder.BIG_ENDIAN); + int startOffset = NativeEndianMemoryUtil.getUnsignedShort(peer + 2 + i * 2); + int endOffset = NativeEndianMemoryUtil.getUnsignedShort(peer + 4 + i * 2); + + long address = bitmapStart + bitmapSize + startOffset; + int length = endOffset - startOffset; + return builder.build(address, length); } - public ByteBuffer[] getRawValues() + public NativeData[] getRawValues() { - ByteBuffer[] values = new ByteBuffer[size()]; + NativeData[] values = new NativeData[size()]; for (int i = 0 ; i < values.length ; i++) values[i] = get(i); return values; @@ -130,13 +176,15 @@ public ByteBuffer[] getRawValues() public ByteBuffer[] getBufferArray() { - return getRawValues(); + ByteBuffer[] values = new ByteBuffer[size()]; + for (int i = 0 ; i < values.length ; i++) + values[i] = getByteBuffer(i); + return values; } - public ValueAccessor accessor() + public ValueAccessor accessor() { - // TODO: add a native accessor - return ByteBufferAccessor.instance; + return NativeAccessor.instance; } public long unsharedHeapSize() @@ -149,6 +197,12 @@ public long unsharedHeapSizeExcludingData() return EMPTY_SIZE; } + @Override + public Clustering ensureAccessorFactorySupport() + { + return retainable(); + } + @Override public final int hashCode() { @@ -161,8 +215,9 @@ public final boolean equals(Object o) return ClusteringPrefix.equals(this, o); } + // data are copied to heap byte buffers to detach from a NativeAllocator lifecycle @Override - public ClusteringPrefix retainable() + public Clustering retainable() { assert kind() == Kind.CLUSTERING; // tombstones are never stored natively @@ -170,10 +225,10 @@ public ClusteringPrefix retainable() ByteBuffer[] values = new ByteBuffer[size()]; for (int i = 0; i < values.length; ++i) { - ByteBuffer value = get(i); + ByteBuffer value = getByteBuffer(i); values[i] = value != null ? HeapCloner.instance.clone(value) : null; } - return accessor().factory().clustering(values); + return ByteBufferAccessor.instance.factory().clustering(values); } } diff --git a/src/java/org/apache/cassandra/db/NativeDecoratedKey.java b/src/java/org/apache/cassandra/db/NativeDecoratedKey.java index bc149084d852..76b2367ae210 100644 --- a/src/java/org/apache/cassandra/db/NativeDecoratedKey.java +++ b/src/java/org/apache/cassandra/db/NativeDecoratedKey.java @@ -26,6 +26,7 @@ import org.apache.cassandra.utils.concurrent.OpOrder; import org.apache.cassandra.utils.memory.MemoryUtil; import org.apache.cassandra.utils.memory.NativeAllocator; +import org.apache.cassandra.utils.memory.NativeEndianMemoryUtil; public class NativeDecoratedKey extends DecoratedKey { @@ -39,7 +40,7 @@ public NativeDecoratedKey(Token token, NativeAllocator allocator, OpOrder.Group int size = key.remaining(); this.peer = allocator.allocate(4 + size, writeOp); - MemoryUtil.setInt(peer, size); + NativeEndianMemoryUtil.setInt(peer, size); MemoryUtil.setBytes(peer + 4, key); } @@ -50,14 +51,14 @@ public NativeDecoratedKey(Token token, NativeAllocator allocator, OpOrder.Group int size = keyBytes.length; this.peer = allocator.allocate(4 + size, writeOp); - MemoryUtil.setInt(peer, size); + NativeEndianMemoryUtil.setInt(peer, size); MemoryUtil.setBytes(peer + 4, keyBytes, 0, size); } @Inline int length() { - return MemoryUtil.getInt(peer); + return NativeEndianMemoryUtil.getInt(peer); } @Inline @@ -75,7 +76,7 @@ public ByteBuffer getKey() @Override public int getKeyLength() { - return MemoryUtil.getInt(peer); + return NativeEndianMemoryUtil.getInt(peer); } @Override diff --git a/src/java/org/apache/cassandra/db/PartitionPosition.java b/src/java/org/apache/cassandra/db/PartitionPosition.java index e8d29cbd9464..d936e566a7cf 100644 --- a/src/java/org/apache/cassandra/db/PartitionPosition.java +++ b/src/java/org/apache/cassandra/db/PartitionPosition.java @@ -29,7 +29,7 @@ public interface PartitionPosition extends RingPosition, ByteComparable { - public static enum Kind + public enum Kind { // Only add new values to the end of the enum, the ordinal is used // during serialization diff --git a/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java b/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java index 4926061cb870..131309b87698 100644 --- a/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java +++ b/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java @@ -53,6 +53,7 @@ import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.service.StorageProxy; +import org.apache.cassandra.service.reads.ReadCoordinator; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.transport.Dispatcher; @@ -71,6 +72,7 @@ protected PartitionRangeReadCommand(Epoch serializedAtEpoch, boolean isDigest, int digestVersion, boolean acceptsTransient, + PotentialTxnConflicts potentialTxnConflicts, TableMetadata metadata, long nowInSec, ColumnFilter columnFilter, @@ -80,7 +82,7 @@ protected PartitionRangeReadCommand(Epoch serializedAtEpoch, Index.QueryPlan indexQueryPlan, boolean trackWarnings) { - super(serializedAtEpoch, Kind.PARTITION_RANGE, isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter, rowFilter, limits, indexQueryPlan, trackWarnings, dataRange); + super(serializedAtEpoch, Kind.PARTITION_RANGE, isDigest, digestVersion, acceptsTransient, potentialTxnConflicts, metadata, nowInSec, columnFilter, rowFilter, limits, indexQueryPlan, trackWarnings, dataRange); this.requestedSlices = dataRange.clusteringIndexFilter.getSlices(metadata()); } @@ -88,6 +90,7 @@ private static PartitionRangeReadCommand create(Epoch serializedAtEpoch, boolean isDigest, int digestVersion, boolean acceptsTransient, + PotentialTxnConflicts potentialTxnConflicts, TableMetadata metadata, long nowInSec, ColumnFilter columnFilter, @@ -115,6 +118,7 @@ private static PartitionRangeReadCommand create(Epoch serializedAtEpoch, isDigest, digestVersion, acceptsTransient, + potentialTxnConflicts, metadata, nowInSec, columnFilter, @@ -136,6 +140,30 @@ public static PartitionRangeReadCommand create(TableMetadata metadata, false, 0, false, + PotentialTxnConflicts.DISALLOW, + metadata, + nowInSec, + columnFilter, + rowFilter, + limits, + dataRange, + findIndexQueryPlan(metadata, rowFilter), + false); + } + + public static PartitionRangeReadCommand create(TableMetadata metadata, + long nowInSec, + ColumnFilter columnFilter, + RowFilter rowFilter, + DataLimits limits, + DataRange dataRange, + PotentialTxnConflicts potentialTxnConflicts) + { + return create(metadata.epoch, + false, + 0, + false, + potentialTxnConflicts, metadata, nowInSec, columnFilter, @@ -160,6 +188,7 @@ public static PartitionRangeReadCommand allDataRead(TableMetadata metadata, long false, 0, false, + PotentialTxnConflicts.DISALLOW, metadata, nowInSec, ColumnFilter.all(metadata), @@ -206,6 +235,7 @@ public PartitionRangeReadCommand forSubRange(AbstractBounds r isDigestQuery(), digestVersion(), acceptsTransient(), + potentialTxnConflicts(), metadata(), nowInSec(), columnFilter(), @@ -216,12 +246,52 @@ public PartitionRangeReadCommand forSubRange(AbstractBounds r isTrackingWarnings()); } + public PartitionRangeReadCommand withTransactionalSettings(long nowInSec, AbstractBounds range, boolean isRangeContinuation, boolean withoutReconciliation) + { + // If we're not a continuation of whatever range we've previously queried, we should ignore the states of the + // DataLimits as it's either useless, or misleading. This is particularly important for GROUP BY queries, where + // DataLimits.CQLGroupByLimits.GroupByAwareCounter assumes that if GroupingState.hasClustering(), then we're in + // the middle of a group, but we can't make that assumption if we query and range "in advance" of where we are + // on the ring. + return create(serializedAtEpoch(), + isDigestQuery(), + digestVersion(), + acceptsTransient(), + PotentialTxnConflicts.ALLOW, + metadata(), + nowInSec, + columnFilter(), + withoutReconciliation ? rowFilter().withoutReconciliation() : rowFilter(), + isRangeContinuation ? limits() : limits().withoutState(), + dataRange().forSubRange(range), + indexQueryPlan(), + isTrackingWarnings()); + } + + public PartitionRangeReadCommand withTxnReadName(int txnReadName) + { + return create(serializedAtEpoch(), + isDigestQuery(), + digestVersion(), + acceptsTransient(), + potentialTxnConflicts(), + metadata(), + txnReadName, + columnFilter(), + rowFilter(), + limits(), + dataRange(), + indexQueryPlan(), + isTrackingWarnings()); + } + public PartitionRangeReadCommand copy() { return create(serializedAtEpoch(), isDigestQuery(), digestVersion(), acceptsTransient(), + potentialTxnConflicts(), metadata(), nowInSec(), columnFilter(), @@ -239,6 +309,7 @@ protected PartitionRangeReadCommand copyAsDigestQuery() true, digestVersion(), false, + potentialTxnConflicts(), metadata(), nowInSec(), columnFilter(), @@ -256,6 +327,7 @@ protected PartitionRangeReadCommand copyAsTransientQuery() false, 0, true, + potentialTxnConflicts(), metadata(), nowInSec(), columnFilter(), @@ -273,6 +345,7 @@ public PartitionRangeReadCommand withUpdatedLimit(DataLimits newLimits) isDigestQuery(), digestVersion(), acceptsTransient(), + potentialTxnConflicts(), metadata(), nowInSec(), columnFilter(), @@ -290,6 +363,7 @@ public PartitionRangeReadCommand withUpdatedLimitsAndDataRange(DataLimits newLim isDigestQuery(), digestVersion(), acceptsTransient(), + potentialTxnConflicts(), metadata(), nowInSec(), columnFilter(), @@ -312,7 +386,7 @@ public boolean isReversed() public PartitionIterator execute(ConsistencyLevel consistency, ClientState state, Dispatcher.RequestTime requestTime) throws RequestExecutionException { - return StorageProxy.getRangeSlice(this, consistency, requestTime); + return StorageProxy.getRangeSlice(this, consistency, ReadCoordinator.DEFAULT, requestTime); } protected void recordLatency(TableMetrics metric, long latencyNanos) @@ -494,6 +568,11 @@ protected void serializeSelection(DataOutputPlus out, int version) throws IOExce DataRange.serializer.serialize(dataRange(), out, version, metadata()); } + protected void serializeSelectionWithoutKey(DataOutputPlus out, int version) throws IOException + { + serializeSelection(out, version); + } + protected long selectionSerializedSize(int version) { return DataRange.serializer.serializedSize(dataRange(), version, metadata()); @@ -525,6 +604,7 @@ public ReadCommand deserialize(DataInputPlus in, boolean isDigest, int digestVersion, boolean acceptsTransient, + PotentialTxnConflicts potentialTxnConflicts, TableMetadata metadata, long nowInSec, ColumnFilter columnFilter, @@ -534,7 +614,7 @@ public ReadCommand deserialize(DataInputPlus in, throws IOException { DataRange range = DataRange.serializer.deserialize(in, version, metadata); - return PartitionRangeReadCommand.create(serializedAtEpoch, isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter, rowFilter, limits, range, indexQueryPlan, false); + return PartitionRangeReadCommand.create(serializedAtEpoch, isDigest, digestVersion, acceptsTransient, potentialTxnConflicts, metadata, nowInSec, columnFilter, rowFilter, limits, range, indexQueryPlan, false); } } @@ -552,7 +632,7 @@ private VirtualTablePartitionRangeReadCommand(boolean isDigest, Index.QueryPlan indexQueryPlan, boolean trackWarnings) { - super(metadata.epoch, isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter, rowFilter, limits, dataRange, indexQueryPlan, trackWarnings); + super(metadata.epoch, isDigest, digestVersion, acceptsTransient, PotentialTxnConflicts.ALLOW, metadata, nowInSec, columnFilter, rowFilter, limits, dataRange, indexQueryPlan, trackWarnings); } @Override diff --git a/src/java/org/apache/cassandra/db/PartitionRangeReadQuery.java b/src/java/org/apache/cassandra/db/PartitionRangeReadQuery.java index d91930125d6f..934c33eafa03 100644 --- a/src/java/org/apache/cassandra/db/PartitionRangeReadQuery.java +++ b/src/java/org/apache/cassandra/db/PartitionRangeReadQuery.java @@ -17,6 +17,7 @@ */ package org.apache.cassandra.db; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.filter.DataLimits; import org.apache.cassandra.db.filter.RowFilter; @@ -36,9 +37,10 @@ static ReadQuery create(TableMetadata table, ColumnFilter columnFilter, RowFilter rowFilter, DataLimits limits, - DataRange dataRange) + DataRange dataRange, + PotentialTxnConflicts potentialTxnConflicts) { - return PartitionRangeReadCommand.create(table, nowInSec, columnFilter, rowFilter, limits, dataRange); + return PartitionRangeReadCommand.create(table, nowInSec, columnFilter, rowFilter, limits, dataRange, potentialTxnConflicts); } /** diff --git a/src/java/org/apache/cassandra/db/RangeTombstoneList.java b/src/java/org/apache/cassandra/db/RangeTombstoneList.java index 8b8cee2d39bd..963985788a9e 100644 --- a/src/java/org/apache/cassandra/db/RangeTombstoneList.java +++ b/src/java/org/apache/cassandra/db/RangeTombstoneList.java @@ -22,14 +22,14 @@ import java.util.Collections; import java.util.Iterator; -import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.utils.AbstractIterator; -import org.apache.cassandra.utils.CassandraUInt; - import com.google.common.collect.Iterators; import org.apache.cassandra.cache.IMeasurableMemory; -import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.utils.AbstractIterator; +import org.apache.cassandra.utils.CassandraUInt; import org.apache.cassandra.utils.ObjectSizes; import org.apache.cassandra.utils.memory.ByteBufferCloner; @@ -328,6 +328,16 @@ public void updateAllTimestamp(long timestamp) markedAts[i] = timestamp; } + public void updateAllTimestampAndLocalDeletionTime(long timestamp, long localDeletionTime) + { + int unsignedLocalDeletionTime = Cell.deletionTimeLongToUnsignedInteger(localDeletionTime); + for (int i = 0; i < size; i++) + { + markedAts[i] = timestamp; + delTimesUnsignedIntegers[i] = unsignedLocalDeletionTime; + } + } + private RangeTombstone rangeTombstone(int idx) { return new RangeTombstone(Slice.make(starts[idx], ends[idx]), DeletionTime.buildUnsafeWithUnsignedInteger(markedAts[idx], delTimesUnsignedIntegers[idx])); diff --git a/src/java/org/apache/cassandra/db/ReadCommand.java b/src/java/org/apache/cassandra/db/ReadCommand.java index e4ea5f12d74b..ae53eb292c9a 100644 --- a/src/java/org/apache/cassandra/db/ReadCommand.java +++ b/src/java/org/apache/cassandra/db/ReadCommand.java @@ -18,44 +18,64 @@ package org.apache.cassandra.db; import java.io.IOException; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.function.BiFunction; -import java.util.function.LongPredicate; import java.util.function.Function; +import java.util.function.LongPredicate; import java.util.stream.Collectors; - import javax.annotation.Nullable; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.Iterables; import com.google.common.collect.Sets; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import accord.primitives.Seekable; +import accord.utils.Invariants; import io.netty.util.concurrent.FastThreadLocal; -import org.apache.cassandra.config.*; -import org.apache.cassandra.db.filter.*; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.config.Config; +import org.apache.cassandra.config.DataStorageSpec; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.filter.ClusteringIndexFilter; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.filter.DataLimits; +import org.apache.cassandra.db.filter.LocalReadSizeTooLargeException; +import org.apache.cassandra.db.filter.RowFilter; +import org.apache.cassandra.db.filter.TombstoneOverwhelmingException; +import org.apache.cassandra.db.partitions.PurgeFunction; +import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; +import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.RangeTombstoneBoundMarker; +import org.apache.cassandra.db.rows.RangeTombstoneMarker; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.Rows; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIterators; import org.apache.cassandra.db.transform.BasePartitions; import org.apache.cassandra.db.transform.BaseRows; +import org.apache.cassandra.db.transform.RTBoundCloser; +import org.apache.cassandra.db.transform.RTBoundValidator; +import org.apache.cassandra.db.transform.RTBoundValidator.Stage; +import org.apache.cassandra.db.transform.StoppingTransformation; +import org.apache.cassandra.db.transform.Transformation; import org.apache.cassandra.exceptions.CoordinatorBehindException; import org.apache.cassandra.exceptions.QueryCancelledException; +import org.apache.cassandra.exceptions.UnknownIndexException; import org.apache.cassandra.exceptions.UnknownTableException; import org.apache.cassandra.metrics.TCMMetrics; import org.apache.cassandra.net.MessageFlag; import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.net.ParamType; import org.apache.cassandra.net.Verb; -import org.apache.cassandra.db.partitions.*; -import org.apache.cassandra.db.rows.*; -import org.apache.cassandra.db.transform.RTBoundCloser; -import org.apache.cassandra.db.transform.RTBoundValidator; -import org.apache.cassandra.db.transform.RTBoundValidator.Stage; -import org.apache.cassandra.db.transform.StoppingTransformation; -import org.apache.cassandra.db.transform.Transformation; -import org.apache.cassandra.exceptions.UnknownIndexException; import org.apache.cassandra.index.Index; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.sstable.format.SSTableReader; @@ -67,25 +87,28 @@ import org.apache.cassandra.schema.IndexMetadata; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.SchemaProvider; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.schema.SchemaProvider; import org.apache.cassandra.service.ActiveRepairService; import org.apache.cassandra.service.ClientWarn; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tracing.Tracing; -import org.apache.cassandra.utils.CassandraUInt; import org.apache.cassandra.transport.Dispatcher; +import org.apache.cassandra.utils.CassandraUInt; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.NoSpamLogger; import org.apache.cassandra.utils.ObjectSizes; import org.apache.cassandra.utils.TimeUUID; +import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.collect.Iterables.any; import static com.google.common.collect.Iterables.filter; -import static org.apache.cassandra.utils.Clock.Global.nanoTime; import static org.apache.cassandra.db.partitions.UnfilteredPartitionIterators.MergeListener.NOOP; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; import static org.apache.cassandra.utils.MonotonicClock.Global.approxTime; /** @@ -99,8 +122,32 @@ public abstract class ReadCommand extends AbstractReadQuery private static final int TEST_ITERATION_DELAY_MILLIS = CassandraRelevantProperties.TEST_READ_ITERATION_DELAY_MS.getInt(); protected static final Logger logger = LoggerFactory.getLogger(ReadCommand.class); - public static final IVersionedSerializer serializer = new Serializer(); + public static final Serializer serializer = new Serializer(); + public enum PotentialTxnConflicts + { + /** + * Check for and raise an error if this operation should have been transactionally managed. For use + * by queries that aren't issued by a transaction system managing potential conflicts in contexts where + * conflicts would be a problem. + */ + DISALLOW(false), + + /** + * Don't check or raise an error if this operation could conflict with transactions. For use when the thing + * being managed doesn't support transactions or the operation is being done by a transaction that is already + * managing any potential conflicts. + */ + ALLOW(true); + + public final boolean allowed; + + PotentialTxnConflicts(boolean allowed) + { + this.allowed = allowed; + } + } + // Expose the active command running so transitive calls can lookup this command. // This is useful for a few reasons, but mainly because the CQL query is here. private static final FastThreadLocal COMMAND = new FastThreadLocal<>(); @@ -110,6 +157,8 @@ public abstract class ReadCommand extends AbstractReadQuery private final boolean isDigestQuery; private final boolean acceptsTransient; private final Epoch serializedAtEpoch; + private final PotentialTxnConflicts potentialTxnConflicts; + // if a digest query, the version for which the digest is expected. Ignored if not a digest. private int digestVersion; @@ -128,6 +177,7 @@ public abstract ReadCommand deserialize(DataInputPlus in, boolean isDigest, int digestVersion, boolean acceptsTransient, + PotentialTxnConflicts potentialTxnConflicts, TableMetadata metadata, long nowInSec, ColumnFilter columnFilter, @@ -138,14 +188,16 @@ public abstract ReadCommand deserialize(DataInputPlus in, protected enum Kind { - SINGLE_PARTITION (SinglePartitionReadCommand.selectionDeserializer), - PARTITION_RANGE (PartitionRangeReadCommand.selectionDeserializer); + SINGLE_PARTITION (SinglePartitionReadCommand.selectionDeserializer, SinglePartitionReadCommand.accordSelectionDeserializer), + PARTITION_RANGE (PartitionRangeReadCommand.selectionDeserializer, ignore -> PartitionRangeReadCommand.selectionDeserializer); private final SelectionDeserializer selectionDeserializer; + private final Function accordSelectionDeserializer; - Kind(SelectionDeserializer selectionDeserializer) + Kind(SelectionDeserializer selectionDeserializer, Function accordSelectionDeserializer) { this.selectionDeserializer = selectionDeserializer; + this.accordSelectionDeserializer = accordSelectionDeserializer; } } @@ -154,6 +206,7 @@ protected ReadCommand(Epoch serializedAtEpoch, boolean isDigestQuery, int digestVersion, boolean acceptsTransient, + PotentialTxnConflicts potentialTxnConflicts, TableMetadata metadata, long nowInSec, ColumnFilter columnFilter, @@ -172,6 +225,7 @@ protected ReadCommand(Epoch serializedAtEpoch, this.digestVersion = digestVersion; this.acceptsTransient = acceptsTransient; this.indexQueryPlan = indexQueryPlan; + this.potentialTxnConflicts = potentialTxnConflicts; this.trackWarnings = trackWarnings; this.serializedAtEpoch = serializedAtEpoch; this.dataRange = dataRange; @@ -183,6 +237,7 @@ public static ReadCommand getCommand() } protected abstract void serializeSelection(DataOutputPlus out, int version) throws IOException; + protected abstract void serializeSelectionWithoutKey(DataOutputPlus out, int version) throws IOException; protected abstract long selectionSerializedSize(int version); public abstract boolean isLimitedToOnePartition(); @@ -323,7 +378,7 @@ public DataRange dataRange() */ public ReadCommand copyAsTransientQuery(Replica replica) { - Preconditions.checkArgument(replica.isTransient(), + checkArgument(replica.isTransient(), "Can't make a transient request on a full replica: " + replica); return copyAsTransientQuery(); } @@ -345,7 +400,7 @@ public ReadCommand copyAsTransientQuery(Iterable replicas) */ public ReadCommand copyAsDigestQuery(Replica replica) { - Preconditions.checkArgument(replica.isFull(), + checkArgument(replica.isFull(), "Can't make a digest request on a transient replica " + replica); return copyAsDigestQuery(); } @@ -440,6 +495,8 @@ public UnfilteredPartitionIterator executeLocally(ReadExecutionController execut try { ColumnFamilyStore cfs = Keyspace.openAndGetStore(metadata()); + if (!potentialTxnConflicts.allowed) + ConsensusRequestRouter.validateSafeToReadNonTransactionally(this); Index.QueryPlan indexQueryPlan = indexQueryPlan(); Index.Searcher searcher = null; @@ -528,6 +585,11 @@ public ReadExecutionController executionController() return ReadExecutionController.forCommand(this, false); } + public PotentialTxnConflicts potentialTxnConflicts() + { + return potentialTxnConflicts; + } + /** * Wraps the provided iterator so that metrics on what is scanned by the command are recorded. * This also log warning/trow TombstoneOverwhelmingException if appropriate. @@ -874,7 +936,7 @@ protected boolean hasPartitionLevelDeletions(SSTableReader sstable) // Skip purgeable tombstones. We do this because it's safe to do (post-merge of the memtable and sstable at least), it // can save us some bandwith, and avoid making us throw a TombstoneOverwhelmingException for purgeable tombstones (which // are to some extend an artefact of compaction lagging behind and hence counting them is somewhat unintuitive). - protected UnfilteredPartitionIterator withoutPurgeableTombstones(UnfilteredPartitionIterator iterator, + protected UnfilteredPartitionIterator withoutPurgeableTombstones(UnfilteredPartitionIterator iterator, ColumnFamilyStore cfs, ReadExecutionController controller) { @@ -1217,6 +1279,7 @@ public static class Serializer implements IVersionedSerializer private static final int HAS_INDEX = 0x04; private static final int ACCEPTS_TRANSIENT = 0x08; private static final int NEEDS_RECONCILIATION = 0x10; + private static final int ALLOWS_POTENTIAL_TXN_CONFLICTS = 0x20; private final SchemaProvider schema; @@ -1281,21 +1344,30 @@ private static boolean needsReconciliation(int flags) return (flags & NEEDS_RECONCILIATION) != 0; } - public void serialize(ReadCommand command, DataOutputPlus out, int version) throws IOException + private static int potentialTxnConflicts(PotentialTxnConflicts potentialTxnConflicts) + { + return potentialTxnConflicts.allowed ? ALLOWS_POTENTIAL_TXN_CONFLICTS : 0; + } + + private static PotentialTxnConflicts potentialTxnConflicts(int flags) + { + return (flags & ALLOWS_POTENTIAL_TXN_CONFLICTS) != 0 ? PotentialTxnConflicts.ALLOW : PotentialTxnConflicts.DISALLOW; + } + + private void serializeHeader(ReadCommand command, DataOutputPlus out, int version) throws IOException { out.writeByte(command.kind.ordinal()); out.writeByte( - digestFlag(command.isDigestQuery()) - | indexFlag(null != command.indexQueryPlan()) - | acceptsTransientFlag(command.acceptsTransient()) - | needsReconciliationFlag(command.rowFilter().needsReconciliation()) + digestFlag(command.isDigestQuery()) + | indexFlag(null != command.indexQueryPlan()) + | acceptsTransientFlag(command.acceptsTransient()) + | needsReconciliationFlag(command.rowFilter().needsReconciliation()) + | potentialTxnConflicts(command.potentialTxnConflicts) ); - if (command.isDigestQuery()) - out.writeUnsignedVInt32(command.digestVersion()); - command.metadata().id.serialize(out); - if (version >= MessagingService.VERSION_51) - Epoch.serializer.serialize(command.serializedAtEpoch, out); - out.writeInt(version >= MessagingService.VERSION_50 ? CassandraUInt.fromLong(command.nowInSec()) : (int) command.nowInSec()); + } + + private void serializeFiltersAndLimits(ReadCommand command, DataOutputPlus out, int version) throws IOException + { ColumnFilter.serializer.serialize(command.columnFilter(), out, version); RowFilter.serializer.serialize(command.rowFilter(), out, version); DataLimits.serializer.serialize(command.limits(), out, version, command.metadata().comparator); @@ -1304,16 +1376,57 @@ public void serialize(ReadCommand command, DataOutputPlus out, int version) thro // from the index name. if (null != command.indexQueryPlan) IndexMetadata.serializer.serialize(command.indexQueryPlan.getFirst().getIndexMetadata(), out, version); + } + public void serialize(ReadCommand command, DataOutputPlus out, int version) throws IOException + { + serializeHeader(command, out, version); + if (command.isDigestQuery()) + out.writeUnsignedVInt32(command.digestVersion()); + command.metadata().id.serialize(out); + if (version >= MessagingService.VERSION_51) + Epoch.serializer.serialize(command.serializedAtEpoch, out); + out.writeInt(version >= MessagingService.VERSION_50 ? CassandraUInt.fromLong(command.nowInSec()) : (int) command.nowInSec()); + serializeFiltersAndLimits(command, out, version); command.serializeSelection(out, version); } + public void serializeForAccord(ReadCommand command, TableMetadatas tables, DataOutputPlus out, int version) throws IOException + { + Invariants.require(!command.isDigestQuery); + serializeHeader(command, out, version); + tables.serialize(command.metadata(), out); + serializeFiltersAndLimits(command, out, version); + command.serializeSelectionWithoutKey(out, version); + } + + private ReadCommand deserialize(SelectionDeserializer deserializer, int flags, Epoch schemaVersion, int digestVersion, long nowInSec, TableMetadata tableMetadata, DataInputPlus in, int version) throws IOException + { + boolean isDigest = isDigest(flags); + boolean acceptsTransient = acceptsTransient(flags); + PotentialTxnConflicts potentialTxnConflicts = potentialTxnConflicts(flags); + boolean hasIndex = hasIndex(flags); + boolean needsReconciliation = needsReconciliation(flags); + + ColumnFilter columnFilter = ColumnFilter.serializer.deserialize(in, version, tableMetadata); + RowFilter rowFilter = RowFilter.serializer.deserialize(in, version, tableMetadata, needsReconciliation); + DataLimits limits = DataLimits.serializer.deserialize(in, version, tableMetadata); + Index.QueryPlan indexQueryPlan = null; + if (hasIndex) + { + IndexMetadata index = deserializeIndexMetadata(in, version, tableMetadata); + Index.Group indexGroup = Keyspace.openAndGetStore(tableMetadata).indexManager.getIndexGroup(index); + if (indexGroup != null) + indexQueryPlan = indexGroup.queryPlanFor(rowFilter); + } + + return deserializer.deserialize(in, version, schemaVersion, isDigest, digestVersion, acceptsTransient, potentialTxnConflicts, tableMetadata, nowInSec, columnFilter, rowFilter, limits, indexQueryPlan); + } + public ReadCommand deserialize(DataInputPlus in, int version) throws IOException { Kind kind = Kind.values()[in.readByte()]; int flags = in.readByte(); - boolean isDigest = isDigest(flags); - boolean acceptsTransient = acceptsTransient(flags); // Shouldn't happen or it's a user error (see comment above) but // better complain loudly than doing the wrong thing. if (isForThrift(flags)) @@ -1322,9 +1435,7 @@ public ReadCommand deserialize(DataInputPlus in, int version) throws IOException + "which is unsupported. Make sure to stop using thrift before " + "upgrading to 4.0"); - boolean hasIndex = hasIndex(flags); - int digestVersion = isDigest ? (int)in.readUnsignedVInt() : 0; - boolean needsReconciliation = needsReconciliation(flags); + int digestVersion = isDigest(flags) ? in.readUnsignedVInt32() : 0; TableId tableId = TableId.deserialize(in); Epoch schemaVersion = Epoch.EMPTY; @@ -1347,19 +1458,19 @@ public ReadCommand deserialize(DataInputPlus in, int version) throws IOException throw e; } long nowInSec = version >= MessagingService.VERSION_50 ? CassandraUInt.toLong(in.readInt()) : in.readInt(); - ColumnFilter columnFilter = ColumnFilter.serializer.deserialize(in, version, tableMetadata); - RowFilter rowFilter = RowFilter.serializer.deserialize(in, version, tableMetadata, needsReconciliation); - DataLimits limits = DataLimits.serializer.deserialize(in, version, tableMetadata); - Index.QueryPlan indexQueryPlan = null; - if (hasIndex) - { - IndexMetadata index = deserializeIndexMetadata(in, version, tableMetadata); - Index.Group indexGroup = Keyspace.openAndGetStore(tableMetadata).indexManager.getIndexGroup(index); - if (indexGroup != null) - indexQueryPlan = indexGroup.queryPlanFor(rowFilter); - } + return deserialize(kind.selectionDeserializer, flags, schemaVersion, digestVersion, nowInSec, tableMetadata, in, version); + } - return kind.selectionDeserializer.deserialize(in, version, schemaVersion, isDigest, digestVersion, acceptsTransient, tableMetadata, nowInSec, columnFilter, rowFilter, limits, indexQueryPlan); + public ReadCommand deserializeForAccord(Seekable key, TableMetadatas tables, DataInputPlus in, int version) throws IOException + { + Kind kind = Kind.values()[in.readByte()]; + int flags = in.readByte(); + if (isDigest(flags) || isForThrift(flags) || acceptsTransient(flags)) + throw new IllegalStateException("Received an Accord command with a digest/thrift/transient flag set."); + + TableMetadata tableMetadata = tables.deserialize(in); + + return deserialize(kind.accordSelectionDeserializer.apply(key), flags, tableMetadata.epoch, 0, 0, tableMetadata, in, version); } private IndexMetadata deserializeIndexMetadata(DataInputPlus in, int version, TableMetadata metadata) throws IOException @@ -1392,5 +1503,16 @@ public long serializedSize(ReadCommand command, int version) + command.selectionSerializedSize(version) + command.indexSerializedSize(version); } + + public long serializedSizeForAccord(ReadCommand command, TableMetadatas tables, int version) + { + return 2 // kind + flags + + tables.serializedSize(command.metadata()) + + ColumnFilter.serializer.serializedSize(command.columnFilter(), version) + + RowFilter.serializer.serializedSize(command.rowFilter(), version) + + DataLimits.serializer.serializedSize(command.limits(), version, command.metadata().comparator) + + command.selectionSerializedSize(version) + + command.indexSerializedSize(version); + } } } \ No newline at end of file diff --git a/src/java/org/apache/cassandra/db/ReadCommandVerbHandler.java b/src/java/org/apache/cassandra/db/ReadCommandVerbHandler.java index 5d430f32cf72..8ff89f4e9b5d 100644 --- a/src/java/org/apache/cassandra/db/ReadCommandVerbHandler.java +++ b/src/java/org/apache/cassandra/db/ReadCommandVerbHandler.java @@ -22,26 +22,28 @@ import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; -import org.apache.cassandra.exceptions.CoordinatorBehindException; -import org.apache.cassandra.exceptions.InvalidRoutingException; -import org.apache.cassandra.exceptions.QueryCancelledException; import org.apache.cassandra.dht.AbstractBounds; import org.apache.cassandra.dht.Token; +import org.apache.cassandra.exceptions.CoordinatorBehindException; import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.exceptions.InvalidRoutingException; +import org.apache.cassandra.exceptions.QueryCancelledException; +import org.apache.cassandra.exceptions.RetryOnDifferentSystemException; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.metrics.TCMMetrics; -import org.apache.cassandra.schema.SchemaConstants; -import org.apache.cassandra.service.StorageService; -import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.net.IVerbHandler; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.service.StorageService; import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.utils.FBUtilities; import static java.util.concurrent.TimeUnit.NANOSECONDS; +import static org.apache.cassandra.exceptions.RequestFailureReason.RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM; public class ReadCommandVerbHandler implements IVerbHandler { @@ -49,6 +51,18 @@ public class ReadCommandVerbHandler implements IVerbHandler private static final Logger logger = LoggerFactory.getLogger(ReadCommandVerbHandler.class); + public ReadResponse doRead(ReadCommand command, boolean trackRepairedData) + { + ReadResponse response; + try (ReadExecutionController controller = command.executionController(trackRepairedData); + UnfilteredPartitionIterator iterator = command.executeLocally(controller)) + { + response = command.createResponse(iterator, controller.getRepairedDataInfo()); + } + + return response; + } + public void doVerb(Message message) { if (message.epoch().isAfter(Epoch.EMPTY)) @@ -68,10 +82,9 @@ public void doVerb(Message message) command.trackWarnings(); ReadResponse response; - try (ReadExecutionController controller = command.executionController(message.trackRepairedData()); - UnfilteredPartitionIterator iterator = command.executeLocally(controller)) + try { - response = command.createResponse(iterator, controller.getRepairedDataInfo()); + response = doRead(command, message.trackRepairedData()); } catch (RejectException e) { @@ -88,6 +101,13 @@ public void doVerb(Message message) MessagingService.instance().send(reply, message.from()); return; } + catch (RetryOnDifferentSystemException e) + { + logger.debug("Responding with retry on different system"); + MessagingService.instance().respondWithFailure(RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM, message); + Tracing.trace("Payload application resulted in RetryOnDifferentSysten"); + return; + } catch (AssertionError t) { throw new AssertionError(String.format("Caught an error while trying to process the command: %s", command.toCQLString()), t); @@ -147,15 +167,21 @@ else if (localComparisonEpoch.isAfter(readCommand.serializedAtEpoch())) private ClusterMetadata checkTokenOwnership(ClusterMetadata metadata, Message message) { ReadCommand command = message.payload; + if (command.metadata().isVirtual()) return metadata; + // Some read commands may be sent using an older Epoch intentionally so validating using the current Epoch + // doesn't work + if (command.potentialTxnConflicts().allowed) + return metadata; + if (command.isTopK()) return metadata; if (command instanceof SinglePartitionReadCommand) { - Token token = ((SinglePartitionReadCommand) command).partitionKey().getToken(); + Token token = ((SinglePartitionReadCommand)command).partitionKey().getToken(); Replica localReplica = getLocalReplica(metadata, token, command.metadata().keyspace); if (localReplica == null) { diff --git a/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java b/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java index 8ca29eba1351..d40359c14472 100644 --- a/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java +++ b/src/java/org/apache/cassandra/db/ReadRepairVerbHandler.java @@ -25,9 +25,14 @@ public class ReadRepairVerbHandler extends AbstractMutationVerbHandler { public static final ReadRepairVerbHandler instance = new ReadRepairVerbHandler(); + public void applyMutation(Mutation mutation) + { + mutation.apply(); + } + void applyMutation(Message message, InetAddressAndPort respondToAddress) { - message.payload.apply(); + applyMutation(message.payload); MessagingService.instance().send(message.emptyResponse(), respondToAddress); } } diff --git a/src/java/org/apache/cassandra/db/ReadResponse.java b/src/java/org/apache/cassandra/db/ReadResponse.java index d4906b2dd563..65e17a6920b9 100644 --- a/src/java/org/apache/cassandra/db/ReadResponse.java +++ b/src/java/org/apache/cassandra/db/ReadResponse.java @@ -17,14 +17,19 @@ */ package org.apache.cassandra.db; -import java.io.*; +import java.io.IOException; import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; import com.google.common.annotations.VisibleForTesting; import org.apache.cassandra.db.filter.ColumnFilter; -import org.apache.cassandra.db.partitions.*; -import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; +import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators; +import org.apache.cassandra.db.rows.DeserializationHelper; +import org.apache.cassandra.db.rows.Rows; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataInputBuffer; import org.apache.cassandra.io.util.DataInputPlus; @@ -132,6 +137,37 @@ private String toDebugString(UnfilteredRowIterator partition, TableMetadata meta return sb.toString(); } + /** + * For range reads Accord generates multiple responses per node because each command store executes + * the reads independently. The responses are already sorted in token order so the iterators or digests can be + * merged and still produce a consistent result across different nodes. + * + * This can *only* be called from the node producing the results not the coordinator because isEmptyDigest is + * not serialized + */ + public static ReadResponse merge(List responses, ReadCommand command) + { + if (responses.get(0).isDigestResponse()) + { + Digest digest = Digest.forReadResponse(); + for (ReadResponse response : responses) + digest.update(((DigestResponse)response).digest); + return new DigestResponse(ByteBuffer.wrap(digest.digest())); + } + else + { + List iterators = new ArrayList<>(responses.size()); + for (ReadResponse response : responses) + iterators.add(response.makeIterator(command)); + + // Range responses will not respect the limit because each command store returns a separate response + // so we effectively deserialize and then reserialize in order to apply the limits + // Wasteful, but better than sending it to the coordinator to do it + UnfilteredPartitionIterator filtered = command.limits().filter(UnfilteredPartitionIterators.concat(iterators), 0, command.selectsFullPartition()); + return new LocalDataResponse(filtered, command, NO_OP_REPAIRED_DATA_INFO); + } + } + protected static ByteBuffer makeDigest(UnfilteredPartitionIterator iterator, ReadCommand command) { Digest digest = Digest.forReadResponse(); diff --git a/src/java/org/apache/cassandra/db/SerializationHeader.java b/src/java/org/apache/cassandra/db/SerializationHeader.java index 841f7b305198..0c5fc53beb71 100644 --- a/src/java/org/apache/cassandra/db/SerializationHeader.java +++ b/src/java/org/apache/cassandra/db/SerializationHeader.java @@ -23,6 +23,7 @@ import com.google.common.collect.ImmutableList; +import accord.utils.Invariants; import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.UTF8Type; @@ -39,6 +40,7 @@ import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.serializers.AbstractTypeSerializer; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.btree.BTree; public class SerializationHeader { @@ -384,6 +386,97 @@ public EncodingStats getEncodingStats() } } + public interface ParameterizedSerializer

+ { + SerializationHeader deserialize(DataInputPlus in, TableMetadata metadata, boolean hasStatic, P param) throws IOException; + void serialize(DataOutputPlus out, SerializationHeader header, boolean hasStatic, P param) throws IOException; + long serializedSize(SerializationHeader header, boolean hasStatic, P param); + } + + public static class StableHeaderSerializer implements ParameterizedSerializer + { + public static StableHeaderSerializer STABLE = new StableHeaderSerializer(); + + @Override + public SerializationHeader deserialize(DataInputPlus in, TableMetadata metadata, boolean hasStatic, Object param) throws IOException + { + int count = in.readUnsignedVInt32(); + boolean isStatic = true; + Columns staticColumns = Columns.NONE, regularColumns = Columns.NONE; + try (BTree.FastBuilder builder = BTree.fastBuilder()) + { + while (count-- > 0) + { + ColumnMetadata next = metadata.getColumnById(in.readUnsignedVInt32()); + if (isStatic != next.isStatic()) + { + Invariants.require(isStatic); + if (!builder.isEmpty()) + { + staticColumns = Columns.from(builder); + if (count > 0) + builder.reset(); + } + isStatic = false; + } + builder.add(next); + } + Columns columns = Columns.from(builder); + if (isStatic) staticColumns = columns; + else regularColumns = columns; + } + RegularAndStaticColumns columns = new RegularAndStaticColumns(staticColumns, regularColumns); + EncodingStats stats = EncodingStats.serializer.deserialize(in); + return new SerializationHeader(false, metadata, columns, stats); + } + + @Override + public void serialize(DataOutputPlus out, SerializationHeader header, boolean hasStatic, Object param) throws IOException + { + out.writeUnsignedVInt32(header.columns.size()); + for (ColumnMetadata c : header.columns.statics) + out.writeUnsignedVInt32(c.uniqueId); + for (ColumnMetadata c : header.columns.regulars) + out.writeUnsignedVInt32(c.uniqueId); + EncodingStats.serializer.serialize(header.stats, out); + } + + @Override + public long serializedSize(SerializationHeader header, boolean hasStatic, Object param) + { + long size = TypeSizes.sizeofUnsignedVInt(header.columns.size()); + for (ColumnMetadata c : header.columns.statics) + size += TypeSizes.sizeofUnsignedVInt(c.uniqueId); + for (ColumnMetadata c : header.columns.regulars) + size += TypeSizes.sizeofUnsignedVInt(c.uniqueId); + size += EncodingStats.serializer.serializedSize(header.stats); + return size; + } + } + + public static class MessagingHeaderSerializer implements ParameterizedSerializer + { + public static MessagingHeaderSerializer MESSAGING = new MessagingHeaderSerializer(); + + @Override + public SerializationHeader deserialize(DataInputPlus in, TableMetadata metadata, boolean hasStatic, ColumnFilter param) throws IOException + { + return serializer.deserializeForMessaging(in, metadata, param, hasStatic); + } + + @Override + public void serialize(DataOutputPlus out, SerializationHeader header, boolean hasStatic, ColumnFilter param) throws IOException + { + serializer.serializeForMessaging(header, param, out, hasStatic); + } + + @Override + public long serializedSize(SerializationHeader header, boolean hasStatic, ColumnFilter param) + { + return serializer.serializedSizeForMessaging(header, param, hasStatic); + } + } + public static class Serializer implements IMetadataComponentSerializer { private final AbstractTypeSerializer typeSerializer = new AbstractTypeSerializer(); diff --git a/src/java/org/apache/cassandra/db/SimpleBuilders.java b/src/java/org/apache/cassandra/db/SimpleBuilders.java index 3564eb1f100a..7139b6c8b21c 100644 --- a/src/java/org/apache/cassandra/db/SimpleBuilders.java +++ b/src/java/org/apache/cassandra/db/SimpleBuilders.java @@ -18,21 +18,31 @@ package org.apache.cassandra.db; import java.nio.ByteBuffer; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; -import org.apache.cassandra.schema.ColumnMetadata; -import org.apache.cassandra.schema.TableId; -import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.schema.Schema; import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; import org.apache.cassandra.db.context.CounterContext; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.CollectionType; +import org.apache.cassandra.db.marshal.ListType; +import org.apache.cassandra.db.marshal.MapType; +import org.apache.cassandra.db.marshal.SetType; import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.db.rows.BTreeRow; import org.apache.cassandra.db.rows.BufferCell; import org.apache.cassandra.db.rows.Cell; import org.apache.cassandra.db.rows.CellPath; import org.apache.cassandra.db.rows.Row; -import org.apache.cassandra.db.marshal.*; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.CounterId; import org.apache.cassandra.utils.FBUtilities; @@ -111,12 +121,20 @@ public static class MutationBuilder extends AbstractBuilder updateBuilders = new HashMap<>(); + private PotentialTxnConflicts potentialTxnConflicts = PotentialTxnConflicts.DISALLOW; + public MutationBuilder(String keyspaceName, DecoratedKey key) { this.keyspaceName = keyspaceName; this.key = key; } + public MutationBuilder allowPotentialTxnConflicts() + { + potentialTxnConflicts = PotentialTxnConflicts.ALLOW; + return this; + } + public PartitionUpdate.SimpleBuilder update(TableMetadata metadata) { assert metadata.keyspace.equals(keyspaceName); @@ -145,9 +163,9 @@ public Mutation build() assert !updateBuilders.isEmpty() : "Cannot create empty mutation"; if (updateBuilders.size() == 1) - return new Mutation(updateBuilders.values().iterator().next().build()); + return new Mutation(updateBuilders.values().iterator().next().build(), potentialTxnConflicts); - Mutation.PartitionUpdateCollector mutationBuilder = new Mutation.PartitionUpdateCollector(keyspaceName, key); + Mutation.PartitionUpdateCollector mutationBuilder = new Mutation.PartitionUpdateCollector(keyspaceName, key, potentialTxnConflicts); for (PartitionUpdateBuilder builder : updateBuilders.values()) mutationBuilder.add(builder.build()); return mutationBuilder.build(); diff --git a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java index ad6da1e88a5f..ec3b383bb91b 100644 --- a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java +++ b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java @@ -26,11 +26,14 @@ import java.util.NavigableSet; import java.util.TreeSet; import java.util.concurrent.TimeUnit; +import java.util.function.Function; import java.util.stream.Collectors; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Sets; +import accord.primitives.Seekable; +import accord.primitives.Seekables; import org.apache.cassandra.cache.IRowCacheEntry; import org.apache.cassandra.cache.RowCacheKey; import org.apache.cassandra.cache.RowCacheSentinel; @@ -77,6 +80,7 @@ import org.apache.cassandra.service.CacheService; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.service.StorageProxy; +import org.apache.cassandra.service.accord.api.PartitionKey; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.transport.Dispatcher; @@ -89,6 +93,7 @@ public class SinglePartitionReadCommand extends ReadCommand implements SinglePartitionReadQuery { protected static final SelectionDeserializer selectionDeserializer = new Deserializer(); + protected static final Function accordSelectionDeserializer = AccordDeserializer::new; protected final DecoratedKey partitionKey; protected final ClusteringIndexFilter clusteringIndexFilter; @@ -98,6 +103,7 @@ protected SinglePartitionReadCommand(Epoch serializedAtEpoch, boolean isDigest, int digestVersion, boolean acceptsTransient, + PotentialTxnConflicts potentialTxnConflicts, TableMetadata metadata, long nowInSec, ColumnFilter columnFilter, @@ -109,7 +115,7 @@ protected SinglePartitionReadCommand(Epoch serializedAtEpoch, boolean trackWarnings, DataRange dataRange) { - super(serializedAtEpoch, Kind.SINGLE_PARTITION, isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter, rowFilter, limits, indexQueryPlan, trackWarnings, dataRange); + super(serializedAtEpoch, Kind.SINGLE_PARTITION, isDigest, digestVersion, acceptsTransient, potentialTxnConflicts, metadata, nowInSec, columnFilter, rowFilter, limits, indexQueryPlan, trackWarnings, dataRange); assert partitionKey.getPartitioner() == metadata.partitioner; this.partitionKey = partitionKey; this.clusteringIndexFilter = clusteringIndexFilter; @@ -119,6 +125,7 @@ private static SinglePartitionReadCommand create(Epoch serializedAtEpoch, boolean isDigest, int digestVersion, boolean acceptsTransient, + PotentialTxnConflicts potentialTxnConflicts, TableMetadata metadata, long nowInSec, ColumnFilter columnFilter, @@ -152,6 +159,7 @@ private static SinglePartitionReadCommand create(Epoch serializedAtEpoch, isDigest, digestVersion, acceptsTransient, + potentialTxnConflicts, metadata, nowInSec, columnFilter, @@ -191,6 +199,7 @@ public static SinglePartitionReadCommand create(TableMetadata metadata, false, 0, false, + PotentialTxnConflicts.DISALLOW, metadata, nowInSec, columnFilter, @@ -202,6 +211,44 @@ public static SinglePartitionReadCommand create(TableMetadata metadata, false); } + /** + * Creates a new read command on a single partition. + * + * @param metadata the table to query. + * @param nowInSec the time in seconds to use are "now" for this query. + * @param columnFilter the column filter to use for the query. + * @param rowFilter the row filter to use for the query. + * @param limits the limits to use for the query. + * @param partitionKey the partition key for the partition to query. + * @param clusteringIndexFilter the clustering index filter to use for the query. + * + * @return a newly created read command. + */ + public static SinglePartitionReadCommand create(TableMetadata metadata, + long nowInSec, + ColumnFilter columnFilter, + RowFilter rowFilter, + DataLimits limits, + DecoratedKey partitionKey, + ClusteringIndexFilter clusteringIndexFilter, + PotentialTxnConflicts potentialTxnConflicts) + { + return create(metadata.epoch, + false, + 0, + false, + potentialTxnConflicts, + metadata, + nowInSec, + columnFilter, + rowFilter, + limits, + partitionKey, + clusteringIndexFilter, + findIndexQueryPlan(metadata, rowFilter), + false); + } + /** * Creates a new read command on a single partition. * @@ -369,6 +416,7 @@ public SinglePartitionReadCommand copy() isDigestQuery(), digestVersion(), acceptsTransient(), + potentialTxnConflicts(), metadata(), nowInSec(), columnFilter(), @@ -387,6 +435,7 @@ protected SinglePartitionReadCommand copyAsDigestQuery() true, digestVersion(), acceptsTransient(), + potentialTxnConflicts(), metadata(), nowInSec(), columnFilter(), @@ -405,6 +454,7 @@ protected SinglePartitionReadCommand copyAsTransientQuery() false, 0, true, + potentialTxnConflicts(), metadata(), nowInSec(), columnFilter(), @@ -423,6 +473,7 @@ public SinglePartitionReadCommand withUpdatedLimit(DataLimits newLimits) isDigestQuery(), digestVersion(), acceptsTransient(), + potentialTxnConflicts(), metadata(), nowInSec(), columnFilter(), @@ -492,7 +543,9 @@ protected void recordLatency(TableMetrics metric, long latencyNanos) metric.readLatency.addNano(latencyNanos); } - protected UnfilteredPartitionIterator queryStorage(final ColumnFamilyStore cfs, ReadExecutionController executionController) + @VisibleForTesting + @SuppressWarnings("resource") // we close the created iterator through closing the result of this method (and SingletonUnfilteredPartitionIterator ctor cannot fail) + public UnfilteredPartitionIterator queryStorage(final ColumnFamilyStore cfs, ReadExecutionController executionController) { // skip the row cache and go directly to sstables/memtable if repaired status of // data is being tracked. This is only requested after an initial digest mismatch @@ -1229,12 +1282,23 @@ protected void serializeSelection(DataOutputPlus out, int version) throws IOExce ClusteringIndexFilter.serializer.serialize(clusteringIndexFilter(), out, version); } + protected void serializeSelectionWithoutKey(DataOutputPlus out, int version) throws IOException + { + ClusteringIndexFilter.serializer.serialize(clusteringIndexFilter(), out, version); + } + protected long selectionSerializedSize(int version) { return metadata().partitionKeyType.writtenLength(partitionKey().getKey()) + ClusteringIndexFilter.serializer.serializedSize(clusteringIndexFilter(), version); } + protected long selectionSerializedSize(Seekables seekables, int version) + { + return metadata().partitionKeyType.writtenLength(partitionKey().getKey()) + + ClusteringIndexFilter.serializer.serializedSize(clusteringIndexFilter(), version); + } + public boolean isLimitedToOnePartition() { return true; @@ -1245,6 +1309,29 @@ public boolean isRangeRequest() return false; } + /* + * When running transactionally we need to use the txn system nowInSeconds, and set whether reconciliation + * should be performed based on whether it's part of a multiple replica read. We also allow potential txn conflicts + * because we manage those conflicts from the txn system + */ + public SinglePartitionReadCommand withTransactionalSettings(boolean withoutReconciliation, long nowInSeconds) + { + return create(serializedAtEpoch(), + isDigestQuery(), + digestVersion(), + acceptsTransient(), + PotentialTxnConflicts.ALLOW, + metadata(), + nowInSeconds, + columnFilter(), + withoutReconciliation ? rowFilter().withoutReconciliation() : rowFilter(), + limits(), + partitionKey(), + clusteringIndexFilter(), + indexQueryPlan(), + isTrackingWarnings()); + } + /** * Groups multiple single partition read commands. */ @@ -1256,7 +1343,8 @@ public static Group create(TableMetadata metadata, RowFilter rowFilter, DataLimits limits, List partitionKeys, - ClusteringIndexFilter clusteringIndexFilter) + ClusteringIndexFilter clusteringIndexFilter, + PotentialTxnConflicts potentialTxnConflicts) { List commands = new ArrayList<>(partitionKeys.size()); for (DecoratedKey partitionKey : partitionKeys) @@ -1267,7 +1355,8 @@ public static Group create(TableMetadata metadata, rowFilter, limits, partitionKey, - clusteringIndexFilter)); + clusteringIndexFilter, + potentialTxnConflicts)); } return create(commands, limits); @@ -1323,6 +1412,7 @@ public ReadCommand deserialize(DataInputPlus in, boolean isDigest, int digestVersion, boolean acceptsTransient, + PotentialTxnConflicts potentialTxnConflicts, TableMetadata metadata, long nowInSec, ColumnFilter columnFilter, @@ -1333,7 +1423,36 @@ public ReadCommand deserialize(DataInputPlus in, { DecoratedKey key = metadata.partitioner.decorateKey(metadata.partitionKeyType.readBuffer(in, DatabaseDescriptor.getMaxValueSize())); ClusteringIndexFilter filter = ClusteringIndexFilter.serializer.deserialize(in, version, metadata); - return SinglePartitionReadCommand.create(serializedAtEpoch, isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter, rowFilter, limits, key, filter, indexQueryPlan, false); + return SinglePartitionReadCommand.create(serializedAtEpoch, isDigest, digestVersion, acceptsTransient, potentialTxnConflicts, metadata, nowInSec, columnFilter, rowFilter, limits, key, filter, indexQueryPlan, false); + } + } + + private static class AccordDeserializer extends SelectionDeserializer + { + final DecoratedKey key; + + private AccordDeserializer(Seekable seekable) + { + this.key = ((PartitionKey)seekable).partitionKey(); + } + + public ReadCommand deserialize(DataInputPlus in, + int version, + Epoch serializedAtEpoch, + boolean isDigest, + int digestVersion, + boolean acceptsTransient, + PotentialTxnConflicts potentialTxnConflicts, + TableMetadata metadata, + long nowInSec, + ColumnFilter columnFilter, + RowFilter rowFilter, + DataLimits limits, + Index.QueryPlan indexQueryPlan) + throws IOException + { + ClusteringIndexFilter filter = ClusteringIndexFilter.serializer.deserialize(in, version, metadata); + return SinglePartitionReadCommand.create(serializedAtEpoch, isDigest, digestVersion, acceptsTransient, potentialTxnConflicts, metadata, nowInSec, columnFilter, rowFilter, limits, key, filter, indexQueryPlan, false); } } @@ -1381,7 +1500,7 @@ protected VirtualTableSinglePartitionReadCommand(boolean isDigest, boolean trackWarnings, DataRange dataRange) { - super(metadata.epoch, isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter, + super(metadata.epoch, isDigest, digestVersion, acceptsTransient, PotentialTxnConflicts.ALLOW, metadata, nowInSec, columnFilter, rowFilter, limits, partitionKey, clusteringIndexFilter, indexQueryPlan, trackWarnings, dataRange); } diff --git a/src/java/org/apache/cassandra/db/SinglePartitionReadQuery.java b/src/java/org/apache/cassandra/db/SinglePartitionReadQuery.java index 5409cde8c493..0ec29618612c 100644 --- a/src/java/org/apache/cassandra/db/SinglePartitionReadQuery.java +++ b/src/java/org/apache/cassandra/db/SinglePartitionReadQuery.java @@ -23,9 +23,9 @@ import java.util.stream.Collectors; import com.google.common.collect.Iterables; - import org.apache.commons.lang3.tuple.Pair; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; import org.apache.cassandra.db.filter.ClusteringIndexFilter; import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.filter.DataLimits; @@ -51,9 +51,10 @@ public static Group createGroup(TableMetadat RowFilter rowFilter, DataLimits limits, List partitionKeys, - ClusteringIndexFilter clusteringIndexFilter) + ClusteringIndexFilter clusteringIndexFilter, + PotentialTxnConflicts potentialTxnConflicts) { - return SinglePartitionReadCommand.Group.create(metadata, nowInSec, columnFilter, rowFilter, limits, partitionKeys, clusteringIndexFilter); + return SinglePartitionReadCommand.Group.create(metadata, nowInSec, columnFilter, rowFilter, limits, partitionKeys, clusteringIndexFilter, potentialTxnConflicts); } diff --git a/src/java/org/apache/cassandra/db/SystemKeyspace.java b/src/java/org/apache/cassandra/db/SystemKeyspace.java index 64436a940339..dd38684b2da4 100644 --- a/src/java/org/apache/cassandra/db/SystemKeyspace.java +++ b/src/java/org/apache/cassandra/db/SystemKeyspace.java @@ -54,6 +54,7 @@ import org.slf4j.LoggerFactory; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.QueryHandler.Prepared; import org.apache.cassandra.cql3.QueryProcessor; import org.apache.cassandra.cql3.UntypedResultSet; import org.apache.cassandra.cql3.statements.schema.CreateTableStatement; @@ -79,8 +80,10 @@ import org.apache.cassandra.io.sstable.SSTableId; import org.apache.cassandra.io.sstable.SequenceBasedSSTableId; import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputBuffer; import org.apache.cassandra.io.util.File; +import org.apache.cassandra.io.util.FileUtils; import org.apache.cassandra.io.util.RebufferingInputStream; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.MetaStrategy; @@ -132,9 +135,11 @@ import static java.util.concurrent.TimeUnit.MICROSECONDS; import static org.apache.cassandra.config.Config.PaxosStatePurging.legacy; import static org.apache.cassandra.config.DatabaseDescriptor.paxosStatePurging; +import static org.apache.cassandra.cql3.QueryProcessor.PREPARED_STATEMENT_CACHE_SIZE_BYTES; import static org.apache.cassandra.cql3.QueryProcessor.executeInternal; import static org.apache.cassandra.cql3.QueryProcessor.executeInternalWithNowInSec; import static org.apache.cassandra.cql3.QueryProcessor.executeOnceInternal; +import static org.apache.cassandra.cql3.QueryProcessor.executeOnceInternalWithPaging; import static org.apache.cassandra.gms.ApplicationState.DC; import static org.apache.cassandra.gms.ApplicationState.HOST_ID; import static org.apache.cassandra.gms.ApplicationState.INTERNAL_ADDRESS_AND_PORT; @@ -143,6 +148,8 @@ import static org.apache.cassandra.gms.ApplicationState.RELEASE_VERSION; import static org.apache.cassandra.gms.ApplicationState.STATUS_WITH_PORT; import static org.apache.cassandra.gms.ApplicationState.TOKENS; +import org.apache.cassandra.service.consensus.migration.ConsensusMigratedAt; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationTarget; import static org.apache.cassandra.service.paxos.Commit.latest; import static org.apache.cassandra.service.snapshot.SnapshotOptions.systemSnapshot; import static org.apache.cassandra.utils.CassandraVersion.NULL_VERSION; @@ -161,6 +168,7 @@ private SystemKeyspace() public static final String BATCHES = "batches"; public static final String PAXOS = "paxos"; + public static final String CONSENSUS_MIGRATION_STATE = "consensus_migration_state"; public static final String PAXOS_REPAIR_HISTORY = "paxos_repair_history"; public static final String PAXOS_REPAIR_STATE = "_paxos_repair_state"; public static final String BUILT_INDEXES = "IndexInfo"; @@ -189,6 +197,7 @@ private SystemKeyspace() */ public static final Set TABLES_SPLIT_ACROSS_MULTIPLE_DISKS = ImmutableSet.of(BATCHES, PAXOS, + CONSENSUS_MIGRATION_STATE, COMPACTION_HISTORY, PREPARED_STATEMENTS, REPAIRS); @@ -214,14 +223,14 @@ private SystemKeyspace() TABLE_ESTIMATES_TYPE_LOCAL_PRIMARY, AVAILABLE_RANGES_V2, TRANSFERRED_RANGES_V2, VIEW_BUILDS_IN_PROGRESS, BUILT_VIEWS, PREPARED_STATEMENTS, REPAIRS, TOP_PARTITIONS, LEGACY_PEERS, LEGACY_PEER_EVENTS, LEGACY_TRANSFERRED_RANGES, LEGACY_AVAILABLE_RANGES, LEGACY_SIZE_ESTIMATES, LEGACY_SSTABLE_ACTIVITY, - METADATA_LOG, SNAPSHOT_TABLE_NAME); + METADATA_LOG, SNAPSHOT_TABLE_NAME, CONSENSUS_MIGRATION_STATE); public static final Set TABLE_NAMES = ImmutableSet.of( - BATCHES, PAXOS, PAXOS_REPAIR_HISTORY, BUILT_INDEXES, LOCAL, PEERS_V2, PEER_EVENTS_V2, - COMPACTION_HISTORY, SSTABLE_ACTIVITY_V2, TABLE_ESTIMATES, AVAILABLE_RANGES_V2, TRANSFERRED_RANGES_V2, VIEW_BUILDS_IN_PROGRESS, - BUILT_VIEWS, PREPARED_STATEMENTS, REPAIRS, TOP_PARTITIONS, LEGACY_PEERS, LEGACY_PEER_EVENTS, + BATCHES, PAXOS, PAXOS_REPAIR_HISTORY, BUILT_INDEXES, LOCAL, PEERS_V2, PEER_EVENTS_V2, + COMPACTION_HISTORY, SSTABLE_ACTIVITY_V2, TABLE_ESTIMATES, AVAILABLE_RANGES_V2, TRANSFERRED_RANGES_V2, VIEW_BUILDS_IN_PROGRESS, + BUILT_VIEWS, PREPARED_STATEMENTS, REPAIRS, TOP_PARTITIONS, LEGACY_PEERS, LEGACY_PEER_EVENTS, LEGACY_TRANSFERRED_RANGES, LEGACY_AVAILABLE_RANGES, LEGACY_SIZE_ESTIMATES, LEGACY_SSTABLE_ACTIVITY, - METADATA_LOG, SNAPSHOT_TABLE_NAME); + METADATA_LOG, SNAPSHOT_TABLE_NAME, CONSENSUS_MIGRATION_STATE); public static final TableMetadata Batches = parse(BATCHES, @@ -254,6 +263,25 @@ private SystemKeyspace() .indexes(PaxosUncommittedIndex.indexes()) .build(); + private static final TableMetadata ConsensusMigrationState = + parse(CONSENSUS_MIGRATION_STATE, + "Keys that have been migrated to another consensus protocol", + "CREATE TABLE %s (" + + "row_key blob, " + + "cf_id UUID, " + + "consensus_migrated_at_epoch bigint, " + + "consensus_target tinyint, " + + "PRIMARY KEY ((row_key), cf_id, consensus_migrated_at_epoch)) " + + "WITH CLUSTERING ORDER BY (cf_id ASC, consensus_migrated_at_epoch DESC)") + .compaction(CompactionParams.twcs( + ImmutableMap.of( + "compaction_window_unit", "MINUTES", + "compaction_window_size", + // 7 days divided into 30 windows + String.valueOf((7 * 24 * 60) / 30)))) + .defaultTimeToLive((int)TimeUnit.DAYS.toSeconds(7)) + .build(); + private static final TableMetadata BuiltIndexes = parse(BUILT_INDEXES, "built column indexes", @@ -601,11 +629,14 @@ private static Tables tables() Repairs, TopPartitions, LocalMetadataLog, - Snapshots); + Snapshots, + ConsensusMigrationState); } private static volatile Map> truncationRecords; + private static final Object truncationRecordLock = new Object(); + public enum BootstrapState { NEEDS_BOOTSTRAP, @@ -801,27 +832,33 @@ public static Map, Pair> getViewBuildStatus(String ksn return status; } - public static synchronized void saveTruncationRecord(ColumnFamilyStore cfs, long truncatedAt, CommitLogPosition position) + public static void saveTruncationRecord(ColumnFamilyStore cfs, long truncatedAt, CommitLogPosition position) { - String req = "UPDATE system.%s SET truncated_at = truncated_at + ? WHERE key = '%s'"; - executeInternal(format(req, LOCAL, LOCAL), truncationAsMapEntry(cfs, truncatedAt, position)); - truncationRecords = null; - forceBlockingFlush(LOCAL); + synchronized (truncationRecordLock) + { + String req = "UPDATE system.%s SET truncated_at = truncated_at + ? WHERE key = '%s'"; + executeInternal(format(req, LOCAL, LOCAL), truncationAsMapEntry(cfs, truncatedAt, position)); + truncationRecords = null; + forceBlockingFlush(LOCAL); + } } /** * This method is used to remove information about truncation time for specified column family */ - public static synchronized void removeTruncationRecord(TableId id) + public static void removeTruncationRecord(TableId id) { - Pair truncationRecord = getTruncationRecord(id); - if (truncationRecord == null) - return; - - String req = "DELETE truncated_at[?] from system.%s WHERE key = '%s'"; - executeInternal(format(req, LOCAL, LOCAL), id.asUUID()); - truncationRecords = null; - forceBlockingFlush(LOCAL); + synchronized (truncationRecordLock) + { + Pair truncationRecord = getTruncationRecord(id); + if (truncationRecord == null) + return; + + String req = "DELETE truncated_at[?] from system.%s WHERE key = '%s'"; + executeInternal(format(req, LOCAL, LOCAL), id.asUUID()); + truncationRecords = null; + forceBlockingFlush(LOCAL); + } } private static Map truncationAsMapEntry(ColumnFamilyStore cfs, long truncatedAt, CommitLogPosition position) @@ -850,11 +887,14 @@ public static long getTruncatedAt(TableId id) return record == null ? Long.MIN_VALUE : record.right; } - private static synchronized Pair getTruncationRecord(TableId id) + private static Pair getTruncationRecord(TableId id) { - if (truncationRecords == null) - truncationRecords = readTruncationRecords(); - return truncationRecords.get(id); + synchronized (truncationRecordLock) + { + if (truncationRecords == null) + truncationRecords = readTruncationRecords(); + return truncationRecords.get(id); + } } private static Map> readTruncationRecords() @@ -964,6 +1004,13 @@ public static synchronized void updateRack(String rack) executeInternal(format(req, LOCAL, LOCAL), rack); } + public static synchronized void updateLocation(Location location) + { + String req = "INSERT INTO system.%s (key, data_center, rack) VALUES ('%s', ?, ?)"; + executeInternal(format(req, LOCAL, LOCAL), location.datacenter, location.rack); + forceBlockingFlush(LOCAL); + } + public static Set tokensAsSet(Collection tokens) { if (tokens.isEmpty()) @@ -1579,6 +1626,27 @@ public static PaxosRepairHistory loadPaxosRepairHistory(String keyspace, String return PaxosRepairHistory.fromTupleBufferList(keyspace, table, points); } + public static void saveConsensusKeyMigrationState(ByteBuffer partitionKey, UUID cfId, ConsensusMigratedAt consensusMigratedAt) + { + String cql = "UPDATE system." + CONSENSUS_MIGRATION_STATE + " SET consensus_target = ? WHERE row_key = ? AND cf_id = ? AND consensus_migrated_at_epoch = ?"; + executeInternal(cql, consensusMigratedAt.migratedAtTarget.value, partitionKey, cfId, consensusMigratedAt.migratedAtEpoch.getEpoch()); + } + + public static ConsensusMigratedAt loadConsensusKeyMigrationState(ByteBuffer partitionKey, UUID cfId) + { + String cql = "SELECT consensus_migrated_at_epoch, consensus_target FROM system." + CONSENSUS_MIGRATION_STATE + " WHERE row_key = ? AND cf_id = ? LIMIT 1"; + UntypedResultSet results = executeInternal(cql, partitionKey, cfId); + + if (results.isEmpty()) + return null; + + UntypedResultSet.Row row = results.one(); + // TODO Period won't be necessary eventually + Epoch migratedAtEpoch = Epoch.create(row.getLong("consensus_migrated_at_epoch")); + ConsensusMigrationTarget target = ConsensusMigrationTarget.fromValue(row.getByte("consensus_target")); + return new ConsensusMigratedAt(migratedAtEpoch, target); + } + /** * Returns a RestorableMeter tracking the average read rate of a particular SSTable, restoring the last-seen rate * from values in system.sstable_activity if present. @@ -1837,9 +1905,9 @@ public static void snapshotOnVersionChange() if (!previous.equals(NULL_VERSION.toString()) && !previous.equals(next)) { List entities = new ArrayList<>(); - for (String keyspace : SchemaConstants.LOCAL_SYSTEM_KEYSPACE_NAMES) + for (Keyspace keyspace : Keyspace.system()) { - for (ColumnFamilyStore cfs : Keyspace.open(keyspace).getColumnFamilyStores()) + for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores()) entities.add(cfs.getKeyspaceTableName()); } @@ -1916,12 +1984,10 @@ public static ByteBuffer rangeToBytes(Range range) @SuppressWarnings("unchecked") private static Range byteBufferToRange(ByteBuffer rawRange, IPartitioner partitioner) { - try + try (DataInputPlus.DataInputStreamPlus in = new DataInputBuffer(ByteBufferUtil.getArray(rawRange))) { // See rangeToBytes above for why version is 0. - return (Range) Range.tokenSerializer.deserialize(new DataInputBuffer(ByteBufferUtil.getArray(rawRange)), - partitioner, - 0); + return (Range) Range.tokenSerializer.deserialize(in, partitioner, 0); } catch (IOException e) { @@ -1929,17 +1995,17 @@ private static Range byteBufferToRange(ByteBuffer rawRange, IPartitioner } } - public static void writePreparedStatement(String loggedKeyspace, MD5Digest key, String cql) + public static void writePreparedStatement(String loggedKeyspace, MD5Digest key, String cql, long timestamp) { - executeInternal(format("INSERT INTO %s (logged_keyspace, prepared_id, query_string) VALUES (?, ?, ?)", + executeInternal(format("INSERT INTO %s (logged_keyspace, prepared_id, query_string) VALUES (?, ?, ?) USING TIMESTAMP ?", PreparedStatements.toString()), - loggedKeyspace, key.byteBuffer(), cql); + loggedKeyspace, key.byteBuffer(), cql, timestamp); logger.debug("stored prepared statement for logged keyspace '{}': '{}'", loggedKeyspace, cql); } public static void removePreparedStatement(MD5Digest key) { - executeInternal(format("DELETE FROM %s WHERE prepared_id = ?", PreparedStatements.toString()), + executeInternal(format("DELETE FROM %s WHERE prepared_id = ?", PreparedStatements), key.byteBuffer()); } @@ -1949,17 +2015,50 @@ public static void resetPreparedStatements() preparedStatements.truncateBlockingWithoutSnapshot(); } - public static int loadPreparedStatements(TriFunction onLoaded) + public static int loadPreparedStatements(TriFunction onLoaded) + { + return loadPreparedStatements(onLoaded, QueryProcessor.PRELOAD_PREPARED_STATEMENTS_FETCH_SIZE); + } + + public static int loadPreparedStatements(TriFunction onLoaded, int pageSize) { String query = String.format("SELECT prepared_id, logged_keyspace, query_string FROM %s.%s", SchemaConstants.SYSTEM_KEYSPACE_NAME, PREPARED_STATEMENTS); - UntypedResultSet resultSet = executeOnceInternal(query); + UntypedResultSet resultSet = executeOnceInternalWithPaging(query, pageSize); int counter = 0; + + // As the cache size may be briefly exceeded before statements are evicted, we allow loading 110% the cache size + // to avoid logging early. + long preparedBytesLoadThreshold = (long) (PREPARED_STATEMENT_CACHE_SIZE_BYTES * 1.1); + long preparedBytesLoaded = 0L; for (UntypedResultSet.Row row : resultSet) { - if (onLoaded.apply(MD5Digest.wrap(row.getByteArray("prepared_id")), - row.getString("query_string"), - row.has("logged_keyspace") ? row.getString("logged_keyspace") : null)) + Prepared prepared = onLoaded.apply(MD5Digest.wrap(row.getByteArray("prepared_id")), + row.getString("query_string"), + row.has("logged_keyspace") ? row.getString("logged_keyspace") : null); + if (prepared != null) + { counter++; + preparedBytesLoaded += Math.max(0, prepared.pstmntSize); + + if (preparedBytesLoaded > preparedBytesLoadThreshold) + { + // In the event that we detect that we have loaded more bytes than the cache size return early to + // prevent an indefinite startup time. This is almost certainly caused by the prepared statement cache + // leaking (CASSANDRA-19703) which should not recur after being on a version running this code. + // In such a case it's better to warn and continue startup than to continually page over millions of + // prepared statements that would be immediately evicted. + logger.warn("Detected prepared statement cache filling up during preload after preparing {} " + + "statements (loaded {} with prepared_statements_cache_size being {}). " + + "This could be an indication that prepared statements leaked prior to CASSANDRA-19703 " + + "being fixed. Returning early to prevent indefinite startup. " + + "Consider truncating {}.{} to clear out leaked prepared statements.", + counter, + FileUtils.stringifyFileSize(preparedBytesLoaded), + FileUtils.stringifyFileSize(PREPARED_STATEMENT_CACHE_SIZE_BYTES), + SchemaConstants.SYSTEM_KEYSPACE_NAME, PREPARED_STATEMENTS); + break; + } + } } return counter; } diff --git a/src/java/org/apache/cassandra/db/WriteType.java b/src/java/org/apache/cassandra/db/WriteType.java index 11909e747614..3d0077046515 100644 --- a/src/java/org/apache/cassandra/db/WriteType.java +++ b/src/java/org/apache/cassandra/db/WriteType.java @@ -17,6 +17,10 @@ */ package org.apache.cassandra.db; +/** + * Identifier for what type of operation timed out. This type is driver facing as a String, but some drivers convert + * this to an enum, meaning any changes to this type require protocol changes and driver support. + */ public enum WriteType { SIMPLE, @@ -26,5 +30,6 @@ public enum WriteType BATCH_LOG, CAS, VIEW, - CDC; + CDC + //TODO update client protocol to support "TRANSACTION" } diff --git a/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogSegmentManager.java b/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogSegmentManager.java index dcd791caf306..33489688818b 100644 --- a/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogSegmentManager.java +++ b/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogSegmentManager.java @@ -56,7 +56,7 @@ import org.apache.cassandra.utils.concurrent.WaitQueue; import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; -import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.Daemon.NON_DAEMON; +import static org.apache.cassandra.concurrent.ExecutorFactory.SystemThreadTag.NON_DAEMON; import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.Interrupts.SYNCHRONIZED; import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.SimulatorSafe.SAFE; import static org.apache.cassandra.db.commitlog.CommitLogSegment.Allocation; diff --git a/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java b/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java index cd3eb56105d6..7bba9c49110d 100644 --- a/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java +++ b/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java @@ -38,7 +38,7 @@ import static java.util.concurrent.TimeUnit.NANOSECONDS; import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; -import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.Daemon.NON_DAEMON; +import static org.apache.cassandra.concurrent.ExecutorFactory.SystemThreadTag.NON_DAEMON; import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.Interrupts.SYNCHRONIZED; import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.SimulatorSafe.SAFE; import static org.apache.cassandra.concurrent.Interruptible.State.NORMAL; diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java b/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java index eb1e761493d0..53b4d57f2e2f 100644 --- a/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java +++ b/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java @@ -17,6 +17,9 @@ */ package org.apache.cassandra.db.compaction; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; @@ -24,11 +27,24 @@ import java.util.Map; import java.util.concurrent.TimeUnit; import java.util.function.LongPredicate; +import java.util.function.Supplier; +import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Ordering; - +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.local.Cleanup; +import accord.local.DurableBefore; +import accord.local.RedundantBefore; +import accord.utils.Invariants; +import accord.utils.UnhandledEnum; +import accord.utils.btree.BTree; +import accord.utils.btree.BulkIterator; +import accord.utils.btree.UpdateFunction; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.ColumnIdentifier; import org.apache.cassandra.db.AbstractCompactionController; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Columns; @@ -38,11 +54,16 @@ import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.RegularAndStaticColumns; import org.apache.cassandra.db.SystemKeyspace; -import org.apache.cassandra.db.transform.DuplicateRowChecker; import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.LongType; +import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.db.partitions.PurgeFunction; import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators; +import org.apache.cassandra.db.rows.BTreeRow; +import org.apache.cassandra.db.rows.BufferCell; +import org.apache.cassandra.db.rows.ColumnData; import org.apache.cassandra.db.rows.RangeTombstoneBoundMarker; import org.apache.cassandra.db.rows.RangeTombstoneMarker; import org.apache.cassandra.db.rows.Row; @@ -51,25 +72,51 @@ import org.apache.cassandra.db.rows.UnfilteredRowIterator; import org.apache.cassandra.db.rows.UnfilteredRowIterators; import org.apache.cassandra.db.rows.WrappingUnfilteredRowIterator; +import org.apache.cassandra.db.transform.DuplicateRowChecker; import org.apache.cassandra.db.transform.Transformation; import org.apache.cassandra.dht.Token; +import org.apache.cassandra.exceptions.UnknownTableException; import org.apache.cassandra.index.transactions.CompactionTransaction; import org.apache.cassandra.index.transactions.IndexTransaction; import org.apache.cassandra.io.sstable.ISSTableScanner; import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataOutputBuffer; import org.apache.cassandra.metrics.TopPartitionTracker; +import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.CompactionParams.TombstoneOption; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.accord.AccordJournal; +import org.apache.cassandra.service.accord.AccordJournalValueSerializers.FlyweightImage; +import org.apache.cassandra.service.accord.AccordJournalValueSerializers.FlyweightSerializer; +import org.apache.cassandra.service.accord.AccordKeyspace; +import org.apache.cassandra.service.accord.AccordKeyspace.CommandsForKeyAccessor; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.IAccordService; +import org.apache.cassandra.service.accord.IAccordService.AccordCompactionInfo; +import org.apache.cassandra.service.accord.IAccordService.AccordCompactionInfos; +import org.apache.cassandra.service.accord.JournalKey; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.accord.journal.AccordTopologyUpdate; +import org.apache.cassandra.service.accord.serializers.Version; import org.apache.cassandra.service.paxos.PaxosRepairHistory; import org.apache.cassandra.service.paxos.uncommitted.PaxosRows; +import org.apache.cassandra.utils.NoSpamLogger; +import org.apache.cassandra.utils.NoSpamLogger.NoSpamLogStatement; import org.apache.cassandra.utils.TimeUUID; +import static accord.local.Cleanup.Input.PARTIAL; +import static accord.local.Cleanup.NO; +import static com.google.common.base.Preconditions.checkState; import static java.util.concurrent.TimeUnit.MICROSECONDS; +import static java.util.concurrent.TimeUnit.MINUTES; import static org.apache.cassandra.config.Config.PaxosStatePurging.legacy; import static org.apache.cassandra.config.DatabaseDescriptor.paxosStatePurging; +import static org.apache.cassandra.service.accord.AccordKeyspace.CFKAccessor; /** * Merge multiple iterators over the content of sstable into a "compacted" iterator. @@ -89,7 +136,9 @@ */ public class CompactionIterator extends CompactionInfo.Holder implements UnfilteredPartitionIterator { - private static final long UNFILTERED_TO_UPDATE_PROGRESS = 100; + private static final Logger logger = LoggerFactory.getLogger(CompactionIterator.class); + private static final NoSpamLogStatement unknownTable = NoSpamLogger.getStatement(logger, "Unknown (probably dropped) TableId {} reading {}; skipping record", 1L, MINUTES); + private static final long UNFILTERED_TO_UPDATE_PROGRESS = 128; private final OperationType type; private final AbstractCompactionController controller; @@ -126,6 +175,35 @@ public CompactionIterator(OperationType type, TimeUUID compactionId, ActiveCompactionsTracker activeCompactions, TopPartitionTracker.Collector topPartitionCollector) + { + this(type, scanners, controller, nowInSec, compactionId, activeCompactions, topPartitionCollector, + AccordService.isSetup() ? AccordService.instance() : null); + } + + public CompactionIterator(OperationType type, + List scanners, + AbstractCompactionController controller, + long nowInSec, + TimeUUID compactionId, + ActiveCompactionsTracker activeCompactions, + TopPartitionTracker.Collector topPartitionCollector, + IAccordService accord) + { + this(type, scanners, controller, nowInSec, compactionId, activeCompactions, topPartitionCollector, + () -> accord.getCompactionInfo(), + () -> Version.fromVersion(accord.journalConfiguration().userVersion())); + } + + @VisibleForTesting + public CompactionIterator(OperationType type, + List scanners, + AbstractCompactionController controller, + long nowInSec, + TimeUUID compactionId, + ActiveCompactionsTracker activeCompactions, + TopPartitionTracker.Collector topPartitionCollector, + Supplier compactionInfos, + Supplier accordVersion) { this.controller = controller; this.type = type; @@ -151,14 +229,29 @@ public CompactionIterator(OperationType type, if (topPartitionCollector != null) // need to count tombstones before they are purged merged = Transformation.apply(merged, new TopPartitionTracker.TombstoneCounter(topPartitionCollector, nowInSec)); merged = Transformation.apply(merged, new GarbageSkipper(controller)); - Transformation purger = isPaxos(controller.cfs) && paxosStatePurging() != legacy - ? new PaxosPurger(nowInSec) - : new Purger(controller, nowInSec); + Transformation purger = purger(controller.cfs, compactionInfos, accordVersion); merged = Transformation.apply(merged, purger); merged = DuplicateRowChecker.duringCompaction(merged, type); compacted = Transformation.apply(merged, new AbortableUnfilteredPartitionTransformation(this)); } + private Transformation purger(ColumnFamilyStore cfs, Supplier compactionInfos, Supplier version) + { + if (isPaxos(cfs) && paxosStatePurging() != legacy) + return new PaxosPurger(); + + // Topologies uses regular deletion so it can use a regular Purger + if (!requiresAccordSpecificPurger(cfs)) + return new Purger(controller, nowInSec); + + if (isAccordJournal(cfs)) + return new AccordJournalPurger(compactionInfos.get(), version.get(), cfs); + if (isAccordCommandsForKey(cfs)) + return new AccordCommandsForKeyPurger(AccordKeyspace.CFKAccessor, compactionInfos); + + throw new IllegalArgumentException("Unhandled accord table: " + cfs.keyspace.getName() + '.' + cfs.name); + } + public TableMetadata metadata() { return controller.cfs.metadata(); @@ -634,19 +727,9 @@ protected UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition } } - private class PaxosPurger extends Transformation + private abstract class AbstractPurger extends Transformation { - - private final long nowInSec; - private final long paxosPurgeGraceMicros = DatabaseDescriptor.getPaxosPurgeGrace(MICROSECONDS); - private final Map tableIdToHistory = new HashMap<>(); - private Token currentToken; - private int compactedUnfiltered; - - private PaxosPurger(long nowInSec) - { - this.nowInSec = nowInSec; - } + int compactedUnfiltered; protected void onEmptyPartitionPostPurge(DecoratedKey key) { @@ -663,7 +746,7 @@ protected void updateProgress() @Override protected UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition) { - currentToken = partition.partitionKey().getToken(); + beginPartition(partition); UnfilteredRowIterator purged = Transformation.apply(partition, this); if (purged.isEmpty()) { @@ -675,10 +758,27 @@ protected UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition return purged; } + protected abstract void beginPartition(UnfilteredRowIterator partition); + } + + private class PaxosPurger extends AbstractPurger + { + private final long paxosPurgeGraceMicros = DatabaseDescriptor.getPaxosPurgeGrace(MICROSECONDS); + private final Map tableIdToHistory = new HashMap<>(); + + private Token token; + + @Override + protected void beginPartition(UnfilteredRowIterator partition) + { + this.token = partition.partitionKey().getToken(); + } + @Override protected Row applyToRow(Row row) { updateProgress(); + TableId tableId = PaxosRows.getTableId(row); switch (paxosStatePurging()) @@ -700,9 +800,341 @@ protected Row applyToRow(Row row) }); return history == null ? row : - row.purgeDataOlderThan(history.ballotForToken(currentToken).unixMicros() - paxosPurgeGraceMicros, false); + row.purgeDataOlderThan(history.ballotForToken(token).unixMicros() - paxosPurgeGraceMicros, false); + } + } + } + } + + class AccordCommandsForKeyPurger extends AbstractPurger + { + final CommandsForKeyAccessor accessor; + final AccordCompactionInfos compactionInfos; + + AccordCompactionInfo info; + int storeId; + TokenKey tokenKey; + + AccordCommandsForKeyPurger(CommandsForKeyAccessor accessor, Supplier compactionInfos) + { + this.accessor = accessor; + this.compactionInfos = compactionInfos.get(); + } + + protected void beginPartition(UnfilteredRowIterator partition) + { + ByteBuffer key = partition.partitionKey().getKey(); + storeId = CommandsForKeyAccessor.getCommandStoreId(key); + info = compactionInfos.get(storeId); + tokenKey = info == null ? null : CommandsForKeyAccessor.getUserTableKey(info.tableId, key); + } + + @Override + protected Row applyToRow(Row row) + { + updateProgress(); + + // TODO (required): if the store has been retired, this should return null + if (info == null) + return row; + + RedundantBefore redundantBefore = info.redundantBefore; + RedundantBefore.Bounds redundantBeforeEntry = redundantBefore.get(tokenKey.toUnseekable()); + if (redundantBeforeEntry == null) + return row; + + return CFKAccessor.withoutRedundantCommands(tokenKey, row, redundantBeforeEntry); + } + + @Override + protected Row applyToStatic(Row row) + { + checkState(row.isStatic() && row.isEmpty()); + return row; + } + } + + class AccordJournalPurger extends AbstractPurger + { + final AccordCompactionInfos infos; + final ColumnMetadata recordColumn; + final ColumnMetadata versionColumn; + + JournalKey key; + AccordRowCompactor compactor; + // Initialize topology serializer during compaction to avoid deserializing redundant epochs + FlyweightSerializer topologySerializer; + final Version userVersion; + + public AccordJournalPurger(AccordCompactionInfos compactionInfos, Version version, ColumnFamilyStore cfs) + { + this.userVersion = version; + + this.infos = compactionInfos; + this.recordColumn = cfs.metadata().getColumn(ColumnIdentifier.getInterned("record", false)); + this.versionColumn = cfs.metadata().getColumn(ColumnIdentifier.getInterned("user_version", false)); + this.topologySerializer = (FlyweightSerializer) (FlyweightSerializer) new AccordTopologyUpdate.AccumulatingSerializer(() -> infos.minEpoch); + } + + @SuppressWarnings("unchecked") + @Override + protected void beginPartition(UnfilteredRowIterator partition) + { + key = AccordKeyspace.JournalColumns.getJournalKey(partition.partitionKey()); + if (compactor == null || compactor.serializer != key.type.serializer) + { + switch (key.type) + { + case COMMAND_DIFF: + compactor = new AccordCommandRowCompactor(infos, userVersion, nowInSec); + break; + case TOPOLOGY_UPDATE: + compactor = new AccordMergingCompactor(topologySerializer, userVersion); + break; + default: + compactor = new AccordMergingCompactor(key.type.serializer, userVersion); + } + } + compactor.reset(key); + } + + @Override + protected UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition) + { + if (!partition.hasNext()) + return partition; + + try + { + beginPartition(partition); + while (partition.hasNext()) + collect((Row)partition.next()); + + return compactor.result(key, partition.partitionKey()); + } + catch (UnknownTableException e) + { + unknownTable.info(e.id, key); + return null; + } + catch (IOException e) + { + throw new RuntimeException(e); + } + } + + protected void collect(Row row) throws IOException + { + updateProgress(); + ByteBuffer bytes = row.getCell(recordColumn).buffer(); + Version userVersion = Version.fromVersion(Int32Type.instance.compose(row.getCell(versionColumn).buffer())); + compactor.collect(key, row, bytes, userVersion); + } + } + + static abstract class AccordRowCompactor + { + final FlyweightSerializer serializer; + + AccordRowCompactor(FlyweightSerializer serializer) + { + this.serializer = serializer; + } + + abstract void reset(JournalKey key); + abstract void collect(JournalKey key, Row row, ByteBuffer bytes, Version userVersion) throws IOException; + abstract UnfilteredRowIterator result(JournalKey journalKey, DecoratedKey partitionKey) throws IOException; + } + + static class AccordMergingCompactor extends AccordRowCompactor + { + final T builder; + final Version userVersion; + Object[] highestClustering; + long lastDescriptor; + int lastOffset; + + AccordMergingCompactor(FlyweightSerializer serializer, Version userVersion) + { + super(serializer); + this.builder = serializer.mergerFor(); + this.userVersion = userVersion; + } + + @Override + void reset(JournalKey key) + { + builder.reset(key); + lastDescriptor = -1; + lastOffset = -1; + highestClustering = null; + } + + @Override + protected void collect(JournalKey key, Row row, ByteBuffer bytes, Version userVersion) throws IOException + { + if (highestClustering == null) + highestClustering = row.clustering().getBufferArray(); + + long descriptor = LongType.instance.compose(row.clustering().bufferAt(0)); + int offset = Int32Type.instance.compose(row.clustering().bufferAt(1)); + + if (lastOffset != -1) + { + Invariants.require(descriptor <= lastDescriptor, + "Descriptors were accessed out of order: %d was accessed after %d", descriptor, lastDescriptor); + Invariants.require(descriptor != lastDescriptor || + offset < lastOffset, + "Offsets within %d were accessed out of order: %d was accessed after %s", descriptor, offset, lastOffset); + } + lastDescriptor = descriptor; + lastOffset = offset; + + try (DataInputBuffer in = new DataInputBuffer(bytes, false)) + { + serializer.deserialize(key, builder, in, userVersion); + } + } + + @Override + UnfilteredRowIterator result(JournalKey journalKey, DecoratedKey partitionKey) throws IOException + { + PartitionUpdate.SimpleBuilder newVersion = PartitionUpdate.simpleBuilder(AccordKeyspace.Journal, partitionKey); + try (DataOutputBuffer out = DataOutputBuffer.scratchBuffer.get()) + { + serializer.reserialize(journalKey, builder, out, userVersion); + newVersion.row(highestClustering) + .add("record", out.asNewBuffer()) + .add("user_version", userVersion.version); + } + + return newVersion.build().unfilteredIterator(); + } + } + + static class AccordCommandRowEntry + { + final AccordJournal.Builder builder = new AccordJournal.Builder(); + Row row; + boolean modified; + + void init(JournalKey key, Row row, ByteBuffer bytes, Version userVersion) throws IOException + { + this.row = row; + this.builder.reset(key); + try (DataInputBuffer in = new DataInputBuffer(bytes, false)) + { + builder.deserializeNext(in, userVersion); + } + } + + void clear() + { + row = null; + modified = false; + builder.clear(); + } + } + + static class AccordCommandRowCompactor extends AccordRowCompactor + { + static final Object[] rowTemplate = BTree.build(BulkIterator.of(new Object[2]), 2, UpdateFunction.noOp); + final long timestamp = ClientState.getTimestamp(); + final AccordCompactionInfos infos; + final Version userVersion; + final ColumnData userVersionCell; + final long nowInSec; + + final AccordJournal.Builder mainBuilder = new AccordJournal.Builder(); + final List entries = new ArrayList<>(); + final ArrayDeque reuseEntries = new ArrayDeque<>(); + AccordCompactionInfo info; + + AccordCommandRowCompactor(AccordCompactionInfos infos, Version userVersion, long nowInSec) + { + super((FlyweightSerializer) JournalKey.Type.COMMAND_DIFF.serializer); + this.infos = infos; + this.userVersion = userVersion; + this.userVersionCell = BufferCell.live(AccordKeyspace.JournalColumns.user_version, timestamp, Int32Type.instance.decompose(userVersion.version)); + this.nowInSec = nowInSec; + } + + @Override + void reset(JournalKey key) + { + mainBuilder.reset(key); + reuseEntries.addAll(entries); + for (int i = 0; i < entries.size() ; ++i) + entries.get(i).clear(); + entries.clear(); + } + + @Override + void collect(JournalKey key, Row row, ByteBuffer bytes, Version userVersion) throws IOException + { + AccordCommandRowEntry e = reuseEntries.pollLast(); + if (e == null) + e = new AccordCommandRowEntry(); + entries.add(e); + e.init(key, row, bytes, userVersion); + e.modified |= e.builder.clearSuperseded(false, mainBuilder); + mainBuilder.fillInMissingOrCleanup(false, e.builder); + } + + @Override + UnfilteredRowIterator result(JournalKey journalKey, DecoratedKey partitionKey) throws IOException + { + if (mainBuilder.isEmpty()) + return null; + + if (info != null && info.commandStoreId != journalKey.commandStoreId) info = null; + if (info == null) info = infos.get(journalKey.commandStoreId); + // TODO (required): should return null only if commandStore has been removed + if (info == null) + return null; + + DurableBefore durableBefore = infos.durableBefore; + Cleanup cleanup = mainBuilder.maybeCleanup(false, PARTIAL, info.redundantBefore, durableBefore); + if (cleanup != NO) + { + switch (cleanup) + { + default: throw new UnhandledEnum(cleanup); + case EXPUNGE: + return null; + case ERASE: + return PartitionUpdate.fullPartitionDelete(AccordKeyspace.Journal, partitionKey, Long.MAX_VALUE, nowInSec).unfilteredIterator(); + + case TRUNCATE: + case TRUNCATE_WITH_OUTCOME: + case INVALIDATE: + case VESTIGIAL: + for (int i = 0, size = entries.size(); i < size ; i++) + { + AccordCommandRowEntry entry = entries.get(i); + if (i == 0) entry.modified |= entry.builder.addCleanup(false, cleanup); + else entry.modified |= entry.builder.cleanup(false, cleanup); + } + } + } + + PartitionUpdate.Builder newVersion = new PartitionUpdate.Builder(AccordKeyspace.Journal, partitionKey, AccordKeyspace.JournalColumns.regular, entries.size()); + for (int i = 0, size = entries.size() ; i < size ; ++i) + { + AccordCommandRowEntry entry = entries.get(i); + if (!entry.modified) + { + newVersion.add(entry.row); + } + else if (entry.builder.flags() != 0) + { + Object[] newRow = rowTemplate.clone(); + newRow[0] = BufferCell.live(AccordKeyspace.JournalColumns.record, timestamp, entry.builder.asByteBuffer(userVersion)); + newRow[1] = userVersionCell; + newVersion.add(BTreeRow.create(entry.row.clustering(), entry.row.primaryKeyLivenessInfo(), entry.row.deletion(), newRow)); } } + return newVersion.build().unfilteredIterator(); } } @@ -745,4 +1177,26 @@ private static boolean isPaxos(ColumnFamilyStore cfs) { return cfs.name.equals(SystemKeyspace.PAXOS) && cfs.getKeyspaceName().equals(SchemaConstants.SYSTEM_KEYSPACE_NAME); } -} \ No newline at end of file + + private static boolean requiresAccordSpecificPurger(ColumnFamilyStore cfs) + { + return cfs.getKeyspaceName().equals(SchemaConstants.ACCORD_KEYSPACE_NAME) && + (cfs.getTableName().contains(AccordKeyspace.JOURNAL) || + AccordKeyspace.COMMANDS_FOR_KEY.equals(cfs.getTableName())); + } + + private static boolean isAccordTable(ColumnFamilyStore cfs, String name) + { + return cfs.name.equals(name) && cfs.getKeyspaceName().equals(SchemaConstants.ACCORD_KEYSPACE_NAME); + } + + private static boolean isAccordJournal(ColumnFamilyStore cfs) + { + return cfs.getKeyspaceName().equals(SchemaConstants.ACCORD_KEYSPACE_NAME) && cfs.name.startsWith(AccordKeyspace.JOURNAL); + } + + private static boolean isAccordCommandsForKey(ColumnFamilyStore cfs) + { + return isAccordTable(cfs, AccordKeyspace.COMMANDS_FOR_KEY); + } +} diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java index 1a3b2e705b3b..a36d0fc49b07 100644 --- a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java +++ b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java @@ -83,6 +83,8 @@ import org.apache.cassandra.db.rows.UnfilteredRowIterator; import org.apache.cassandra.db.view.ViewBuilderTask; import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.LocalPartitioner; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.exceptions.ConfigurationException; @@ -102,6 +104,7 @@ import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.MetaStrategy; import org.apache.cassandra.locator.RangesAtEndpoint; +import org.apache.cassandra.locator.Replica; import org.apache.cassandra.metrics.CompactionMetrics; import org.apache.cassandra.metrics.TableMetrics; import org.apache.cassandra.repair.NoSuchRepairSessionException; @@ -768,6 +771,11 @@ public AllSSTableOpStatus performCleanup(final ColumnFamilyStore cfStore, int jo DataPlacement placement = cm.placements.get(keyspace.getMetadata().params.replication); InetAddressAndPort local = FBUtilities.getBroadcastAddressAndPort(); RangesAtEndpoint localWrites = placement.writes.byEndpoint().get(local); + // TODO review: Hack to get local partitioner not to fail out because it's handled very poorly with data placements + IPartitioner partitioner = cfStore.getPartitioner(); + if (partitioner.getClass() == LocalPartitioner.class) + localWrites = RangesAtEndpoint.of(Replica.fullReplica(local, new Range<>(partitioner.getMinimumToken(), partitioner.getMinimumToken()))); + final Set> allRanges = new HashSet<>(localWrites.ranges()); final Set> transientRanges = new HashSet<>(localWrites.onlyTransient().ranges()); final Set> fullRanges = new HashSet<>(localWrites.onlyFull().ranges()); @@ -1007,7 +1015,7 @@ private static void mutateFullyContainedSSTables(ColumnFamilyStore cfs, Set fullyContainedSSTables = findSSTablesToAnticompact(sstableIterator, normalizedRanges, sessionID); - cfs.metric.bytesMutatedAnticompaction.inc(SSTableReader.getTotalBytes(fullyContainedSSTables)); + cfs.metric.bytesMutatedAnticompaction.mark(SSTableReader.getTotalBytes(fullyContainedSSTables)); cfs.getCompactionStrategyManager().mutateRepaired(fullyContainedSSTables, UNREPAIRED_SSTABLE, sessionID, isTransient); // since we're just re-writing the sstable metdata for the fully contained sstables, we don't want // them obsoleted when the anti-compaction is complete. So they're removed from the transaction here @@ -1854,7 +1862,7 @@ private void doAntiCompaction(ColumnFamilyStore cfs, // repairedAt values for these, we still avoid anti-compacting already repaired sstables, as we currently don't // make use of any actual repairedAt value and splitting up sstables just for that is not worth it at this point. Set unrepairedSSTables = sstables.stream().filter((s) -> !s.isRepaired()).collect(Collectors.toSet()); - cfs.metric.bytesAnticompacted.inc(SSTableReader.getTotalBytes(unrepairedSSTables)); + cfs.metric.bytesAnticompacted.mark(SSTableReader.getTotalBytes(unrepairedSSTables)); Collection> groupedSSTables = cfs.getCompactionStrategyManager().groupSSTablesForAntiCompaction(unrepairedSSTables); // iterate over sstables to check if the full / transient / unrepaired ranges intersect them. diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java index a68efa120c18..1509aa2e0371 100644 --- a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java +++ b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java @@ -52,6 +52,7 @@ public class LeveledCompactionStrategy extends AbstractCompactionStrategy public static final String LEVEL_FANOUT_SIZE_OPTION = "fanout_size"; public static final String SINGLE_SSTABLE_UPLEVEL_OPTION = "single_sstable_uplevel"; public static final int DEFAULT_LEVEL_FANOUT_SIZE = 10; + public static final int DEFAULT_MAX_SSTABLE_SIZE_MIB = 160; @VisibleForTesting final LeveledManifest manifest; @@ -62,9 +63,9 @@ public class LeveledCompactionStrategy extends AbstractCompactionStrategy public LeveledCompactionStrategy(ColumnFamilyStore cfs, Map options) { super(cfs, options); - int configuredMaxSSTableSize = 160; + int configuredMaxSSTableSize = DEFAULT_MAX_SSTABLE_SIZE_MIB; int configuredLevelFanoutSize = DEFAULT_LEVEL_FANOUT_SIZE; - boolean configuredSingleSSTableUplevel = false; + boolean configuredSingleSSTableUplevel = true; SizeTieredCompactionStrategyOptions localOptions = new SizeTieredCompactionStrategyOptions(options); if (options != null) { diff --git a/src/java/org/apache/cassandra/db/compaction/SingleSSTableLCSTask.java b/src/java/org/apache/cassandra/db/compaction/SingleSSTableLCSTask.java index 2d9768924cfb..7a80451e2dcb 100644 --- a/src/java/org/apache/cassandra/db/compaction/SingleSSTableLCSTask.java +++ b/src/java/org/apache/cassandra/db/compaction/SingleSSTableLCSTask.java @@ -45,6 +45,11 @@ public SingleSSTableLCSTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int this.level = level; } + protected int getLevel() + { + return level; + } + @Override protected void executeInternal(ActiveCompactionsTracker activeCompactions) { diff --git a/src/java/org/apache/cassandra/db/compaction/unified/Controller.java b/src/java/org/apache/cassandra/db/compaction/unified/Controller.java index 28684aa59645..a5b89e6ae244 100644 --- a/src/java/org/apache/cassandra/db/compaction/unified/Controller.java +++ b/src/java/org/apache/cassandra/db/compaction/unified/Controller.java @@ -518,14 +518,20 @@ public static Map validateOptions(Map options) t { try { - targetSSTableSize = FBUtilities.parseHumanReadableBytes(s); - if (targetSSTableSize < MIN_TARGET_SSTABLE_SIZE) + double targetSize = FBUtilities.parseHumanReadable(s, null, "B"); + if (targetSize >= Long.MAX_VALUE) { + throw new ConfigurationException(String.format("%s %s is out of range of Long.", + TARGET_SSTABLE_SIZE_OPTION, + s)); + } + if (targetSize < MIN_TARGET_SSTABLE_SIZE) { throw new ConfigurationException(String.format("%s %s is not acceptable, size must be at least %s", TARGET_SSTABLE_SIZE_OPTION, s, FBUtilities.prettyPrintMemory(MIN_TARGET_SSTABLE_SIZE))); } + targetSSTableSize = (long) Math.ceil(targetSize); } catch (NumberFormatException e) { @@ -622,12 +628,12 @@ public static Map validateOptions(Map options) t if (sizeInBytes < 0) throw new ConfigurationException(String.format("Invalid configuration, %s should be greater than or equal to 0 (zero)", MIN_SSTABLE_SIZE_OPTION)); - int limit = (int) Math.ceil(targetSSTableSize * INVERSE_SQRT_2); + long limit = (long) Math.ceil(targetSSTableSize * INVERSE_SQRT_2); if (sizeInBytes >= limit) - throw new ConfigurationException(String.format("Invalid configuration, %s (%s) should be less than the target size minimum: %s", + throw new ConfigurationException(String.format("Invalid configuration, %s (%s) should be less than 70%% of the targetSSTableSize (%s)", MIN_SSTABLE_SIZE_OPTION, FBUtilities.prettyPrintMemory(sizeInBytes), - FBUtilities.prettyPrintMemory(limit))); + FBUtilities.prettyPrintMemory(targetSSTableSize))); } catch (NumberFormatException e) { diff --git a/src/java/org/apache/cassandra/db/filter/ClusteringIndexNamesFilter.java b/src/java/org/apache/cassandra/db/filter/ClusteringIndexNamesFilter.java index a98e3bde99ba..f3a0904eeea5 100644 --- a/src/java/org/apache/cassandra/db/filter/ClusteringIndexNamesFilter.java +++ b/src/java/org/apache/cassandra/db/filter/ClusteringIndexNamesFilter.java @@ -150,6 +150,7 @@ public boolean intersects(ClusteringComparator comparator, Slice slice) return false; } + @Override public String toString(TableMetadata metadata) { StringBuilder sb = new StringBuilder(); @@ -197,6 +198,7 @@ public String toCQLString(TableMetadata metadata, RowFilter rowFilter) return sb.toString(); } + @Override public boolean equals(Object o) { if (this == o) return true; @@ -206,6 +208,7 @@ public boolean equals(Object o) Objects.equals(reversed, that.reversed); } + @Override public int hashCode() { return Objects.hash(clusterings, reversed); diff --git a/src/java/org/apache/cassandra/db/filter/ColumnFilter.java b/src/java/org/apache/cassandra/db/filter/ColumnFilter.java index ae043039e25e..bc17dd0158f1 100644 --- a/src/java/org/apache/cassandra/db/filter/ColumnFilter.java +++ b/src/java/org/apache/cassandra/db/filter/ColumnFilter.java @@ -189,6 +189,14 @@ public static ColumnFilter all(RegularAndStaticColumns columns) return new WildCardColumnFilter(columns); } + /** + * A filter that includes all columns for the provided table. + */ + public static ColumnFilter allEver(TableMetadata metadata) + { + return new WildCardColumnFilter(metadata.regularAndStaticAndDroppedColumns()); + } + /** * A filter that only fetches/queries the provided columns. *

@@ -682,7 +690,7 @@ public SelectionColumnFilter(FetchingStrategy fetchingStrategy, SortedSetMultimap subSelections) { assert queried != null; - assert fetched.includes(queried); + assert fetched.includes(queried) : String.format("Queries columns %s are not included in the fetch strategy %s", queried, fetched); this.fetchingStrategy = fetchingStrategy; this.queried = queried; diff --git a/src/java/org/apache/cassandra/db/filter/RowFilter.java b/src/java/org/apache/cassandra/db/filter/RowFilter.java index 4f6b7bd1d514..d60dd43b992b 100644 --- a/src/java/org/apache/cassandra/db/filter/RowFilter.java +++ b/src/java/org/apache/cassandra/db/filter/RowFilter.java @@ -21,8 +21,10 @@ import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Collections; +import java.util.HashSet; import java.util.Iterator; import java.util.List; +import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicInteger; @@ -87,7 +89,7 @@ public class RowFilter implements Iterable private static final Logger logger = LoggerFactory.getLogger(RowFilter.class); public static final Serializer serializer = new Serializer(); - private static final RowFilter NONE = new RowFilter(Collections.emptyList(), false); + public static final RowFilter NONE = new RowFilter(Collections.emptyList(), false); protected final List expressions; @@ -172,15 +174,21 @@ public boolean isStrict() */ public boolean isMutableIntersection() { - int count = 0; + Set columns = null; for (Expression e : expressions) { if (e.column.isStatic() && expressions.size() > 1) return true; if (!e.column.isPrimaryKeyColumn()) - if (++count > 1) + { + if (columns == null) + columns = new HashSet<>(expressions.size()); + + columns.add(e.column); + if (columns.size() > 1) return true; + } } return false; } @@ -256,7 +264,10 @@ protected BaseRowIterator applyToPartition(BaseRowIterator partition) @Override public Row applyToRow(Row row) { - Row purged = row.purge(DeletionPurger.PURGE_ALL, nowInSec, metadata.enforceStrictLiveness()); + // If we purge deletions when reconciliation is required, we hide information replica filtering + // protection would require to filter rows that are no longer matches are the coordinator. + Row purged = needsReconciliation() ? row : row.purge(DeletionPurger.PURGE_ALL, nowInSec, metadata.enforceStrictLiveness()); + if (purged == null) return null; @@ -393,6 +404,13 @@ public RowFilter without(ColumnMetadata column, Operator op, ByteBuffer value) return withNewExpressions(newExpressions); } + public RowFilter withoutReconciliation() + { + if (needsReconciliation) + return new RowFilter(expressions, false); + return this; + } + public boolean hasNonKeyExpression() { for (Expression e : expressions) @@ -924,7 +942,7 @@ private static ColumnMetadata makeDefinition(TableMetadata table, IndexMetadata { // Similarly to how we handle non-defined columns in thift, we create a fake column definition to // represent the target index. This is definitely something that can be improved though. - return ColumnMetadata.regularColumn(table, ByteBuffer.wrap(index.name.getBytes()), BytesType.instance); + return ColumnMetadata.regularColumn(table, ByteBuffer.wrap(index.name.getBytes()), BytesType.instance, ColumnMetadata.NO_UNIQUE_ID); } public IndexMetadata getTargetIndex() diff --git a/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java b/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java index 79fe0b2923c3..15b0417b382f 100644 --- a/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java +++ b/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java @@ -21,9 +21,11 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.Comparator; import java.util.HashSet; import java.util.IdentityHashMap; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.function.BiPredicate; @@ -32,6 +34,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; +import com.google.common.collect.Maps; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -66,6 +69,7 @@ import static org.apache.cassandra.db.lifecycle.Helpers.select; import static org.apache.cassandra.db.lifecycle.Helpers.selectFirst; import static org.apache.cassandra.db.lifecycle.Helpers.setReplaced; +import static org.apache.cassandra.db.lifecycle.View.replaceSSTables; import static org.apache.cassandra.db.lifecycle.View.updateCompacting; import static org.apache.cassandra.db.lifecycle.View.updateLiveSet; import static org.apache.cassandra.utils.Throwables.maybeFail; @@ -294,7 +298,15 @@ public Throwable doAbort(Throwable accumulate) // replace all updated readers with a version restored to its original state List restored = restoreUpdatedOriginals(); List invalid = Lists.newArrayList(Iterables.concat(logged.update, logged.obsolete)); - accumulate = tracker.apply(updateLiveSet(logged.update, restored), accumulate); + + Map replacementMap = Collections.emptyMap(); + if (!isOffline()) + replacementMap = getReplacementMap(logged.update, restored); + if (!replacementMap.isEmpty()) + accumulate = tracker.apply(replaceSSTables(logged.update, restored, replacementMap, tracker.maybeGetSSTableIntervalTreeLatencyMetrics()), accumulate); + else + accumulate = tracker.apply(updateLiveSet(logged.update, restored, tracker.maybeGetSSTableIntervalTreeLatencyMetrics()), accumulate); + accumulate = tracker.notifySSTablesChanged(invalid, restored, OperationType.COMPACTION, accumulate); // setReplaced immediately preceding versions that have not been obsoleted accumulate = setReplaced(logged.update, accumulate); @@ -373,8 +385,15 @@ private Throwable checkpoint(Throwable accumulate) // ensure any new readers are in the compacting set, since we aren't done with them yet // and don't want anyone else messing with them // apply atomically along with updating the live set of readers - tracker.apply(compose(updateCompacting(emptySet(), fresh), - updateLiveSet(toUpdate, staged.update))); + Map replacementMap = Collections.emptyMap(); + if (!isOffline()) + replacementMap = getReplacementMap(toUpdate, staged.update); + if (!replacementMap.isEmpty()) + tracker.apply(compose(updateCompacting(emptySet(), fresh), + replaceSSTables(toUpdate, staged.update, replacementMap, tracker.maybeGetSSTableIntervalTreeLatencyMetrics()))); + else + tracker.apply(compose(updateCompacting(emptySet(), fresh), + updateLiveSet(toUpdate, staged.update, tracker.maybeGetSSTableIntervalTreeLatencyMetrics()))); // log the staged changes and our newly marked readers marked.addAll(fresh); @@ -389,6 +408,38 @@ private Throwable checkpoint(Throwable accumulate) return accumulate; } + // Match the SSTableReaders from the existing ones to the new one to be added (with same ranges) + // Returns the map of toRemove <-> toAdd. Return empty map if such 1-1 replacement doesn't exist + private static Map getReplacementMap(final Set remove, final Collection add) + { + if (remove.size() != add.size()) + return Collections.emptyMap(); + + List toAdds = new ArrayList<>(add); + List toRemoves = new ArrayList<>(remove); + // sort the SSTableReader list by (first, last, descriptor.id). The view is per cfs so id will be unique + Comparator comp = Comparator.comparing((SSTableReader s) -> s.getFirst()) + .thenComparing(s -> s.getLast()) + .thenComparing(SSTableReader.idComparator); + toRemoves.sort(comp); + toAdds.sort(comp); + + Map replacementMap = Maps.newHashMapWithExpectedSize(toAdds.size()); + // toAdd and toRemove have the same size + for (int i = 0; i < toAdds.size(); i++) + { + SSTableReader toRemove = toRemoves.get(i); + SSTableReader toAdd = toAdds.get(i); + // optimization: here we don't check the descriptor. If we're able to match those to be removed with those + // to be added, we ensure that the pairs have the same (first, last) range + if (toRemove.getFirst().equals(toAdd.getFirst()) && toRemove.getLast().equals(toAdd.getLast())) + replacementMap.put(toRemove, toAdd); + else + // stop and return empty map if toAdd and toRemove can't match + return Collections.emptyMap(); + } + return replacementMap; + } /** * update a reader: if !original, this is a reader that is being introduced by this transaction; diff --git a/src/java/org/apache/cassandra/db/lifecycle/LogFile.java b/src/java/org/apache/cassandra/db/lifecycle/LogFile.java index 9decc248b92c..13436b112a98 100644 --- a/src/java/org/apache/cassandra/db/lifecycle/LogFile.java +++ b/src/java/org/apache/cassandra/db/lifecycle/LogFile.java @@ -179,6 +179,10 @@ static boolean isLogFile(File file) this.id = id; } + /** + * Check a variety of the internals of the LogRecord as well as the state of the LogRecord vs. the files found on disk + * to ensure they remain correct and nothing was changed external to the process. + */ boolean verify() { records.clear(); @@ -245,6 +249,9 @@ LogRecord setErrorInReplicas(LogRecord record) return record; } + /** + * Sets the {@link LogRecord.Status#error} if something wrong is found with the record. + */ static void verifyRecord(LogRecord record, List existingFiles) { if (record.checksum != record.computeChecksum()) @@ -256,6 +263,7 @@ static void verifyRecord(LogRecord record, List existingFiles) return; } + // If it's not a removal we don't check it since we're not going to take action on it if (record.type != Type.REMOVE) return; @@ -269,6 +277,16 @@ static void verifyRecord(LogRecord record, List existingFiles) // we can have transaction files with mismatching updateTime resolutions due to switching between jdk8 and jdk11, truncate both to be consistent: if (truncateMillis(record.updateTime) != truncateMillis(record.status.onDiskRecord.updateTime) && record.status.onDiskRecord.updateTime > 0) { + // handle the case where we have existing broken transaction file on disk, where the update time is + // based on the stats file. This is just for the first upgrade, patched versions never base the update + // time on the stats file. + LogRecord statsIncluded = LogRecord.make(record.type, existingFiles, existingFiles.size(), record.absolutePath(), true); + if (truncateMillis(statsIncluded.updateTime) == truncateMillis(record.updateTime)) + { + logger.warn("Found a legacy log record {} with updateTime based on the stats file, ignoring to allow startup to continue", record); + return; + } + record.setError(String.format("Unexpected files detected for sstable [%s]: " + "last update time [%tc] (%d) should have been [%tc] (%d)", record.fileName(), @@ -276,7 +294,6 @@ static void verifyRecord(LogRecord record, List existingFiles) record.status.onDiskRecord.updateTime, record.updateTime, record.updateTime)); - } } diff --git a/src/java/org/apache/cassandra/db/lifecycle/LogRecord.java b/src/java/org/apache/cassandra/db/lifecycle/LogRecord.java index 5f45156eb6ad..d39b23bf9c04 100644 --- a/src/java/org/apache/cassandra/db/lifecycle/LogRecord.java +++ b/src/java/org/apache/cassandra/db/lifecycle/LogRecord.java @@ -20,7 +20,6 @@ */ package org.apache.cassandra.db.lifecycle; - import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; @@ -38,14 +37,21 @@ import java.util.stream.Collectors; import java.util.zip.CRC32; +import com.google.common.annotations.VisibleForTesting; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import org.apache.cassandra.io.sstable.Component; +import org.apache.cassandra.io.sstable.Descriptor; import org.apache.cassandra.io.sstable.SSTable; +import org.apache.cassandra.io.sstable.format.SSTableFormat; import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.io.util.File; import org.apache.cassandra.io.util.FileUtils; import org.apache.cassandra.io.util.PathUtils; import org.apache.cassandra.utils.FBUtilities; +import static org.apache.cassandra.io.sstable.Descriptor.TMP_EXT; import static org.apache.cassandra.utils.LocalizeString.toUpperCaseLocalized; /** @@ -55,6 +61,10 @@ */ final class LogRecord { + private static final Logger logger = LoggerFactory.getLogger(LogRecord.class); + @VisibleForTesting + static boolean INCLUDE_STATS_FOR_TESTS = false; + public enum Type { UNKNOWN, // a record that cannot be parsed @@ -78,7 +88,10 @@ public boolean matches(LogRecord record) return this == record.type; } - public boolean isFinal() { return this == Type.COMMIT || this == Type.ABORT; } + public boolean isFinal() + { + return this == Type.COMMIT || this == Type.ABORT; + } } /** @@ -194,17 +207,65 @@ private static String absolutePath(File baseFile) public LogRecord withExistingFiles(List existingFiles) { + if (!absolutePath.isPresent()) + throw new IllegalStateException(String.format("Cannot create record from existing files for type %s - file is not present", type)); + return make(type, existingFiles, 0, absolutePath.get()); } + /** + * We create a LogRecord based on the files on disk; there's some subtlety around how we handle stats files as the + * timestamp can be mutated by the async completion of compaction if things race with node shutdown. To work around this, + * we don't take the stats file timestamp into account when calculating nor using the timestamps for all the components + * as we build the LogRecord. + */ public static LogRecord make(Type type, List files, int minFiles, String absolutePath) { + return make(type, files, minFiles, absolutePath, INCLUDE_STATS_FOR_TESTS); + } + + /** + * In most cases we skip including the stats file timestamp entirely as it can be mutated during anticompaction + * and thus "invalidate" the LogRecord. There is an edge case where we have a LogRecord that was written w/the wrong + * timestamp (i.e. included a mutated stats file) and we need the node to come up, so we need to expose the selective + * ability to either include the stats file timestamp or not. + * + * See {@link LogFile#verifyRecord} + */ + static LogRecord make(Type type, List files, int minFiles, String absolutePath, boolean includeStatsFile) + { + List toVerify; + File statsFile = null; + if (!includeStatsFile && !files.isEmpty()) + { + toVerify = new ArrayList<>(files.size() - 1); + for (File f : files) + { + if (!f.name().endsWith(TMP_EXT)) + { + if (Descriptor.componentFromFile(f) == SSTableFormat.Components.STATS) + statsFile = f; + else + toVerify.add(f); + } + } + } + else + { + toVerify = files; + } // CASSANDRA-11889: File.lastModified() returns a positive value only if the file exists, therefore // we filter by positive values to only consider the files that still exists right now, in case things // changed on disk since getExistingFiles() was called - List positiveModifiedTimes = files.stream().map(File::lastModified).filter(lm -> lm > 0).collect(Collectors.toList()); + List positiveModifiedTimes = toVerify.stream().map(File::lastModified).filter(lm -> lm > 0).collect(Collectors.toList()); long lastModified = positiveModifiedTimes.stream().reduce(0L, Long::max); - return new LogRecord(type, absolutePath, lastModified, Math.max(minFiles, positiveModifiedTimes.size())); + + // We need to preserve the file count for the number of existing files found on disk even though we ignored the + // stats file during our timestamp calculation. If the stats file still exists, we add in the count of it as + // a separate validation assumption that it's one of the files considered valid in this LogRecord. + boolean addStatTS = statsFile != null && statsFile.exists(); + int positiveTSCount = addStatTS ? positiveModifiedTimes.size() + 1 : positiveModifiedTimes.size(); + return new LogRecord(type, absolutePath, lastModified, Math.max(minFiles, positiveTSCount)); } private LogRecord(Type type, long updateTime) diff --git a/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java b/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java index 635ab38ec621..92766de3852c 100644 --- a/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java +++ b/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java @@ -549,6 +549,8 @@ static boolean removeUnfinishedLeftovers(Map.Entry> entry) try(LogFile txn = LogFile.make(entry.getKey(), entry.getValue())) { logger.info("Verifying logfile transaction {}", txn); + // We don't check / include the stats file timestamp on LogRecord creation / verification as that might + // be modified by a race in compaction notification and then needlessly fail subsequent node starts. if (txn.verify()) { Throwable failure = txn.removeUnfinishedLeftovers(null); diff --git a/src/java/org/apache/cassandra/db/lifecycle/SSTableIntervalTree.java b/src/java/org/apache/cassandra/db/lifecycle/SSTableIntervalTree.java index 4d5a87f3991d..0e88193856f5 100644 --- a/src/java/org/apache/cassandra/db/lifecycle/SSTableIntervalTree.java +++ b/src/java/org/apache/cassandra/db/lifecycle/SSTableIntervalTree.java @@ -1,5 +1,4 @@ /* - * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -8,29 +7,32 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. + * http://www.apache.org/licenses/LICENSE-2.0 * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + package org.apache.cassandra.db.lifecycle; -import java.util.Arrays; +import java.util.ArrayList; import java.util.Collection; -import java.util.Collections; import java.util.List; +import java.util.Map; + +import com.google.common.collect.Iterables; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.PartitionPosition; import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.utils.Interval; import org.apache.cassandra.utils.IntervalTree; +import org.apache.cassandra.utils.Pair; +import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; public class SSTableIntervalTree extends IntervalTree> @@ -42,6 +44,11 @@ public class SSTableIntervalTree extends IntervalTree[] minOrder, Interval[] maxOrder) + { + super(head, modCount, minOrder, maxOrder); + } + private SSTableIntervalTree(Interval[] minOrder, Interval[] maxOrder) { super(minOrder, maxOrder); @@ -53,6 +60,25 @@ protected SSTableIntervalTree create(Interval[ return new SSTableIntervalTree(minOrder, maxOrder); } + @Override + protected SSTableIntervalTree create(IntervalNode head, int modCount, Interval[] minOrder, Interval[] maxOrder) + { + return new SSTableIntervalTree(head, modCount, minOrder, maxOrder); + } + + @Override + protected SSTableIntervalTree create(Collection> intervals) + { + return new SSTableIntervalTree(intervals); + } + + @Override + public SSTableIntervalTree replace(List, Interval>> replacements) + { + checkArgument(!replacements.isEmpty(), "Shouldn't call replace with no replacements"); + return (SSTableIntervalTree) super.replace(replacements); + } + public static SSTableIntervalTree empty() { return EMPTY; @@ -67,12 +93,14 @@ public static SSTableIntervalTree buildSSTableIntervalTree(Collection> buildIntervals(Collection sstables) { - if (sstables == null || sstables.isEmpty()) - return Collections.emptyList(); - return Arrays.asList(buildIntervalsArray(sstables)); + List> intervals = new ArrayList<>(Iterables.size(sstables)); + for (SSTableReader sstable : sstables) + intervals.add(Interval.create(sstable.getFirst(), sstable.getLast(), sstable)); + return intervals; } - public static Interval[] buildIntervalsArray(Collection sstables) + @SuppressWarnings("unchecked") + static Interval[] buildIntervalsArray(Collection sstables) { if (sstables == null || sstables.isEmpty()) return IntervalTree.EMPTY_ARRAY; @@ -107,4 +135,36 @@ public static SSTableIntervalTree update(SSTableIntervalTree tree, Collection replacementMap) + { + checkArgument(!replacementMap.isEmpty(), "Replacement map shouldn't be empty for SSTableIntervalTree.replace"); + List, Interval>> replacementIntervalsMap = new ArrayList<>(); + for (Map.Entry entry : replacementMap.entrySet()) + { + SSTableReader originalSSTable = entry.getKey(); + SSTableReader replacementSSTable = entry.getValue(); + Interval originalInterval = originalSSTable.getInterval(); + Interval replacementInterval = replacementSSTable.getInterval(); + replacementIntervalsMap.add(Pair.create(originalInterval, replacementInterval)); + } + return tree.replace(replacementIntervalsMap); + } + + public static SSTableIntervalTree addSSTables(SSTableIntervalTree tree, Collection additions) + { + return (SSTableIntervalTree) tree.add(buildIntervalsArray(additions)); + } +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java index a8b22d1df6d9..73c18328a3d6 100644 --- a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java +++ b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java @@ -45,6 +45,7 @@ import org.apache.cassandra.io.sstable.metadata.StatsMetadata; import org.apache.cassandra.io.util.File; import org.apache.cassandra.io.util.FileUtils; +import org.apache.cassandra.metrics.LatencyMetrics; import org.apache.cassandra.metrics.StorageMetrics; import org.apache.cassandra.notifications.INotification; import org.apache.cassandra.notifications.INotificationConsumer; @@ -251,14 +252,14 @@ public void addInitialSSTables(Collection sstables) addSSTablesInternal(sstables, true, false, true); } - public void addInitialSSTablesWithoutUpdatingSize(Collection sstables, ColumnFamilyStore cfs) + public void addInitialSSTablesWithoutUpdatingSize(Collection sstables) { if (!isDummy()) { for (SSTableReader reader : sstables) reader.setupOnline(); } - apply(updateLiveSet(emptySet(), sstables)); + apply(updateLiveSet(emptySet(), sstables, maybeGetSSTableIntervalTreeLatencyMetrics())); notifyAdded(sstables, true); } @@ -279,7 +280,7 @@ private void addSSTablesInternal(Collection sstables, { if (!isDummy()) setupOnline(sstables); - apply(updateLiveSet(emptySet(), sstables)); + apply(updateLiveSet(emptySet(), sstables, maybeGetSSTableIntervalTreeLatencyMetrics())); if(updateSize) maybeFail(updateSizeTracking(emptySet(), sstables, null)); if (maybeIncrementallyBackup) @@ -332,7 +333,7 @@ public Throwable dropSSTables(final Predicate remove, OperationTy { Pair result = apply(view -> { Set toremove = copyOf(filter(view.sstables, and(remove, notIn(view.compacting)))); - return updateLiveSet(toremove, emptySet()).apply(view); + return updateLiveSet(toremove, emptySet(), maybeGetSSTableIntervalTreeLatencyMetrics()).apply(view); }); Set removed = Sets.difference(result.left.sstables, result.right.sstables); @@ -434,7 +435,7 @@ public void replaceFlushed(Memtable memtable, Collection sstables { // sstable may be null if we flushed batchlog and nothing needed to be retained // if it's null, we don't care what state the cfstore is in, we just replace it and continue - apply(View.replaceFlushed(memtable, null)); + apply(View.replaceFlushed(memtable, null, maybeGetSSTableIntervalTreeLatencyMetrics())); return; } @@ -442,7 +443,7 @@ public void replaceFlushed(Memtable memtable, Collection sstables // back up before creating a new Snapshot (which makes the new one eligible for compaction) maybeIncrementallyBackup(sstables); - apply(View.replaceFlushed(memtable, sstables)); + apply(View.replaceFlushed(memtable, sstables, maybeGetSSTableIntervalTreeLatencyMetrics())); Throwable fail; fail = updateSizeTracking(emptySet(), sstables, null); @@ -625,6 +626,13 @@ public View getView() @VisibleForTesting public void removeUnsafe(Set toRemove) { - Pair result = apply(view -> updateLiveSet(toRemove, emptySet()).apply(view)); + Pair result = apply(view -> updateLiveSet(toRemove, emptySet(), maybeGetSSTableIntervalTreeLatencyMetrics()).apply(view)); + } + + public LatencyMetrics maybeGetSSTableIntervalTreeLatencyMetrics() + { + if (cfstore == null) + return null; + return cfstore.metric != null ? cfstore.metric.viewSSTableIntervalTree : null; } } diff --git a/src/java/org/apache/cassandra/db/lifecycle/View.java b/src/java/org/apache/cassandra/db/lifecycle/View.java index ba200d5d0bc1..c6fd7ed52af2 100644 --- a/src/java/org/apache/cassandra/db/lifecycle/View.java +++ b/src/java/org/apache/cassandra/db/lifecycle/View.java @@ -23,6 +23,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import javax.annotation.Nullable; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Function; @@ -36,6 +37,8 @@ import org.apache.cassandra.db.memtable.Memtable; import org.apache.cassandra.dht.AbstractBounds; import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.metrics.LatencyMetrics; +import org.apache.cassandra.utils.Clock; import org.apache.cassandra.utils.Interval; import static com.google.common.base.Predicates.equalTo; @@ -297,7 +300,7 @@ public boolean apply(View view) } // construct a function to change the liveset in a Snapshot - static Function updateLiveSet(final Set remove, final Collection add) + static Function updateLiveSet(final Set remove, final Collection add, @Nullable LatencyMetrics sstableIntervalTreeLatency) { if (remove.isEmpty() && Iterables.isEmpty(add)) return Functions.identity(); @@ -306,8 +309,28 @@ static Function updateLiveSet(final Set remove, final public View apply(View view) { Map sstableMap = replace(view.sstablesMap, remove, add); - return new View(view.liveMemtables, view.flushingMemtables, sstableMap, view.compactingMap, - SSTableIntervalTree.update(view.intervalTree, remove, add)); + long treeBuildStart = Clock.Global.nanoTime(); + SSTableIntervalTree sstableIntervalTree = SSTableIntervalTree.update(view.intervalTree, remove, add); + if (sstableIntervalTreeLatency != null) + sstableIntervalTreeLatency.addNano(Clock.Global.nanoTime() - treeBuildStart); + return new View(view.liveMemtables, view.flushingMemtables, sstableMap, view.compactingMap, sstableIntervalTree); + } + }; + } + + // construct a function to replace the SSTable that have the same [first,last] intervals + static Function replaceSSTables(final Set remove, final Iterable add, final Map replacementMap, LatencyMetrics sstableIntervalTreeLatency) + { + return new Function() + { + public View apply(View view) + { + Map sstableMap = replace(view.sstablesMap, remove, add); + long treeBuildStart = Clock.Global.nanoTime(); + SSTableIntervalTree sstableIntervalTree = SSTableIntervalTree.replace(view.intervalTree, replacementMap); + if (sstableIntervalTreeLatency != null) + sstableIntervalTreeLatency.addNano(Clock.Global.nanoTime() - treeBuildStart); + return new View(view.liveMemtables, view.flushingMemtables, sstableMap, view.compactingMap, sstableIntervalTree); } }; } @@ -346,7 +369,7 @@ public View apply(View view) } // called after flush: removes memtable from flushingMemtables, and inserts flushed into the live sstable set - static Function replaceFlushed(final Memtable memtable, final Collection flushed) + static Function replaceFlushed(final Memtable memtable, final Collection flushed, @Nullable LatencyMetrics sstableIntervalTreeLatency) { return new Function() { @@ -360,8 +383,11 @@ public View apply(View view) view.compactingMap, view.intervalTree); Map sstableMap = replace(view.sstablesMap, emptySet(), flushed); - return new View(view.liveMemtables, flushingMemtables, sstableMap, view.compactingMap, - SSTableIntervalTree.update(view.intervalTree, null, flushed)); + long treeBuildStart = Clock.Global.nanoTime(); + SSTableIntervalTree sstableIntervalTree = SSTableIntervalTree.addSSTables(view.intervalTree, flushed); + if (sstableIntervalTreeLatency != null) + sstableIntervalTreeLatency.addNano(Clock.Global.nanoTime() - treeBuildStart); + return new View(view.liveMemtables, flushingMemtables, sstableMap, view.compactingMap, sstableIntervalTree); } }; } diff --git a/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java b/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java index 737954d09bdb..60bbdd31ecec 100644 --- a/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java +++ b/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java @@ -98,6 +98,11 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right ++i; } + return compareCustomRemainder(left, accessorL, offsetL, right, accessorR, offsetR); + } + + protected int compareCustomRemainder(VL left, ValueAccessor accessorL, int offsetL, VR right, ValueAccessor accessorR, int offsetR) + { if (accessorL.isEmptyFromOffset(left, offsetL)) return accessorR.sizeFromOffset(right, offsetR) == 0 ? 0 : -1; diff --git a/src/java/org/apache/cassandra/db/marshal/AbstractType.java b/src/java/org/apache/cassandra/db/marshal/AbstractType.java index 5378a4cd3fba..42190e0c2e84 100644 --- a/src/java/org/apache/cassandra/db/marshal/AbstractType.java +++ b/src/java/org/apache/cassandra/db/marshal/AbstractType.java @@ -211,20 +211,12 @@ public void validate(V value, ValueAccessor accessor) throws MarshalExcep public void checkConstraints(ByteBuffer bytes, ColumnConstraints constraints) throws ConstraintViolationException { - if (constraints.isEmpty()) - return; - - T value = getSerializer().deserialize(bytes); - constraints.evaluate(this, bytes); + checkConstraints(bytes, constraints.getConstraints()); } - public void checkConstraints(ByteBuffer bytes, List constraints) throws ConstraintViolationException + public void checkConstraints(ByteBuffer bytes, List> constraints) throws ConstraintViolationException { - if (constraints.isEmpty()) - return; - - T value = getSerializer().deserialize(bytes); - for (ColumnConstraint constraint : constraints) + for (ColumnConstraint constraint : constraints) constraint.evaluate(this, bytes); } @@ -448,7 +440,7 @@ public boolean isFreezable() return false; } - public AbstractType freeze() + public AbstractType freeze() { return this; } diff --git a/src/java/org/apache/cassandra/db/marshal/AddressBasedNativeData.java b/src/java/org/apache/cassandra/db/marshal/AddressBasedNativeData.java new file mode 100644 index 000000000000..73e686028a32 --- /dev/null +++ b/src/java/org/apache/cassandra/db/marshal/AddressBasedNativeData.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.marshal; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import org.apache.cassandra.utils.memory.MemoryUtil; + +public class AddressBasedNativeData implements NativeData +{ + // use a real address, just in case + private static final ByteBuffer EMPTY_NATIVE_BUFFER = ByteBuffer.allocateDirect(1); + private static final long EMPTY_VALUE_ADDRESS = MemoryUtil.getAddress(EMPTY_NATIVE_BUFFER); + public static final AddressBasedNativeData EMPTY = new AddressBasedNativeData(EMPTY_VALUE_ADDRESS, 0); + + private final long address; + private final int length; + + public AddressBasedNativeData(long address, int length) + { + this.address = address; + this.length = length; + } + + + @Override + public int nativeDataSize() + { + return length; + } + + @Override + public ByteBuffer asByteBuffer() + { + return MemoryUtil.getByteBuffer(address, length, ByteOrder.BIG_ENDIAN); + } + + @Override + public NativeData slice(int offset, int length) + { + if (offset < 0 || offset > this.length) + throw new IllegalArgumentException("offset must but be >= 0 and < parent length; " + + "offset: " + offset + + ", slice length: " + length + + ", data length: " + this.length); + if (length < 0 || offset + length > this.length) { + throw new IllegalArgumentException("length must but be >= 0 and offset + length > parent length; " + + "offset: " + offset + + ", slice length: " + length + + ", data length: " + this.length); + } + + if (length == 0) { + return EMPTY; + } + return new AddressBasedNativeData(address + offset, length); + } + + @Override + public long getAddress() + { + return address; + } +} diff --git a/src/java/org/apache/cassandra/db/marshal/ByteArrayAccessor.java b/src/java/org/apache/cassandra/db/marshal/ByteArrayAccessor.java index 55eae805124b..7035a251472f 100644 --- a/src/java/org/apache/cassandra/db/marshal/ByteArrayAccessor.java +++ b/src/java/org/apache/cassandra/db/marshal/ByteArrayAccessor.java @@ -25,6 +25,7 @@ import java.util.Arrays; import java.util.UUID; +import accord.utils.Invariants; import org.apache.cassandra.db.Digest; import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.io.util.DataInputPlus; @@ -107,6 +108,7 @@ public byte[] read(DataInputPlus in, int length) throws IOException @Override public byte[] slice(byte[] input, int offset, int length) { + Invariants.requireArgument(offset + length <= input.length); return Arrays.copyOfRange(input, offset, offset + length); } diff --git a/src/java/org/apache/cassandra/db/marshal/ByteBufferAccessor.java b/src/java/org/apache/cassandra/db/marshal/ByteBufferAccessor.java index 0ae208514b34..381bac4971b4 100644 --- a/src/java/org/apache/cassandra/db/marshal/ByteBufferAccessor.java +++ b/src/java/org/apache/cassandra/db/marshal/ByteBufferAccessor.java @@ -315,6 +315,35 @@ public int putFloat(ByteBuffer dst, int offset, float value) return TypeSizes.FLOAT_SIZE; } + @Override + public int putLeastSignificantBytes(ByteBuffer dst, int offset, long register, int bytes) + { + int pos = dst.position() + offset; + if (dst.limit() - pos < Long.BYTES) + { + return ValueAccessor.putLeastSignificantBytes(this, dst, offset, register, bytes); + } + else + { + dst.putLong(pos, register << (64 - (bytes * 8))); + } + return bytes; + } + + @Override + public long getLeastSignificantBytes(ByteBuffer dst, int offset, int bytes) + { + int pos = dst.position() + offset; + if (dst.limit() - pos < Long.BYTES) + { + return ValueAccessor.getLeastSignificantBytes(this, dst, offset, bytes); + } + else + { + return dst.getLong(pos) >>> (64 - (bytes * 8)); + } + } + @Override public ByteBuffer empty() { diff --git a/src/java/org/apache/cassandra/db/marshal/CollectionType.java b/src/java/org/apache/cassandra/db/marshal/CollectionType.java index 8c39dbab4f5b..c54ad31e5efb 100644 --- a/src/java/org/apache/cassandra/db/marshal/CollectionType.java +++ b/src/java/org/apache/cassandra/db/marshal/CollectionType.java @@ -171,6 +171,12 @@ public boolean isFreezable() return true; } + @Override + public boolean isConstrainable() + { + return isFrozenCollection(); + } + public ByteBuffer serializeForNativeProtocol(Iterator> cells) { assert isMultiCell(); @@ -344,15 +350,15 @@ V fromComparableBytesListOrSet(ValueAccessor accessor, } @Override - public ByteBuffer pack(List elements) + public V pack(List elements, ValueAccessor accessor) { - return getSerializer().pack(elements); + return getSerializer().pack(elements, accessor); } @Override - public List unpack(ByteBuffer input) + public List unpack(V value, ValueAccessor accessor) { - return getSerializer().unpack(input); + return getSerializer().unpack(value, accessor); } /** diff --git a/src/java/org/apache/cassandra/db/marshal/CompositeType.java b/src/java/org/apache/cassandra/db/marshal/CompositeType.java index df7ee99070de..c555d928fbad 100644 --- a/src/java/org/apache/cassandra/db/marshal/CompositeType.java +++ b/src/java/org/apache/cassandra/db/marshal/CompositeType.java @@ -41,6 +41,7 @@ import static com.google.common.collect.Iterables.any; import static com.google.common.collect.Iterables.transform; +import static org.apache.cassandra.utils.bytecomparable.ByteSource.END_OF_STREAM; /* * The encoding of a CompositeType column name should be: @@ -250,7 +251,7 @@ public ByteSource asComparableBytes(ValueAccessor accessor, V data, Versi if (i * 2 + 1 < srcs.length) srcs = Arrays.copyOfRange(srcs, 0, i * 2 + 1); - return ByteSource.withTerminatorMaybeLegacy(version, ByteSource.END_OF_STREAM, srcs); + return ByteSource.withTerminatorMaybeLegacy(version, END_OF_STREAM, srcs); } @Override diff --git a/src/java/org/apache/cassandra/db/marshal/ListType.java b/src/java/org/apache/cassandra/db/marshal/ListType.java index 94d302d05981..6c391b050e00 100644 --- a/src/java/org/apache/cassandra/db/marshal/ListType.java +++ b/src/java/org/apache/cassandra/db/marshal/ListType.java @@ -37,9 +37,9 @@ import org.apache.cassandra.serializers.ListSerializer; import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.JsonUtils; import org.apache.cassandra.utils.TimeUUID; -import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version; import org.apache.cassandra.utils.bytecomparable.ByteSource; @@ -52,7 +52,7 @@ public class ListType extends CollectionType> private static final ConcurrentHashMap, ListType> frozenInstances = new ConcurrentHashMap<>(); private final AbstractType elements; - public final ListSerializer serializer; + private final ListSerializer serializer; private final boolean isMultiCell; public static ListType getInstance(TypeParser parser) throws ConfigurationException, SyntaxException @@ -131,7 +131,7 @@ public ListSerializer getSerializer() } @Override - public AbstractType freeze() + public ListType freeze() { // freeze elements to match org.apache.cassandra.cql3.CQL3Type.Raw.RawCollection.freeze return isMultiCell ? getInstance(this.elements.freeze(), false) : this; diff --git a/src/java/org/apache/cassandra/db/marshal/MapType.java b/src/java/org/apache/cassandra/db/marshal/MapType.java index f8ac6c00680e..8ed43c3c8e1c 100644 --- a/src/java/org/apache/cassandra/db/marshal/MapType.java +++ b/src/java/org/apache/cassandra/db/marshal/MapType.java @@ -44,10 +44,10 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.JsonUtils; +import org.apache.cassandra.utils.Pair; import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version; import org.apache.cassandra.utils.bytecomparable.ByteSource; import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; -import org.apache.cassandra.utils.Pair; public class MapType extends CollectionType> { @@ -153,7 +153,7 @@ public List> subTypes() } @Override - public AbstractType freeze() + public MapType freeze() { // freeze key/value to match org.apache.cassandra.cql3.CQL3Type.Raw.RawCollection.freeze return isMultiCell ? getInstance(this.keys.freeze(), this.values.freeze(), false) : this; @@ -456,10 +456,4 @@ public ByteBuffer getElement(@Nullable ColumnData columnData, ByteBuffer keyOrIn return getSerializer().getSerializedValue(((Cell) columnData).buffer(), keyOrIndex, getValuesType()); } - - @Override - public boolean isConstrainable() - { - return false; - } } diff --git a/src/java/org/apache/cassandra/db/marshal/MultiElementType.java b/src/java/org/apache/cassandra/db/marshal/MultiElementType.java index d9c229c9e9bd..99ce762d6abc 100644 --- a/src/java/org/apache/cassandra/db/marshal/MultiElementType.java +++ b/src/java/org/apache/cassandra/db/marshal/MultiElementType.java @@ -44,7 +44,18 @@ protected MultiElementType(ComparisonType comparisonType) * @param elements the serialized values of the elements * @return the serialized representation of the value composed of the specified elements. */ - public abstract ByteBuffer pack(List elements); + public abstract V pack(List elements, ValueAccessor accessor); + + /** + * Returns the serialized representation of the value composed of the specified elements. + * + * @param elements the serialized values of the elements + * @return the serialized representation of the value composed of the specified elements. + */ + public ByteBuffer pack(List elements) + { + return pack(elements, ByteBufferAccessor.instance); + } /** * Returns the serialized representation of the elements composing the specified value. @@ -52,7 +63,23 @@ protected MultiElementType(ComparisonType comparisonType) * @param value a serialized value of this type * @return the serialized representation of the elements composing the specified value. */ - public abstract List unpack(ByteBuffer value); + /** + * Returns the serialized representation of the elements composing the specified value. + * + * @param value a serialized value of this type + * @return the serialized representation of the elements composing the specified value. + */ + public abstract List unpack(V value, ValueAccessor accessor); + + public final List unpack(byte[] value) + { + return unpack(value, ByteArrayAccessor.instance); + } + + public final List unpack(ByteBuffer value) + { + return unpack(value, ByteBufferAccessor.instance); + } /** * Checks if this type supports bind markers for its elements when the type value is provided through a literal. diff --git a/src/java/org/apache/cassandra/db/marshal/NativeAccessor.java b/src/java/org/apache/cassandra/db/marshal/NativeAccessor.java new file mode 100644 index 000000000000..70d73041de1b --- /dev/null +++ b/src/java/org/apache/cassandra/db/marshal/NativeAccessor.java @@ -0,0 +1,485 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.marshal; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.util.UUID; + +import com.google.common.annotations.VisibleForTesting; + +import org.apache.cassandra.db.Digest; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.paxos.Ballot; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.FastByteOperations; +import org.apache.cassandra.utils.TimeUUID; +import org.apache.cassandra.utils.UUIDGen; +import org.apache.cassandra.utils.memory.BigEndianMemoryUtil; +import org.apache.cassandra.utils.memory.MemoryUtil; + +/** + * ValueAccessor has a lot of different methods are grouped together in a single interface. + * Technically the methods can be classfied to 4 categories: + * 1) basic methods to deal with the existing data as an abstract read-only container of bytes + * 2) deserialization methods to decode the data into different data types + * 3) serialization methods to encode and write different data types into the value entity + * 4) Value object creation methods + * + * NativeAccessor provides a support for real NativeData objects (on top of off-heap memory) for 1-3 categories + * with a focus on 1) category and only emulates 4th category using ByteBufferSliceNativeData on top of heap ByteBuffers. + * We expect NativeData is used only to store data in Memtables with an explicit allocator and memory regions lifecycle + * and not used to create short-living Mutation requests and transfer them between nodes. + */ +public class NativeAccessor implements ValueAccessor +{ + public static final ValueAccessor instance = new NativeAccessor(); + + // ----------------------------------------------------------------------------- + // basic methods to deal with data as a read-only container of bytes + + @Override + public int size(NativeData value) + { + return value.nativeDataSize(); + } + + @Override + public void write(NativeData sourceValue, DataOutputPlus out) throws IOException + { + out.writeMemory(sourceValue.getAddress(), sourceValue.nativeDataSize()); + } + + @Override + public ByteBuffer toBuffer(NativeData value) + { + if (value == null) + return null; + return value.asByteBuffer(); + } + + @Override + public void write(NativeData value, ByteBuffer out) + { + int size = value.nativeDataSize(); + MemoryUtil.getBytes(value.getAddress(), out, size); + out.position(out.position() + size); + + } + + @Override + public int copyTo(NativeData src, int srcOffset, V2 dst, ValueAccessor dstAccessor, int dstOffset, int size) + { + if (dstAccessor == ByteArrayAccessor.instance) + MemoryUtil.getBytes(src.getAddress() + srcOffset, dstAccessor.toArray(dst), dstOffset, size); + else if (dstAccessor == ByteBufferAccessor.instance) + { + ByteBuffer dstBuffer = dstAccessor.toBuffer(dst); + MemoryUtil.getBytes(src.getAddress() + srcOffset, dstBuffer, dstOffset, size); + // note: position of dstBuffer expected to stay the same + } + else if (dstAccessor == NativeAccessor.instance) + MemoryUtil.setBytes(src.getAddress() + srcOffset, ((NativeData) dst).getAddress() + dstOffset, size); + else // just in case of new implementations of ValueAccessor appear + dstAccessor.copyByteBufferTo(src.asByteBuffer(), srcOffset, dst, dstOffset, size); + + return size; + } + + @Override + public int copyByteArrayTo(byte[] src, int srcOffset, NativeData dstNative, int dstOffset, int size) + { + MemoryUtil.setBytes(src, srcOffset, dstNative.getAddress() + dstOffset, size); + return size; + } + + @Override + public int copyByteBufferTo(ByteBuffer src, int srcOffset, NativeData dstNative, int dstOffset, int size) + { + MemoryUtil.setBytes(dstNative.getAddress() + dstOffset, src, srcOffset, size); + return size; + } + + @Override + public void digest(NativeData value, int offset, int size, Digest digest) + { + // not used for NativeData (we copy data to heap during a select) + // so, there is no much reason to optimize to avoid a ByteBuffer object allocation + ByteBuffer byteBuffer = value.asByteBuffer(); + digest.update(byteBuffer, byteBuffer.position() + offset, size); + } + + @Override + public NativeData slice(NativeData input, int offset, int length) + { + return input.slice(offset, length); + } + + @Override + public int compare(NativeData left, VR right, ValueAccessor accessorR) + { + + if (accessorR == ByteArrayAccessor.instance) + return -compareByteArrayTo(accessorR.toArray(right), left); + else if (accessorR == ByteBufferAccessor.instance) + return -compareByteBufferTo(accessorR.toBuffer(right), left); + if (accessorR == NativeAccessor.instance) + { + NativeData rightNative = (NativeData) right; + int leftSize = left.nativeDataSize(); + int rightSize = rightNative.nativeDataSize(); + return FastByteOperations.compareMemoryUnsigned(left.getAddress(), leftSize, rightNative.getAddress(), rightSize); + } else // just in case of new implementations of ValueAccessor appear + return ByteBufferUtil.compareUnsigned(left.asByteBuffer(), accessorR.toBuffer(right)); + } + + @Override + public int compareByteArrayTo(byte[] left, NativeData right) + { + return FastByteOperations.compareWithMemoryUnsigned(left, 0, left.length, right.getAddress(), right.nativeDataSize()); + } + + @Override + public int compareByteBufferTo(ByteBuffer left, NativeData right) + { + return FastByteOperations.compareWithMemoryUnsigned(left, right.getAddress(), right.nativeDataSize()); + } + + // ----------------------------------------------------------------------------- + // Data deserialization methods + + @Override + public byte[] toArray(NativeData value) + { + if (value == null) + return null; + int size = value.nativeDataSize(); + byte[] result = new byte[size]; + MemoryUtil.getBytes(value.getAddress(), result, 0, size); + return result; + } + + @Override + public byte[] toArray(NativeData value, int offset, int length) + { + if (value == null) + return null; + int size = value.nativeDataSize(); + if (length > size) + throw new IllegalArgumentException("length (" + length + ") cannot be more than the value size (" + size + ")"); + + byte[] result = new byte[length]; + MemoryUtil.getBytes(value.getAddress() + offset, result, 0, length); + return result; + } + + @Override + public String toString(NativeData value, Charset charset) throws CharacterCodingException + { + return ByteBufferUtil.string(value.asByteBuffer(), charset); + } + + @Override + public String toHex(NativeData value) + { + return ByteBufferUtil.bytesToHex(value.asByteBuffer()); + } + + @Override + public byte toByte(NativeData value) + { + return getByte(value, 0); + } + + @Override + public byte getByte(NativeData value, int offset) + { + return MemoryUtil.getByte(value.getAddress() + offset); + } + + @Override + public short toShort(NativeData value) + { + return getShort(value, 0); + } + + @Override + public short getShort(NativeData value, int offset) + { + return (short) BigEndianMemoryUtil.getUnsignedShort(value.getAddress() + offset); + } + + @Override + public int getUnsignedShort(NativeData value, int offset) + { + return BigEndianMemoryUtil.getUnsignedShort(value.getAddress() + offset); + } + + @Override + public int toInt(NativeData value) + { + return getInt(value, 0); + } + + @Override + public int getInt(NativeData value, int offset) + { + return BigEndianMemoryUtil.getInt(value.getAddress() + offset); + } + + @Override + public long toLong(NativeData value) + { + return getLong(value, 0); + } + + @Override + public long getLong(NativeData value, int offset) + { + return BigEndianMemoryUtil.getLong(value.getAddress() + offset); + } + + @Override + public float getFloat(NativeData value, int offset) + { + return Float.intBitsToFloat(BigEndianMemoryUtil.getInt(value.getAddress() + offset)); + } + + @Override + public double getDouble(NativeData value, int offset) + { + return Double.longBitsToDouble(BigEndianMemoryUtil.getLong(value.getAddress() + offset)); + } + + @Override + public float toFloat(NativeData value) + { + return getFloat(value, 0); + } + + @Override + public double toDouble(NativeData value) + { + return getDouble(value, 0); + } + + @Override + public UUID toUUID(NativeData value) + { + long mostSigBits = getLong(value, 0); + long leastSigBits = getLong(value, 8); + + return UUIDGen.getUUID(mostSigBits, leastSigBits); + } + + @Override + public TimeUUID toTimeUUID(NativeData value) + { + long mostSigBits = getLong(value, 0); + long leastSigBits = getLong(value, 8); + return TimeUUID.fromBytes(mostSigBits, leastSigBits); + } + + @Override + public Ballot toBallot(NativeData value) + { + long mostSigBits = getLong(value, 0); + long leastSigBits = getLong(value, 8); + return Ballot.fromBytes(mostSigBits, leastSigBits); + } + + @Override + public float[] toFloatArray(NativeData value, int dimension) + { + int arraySize = value.nativeDataSize() / Float.BYTES; + if (arraySize != dimension) + throw new IllegalArgumentException(String.format("Could not convert to a float[] with different dimension. " + + "Was expecting %d but got %d", dimension, arraySize)); + float[] floatArray = new float[arraySize]; + for (int i = 0; i < arraySize; i++) + { + floatArray[i] = Float.intBitsToFloat(getInt(value, i * Float.BYTES)); + } + return floatArray; + } + + + // ----------------------------------------------------------------------------- + // Data serialization methods + @Override + public int putByte(NativeData dstNative, int offset, byte value) + { + BigEndianMemoryUtil.setByte(dstNative.getAddress() + offset, value); + return TypeSizes.BYTE_SIZE; + } + + @Override + public int putShort(NativeData dstNative, int offset, short value) + { + BigEndianMemoryUtil.setShort(dstNative.getAddress() + offset, value); + return TypeSizes.SHORT_SIZE; + } + + @Override + public int putInt(NativeData dstNative, int offset, int value) + { + BigEndianMemoryUtil.setInt(dstNative.getAddress() + offset, value); + return TypeSizes.INT_SIZE; + } + + @Override + public int putLong(NativeData dstNative, int offset, long value) + { + BigEndianMemoryUtil.setLong(dstNative.getAddress() + offset, value); + return TypeSizes.LONG_SIZE; + } + + @Override + public int putFloat(NativeData dstNative, int offset, float value) + { + putInt(dstNative, offset, Float.floatToIntBits(value)); + return TypeSizes.FLOAT_SIZE; + } + + @Override + public NativeData[] createArray(int length) + { + return new NativeData[length]; + } + + // ----------------------------------------------------------------------------- + // Value object creation methods + // We do not expect the methods are used in real logic for NativeData, + // but they are needed to reuse existing unit tests written for other implementation of ValueAccessor. + + private static NativeDataAllocator allocator = NativeDataAllocator.UNSUPPORTED; + + @VisibleForTesting + public static void setNativeMemoryAllocator(NativeDataAllocator allocatorToSet) + { + allocator = allocatorToSet; + } + + @Override + public NativeData read(DataInputPlus in, int length) throws IOException + { + ByteBuffer data = ByteBufferUtil.read(in, length); + return allocator.allocateBasedOnBuffer(data); + } + + @Override + public NativeData empty() + { + return AddressBasedNativeData.EMPTY; + } + + @Override + public NativeData valueOf(byte[] bytes) + { + return allocator.allocateBasedOnBuffer(ByteBufferAccessor.instance.valueOf(bytes)); + } + + @Override + public NativeData valueOf(ByteBuffer bytes) + { + return allocator.allocateBasedOnBuffer(ByteBufferAccessor.instance.valueOf(bytes)); + } + + @Override + public NativeData valueOf(String s, Charset charset) + { + return allocator.allocateBasedOnBuffer(ByteBufferAccessor.instance.valueOf(s, charset)); + } + + @Override + public NativeData valueOf(UUID v) + { + return allocator.allocateBasedOnBuffer(ByteBufferAccessor.instance.valueOf(v)); + } + + @Override + public NativeData valueOf(boolean v) + { + return allocator.allocateBasedOnBuffer(ByteBufferAccessor.instance.valueOf(v)); + } + + @Override + public NativeData valueOf(byte v) + { + return allocator.allocateBasedOnBuffer(ByteBufferAccessor.instance.valueOf(v)); + } + + @Override + public NativeData valueOf(short v) + { + return allocator.allocateBasedOnBuffer(ByteBufferAccessor.instance.valueOf(v)); + } + + @Override + public NativeData valueOf(int v) + { + return allocator.allocateBasedOnBuffer(ByteBufferAccessor.instance.valueOf(v)); + } + + @Override + public NativeData valueOf(long v) + { + return allocator.allocateBasedOnBuffer(ByteBufferAccessor.instance.valueOf(v)); + } + + @Override + public NativeData valueOf(float v) + { + return allocator.allocateBasedOnBuffer(ByteBufferAccessor.instance.valueOf(v)); + } + + @Override + public NativeData valueOf(double v) + { + return allocator.allocateBasedOnBuffer(ByteBufferAccessor.instance.valueOf(v)); + } + + @Override + public NativeData convert(V2 src, ValueAccessor accessor) + { + if (accessor == NativeAccessor.instance) + return (NativeData) src; + return allocator.allocateBasedOnBuffer(accessor.toBuffer(src)); + } + + @Override + public NativeData allocate(int size) + { + return allocator.allocateBasedOnBuffer(ByteBufferAccessor.instance.allocate(size)); + } + + @Override + public ObjectFactory factory() + { + // The method is used to de-serialize and create different parts of a Mutation object + // to transfer it between Cassandra nodes. + // The current implementation of NativeData does not support creating of such objects in-flight + // because it requires to have a native memory pool/allocator and manage its lifecycle. + throw new UnsupportedOperationException(); + } +} diff --git a/src/java/org/apache/cassandra/db/marshal/NativeData.java b/src/java/org/apache/cassandra/db/marshal/NativeData.java new file mode 100644 index 000000000000..60ffb8ce515b --- /dev/null +++ b/src/java/org/apache/cassandra/db/marshal/NativeData.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.marshal; + +import java.nio.ByteBuffer; + +public interface NativeData +{ + int nativeDataSize(); + + ByteBuffer asByteBuffer(); + + NativeData slice(int offset, int length); + + public long getAddress(); +} diff --git a/src/java/org/apache/cassandra/db/marshal/NativeDataAllocator.java b/src/java/org/apache/cassandra/db/marshal/NativeDataAllocator.java new file mode 100644 index 000000000000..3d22f9951422 --- /dev/null +++ b/src/java/org/apache/cassandra/db/marshal/NativeDataAllocator.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.marshal; + +import java.nio.ByteBuffer; + +public interface NativeDataAllocator extends AutoCloseable +{ + NativeDataAllocator UNSUPPORTED = data -> { + throw new UnsupportedOperationException("The method is not expected to be used by NativeAccessor outside of tests. " + + "NativeData can be allocated only by a memtable NativeAllocator"); + }; + + NativeData allocateBasedOnBuffer(ByteBuffer data); + + @Override + default void close() {}; +} diff --git a/src/java/org/apache/cassandra/db/marshal/SetType.java b/src/java/org/apache/cassandra/db/marshal/SetType.java index c2fdf0042c7d..f2568e3cbc5d 100644 --- a/src/java/org/apache/cassandra/db/marshal/SetType.java +++ b/src/java/org/apache/cassandra/db/marshal/SetType.java @@ -18,7 +18,13 @@ package org.apache.cassandra.db.marshal; import java.nio.ByteBuffer; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; import java.util.function.Consumer; @@ -117,7 +123,7 @@ public boolean isMultiCell() } @Override - public AbstractType freeze() + public SetType freeze() { // freeze elements to match org.apache.cassandra.cql3.CQL3Type.Raw.RawCollection.freeze return isMultiCell ? getInstance(this.elements.freeze(), false) : this; diff --git a/src/java/org/apache/cassandra/db/marshal/TupleType.java b/src/java/org/apache/cassandra/db/marshal/TupleType.java index d6ce2da0f48d..47301b6e97a7 100644 --- a/src/java/org/apache/cassandra/db/marshal/TupleType.java +++ b/src/java/org/apache/cassandra/db/marshal/TupleType.java @@ -295,11 +295,6 @@ public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable } @Override - public List unpack(ByteBuffer value) - { - return unpack(value, ByteBufferAccessor.instance); - } - public List unpack(V value, ValueAccessor accessor) { int numberOfElements = size(); @@ -351,6 +346,11 @@ protected String componentOrFieldName(int i) return "component"; } + public static V pack(ValueAccessor accessor, V... components) + { + return pack(accessor, Arrays.asList(components)); + } + public static V pack(ValueAccessor accessor, Collection components) { int totalLength = 0; @@ -376,9 +376,9 @@ public static V pack(ValueAccessor accessor, Collection components) } @Override - public ByteBuffer pack(List components) + public V pack(List elements, ValueAccessor accessor) { - return pack(ByteBufferAccessor.instance, components); + return pack(accessor, elements); } public ByteBuffer pack(ByteBuffer... components) diff --git a/src/java/org/apache/cassandra/db/marshal/UserType.java b/src/java/org/apache/cassandra/db/marshal/UserType.java index 804891448345..15ab78e82a2a 100644 --- a/src/java/org/apache/cassandra/db/marshal/UserType.java +++ b/src/java/org/apache/cassandra/db/marshal/UserType.java @@ -24,6 +24,7 @@ import javax.annotation.Nullable; import com.google.common.base.Objects; +import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import org.slf4j.Logger; @@ -136,6 +137,12 @@ public AbstractType fieldType(int i) return type(i); } + public AbstractType fieldType(CellPath path) + { + int field = ByteBufferUtil.getUnsignedShort(path.get(0), 0); + return fieldType(field); + } + public List> fieldTypes() { return types; @@ -146,6 +153,11 @@ public FieldIdentifier fieldName(int i) return fieldNames.get(i); } + public FieldIdentifier fieldName(CellPath path) + { + return fieldNames.get(fieldPosition(path)); + } + public String fieldNameAsString(int i) { return stringFieldNames.get(i); @@ -166,6 +178,11 @@ public int fieldPosition(FieldIdentifier fieldName) return fieldNames.indexOf(fieldName); } + public int fieldPosition(CellPath path) + { + return Preconditions.checkElementIndex(ByteBufferUtil.getUnsignedShort(path.get(0), 0), fieldNames.size()); + } + public CellPath cellPathForField(FieldIdentifier fieldName) { // we use the field position instead of the field name to allow for field renaming in ALTER TYPE statements @@ -177,7 +194,7 @@ public ShortType nameComparator() return ShortType.instance; } - public ByteBuffer serializeForNativeProtocol(Iterator> cells, ProtocolVersion protocolVersion) + public ByteBuffer serializeForNativeProtocol(Iterator> cells) { assert isMultiCell; diff --git a/src/java/org/apache/cassandra/db/marshal/ValueAccessor.java b/src/java/org/apache/cassandra/db/marshal/ValueAccessor.java index b4f775de9522..4916c7326223 100644 --- a/src/java/org/apache/cassandra/db/marshal/ValueAccessor.java +++ b/src/java/org/apache/cassandra/db/marshal/ValueAccessor.java @@ -340,6 +340,11 @@ default int getVInt32(V value, int offset) return VIntCoding.getVInt32(value, this, offset); } + default long getLeastSignificantBytes(V value, int offset, int bytes) + { + return getLeastSignificantBytes(this, value, offset, bytes); + } + float getFloat(V value, int offset); double getDouble(V value, int offset); /** returns a long from offset 0 */ @@ -399,6 +404,18 @@ default int putBytes(V dst, int offset, byte[] src, int srcOffset, int length) return ByteArrayAccessor.instance.copyTo(src, srcOffset, dst, this, offset, length); } + /** + * An efficient way to write the type {@code bytes} of a long + * + * @param register - the long value to be written + * @param bytes - the number of bytes the register occupies. Valid values are between 1 and 8 inclusive. + * @throws IOException + */ + default int putLeastSignificantBytes(V dst, int offset, long register, int bytes) + { + return putLeastSignificantBytes(this, dst, offset, register, bytes); + } + default int putBytes(V dst, int offset, byte[] src) { return putBytes(dst, offset, src, 0, src.length); @@ -493,4 +510,72 @@ public static boolean equals(L left, ValueAccessor leftAccessor, R rig { return compare(left, leftAccessor, right, rightAccessor) == 0; } + + public static int putLeastSignificantBytes(ValueAccessor accessor, V dst, int offset, long register, int bytes) + { + switch (bytes) + { + case 0: + break; + case 1: + accessor.putByte(dst, offset, (byte)register); + break; + case 2: + accessor.putShort(dst, offset, (short)register); + break; + case 3: + accessor.putShort(dst, offset, (short)(register >>> 8)); + accessor.putByte(dst, offset + 2, (byte)register); + break; + case 4: + accessor.putInt(dst, offset, (int)register); + break; + case 5: + accessor.putInt(dst, offset, (int)(register >>> 8)); + accessor.putByte(dst, offset + 4, (byte)register); + break; + case 6: + accessor.putInt(dst, offset, (int)(register >>> 16)); + accessor.putShort(dst, offset + 4, (short)register); + break; + case 7: + accessor.putInt(dst, offset, (int)(register >>> 24)); + accessor.putShort(dst, offset + 4, (short)(register >> 8)); + accessor.putByte(dst, offset + 6, (byte)register); + break; + case 8: + accessor.putLong(dst, offset, register); + break; + default: + throw new IllegalArgumentException(); + } + return bytes; + } + + public static long getLeastSignificantBytes(ValueAccessor accessor, V dst, int offset, int bytes) + { + switch (bytes) + { + case 0: return 0; + case 1: return accessor.getByte(dst, offset) & 0xffL; + case 2: return accessor.getShort(dst, offset) & 0xffffL; + case 3: + return ((accessor.getShort(dst, offset) & 0xffffL) << 8) + | (accessor.getByte(dst, offset + 2) & 0xffL); + case 4: + return accessor.getInt(dst, offset) & 0xffffffffL; + case 5: + return ((accessor.getInt(dst, offset) & 0xffffffffL) << 8) + | (accessor.getByte(dst, offset + 4) & 0xffL); + case 6: + return ((accessor.getInt(dst, offset) & 0xffffffffL) << 16) + | (accessor.getShort(dst, offset + 4) & 0xffffL); + case 7: + return ((accessor.getInt(dst, offset) & 0xffffffffL) << 24) + | ((accessor.getShort(dst, offset + 4) & 0xffffL) << 8) + | (accessor.getByte(dst, offset + 6) & 0xffL); + case 8: return accessor.getLong(dst, offset); + default: throw new IllegalArgumentException(); + } + } } diff --git a/src/java/org/apache/cassandra/db/marshal/VectorType.java b/src/java/org/apache/cassandra/db/marshal/VectorType.java index ac4c0bfb94c1..e70857c5aafd 100644 --- a/src/java/org/apache/cassandra/db/marshal/VectorType.java +++ b/src/java/org/apache/cassandra/db/marshal/VectorType.java @@ -137,12 +137,6 @@ public VectorSerializer getSerializer() return serializer; } - @Override - public List unpack(ByteBuffer buffer) - { - return unpack(buffer, ByteBufferAccessor.instance); - } - public List unpack(V buffer, ValueAccessor accessor) { return getSerializer().unpack(buffer, accessor); @@ -193,11 +187,6 @@ public V decomposeAsFloat(ValueAccessor accessor, float[] value) return buffer; } - public ByteBuffer pack(List elements) - { - return pack(elements, ByteBufferAccessor.instance); - } - public V pack(List elements, ValueAccessor accessor) { return getSerializer().pack(elements, accessor); diff --git a/src/java/org/apache/cassandra/db/memtable/AbstractAllocatorMemtable.java b/src/java/org/apache/cassandra/db/memtable/AbstractAllocatorMemtable.java index b431d360ed10..2dbe41374f09 100644 --- a/src/java/org/apache/cassandra/db/memtable/AbstractAllocatorMemtable.java +++ b/src/java/org/apache/cassandra/db/memtable/AbstractAllocatorMemtable.java @@ -220,6 +220,12 @@ protected void runMayThrow() if (current instanceof AbstractAllocatorMemtable) ((AbstractAllocatorMemtable) current).flushIfPeriodExpired(); } + + @Override + public String toString() + { + return "Scheduled Flush of " + owner; + } }; ScheduledExecutors.scheduledTasks.scheduleSelfRecurring(runnable, period, TimeUnit.MILLISECONDS); } diff --git a/src/java/org/apache/cassandra/db/memtable/AbstractMemtable.java b/src/java/org/apache/cassandra/db/memtable/AbstractMemtable.java index 2f2c2a25516c..dd2254721b07 100644 --- a/src/java/org/apache/cassandra/db/memtable/AbstractMemtable.java +++ b/src/java/org/apache/cassandra/db/memtable/AbstractMemtable.java @@ -40,6 +40,8 @@ public abstract class AbstractMemtable implements Memtable { + private static final AtomicLong nextId = new AtomicLong(); + private final AtomicReference flushTransaction = new AtomicReference<>(null); protected final AtomicLong currentOperations = new AtomicLong(0); protected final ColumnsCollector columnsCollector; @@ -48,6 +50,7 @@ public abstract class AbstractMemtable implements Memtable protected AtomicLong minTimestamp = new AtomicLong(Long.MAX_VALUE); // The smallest local deletion time for all partitions in this memtable protected AtomicLong minLocalDeletionTime = new AtomicLong(Long.MAX_VALUE); + private final long id = nextId.incrementAndGet(); // Note: statsCollector has corresponding statistics to the two above, but starts with an epoch value which is not // correct for their usage. @@ -80,6 +83,12 @@ public long operationCount() return currentOperations.get(); } + @Override + public long getMemtableId() + { + return id; + } + @Override public long getMinTimestamp() { diff --git a/src/java/org/apache/cassandra/db/memtable/Memtable.java b/src/java/org/apache/cassandra/db/memtable/Memtable.java index f722ec20a907..dc0b7b3a1d54 100644 --- a/src/java/org/apache/cassandra/db/memtable/Memtable.java +++ b/src/java/org/apache/cassandra/db/memtable/Memtable.java @@ -30,6 +30,7 @@ import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.db.rows.EncodingStats; import org.apache.cassandra.db.rows.UnfilteredSource; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.index.transactions.UpdateTransaction; import org.apache.cassandra.io.sstable.format.SSTableWriter; import org.apache.cassandra.schema.TableMetadata; @@ -178,6 +179,11 @@ interface Owner // Main write and read operations + default long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup) + { + return put(update, indexer, opGroup, false); + } + /** * Put new data in the memtable. This operation may block until enough memory is available in the memory pool. * @@ -185,12 +191,14 @@ interface Owner * @param indexer receives information about the update's effect * @param opGroup write operation group, used to permit the operation to complete if it is needed to complete a * flush to free space. + * @param assumeMissing if true, the implementation MAY clone the key and attempt putIfAbsent without first + * looking for the keys' presence * * @return the smallest timestamp delta between corresponding rows from existing and update. A * timestamp delta being computed as the difference between the cells and DeletionTimes from any existing partition * and those in {@code update}. See CASSANDRA-7979. */ - long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup); + long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup, boolean assumeMissing); // Read operations are provided by the UnfilteredSource interface. @@ -362,6 +370,8 @@ default boolean isEmpty() */ boolean accepts(OpOrder.Group opGroup, CommitLogPosition commitLogPosition); + long getMemtableId(); + /** Approximate commit log lower bound, <= getCommitLogLowerBound, used as a time stamp for ordering */ CommitLogPosition getApproximateCommitLogLowerBound(); @@ -441,4 +451,9 @@ public LastCommitLogPosition(CommitLogPosition copy) super(copy.segmentId, copy.position); } } + + default Token lastToken() + { + throw new UnsupportedOperationException("lastToken is not supported"); + } } diff --git a/src/java/org/apache/cassandra/db/memtable/ShardedSkipListMemtable.java b/src/java/org/apache/cassandra/db/memtable/ShardedSkipListMemtable.java index 92cdbbad9fe0..9b9a531a2b19 100644 --- a/src/java/org/apache/cassandra/db/memtable/ShardedSkipListMemtable.java +++ b/src/java/org/apache/cassandra/db/memtable/ShardedSkipListMemtable.java @@ -50,6 +50,7 @@ import org.apache.cassandra.dht.Bounds; import org.apache.cassandra.dht.IncludingExcludingBounds; import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.index.transactions.UpdateTransaction; import org.apache.cassandra.io.sstable.SSTableReadsListener; import org.apache.cassandra.schema.TableMetadata; @@ -112,17 +113,36 @@ public boolean isClean() return true; } + @Override + public Token lastToken() + { + Token lastToken = null; + for (MemtableShard shard : shards) + { + Iterator ppIterator = shard.partitions.descendingKeySet().iterator(); + if (ppIterator.hasNext()) + { + Token token = ppIterator.next().getToken(); + if (lastToken == null) + lastToken = token; + else if (lastToken.compareTo(token) < 0) + lastToken = token; + } + } + return lastToken; + } + /** * Should only be called by ColumnFamilyStore.apply via Keyspace.apply, which supplies the appropriate * OpOrdering. * * commitLogSegmentPosition should only be null if this is a secondary index, in which case it is *expected* to be null */ - public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup) + public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup, boolean assumeMissing) { DecoratedKey key = update.partitionKey(); MemtableShard shard = shards[boundaries.getShardForKey(key)]; - return shard.put(key, update, indexer, opGroup); + return shard.put(key, update, indexer, opGroup, assumeMissing); } /** @@ -346,10 +366,10 @@ static class MemtableShard this.metadata = metadata; } - public long put(DecoratedKey key, PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup) + public long put(DecoratedKey key, PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup, boolean assumeMissing) { Cloner cloner = allocator.cloner(opGroup); - AtomicBTreePartition previous = partitions.get(key); + AtomicBTreePartition previous = assumeMissing ? null : partitions.get(key); long initialSize = 0; if (previous == null) @@ -484,13 +504,13 @@ static class Locking extends ShardedSkipListMemtable * * commitLogSegmentPosition should only be null if this is a secondary index, in which case it is *expected* to be null */ - public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup) + public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup, boolean assumeMissing) { DecoratedKey key = update.partitionKey(); MemtableShard shard = shards[boundaries.getShardForKey(key)]; synchronized (shard) { - return shard.put(key, update, indexer, opGroup); + return shard.put(key, update, indexer, opGroup, assumeMissing); } } diff --git a/src/java/org/apache/cassandra/db/memtable/SkipListMemtable.java b/src/java/org/apache/cassandra/db/memtable/SkipListMemtable.java index 8871b03bd69f..985dd310fdcb 100644 --- a/src/java/org/apache/cassandra/db/memtable/SkipListMemtable.java +++ b/src/java/org/apache/cassandra/db/memtable/SkipListMemtable.java @@ -50,6 +50,7 @@ import org.apache.cassandra.dht.Bounds; import org.apache.cassandra.dht.IncludingExcludingBounds; import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.index.transactions.UpdateTransaction; import org.apache.cassandra.io.sstable.SSTableReadsListener; import org.apache.cassandra.schema.TableMetadata; @@ -97,6 +98,15 @@ public boolean isClean() return partitions.isEmpty(); } + @Override + public Token lastToken() + { + Iterator iterator = partitions.keySet().iterator(); + if (iterator.hasNext()) + return iterator.next().getToken(); + return null; + } + /** * Should only be called by ColumnFamilyStore.apply via Keyspace.apply, which supplies the appropriate * OpOrdering. @@ -104,15 +114,14 @@ public boolean isClean() * commitLogSegmentPosition should only be null if this is a secondary index, in which case it is *expected* to be null */ @Override - public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup) + public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup, boolean assumeMissing) { - Cloner cloner = allocator.cloner(opGroup); - AtomicBTreePartition previous = partitions.get(update.partitionKey()); - long initialSize = 0; + Cloner cloner = allocator.cloner(opGroup); + AtomicBTreePartition previous = assumeMissing ? null : partitions.get(update.partitionKey()); if (previous == null) { - final DecoratedKey cloneKey = cloner.clone(update.partitionKey()); + DecoratedKey cloneKey = cloner.clone(update.partitionKey()); AtomicBTreePartition empty = new AtomicBTreePartition(metadata, cloneKey, allocator); // We'll add the columns later. This avoids wasting works if we get beaten in the putIfAbsent previous = partitions.putIfAbsent(cloneKey, empty); diff --git a/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java b/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java index a8fc54b89131..2a2813d0ad87 100644 --- a/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java +++ b/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java @@ -180,7 +180,7 @@ public void discard() * commitLogSegmentPosition should only be null if this is a secondary index, in which case it is *expected* to be null */ @Override - public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup) + public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup, boolean assumeMissing) { try { diff --git a/src/java/org/apache/cassandra/db/monitoring/Monitorable.java b/src/java/org/apache/cassandra/db/monitoring/Monitorable.java index 10bd10438aa5..4288a667de3f 100644 --- a/src/java/org/apache/cassandra/db/monitoring/Monitorable.java +++ b/src/java/org/apache/cassandra/db/monitoring/Monitorable.java @@ -20,6 +20,8 @@ public interface Monitorable { + Monitorable NO_OP = new NoOp(); + String name(); long creationTimeNanos(); long timeoutNanos(); @@ -33,4 +35,76 @@ public interface Monitorable boolean abort(); boolean complete(); + + default String monitoredOnKeyspace() { return null; }; + default String monitoredOnTable() { return null; }; + + class NoOp implements Monitorable + { + @Override + public String name() + { + return null; + } + + @Override + public long creationTimeNanos() + { + return 0; + } + + @Override + public long timeoutNanos() + { + return 0; + } + + @Override + public long slowTimeoutNanos() + { + return 0; + } + + @Override + public boolean isInProgress() + { + return false; + } + + @Override + public boolean isAborted() + { + return false; + } + + @Override + public boolean isCompleted() + { + return false; + } + + @Override + public boolean isSlow() + { + return false; + } + + @Override + public boolean isCrossNode() + { + return false; + } + + @Override + public boolean abort() + { + return false; + } + + @Override + public boolean complete() + { + return false; + } + } } diff --git a/src/java/org/apache/cassandra/db/monitoring/MonitoringTask.java b/src/java/org/apache/cassandra/db/monitoring/MonitoringTask.java index 243569910b8a..4d6d995c77bf 100644 --- a/src/java/org/apache/cassandra/db/monitoring/MonitoringTask.java +++ b/src/java/org/apache/cassandra/db/monitoring/MonitoringTask.java @@ -19,6 +19,7 @@ package org.apache.cassandra.db.monitoring; import java.util.Arrays; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -32,9 +33,19 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import com.fasterxml.jackson.core.type.TypeReference; import org.apache.cassandra.concurrent.ScheduledExecutors; import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.utils.Clock; +import org.apache.cassandra.utils.JsonUtils; import org.apache.cassandra.utils.NoSpamLogger; +import org.apache.cassandra.utils.logging.LoggingSupport; +import org.apache.cassandra.utils.logging.LoggingSupportFactory; +import org.apache.cassandra.utils.logging.SlowQueriesAppender; import static java.util.concurrent.TimeUnit.NANOSECONDS; import static org.apache.cassandra.config.CassandraRelevantProperties.MONITORING_MAX_OPERATIONS; @@ -47,8 +58,9 @@ * We also log timed out operations, see CASSANDRA-7392. * Since CASSANDRA-12403 we also log queries that were slow. */ -class MonitoringTask +public class MonitoringTask { + private static final String SLOW_OPERATIONS_LOGGER_NAME = "slow_queries"; private static final String LINE_SEPARATOR = CassandraRelevantProperties.LINE_SEPARATOR.getString(); private static final Logger logger = LoggerFactory.getLogger(MonitoringTask.class); private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(logger, 5L, TimeUnit.MINUTES); @@ -70,6 +82,8 @@ class MonitoringTask private final ScheduledFuture reportingTask; private final OperationsQueue failedOperationsQueue; private final OperationsQueue slowOperationsQueue; + private Logger slowOperationsLogger = logger; + private boolean slowOperationsLoggedToVirtualTable; private long approxLastLogTimeNanos; @@ -97,6 +111,15 @@ private MonitoringTask(int reportIntervalMillis, int maxOperations) reportIntervalMillis, reportIntervalMillis, TimeUnit.MILLISECONDS); + + LoggingSupport support = LoggingSupportFactory.getLoggingSupport(); + if (support.getLogger(SLOW_OPERATIONS_LOGGER_NAME).isPresent()) + { + if (support.getAppender(SlowQueriesAppender.class, SlowQueriesAppender.APPENDER_NAME).isPresent()) + slowOperationsLoggedToVirtualTable = true; + + slowOperationsLogger = LoggerFactory.getLogger(SLOW_OPERATIONS_LOGGER_NAME); + } } public void cancel() @@ -169,14 +192,30 @@ boolean logSlowOperations(long approxCurrentTimeNanos) if (!slowOperations.isEmpty()) { long approxElapsedNanos = approxCurrentTimeNanos - approxLastLogTimeNanos; - noSpamLogger.info("Some operations were slow, details available at debug level (debug.log)"); + noSpamLogger.info("Some operations were slow, details available at debug level (debug.log) or " + + "system_views.slow_queries virtual table (when enabled)."); + + if (slowOperationsLogger.isDebugEnabled()) + { + if (slowOperationsLoggedToVirtualTable) + { + // This is the crux of the patch for appending to vtable. + // Because we can send only Strings to debug method (or objects, on which toString() + // would be eventually called), we need to log a string in such a way that we can + // get Operation object(s) back "on the other side" when dealing with vtables and custom appenders + // as appenders work with LoggingEvent where message is just a string. + // It would be very hard / tricky / error-prone to parse customly crafted log message + // which appears in logs when no vtable appender is used. + slowOperationsLogger.debug(Operation.serialize(slowOperations.getOperations())); + } + else + slowOperationsLogger.debug("{} operations were slow in the last {} msecs:{}{}", + slowOperations.num(), + NANOSECONDS.toMillis(approxElapsedNanos), + LINE_SEPARATOR, + slowOperations.getLogMessage()); + } - if (logger.isDebugEnabled()) - logger.debug("{} operations were slow in the last {} msecs:{}{}", - slowOperations.num(), - NANOSECONDS.toMillis(approxElapsedNanos), - LINE_SEPARATOR, - slowOperations.getLogMessage()); return true; } return false; @@ -274,6 +313,12 @@ public long num() return operations.size() + numDropped; } + private Collection getOperations() + { + return operations.values(); + } + + @JsonIgnore String getLogMessage() { if (isEmpty()) @@ -307,9 +352,16 @@ private static void addOperation(StringBuilder ret, Operation operation) * same name (CQL query text) is reported and store the average, min and max * times. */ - protected abstract static class Operation + @JsonTypeInfo(use = JsonTypeInfo.Id.CLASS, property = "id") + @JsonSubTypes({ @JsonSubTypes.Type(value = SlowOperation.class) }) + @VisibleForTesting + public abstract static class Operation { + @JsonProperty + String id = getClass().getName(); + /** The operation that was reported as slow or timed out */ + @JsonIgnore final Monitorable operation; /** The number of times the operation was reported */ @@ -319,24 +371,50 @@ protected abstract static class Operation long totalTimeNanos; /** The maximum time spent by this operation */ - long maxTime; + long maxTimeNanos; /** The minimum time spent by this operation */ - long minTime; + long minTimeNanos; /** The name of the operation, i.e. the SELECT query CQL, * this is set lazily as it takes time to build the query CQL */ private String name; + /** + * creation time of this Operation object, in ms, + * this is different from operation's creationTimeNanos + * which does not follow wall clock and is useless for + * reporting purposes e.g. in virtual tables + */ + private final long timestampMs; + + // optional keyspace and table this operation acts on + // used upon deserialization + private String keyspace; + private String table; + private boolean crossNode; + Operation(Monitorable operation, long failedAtNanos) { this.operation = operation; numTimesReported = 1; totalTimeNanos = failedAtNanos - operation.creationTimeNanos(); - minTime = totalTimeNanos; - maxTime = totalTimeNanos; + minTimeNanos = totalTimeNanos; + maxTimeNanos = totalTimeNanos; + timestampMs = Clock.Global.currentTimeMillis() - (Clock.Global.nanoTime() - operation.creationTimeNanos()) / 1_000_000; + } + + void add(Operation operation) + { + numTimesReported++; + totalTimeNanos += operation.totalTimeNanos; + maxTimeNanos = Math.max(maxTimeNanos, operation.maxTimeNanos); + minTimeNanos = Math.min(minTimeNanos, operation.minTimeNanos); } + public abstract String getLogMessage(); + + @JsonProperty public String name() { if (name == null) @@ -344,15 +422,96 @@ public String name() return name; } - void add(Operation operation) + @JsonProperty + public String keyspace() { - numTimesReported++; - totalTimeNanos += operation.totalTimeNanos; - maxTime = Math.max(maxTime, operation.maxTime); - minTime = Math.min(minTime, operation.minTime); + if (operation != null) + { + String monitored = operation.monitoredOnKeyspace(); + if (monitored != null) + return monitored; + } + return keyspace; } - public abstract String getLogMessage(); + public void setKeyspace(String keyspace) + { + this.keyspace = keyspace; + } + + public void setTable(String table) + { + this.table = table; + } + + @JsonProperty + public String table() + { + if (operation != null) + { + String monitored = operation.monitoredOnTable(); + if (monitored != null) + return monitored; + } + return table; + } + + @JsonProperty + public boolean isCrossNode() + { + if (operation != null) + return operation.isCrossNode(); + + return crossNode; + } + + @JsonProperty + public int numTimesReported() + { + return numTimesReported; + } + + @JsonProperty + public long totalTimeNanos() + { + return totalTimeNanos; + } + + @JsonProperty + public long maxTimeNanos() + { + return maxTimeNanos; + } + + @JsonProperty + public long minTimeNanos() + { + return minTimeNanos; + } + + @JsonIgnore + public long averageTime() + { + return totalTimeNanos / numTimesReported; + } + + @JsonProperty + public long timestampMs() + { + return timestampMs; + } + + public static String serialize(Collection operations) + { + return JsonUtils.writeAsJsonString(operations); + } + + private static final TypeReference> TYPE_REFERENCE = new TypeReference<>() {}; + + public static List deserialize(String message) throws Throwable + { + return JsonUtils.JSON_OBJECT_MAPPER.readValue(message, TYPE_REFERENCE); + } } /** @@ -378,8 +537,8 @@ public String getLogMessage() name(), numTimesReported, NANOSECONDS.toMillis(totalTimeNanos / numTimesReported), - NANOSECONDS.toMillis(minTime), - NANOSECONDS.toMillis(maxTime), + NANOSECONDS.toMillis(minTimeNanos), + NANOSECONDS.toMillis(maxTimeNanos), NANOSECONDS.toMillis(operation.timeoutNanos()), operation.isCrossNode() ? "msec/cross-node" : "msec"); } @@ -388,13 +547,21 @@ public String getLogMessage() /** * An operation (query) that was reported as slow. */ - private final static class SlowOperation extends Operation + @VisibleForTesting + public final static class SlowOperation extends Operation { - SlowOperation(Monitorable operation, long failedAt) + // purely for deserialization purposes + public SlowOperation() + { + this(Monitorable.NO_OP, 0); + } + + public SlowOperation(Monitorable operation, long failedAt) { super(operation, failedAt); } + @JsonIgnore public String getLogMessage() { if (numTimesReported == 1) @@ -408,8 +575,8 @@ public String getLogMessage() name(), numTimesReported, NANOSECONDS.toMillis(totalTimeNanos/ numTimesReported), - NANOSECONDS.toMillis(minTime), - NANOSECONDS.toMillis(maxTime), + NANOSECONDS.toMillis(minTimeNanos), + NANOSECONDS.toMillis(maxTimeNanos), NANOSECONDS.toMillis(operation.slowTimeoutNanos()), operation.isCrossNode() ? "msec/cross-node" : "msec"); } diff --git a/src/java/org/apache/cassandra/db/partitions/AbstractBTreePartition.java b/src/java/org/apache/cassandra/db/partitions/AbstractBTreePartition.java index 33272375733f..923ef54ab621 100644 --- a/src/java/org/apache/cassandra/db/partitions/AbstractBTreePartition.java +++ b/src/java/org/apache/cassandra/db/partitions/AbstractBTreePartition.java @@ -24,12 +24,32 @@ import com.google.common.collect.Iterators; -import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.db.*; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ClusteringBound; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionInfo; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.MutableDeletionInfo; +import org.apache.cassandra.db.RangeTombstone; +import org.apache.cassandra.db.RegularAndStaticColumns; +import org.apache.cassandra.db.Slice; +import org.apache.cassandra.db.Slices; import org.apache.cassandra.db.filter.ColumnFilter; -import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.db.rows.AbstractUnfilteredRowIterator; +import org.apache.cassandra.db.rows.BTreeRow; +import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.db.rows.RangeTombstoneMarker; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.RowAndDeletionMergeIterator; +import org.apache.cassandra.db.rows.RowIterator; +import org.apache.cassandra.db.rows.Rows; +import org.apache.cassandra.db.rows.Unfiltered; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIterators; +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.utils.SearchIterator; import org.apache.cassandra.utils.btree.BTree; +import org.apache.cassandra.utils.btree.BTree.Dir; import static org.apache.cassandra.utils.btree.BTree.Dir.desc; @@ -386,10 +406,10 @@ public String toString(boolean includeFullDetails) @Override public boolean equals(Object obj) { - if (!(obj instanceof PartitionUpdate)) + if (!(obj instanceof AbstractBTreePartition)) return false; - PartitionUpdate that = (PartitionUpdate) obj; + AbstractBTreePartition that = (AbstractBTreePartition) obj; BTreePartitionData a = this.holder(), b = that.holder(); return partitionKey.equals(that.partitionKey) && metadata().id.equals(that.metadata().id) @@ -403,9 +423,15 @@ public int rowCount() return BTree.size(holder().tree); } + @Override public Iterator iterator() { - return BTree.iterator(holder().tree); + return iterator(false); + } + + public Iterator iterator(boolean reverse) + { + return BTree.iterator(holder().tree, reverse ? Dir.DESC : Dir.ASC); } public Row lastRow() diff --git a/src/java/org/apache/cassandra/db/partitions/AtomicBTreePartition.java b/src/java/org/apache/cassandra/db/partitions/AtomicBTreePartition.java index c9035befbde5..994ef1ac7b90 100644 --- a/src/java/org/apache/cassandra/db/partitions/AtomicBTreePartition.java +++ b/src/java/org/apache/cassandra/db/partitions/AtomicBTreePartition.java @@ -25,12 +25,17 @@ import com.google.common.annotations.VisibleForTesting; import org.apache.cassandra.index.transactions.UpdateTransaction; -import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.schema.TableMetadataRef; + import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.db.*; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionInfo; +import org.apache.cassandra.db.Slices; import org.apache.cassandra.db.filter.ColumnFilter; -import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.TableMetadataRef; import org.apache.cassandra.utils.ObjectSizes; import org.apache.cassandra.utils.concurrent.OpOrder; import org.apache.cassandra.utils.memory.Cloner; @@ -223,9 +228,9 @@ public UnfilteredRowIterator unfilteredIterator(BTreePartitionData current, Colu } @Override - public Iterator iterator() + public Iterator iterator(boolean reverse) { - return allocator.ensureOnHeap().applyToPartition(super.iterator()); + return allocator.ensureOnHeap().applyToPartition(super.iterator(reverse)); } private boolean shouldLock(OpOrder.Group writeOp) diff --git a/src/java/org/apache/cassandra/db/partitions/FilteredPartition.java b/src/java/org/apache/cassandra/db/partitions/FilteredPartition.java index d7a0171d9a20..a66781dd7ef0 100644 --- a/src/java/org/apache/cassandra/db/partitions/FilteredPartition.java +++ b/src/java/org/apache/cassandra/db/partitions/FilteredPartition.java @@ -19,11 +19,13 @@ import java.util.Iterator; -import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.DeletionInfo; import org.apache.cassandra.db.RegularAndStaticColumns; -import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.RowIterator; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.btree.BTree; public class FilteredPartition extends ImmutableBTreePartition { @@ -43,9 +45,14 @@ public static FilteredPartition create(RowIterator iterator) return new FilteredPartition(iterator); } - public RowIterator rowIterator() + public Row getAtIdx(int idx) + { + return BTree.findByIndex(holder.tree, idx); + } + + public RowIterator rowIterator(boolean reverse) { - final Iterator iter = iterator(); + final Iterator iter = iterator(reverse); return new RowIterator() { public TableMetadata metadata() @@ -55,7 +62,7 @@ public TableMetadata metadata() public boolean isReverseOrder() { - return false; + return reverse; } public RegularAndStaticColumns columns() diff --git a/src/java/org/apache/cassandra/db/partitions/Partition.java b/src/java/org/apache/cassandra/db/partitions/Partition.java index 8888104d95fe..601934a8e714 100644 --- a/src/java/org/apache/cassandra/db/partitions/Partition.java +++ b/src/java/org/apache/cassandra/db/partitions/Partition.java @@ -37,6 +37,7 @@ public interface Partition { public TableMetadata metadata(); + public DecoratedKey partitionKey(); public DeletionTime partitionLevelDeletion(); diff --git a/src/java/org/apache/cassandra/db/partitions/PartitionIterators.java b/src/java/org/apache/cassandra/db/partitions/PartitionIterators.java index b8a86d5a1aa2..bf65b3392a23 100644 --- a/src/java/org/apache/cassandra/db/partitions/PartitionIterators.java +++ b/src/java/org/apache/cassandra/db/partitions/PartitionIterators.java @@ -17,16 +17,16 @@ */ package org.apache.cassandra.db.partitions; -import java.util.*; +import java.util.List; import org.apache.cassandra.db.EmptyIterators; +import org.apache.cassandra.db.SinglePartitionReadQuery; +import org.apache.cassandra.db.rows.RowIterator; +import org.apache.cassandra.db.rows.RowIterators; import org.apache.cassandra.db.transform.MorePartitions; import org.apache.cassandra.db.transform.Transformation; import org.apache.cassandra.utils.AbstractIterator; -import org.apache.cassandra.db.SinglePartitionReadQuery; -import org.apache.cassandra.db.rows.*; - public abstract class PartitionIterators { private PartitionIterators() {} @@ -57,7 +57,7 @@ public void onPartitionClose() return Transformation.apply(toReturn, new Close()); } - public static PartitionIterator concat(final List iterators) + public static PartitionIterator concat(final List iterators) { if (iterators.size() == 1) return iterators.get(0); @@ -93,21 +93,6 @@ public static void consume(PartitionIterator iterator) } } - /** - * Consumes all rows in the next partition of the provided partition iterator. - */ - public static void consumeNext(PartitionIterator iterator) - { - if (iterator.hasNext()) - { - try (RowIterator partition = iterator.next()) - { - while (partition.hasNext()) - partition.next(); - } - } - } - /** * Wraps the provided iterator so it logs the returned rows for debugging purposes. *

diff --git a/src/java/org/apache/cassandra/db/partitions/PartitionUpdate.java b/src/java/org/apache/cassandra/db/partitions/PartitionUpdate.java index 00f26451c1a3..2b776e31c098 100644 --- a/src/java/org/apache/cassandra/db/partitions/PartitionUpdate.java +++ b/src/java/org/apache/cassandra/db/partitions/PartitionUpdate.java @@ -24,17 +24,46 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import javax.annotation.Nonnull; import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Function; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.primitives.Ints; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.cassandra.db.*; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.Columns; +import org.apache.cassandra.db.CounterMutation; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionInfo; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.MutableDeletionInfo; +import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.db.RangeTombstone; +import org.apache.cassandra.db.RegularAndStaticColumns; +import org.apache.cassandra.db.SerializationHeader; +import org.apache.cassandra.db.SimpleBuilders; +import org.apache.cassandra.db.Slices; import org.apache.cassandra.db.filter.ColumnFilter; -import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.db.rows.BTreeRow; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellPath; +import org.apache.cassandra.db.rows.ColumnData; +import org.apache.cassandra.db.rows.ComplexColumnData; +import org.apache.cassandra.db.rows.DeserializationHelper; +import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.db.rows.RangeTombstoneMarker; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.RowIterator; +import org.apache.cassandra.db.rows.RowIterators; +import org.apache.cassandra.db.rows.Rows; +import org.apache.cassandra.db.rows.Unfiltered; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIteratorSerializer; +import org.apache.cassandra.db.rows.UnfilteredRowIterators; import org.apache.cassandra.exceptions.CoordinatorBehindException; import org.apache.cassandra.exceptions.UnknownTableException; import org.apache.cassandra.index.IndexRegistry; @@ -43,18 +72,21 @@ import org.apache.cassandra.io.util.DataOutputBuffer; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.metrics.TCMMetrics; -import org.apache.cassandra.tcm.ClusterMetadata; -import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.utils.btree.BTree; import org.apache.cassandra.utils.btree.UpdateFunction; import org.apache.cassandra.utils.vint.VIntCoding; +import static org.apache.cassandra.db.SerializationHeader.StableHeaderSerializer.STABLE; import static org.apache.cassandra.db.rows.UnfilteredRowIteratorSerializer.IS_EMPTY; /** @@ -171,6 +203,30 @@ public static PartitionUpdate singleRowUpdate(TableMetadata metadata, DecoratedK return singleRowUpdate(metadata, key, row.isStatic() ? null : row, row.isStatic() ? row : null); } + /** + * Creates an immutable partition update that contains a single row update. + * + * @param metadata the metadata for the created update. + * @param key the partition key for the partition to update. + * @param rows the rows for the update (may not be static). + * + * @return the newly created partition update containing only {@code row}. + */ + public static PartitionUpdate multiRowUpdate(TableMetadata metadata, DecoratedKey key, List rows) + { + if (rows.isEmpty()) + return emptyUpdate(metadata, key); + MutableDeletionInfo deletionInfo = MutableDeletionInfo.live(); + Columns columns = Columns.NONE; + for (Row row : rows) + columns = columns.mergeTo(Columns.from(row)); + + BTreePartitionData holder = new BTreePartitionData(new RegularAndStaticColumns(Columns.NONE, columns), + BTree.build(rows), deletionInfo, Rows.EMPTY_STATIC_ROW, + EncodingStats.NO_STATS); + return new PartitionUpdate(metadata, metadata.epoch, key, holder, deletionInfo, false); + } + /** * Creates an immutable partition update that contains a single row update. * @@ -249,7 +305,8 @@ public PartitionUpdate withOnlyPresentColumns() } - protected boolean canHaveShadowedData() + @Override + public boolean canHaveShadowedData() { return canHaveShadowedData; } @@ -586,6 +643,15 @@ public static PartitionUpdate unsafeConstruct(TableMetadata metadata, return new PartitionUpdate(metadata, metadata.epoch, key, holder, deletionInfo, canHaveShadowedData); } + @Override + public boolean equals(Object obj) + { + if (!(obj instanceof PartitionUpdate)) + return false; + + return super.equals(obj); + } + /** * Interface for building partition updates geared towards human. *

@@ -748,6 +814,17 @@ public void serialize(PartitionUpdate update, DataOutputPlus out, int version) t } } + public void serializeWithoutKey(PartitionUpdate update, TableMetadatas tables, DataOutputPlus out, int version) throws IOException + { + try (UnfilteredRowIterator iter = update.unfilteredIterator()) + { + tables.serialize(update.metadata, out); + Epoch.serializer.serialize(update.metadata.epoch, out); + SerializationHeader header = new SerializationHeader(false, update.metadata, iter.columns(), iter.stats()); + UnfilteredRowIteratorSerializer.serializer.serializeWithoutKey(iter, header, out, version, update.rowCount(), STABLE, null); + } + } + public PartitionUpdate deserialize(DataInputPlus in, int version, DeserializationHelper.Flag flag) throws IOException { TableId tableId = TableId.deserialize(in); @@ -771,6 +848,19 @@ public PartitionUpdate deserialize(DataInputPlus in, int version, Deserializatio throw e; } UnfilteredRowIteratorSerializer.Header header = UnfilteredRowIteratorSerializer.serializer.deserializeHeader(tableMetadata, null, in, version, flag); + return deserialize(header, remoteVersion, tableMetadata, in, version, flag); + } + + public PartitionUpdate deserialize(PartitionKey key, TableMetadatas tables, DataInputPlus in, int version, DeserializationHelper.Flag flag) throws IOException + { + TableMetadata tableMetadata = tables.deserialize(in); + Epoch remoteVersion = Epoch.serializer.deserialize(in); + UnfilteredRowIteratorSerializer.Header header = UnfilteredRowIteratorSerializer.serializer.deserializeHeaderWithoutKey(tableMetadata, key.partitionKey(), in, version, flag, STABLE, null); + return deserialize(header, remoteVersion, tableMetadata, in, version, flag); + } + + private PartitionUpdate deserialize(UnfilteredRowIteratorSerializer.Header header, Epoch remoteVersion, TableMetadata tableMetadata, DataInputPlus in, int version, DeserializationHelper.Flag flag) throws IOException + { if (header.isEmpty) return emptyUpdate(tableMetadata, header.key); @@ -833,6 +923,18 @@ public long serializedSize(PartitionUpdate update, int version) + UnfilteredRowIteratorSerializer.serializer.serializedSize(iter, null, version, update.rowCount()); } } + + public long serializedSizeWithoutKey(PartitionUpdate update, TableMetadatas tables, int version) + { + try (UnfilteredRowIterator iter = update.unfilteredIterator()) + { + long size = tables.serializedSize(update.metadata); + size += Epoch.serializer.serializedSize(update.metadata.epoch); + + SerializationHeader header = new SerializationHeader(false, update.metadata, iter.columns(), iter.stats()); + return size + UnfilteredRowIteratorSerializer.serializer.serializedSizeWithoutKey(iter, header, version, update.rowCount(), STABLE, null); + } + } } /** @@ -914,6 +1016,15 @@ public Builder(TableMetadata metadata, this(metadata, key, columns, initialRowCapacity, canHaveShadowedData, Rows.EMPTY_STATIC_ROW, MutableDeletionInfo.live(), BTree.empty()); } + public Builder(TableMetadata metadata, + DecoratedKey key, + RegularAndStaticColumns columns, + Row staticRow, + int initialRowCapacity) + { + this(metadata, key, columns, initialRowCapacity, true, staticRow, MutableDeletionInfo.live(), BTree.empty()); + } + private Builder(TableMetadata metadata, DecoratedKey key, RegularAndStaticColumns columns, @@ -1090,6 +1201,14 @@ public Builder updateAllTimestamp(long newTimestamp) return this; } + public Builder updateTimesAndPathsForAccord(@Nonnull Function cellToMaybeNewListPath, long newTimestamp, long newLocalDeletionTime) + { + deletionInfo.updateAllTimestampAndLocalDeletionTime(newTimestamp - 1, newLocalDeletionTime); + tree = BTree.transformAndFilter(tree, (x) -> x.updateTimesAndPathsForAccord(cellToMaybeNewListPath, newTimestamp, newLocalDeletionTime)); + staticRow = this.staticRow.updateTimesAndPathsForAccord(cellToMaybeNewListPath, newTimestamp, newLocalDeletionTime); + return this; + } + @Override public String toString() { @@ -1103,6 +1222,5 @@ public String toString() ", isBuilt=" + isBuilt + '}'; } - } } diff --git a/src/java/org/apache/cassandra/db/partitions/UnfilteredPartitionIterators.java b/src/java/org/apache/cassandra/db/partitions/UnfilteredPartitionIterators.java index e68603c9f3dd..fd7880e367e4 100644 --- a/src/java/org/apache/cassandra/db/partitions/UnfilteredPartitionIterators.java +++ b/src/java/org/apache/cassandra/db/partitions/UnfilteredPartitionIterators.java @@ -19,11 +19,21 @@ import java.io.IOError; import java.io.IOException; -import java.util.*; - -import org.apache.cassandra.db.*; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.NoSuchElementException; + +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.Digest; +import org.apache.cassandra.db.EmptyIterators; +import org.apache.cassandra.db.SinglePartitionReadCommand; import org.apache.cassandra.db.filter.ColumnFilter; -import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.db.rows.DeserializationHelper; +import org.apache.cassandra.db.rows.LazilyInitializedUnfilteredRowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIteratorSerializer; +import org.apache.cassandra.db.rows.UnfilteredRowIterators; import org.apache.cassandra.db.transform.FilteredPartitions; import org.apache.cassandra.db.transform.MorePartitions; import org.apache.cassandra.db.transform.Transformation; diff --git a/src/java/org/apache/cassandra/db/repair/CassandraTableRepairManager.java b/src/java/org/apache/cassandra/db/repair/CassandraTableRepairManager.java index 3b6535752e6f..bf5ab5a0e1dc 100644 --- a/src/java/org/apache/cassandra/db/repair/CassandraTableRepairManager.java +++ b/src/java/org/apache/cassandra/db/repair/CassandraTableRepairManager.java @@ -87,7 +87,9 @@ public synchronized void snapshot(String name, Collection> ranges, !sstable.metadata().isIndex() && // exclude SSTables from 2i new Bounds<>(sstable.getFirst().getToken(), sstable.getLast().getToken()).intersects(ranges); - SnapshotOptions options = SnapshotOptions.systemSnapshot(name, SnapshotType.REPAIR, predicate, cfs.getKeyspaceTableName()).ephemeral().build(); + SnapshotOptions options = SnapshotOptions.systemSnapshot(name, SnapshotType.REPAIR, predicate, cfs.getKeyspaceTableName()) + .ephemeral() + .build(); SnapshotManager.instance.takeSnapshot(options); } }).get(); diff --git a/src/java/org/apache/cassandra/db/repair/CassandraValidationIterator.java b/src/java/org/apache/cassandra/db/repair/CassandraValidationIterator.java index 5d4d88ed9efc..a46cbe58ca92 100644 --- a/src/java/org/apache/cassandra/db/repair/CassandraValidationIterator.java +++ b/src/java/org/apache/cassandra/db/repair/CassandraValidationIterator.java @@ -52,11 +52,11 @@ import org.apache.cassandra.io.sstable.ISSTableScanner; import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.metrics.TopPartitionTracker; +import org.apache.cassandra.repair.NoSuchRepairSessionException; import org.apache.cassandra.repair.SharedContext; import org.apache.cassandra.repair.ValidationPartitionIterator; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.ActiveRepairService; -import org.apache.cassandra.repair.NoSuchRepairSessionException; import org.apache.cassandra.service.snapshot.SnapshotManager; import org.apache.cassandra.service.snapshot.TableSnapshot; import org.apache.cassandra.utils.TimeUUID; @@ -161,7 +161,6 @@ else if (isIncremental) } private final ColumnFamilyStore cfs; - private final SharedContext ctx; private final Refs sstables; private final String snapshotName; private final boolean isGlobalSnapshotValidation; @@ -179,7 +178,6 @@ else if (isIncremental) public CassandraValidationIterator(ColumnFamilyStore cfs, SharedContext ctx, Collection> ranges, TimeUUID parentId, TimeUUID sessionID, boolean isIncremental, long nowInSec, boolean dontPurgeTombstones, TopPartitionTracker.Collector topPartitionCollector) throws IOException, NoSuchRepairSessionException { this.cfs = cfs; - this.ctx = ctx; isGlobalSnapshotValidation = SnapshotManager.instance.exists(cfs.getKeyspaceName(), cfs.getTableName(), parentId.toString()); if (isGlobalSnapshotValidation) diff --git a/src/java/org/apache/cassandra/db/rows/AbstractCell.java b/src/java/org/apache/cassandra/db/rows/AbstractCell.java index 69ca0b1c315d..0dbcfc44205a 100644 --- a/src/java/org/apache/cassandra/db/rows/AbstractCell.java +++ b/src/java/org/apache/cassandra/db/rows/AbstractCell.java @@ -19,9 +19,12 @@ import java.nio.ByteBuffer; import java.util.Objects; +import javax.annotation.Nonnull; + +import com.google.common.base.Function; -import org.apache.cassandra.db.Digest; import org.apache.cassandra.db.DeletionPurger; +import org.apache.cassandra.db.Digest; import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.db.context.CounterContext; import org.apache.cassandra.db.marshal.AbstractType; @@ -117,6 +120,20 @@ public Cell updateAllTimestamp(long newTimestamp) return new BufferCell(column, isTombstone() ? newTimestamp - 1 : newTimestamp, ttl(), localDeletionTime(), buffer(), path()); } + @Override + public ColumnData updateTimesAndPathsForAccord(@Nonnull Function cellToMaybeNewListPath, long newTimestamp, long newLocalDeletionTime) + { + long localDeletionTime = localDeletionTime() != NO_DELETION_TIME ? newLocalDeletionTime : NO_DELETION_TIME; + return new BufferCell(column, isTombstone() ? newTimestamp - 1 : newTimestamp, ttl(), localDeletionTime, buffer(), path()); + } + + @Override + public Cell updateAllTimesWithNewCellPathForComplexColumnData(@Nonnull CellPath maybeNewPath, long newTimestamp, long newLocalDeletionTime) + { + long localDeletionTime = localDeletionTime() != NO_DELETION_TIME ? newLocalDeletionTime : NO_DELETION_TIME; + return new BufferCell(column, isTombstone() ? newTimestamp - 1 : newTimestamp, ttl(), localDeletionTime, buffer(), maybeNewPath); + } + public int dataSize() { CellPath path = path(); diff --git a/src/java/org/apache/cassandra/db/rows/ArrayCell.java b/src/java/org/apache/cassandra/db/rows/ArrayCell.java index 07823d2be515..8201b6849f6d 100644 --- a/src/java/org/apache/cassandra/db/rows/ArrayCell.java +++ b/src/java/org/apache/cassandra/db/rows/ArrayCell.java @@ -32,7 +32,7 @@ public class ArrayCell extends AbstractCell { - private static final long EMPTY_SIZE = ObjectSizes.measure(new ArrayCell(ColumnMetadata.regularColumn("", "", "", ByteType.instance), 0L, 0, 0, EMPTY_BYTE_ARRAY, null)); + private static final long EMPTY_SIZE = ObjectSizes.measure(new ArrayCell(ColumnMetadata.regularColumn("", "", "", ByteType.instance, ColumnMetadata.NO_UNIQUE_ID), 0L, 0, 0, EMPTY_BYTE_ARRAY, null)); // Careful: Adding vars here has an impact on memtable size private final long timestamp; @@ -94,6 +94,12 @@ public Cell withUpdatedValue(ByteBuffer newValue) return new ArrayCell(column, timestamp, ttl, localDeletionTimeUnsignedInteger, ByteBufferUtil.getArray(newValue), path); } + @Override + public Cell withUpdatedTimestamp(long newTimestamp) + { + return new ArrayCell(column, newTimestamp, ttl, localDeletionTimeUnsignedInteger, value, path); + } + public Cell withUpdatedTimestampAndLocalDeletionTime(long newTimestamp, long newLocalDeletionTime) { return new ArrayCell(column, newTimestamp, ttl, newLocalDeletionTime, value, path); diff --git a/src/java/org/apache/cassandra/db/rows/BTreeRow.java b/src/java/org/apache/cassandra/db/rows/BTreeRow.java index 4acb20878e75..84ebdc00be5c 100644 --- a/src/java/org/apache/cassandra/db/rows/BTreeRow.java +++ b/src/java/org/apache/cassandra/db/rows/BTreeRow.java @@ -18,7 +18,6 @@ package org.apache.cassandra.db.rows; import java.nio.ByteBuffer; - import java.util.AbstractCollection; import java.util.Arrays; import java.util.Collection; @@ -28,9 +27,10 @@ import java.util.Map; import java.util.function.BiConsumer; import java.util.function.Consumer; -import java.util.function.Function; import java.util.function.Predicate; +import javax.annotation.Nonnull; +import com.google.common.base.Function; import com.google.common.collect.Collections2; import com.google.common.collect.Iterators; import com.google.common.primitives.Ints; @@ -40,15 +40,13 @@ import org.apache.cassandra.db.DeletionPurger; import org.apache.cassandra.db.DeletionTime; import org.apache.cassandra.db.LivenessInfo; +import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.UTF8Type; -import org.apache.cassandra.schema.ColumnMetadata; -import org.apache.cassandra.schema.TableMetadata; - -import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.DroppedColumn; - +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.utils.AbstractIterator; import org.apache.cassandra.utils.BiLongAccumulator; import org.apache.cassandra.utils.BulkIterator; @@ -267,7 +265,7 @@ public Deletion deletion() public Cell getCell(ColumnMetadata c) { - assert !c.isComplex(); + assert !c.isComplex(): String.format("Column %s.%s#%s", c.ksName, c.cfName, c.name); return (Cell) BTree.find(btree, ColumnMetadata.asymmetricColumnDataComparator, c); } @@ -431,7 +429,7 @@ public boolean hasInvalidDeletions() * Returns a copy of the row where all timestamps for live data have replaced by {@code newTimestamp} and * all deletion timestamp by {@code newTimestamp - 1}. * - * This exists for the Paxos path, see {@link PartitionUpdate#updateAllTimestamp} for additional details. + * This exists for the Paxos path, see {@link PartitionUpdate.Builder#updateAllTimestamp} for additional details. */ public Row updateAllTimestamp(long newTimestamp) { @@ -445,6 +443,18 @@ public Row updateAllTimestamp(long newTimestamp) return transformAndFilter(newInfo, newDeletion, (cd) -> cd.updateAllTimestamp(newTimestamp)); } + @Override + public Row updateTimesAndPathsForAccord(@Nonnull Function cellToMaybeNewListPath, long newTimestamp, long newLocalDeletionTime) + { + LivenessInfo newInfo = primaryKeyLivenessInfo.isEmpty() ? primaryKeyLivenessInfo : primaryKeyLivenessInfo.withUpdatedTimestampAndLocalDeletionTime(newTimestamp, newLocalDeletionTime); + // If the deletion is shadowable and the row has a timestamp, we'll forced the deletion timestamp to be less than the row one, so we + // should get rid of said deletion. + Deletion newDeletion = deletion.isLive() || (deletion.isShadowable() && !primaryKeyLivenessInfo.isEmpty()) + ? Deletion.LIVE + : new Deletion(DeletionTime.build(newTimestamp - 1, newLocalDeletionTime), deletion.isShadowable()); + return transformAndFilter(newInfo, newDeletion, (cd) -> cd.updateTimesAndPathsForAccord(cellToMaybeNewListPath, newTimestamp, newLocalDeletionTime)); + } + public Row withRowDeletion(DeletionTime newDeletion) { // Note that: diff --git a/src/java/org/apache/cassandra/db/rows/BufferCell.java b/src/java/org/apache/cassandra/db/rows/BufferCell.java index d6918533e868..b03fcb1a1210 100644 --- a/src/java/org/apache/cassandra/db/rows/BufferCell.java +++ b/src/java/org/apache/cassandra/db/rows/BufferCell.java @@ -32,7 +32,7 @@ public class BufferCell extends AbstractCell { - private static final long EMPTY_SIZE = ObjectSizes.measure(new BufferCell(ColumnMetadata.regularColumn("", "", "", ByteType.instance), 0L, 0, 0, ByteBufferUtil.EMPTY_BYTE_BUFFER, null)); + private static final long EMPTY_SIZE = ObjectSizes.measure(new BufferCell(ColumnMetadata.regularColumn("", "", "", ByteType.instance, ColumnMetadata.NO_UNIQUE_ID), 0L, 0, 0, ByteBufferUtil.EMPTY_BYTE_BUFFER, null)); // Careful: Adding vars here has an impact on memtable size private final long timestamp; @@ -127,6 +127,12 @@ public Cell withUpdatedValue(ByteBuffer newValue) return new BufferCell(column, timestamp, ttl, localDeletionTimeUnsignedInteger, newValue, path); } + @Override + public Cell withUpdatedTimestamp(long newTimestamp) + { + return new BufferCell(column, newTimestamp, ttl, localDeletionTimeUnsignedInteger, value, path); + } + public Cell withUpdatedTimestampAndLocalDeletionTime(long newTimestamp, long newLocalDeletionTime) { return new BufferCell(column, newTimestamp, ttl, newLocalDeletionTime, value, path); diff --git a/src/java/org/apache/cassandra/db/rows/Cell.java b/src/java/org/apache/cassandra/db/rows/Cell.java index d60fdda5a012..3ddfeae39a1f 100644 --- a/src/java/org/apache/cassandra/db/rows/Cell.java +++ b/src/java/org/apache/cassandra/db/rows/Cell.java @@ -184,6 +184,8 @@ public long localDeletionTime() public abstract Cell withUpdatedValue(ByteBuffer newValue); + public abstract Cell withUpdatedTimestamp(long newTimestamp); + public abstract Cell withUpdatedTimestampAndLocalDeletionTime(long newTimestamp, long newLocalDeletionTime); /** diff --git a/src/java/org/apache/cassandra/db/rows/Cells.java b/src/java/org/apache/cassandra/db/rows/Cells.java index 39690d50b7b8..48331a73a655 100644 --- a/src/java/org/apache/cassandra/db/rows/Cells.java +++ b/src/java/org/apache/cassandra/db/rows/Cells.java @@ -113,6 +113,16 @@ private static Cell resolveRegular(Cell left, Cell right) // would otherwise always win (unless it had an empty value), until it expired and was translated to a tombstone if (leftLocalDeletionTime != rightLocalDeletionTime) return leftLocalDeletionTime > rightLocalDeletionTime ? left : right; + + // Both cells are either tombstones or expiring at the same timestamp. If expiring and the + // TTLs differ, write the lower one -- the write is probably from a more recent + // UPDATE USING TTL AND TIMESTAMP, so select the most recent one to be deterministic and be + // closest to client intent. + if (!leftIsTombstone && left.ttl() != right.ttl()) + { + assert !rightIsTombstone; + return left.ttl() < right.ttl() ? left : right; + } } return compareValues(left, right) >= 0 ? left : right; diff --git a/src/java/org/apache/cassandra/db/rows/ColumnData.java b/src/java/org/apache/cassandra/db/rows/ColumnData.java index b9f19dc07fce..c27d9bdabcd3 100644 --- a/src/java/org/apache/cassandra/db/rows/ColumnData.java +++ b/src/java/org/apache/cassandra/db/rows/ColumnData.java @@ -18,13 +18,16 @@ package org.apache.cassandra.db.rows; import java.util.Comparator; +import javax.annotation.Nonnull; + +import com.google.common.base.Function; import org.apache.cassandra.cache.IMeasurableMemory; -import org.apache.cassandra.db.Digest; -import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.db.DeletionPurger; import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.Digest; import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.utils.btree.BTree; import org.apache.cassandra.utils.btree.UpdateFunction; @@ -285,6 +288,19 @@ public static void digest(Digest digest, ColumnData cd) */ public abstract ColumnData updateAllTimestamp(long newTimestamp); + /** + * @param cellToMaybeNewListPath If the cell is a list append cell a new cell path is returned generated based on the Accord executeAt timestamp + */ + public abstract ColumnData updateTimesAndPathsForAccord(@Nonnull Function cellToMaybeNewListPath, long newTimestamp, long newLocalDeletionTime); + + /** + * List paths are time UUIDs that increment for each item in the list and for Accord and Paxos + * should be based on the transaction's ballot/timestamp. + * + * @param maybeNewPath If this cell is a list append for a non-frozen list (multi-cell) then it will be new path generated using the executeAt timestamp, otherwise it will be the existing path + */ + public abstract ColumnData updateAllTimesWithNewCellPathForComplexColumnData(@Nonnull CellPath maybeNewPath, long newTimestamp, long newLocalDeletionTime); + public abstract ColumnData markCounterLocalToBeCleared(); public abstract ColumnData purge(DeletionPurger purger, long nowInSec); diff --git a/src/java/org/apache/cassandra/db/rows/ComplexColumnData.java b/src/java/org/apache/cassandra/db/rows/ComplexColumnData.java index dea77413c09d..129ad69a4f7f 100644 --- a/src/java/org/apache/cassandra/db/rows/ComplexColumnData.java +++ b/src/java/org/apache/cassandra/db/rows/ComplexColumnData.java @@ -20,6 +20,7 @@ import java.nio.ByteBuffer; import java.util.Iterator; import java.util.Objects; +import javax.annotation.Nonnull; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Function; @@ -30,6 +31,7 @@ import org.apache.cassandra.db.LivenessInfo; import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.marshal.ByteType; +import org.apache.cassandra.db.marshal.ListType; import org.apache.cassandra.db.marshal.SetType; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.DroppedColumn; @@ -51,8 +53,8 @@ public class ComplexColumnData extends ColumnData implements Iterable> private static final long EMPTY_SIZE = ObjectSizes.measure(new ComplexColumnData(ColumnMetadata.regularColumn("", "", "", - SetType.getInstance(ByteType.instance, - true)), + SetType.getInstance(ByteType.instance, true), + ColumnMetadata.NO_UNIQUE_ID), NO_CELLS, DeletionTime.build(0, 0))); @@ -264,6 +266,24 @@ public ComplexColumnData updateAllTimestamp(long newTimestamp) return transformAndFilter(newDeletion, (cell) -> (Cell) cell.updateAllTimestamp(newTimestamp)); } + @Override + public ColumnData updateTimesAndPathsForAccord(@Nonnull Function cellToMaybeNewListPath, long newTimestamp, long newLocalDeletionTime) + { + DeletionTime newDeletion = complexDeletion.isLive() ? complexDeletion : DeletionTime.build(newTimestamp - 1, newLocalDeletionTime); + Function maybeNewListPath; + if (column.type instanceof ListType && column.type.isMultiCell()) + maybeNewListPath = cellToMaybeNewListPath; + else + maybeNewListPath = cell -> cell.path(); + return transformAndFilter(newDeletion, (cell) -> (Cell) cell.updateAllTimesWithNewCellPathForComplexColumnData(maybeNewListPath.apply(cell), newTimestamp, newLocalDeletionTime)); + } + + @Override + public ColumnData updateAllTimesWithNewCellPathForComplexColumnData(@Nonnull CellPath maybeNewPath, long newTimestamp, long newLocalDeletionTime) + { + throw new UnsupportedOperationException(); + } + public long maxTimestamp() { long timestamp = complexDeletion.markedForDeleteAt(); diff --git a/src/java/org/apache/cassandra/db/rows/NativeCell.java b/src/java/org/apache/cassandra/db/rows/NativeCell.java index b0613f33f6da..b774cb2ce989 100644 --- a/src/java/org/apache/cassandra/db/rows/NativeCell.java +++ b/src/java/org/apache/cassandra/db/rows/NativeCell.java @@ -20,7 +20,9 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; -import org.apache.cassandra.db.marshal.ByteBufferAccessor; +import org.apache.cassandra.db.marshal.AddressBasedNativeData; +import org.apache.cassandra.db.marshal.NativeAccessor; +import org.apache.cassandra.db.marshal.NativeData; import org.apache.cassandra.db.marshal.ValueAccessor; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.utils.ByteBufferUtil; @@ -28,8 +30,9 @@ import org.apache.cassandra.utils.concurrent.OpOrder; import org.apache.cassandra.utils.memory.MemoryUtil; import org.apache.cassandra.utils.memory.NativeAllocator; +import org.apache.cassandra.utils.memory.NativeEndianMemoryUtil; -public class NativeCell extends AbstractCell +public class NativeCell extends AbstractCell implements NativeData { private static final long EMPTY_SIZE = ObjectSizes.measure(new NativeCell()); @@ -101,11 +104,11 @@ public NativeCell(NativeAllocator allocator, // cellpath? : timestamp : ttl : localDeletionTime : length : : [cell path length] : [] peer = allocator.allocate((int) size, writeOp); - MemoryUtil.setByte(peer + HAS_CELLPATH, (byte)(path == null ? 0 : 1)); - MemoryUtil.setLong(peer + TIMESTAMP, timestamp); - MemoryUtil.setInt(peer + TTL, ttl); - MemoryUtil.setInt(peer + DELETION, localDeletionTimeUnsignedInteger); - MemoryUtil.setInt(peer + LENGTH, value.remaining()); + NativeEndianMemoryUtil.setByte(peer + HAS_CELLPATH, (byte)(path == null ? 0 : 1)); + NativeEndianMemoryUtil.setLong(peer + TIMESTAMP, timestamp); + NativeEndianMemoryUtil.setInt(peer + TTL, ttl); + NativeEndianMemoryUtil.setInt(peer + DELETION, localDeletionTimeUnsignedInteger); + NativeEndianMemoryUtil.setInt(peer + LENGTH, value.remaining()); MemoryUtil.setBytes(peer + VALUE, value); if (path != null) @@ -114,7 +117,7 @@ public NativeCell(NativeAllocator allocator, assert pathbuffer.order() == ByteOrder.BIG_ENDIAN; long offset = peer + VALUE + value.remaining(); - MemoryUtil.setInt(offset, pathbuffer.remaining()); + NativeEndianMemoryUtil.setInt(offset, pathbuffer.remaining()); MemoryUtil.setBytes(offset + 4, pathbuffer); } } @@ -126,28 +129,33 @@ private static long offHeapSizeWithoutPath(int length) public long timestamp() { - return MemoryUtil.getLong(peer + TIMESTAMP); + return NativeEndianMemoryUtil.getLong(peer + TIMESTAMP); } public int ttl() { - return MemoryUtil.getInt(peer + TTL); + return NativeEndianMemoryUtil.getInt(peer + TTL); } - public ByteBuffer value()// FIXME: add native accessor + public NativeData value() { - int length = MemoryUtil.getInt(peer + LENGTH); - return MemoryUtil.getByteBuffer(peer + VALUE, length, ByteOrder.BIG_ENDIAN); + return this; } - public ValueAccessor accessor() + public ByteBuffer byteBufferValue() { - return ByteBufferAccessor.instance; // FIXME: add native accessor + int length = valueSize(); + return MemoryUtil.getByteBuffer(getAddress(), length, ByteOrder.BIG_ENDIAN); + } + + public ValueAccessor accessor() + { + return NativeAccessor.instance; } public int valueSize() { - return MemoryUtil.getInt(peer + LENGTH); + return NativeEndianMemoryUtil.getInt(peer + LENGTH); } public CellPath path() @@ -155,8 +163,8 @@ public CellPath path() if (!hasPath()) return null; - long offset = peer + VALUE + MemoryUtil.getInt(peer + LENGTH); - int size = MemoryUtil.getInt(offset); + long offset = getAddress() + valueSize(); + int size = NativeEndianMemoryUtil.getInt(offset); return CellPath.create(MemoryUtil.getByteBuffer(offset + 4, size, ByteOrder.BIG_ENDIAN)); } @@ -165,17 +173,23 @@ public Cell withUpdatedValue(ByteBuffer newValue) throw new UnsupportedOperationException(); } + @Override + public Cell withUpdatedTimestamp(long newTimestamp) + { + return new BufferCell(column, newTimestamp, ttl(), localDeletionTime(), byteBufferValue(), path()); + } + public Cell withUpdatedTimestampAndLocalDeletionTime(long newTimestamp, long newLocalDeletionTime) { - return new BufferCell(column, newTimestamp, ttl(), newLocalDeletionTime, value(), path()); + return new BufferCell(column, newTimestamp, ttl(), newLocalDeletionTime, byteBufferValue(), path()); } public Cell withUpdatedColumn(ColumnMetadata column) { - return new BufferCell(column, timestamp(), ttl(), localDeletionTimeAsUnsignedInt(), value(), path()); + return new BufferCell(column, timestamp(), ttl(), localDeletionTimeAsUnsignedInt(), byteBufferValue(), path()); } - public Cell withSkippedValue() + public Cell withSkippedValue() { return new BufferCell(column, timestamp(), ttl(), localDeletionTimeAsUnsignedInt(), ByteBufferUtil.EMPTY_BYTE_BUFFER, path()); } @@ -194,20 +208,44 @@ public long unsharedHeapSizeExcludingData() public long offHeapSize() { - long size = offHeapSizeWithoutPath(MemoryUtil.getInt(peer + LENGTH)); + long size = offHeapSizeWithoutPath(NativeEndianMemoryUtil.getInt(peer + LENGTH)); if (hasPath()) - size += 4 + MemoryUtil.getInt(peer + size); + size += 4 + NativeEndianMemoryUtil.getInt(peer + size); return size; } private boolean hasPath() { - return MemoryUtil.getByte(peer+ HAS_CELLPATH) != 0; + return NativeEndianMemoryUtil.getByte(peer + HAS_CELLPATH) != 0; } @Override protected int localDeletionTimeAsUnsignedInt() { - return MemoryUtil.getInt(peer + DELETION); + return NativeEndianMemoryUtil.getInt(peer + DELETION); + } + + + @Override + public int nativeDataSize() + { + return valueSize(); + } + + @Override + public ByteBuffer asByteBuffer() + { + return byteBufferValue(); + } + @Override + public NativeData slice(int offset, int length) + { + return new AddressBasedNativeData(getAddress() + offset, length); + } + + @Override + public long getAddress() + { + return peer + VALUE; } } diff --git a/src/java/org/apache/cassandra/db/rows/Row.java b/src/java/org/apache/cassandra/db/rows/Row.java index 5e0bbaf6edf7..cb48c4ce6fe6 100644 --- a/src/java/org/apache/cassandra/db/rows/Row.java +++ b/src/java/org/apache/cassandra/db/rows/Row.java @@ -17,13 +17,26 @@ */ package org.apache.cassandra.db.rows; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; import java.util.function.BiConsumer; import java.util.function.Consumer; -import java.util.function.Function; +import javax.annotation.Nonnull; + +import com.google.common.base.Function; import org.apache.cassandra.cache.IMeasurableMemory; -import org.apache.cassandra.db.*; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.DeletionPurger; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.Digest; +import org.apache.cassandra.db.LivenessInfo; import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.TableMetadata; @@ -299,6 +312,8 @@ public interface Row extends Unfiltered, Iterable, IMeasurableMemory */ public Row updateAllTimestamp(long newTimestamp); + public Row updateTimesAndPathsForAccord(@Nonnull Function cellToMaybeNewListPath, long newTimestamp, long newLocalDeletionTime); + /** * Returns a copy of this row with the new deletion as row deletion if it is more recent * than the current row deletion. diff --git a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorSerializer.java b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorSerializer.java index 1be3d54558e2..46ca484834fb 100644 --- a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorSerializer.java +++ b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorSerializer.java @@ -29,6 +29,7 @@ import org.apache.cassandra.db.EmptyIterators; import org.apache.cassandra.db.RegularAndStaticColumns; import org.apache.cassandra.db.SerializationHeader; +import org.apache.cassandra.db.SerializationHeader.ParameterizedSerializer; import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.io.sstable.format.big.BigFormatPartitionWriter; @@ -37,6 +38,8 @@ import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.utils.ByteBufferUtil; +import static org.apache.cassandra.db.SerializationHeader.MessagingHeaderSerializer.MESSAGING; + /** * Serialize/Deserialize an unfiltered row iterator. * @@ -90,7 +93,11 @@ public void serialize(UnfilteredRowIterator iterator, ColumnFilter selection, Da public void serialize(UnfilteredRowIterator iterator, ColumnFilter selection, DataOutputPlus out, int version, int rowEstimate) throws IOException { + serialize(iterator, out, version, rowEstimate, MESSAGING, selection); + } + public

void serialize(UnfilteredRowIterator iterator, DataOutputPlus out, int version, int rowEstimate, ParameterizedSerializer

serializer, P param) throws IOException + { SerializationHeader header = new SerializationHeader(false, iterator.metadata(), iterator.columns(), @@ -98,7 +105,7 @@ public void serialize(UnfilteredRowIterator iterator, ColumnFilter selection, Da try { - serialize(iterator, header, selection, out, version, rowEstimate); + serialize(iterator, header, out, version, rowEstimate, serializer, param); } catch (BufferOverflowException boe) { @@ -106,13 +113,22 @@ public void serialize(UnfilteredRowIterator iterator, ColumnFilter selection, Da } } - // Should only be used for the on-wire format. private void serialize(UnfilteredRowIterator iterator, SerializationHeader header, ColumnFilter selection, DataOutputPlus out, int version, int rowEstimate) throws IOException + { + serialize(iterator, header, out, version, rowEstimate, MESSAGING, selection); + } + + // Should only be used for the on-wire format. + private

void serialize(UnfilteredRowIterator iterator, SerializationHeader header, DataOutputPlus out, int version, int rowEstimate, ParameterizedSerializer

serializer, P param) throws IOException { assert !header.isForSSTable(); ByteBufferUtil.writeWithVIntLength(iterator.partitionKey().getKey(), out); + serializeWithoutKey(iterator, header, out, version, rowEstimate, serializer, param); + } + public

void serializeWithoutKey(UnfilteredRowIterator iterator, SerializationHeader header, DataOutputPlus out, int version, int rowEstimate, ParameterizedSerializer

serializer, P param) throws IOException + { int flags = 0; if (iterator.isReverseOrder()) flags |= IS_REVERSED; @@ -136,7 +152,7 @@ private void serialize(UnfilteredRowIterator iterator, SerializationHeader heade out.writeByte((byte)flags); - SerializationHeader.serializer.serializeForMessaging(header, selection, out, hasStatic); + serializer.serialize(out, header, hasStatic, param); SerializationHelper helper = new SerializationHelper(header); if (!partitionDeletion.isLive()) @@ -153,30 +169,40 @@ private void serialize(UnfilteredRowIterator iterator, SerializationHeader heade UnfilteredSerializer.serializer.writeEndOfPartition(out); } + public long serializedSize(UnfilteredRowIterator iterator, ColumnFilter selection, int version, int rowEstimate) + { + return serializedSize(iterator, version, rowEstimate, MESSAGING, selection); + } + // Please note that this consume the iterator, and as such should not be called unless we have a simple way to // recreate an iterator for both serialize and serializedSize, which is mostly only PartitionUpdate/ArrayBackedCachedPartition. - public long serializedSize(UnfilteredRowIterator iterator, ColumnFilter selection, int version, int rowEstimate) + public

long serializedSize(UnfilteredRowIterator iterator, int version, int rowEstimate, ParameterizedSerializer

serializer, P param) { SerializationHeader header = new SerializationHeader(false, iterator.metadata(), iterator.columns(), iterator.stats()); - SerializationHelper helper = new SerializationHelper(header); - assert rowEstimate >= 0; - long size = ByteBufferUtil.serializedSizeWithVIntLength(iterator.partitionKey().getKey()) - + 1; // flags + long size = ByteBufferUtil.serializedSizeWithVIntLength(iterator.partitionKey().getKey()); + return size + serializedSizeWithoutKey(iterator, header, version, rowEstimate, serializer, param); + } + // Please note that this consume the iterator, and as such should not be called unless we have a simple way to + // recreate an iterator for both serialize and serializedSize, which is mostly only PartitionUpdate/ArrayBackedCachedPartition. + public

long serializedSizeWithoutKey(UnfilteredRowIterator iterator, SerializationHeader header, int version, int rowEstimate, ParameterizedSerializer

serializer, P param) + { + long size = 1; // flags if (iterator.isEmpty()) return size; + SerializationHelper helper = new SerializationHelper(header); DeletionTime partitionDeletion = iterator.partitionLevelDeletion(); Row staticRow = iterator.staticRow(); boolean hasStatic = staticRow != Rows.EMPTY_STATIC_ROW; - size += SerializationHeader.serializer.serializedSizeForMessaging(header, selection, hasStatic); + size += serializer.serializedSize(header, hasStatic, param); if (!partitionDeletion.isLive()) size += header.deletionTimeSerializedSize(partitionDeletion); @@ -195,8 +221,18 @@ public long serializedSize(UnfilteredRowIterator iterator, ColumnFilter selectio } public Header deserializeHeader(TableMetadata metadata, ColumnFilter selection, DataInputPlus in, int version, DeserializationHelper.Flag flag) throws IOException + { + return deserializeHeader(metadata, in, version, flag, MESSAGING, selection); + } + + public

Header deserializeHeader(TableMetadata metadata, DataInputPlus in, int version, DeserializationHelper.Flag flag, ParameterizedSerializer

serializer, P param) throws IOException { DecoratedKey key = metadata.partitioner.decorateKey(ByteBufferUtil.readWithVIntLength(in)); + return deserializeHeaderWithoutKey(metadata, key, in, version, flag, serializer, param); + } + + public

Header deserializeHeaderWithoutKey(TableMetadata metadata, DecoratedKey key, DataInputPlus in, int version, DeserializationHelper.Flag flag, ParameterizedSerializer

serializer, P param) throws IOException + { int flags = in.readUnsignedByte(); boolean isReversed = (flags & IS_REVERSED) != 0; if ((flags & IS_EMPTY) != 0) @@ -209,7 +245,7 @@ public Header deserializeHeader(TableMetadata metadata, ColumnFilter selection, boolean hasStatic = (flags & HAS_STATIC_ROW) != 0; boolean hasRowEstimate = (flags & HAS_ROW_ESTIMATE) != 0; - SerializationHeader header = SerializationHeader.serializer.deserializeForMessaging(in, metadata, selection, hasStatic); + SerializationHeader header = serializer.deserialize(in, metadata, hasStatic, param); DeletionTime partitionDeletion = hasPartitionDeletion ? header.readDeletionTime(in) : DeletionTime.LIVE; diff --git a/src/java/org/apache/cassandra/db/streaming/CassandraOutgoingFile.java b/src/java/org/apache/cassandra/db/streaming/CassandraOutgoingFile.java index 7572749d37e2..dc24a3bc0fea 100644 --- a/src/java/org/apache/cassandra/db/streaming/CassandraOutgoingFile.java +++ b/src/java/org/apache/cassandra/db/streaming/CassandraOutgoingFile.java @@ -49,6 +49,7 @@ public class CassandraOutgoingFile implements OutgoingStream private final boolean shouldStreamEntireSSTable; private final StreamOperation operation; private final CassandraStreamHeader header; + private final List> ranges; public CassandraOutgoingFile(StreamOperation operation, Ref ref, List sections, List> normalizedRanges, @@ -60,6 +61,7 @@ public CassandraOutgoingFile(StreamOperation operation, Ref ref, this.ref = ref; this.estimatedKeys = estimatedKeys; this.sections = sections; + this.ranges = normalizedRanges; SSTableReader sstable = ref.get(); @@ -131,6 +133,12 @@ public int getNumFiles() return shouldStreamEntireSSTable ? header.componentManifest.components().size() : 1; } + @Override + public List> ranges() + { + return ranges; + } + @Override public long getRepairedAt() { diff --git a/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java b/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java index 6940f11b57fc..505bd6b9287e 100644 --- a/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java +++ b/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java @@ -81,9 +81,9 @@ public IncomingStream prepareIncomingStream(StreamSession session, StreamMessage } @Override - public StreamReceiver createStreamReceiver(StreamSession session, int totalStreams) + public StreamReceiver createStreamReceiver(StreamSession session, List> ranges, int totalStreams) { - return new CassandraStreamReceiver(cfs, session, totalStreams); + return new CassandraStreamReceiver(cfs, session, ranges, totalStreams); } @Override diff --git a/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java b/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java index 50f87c799ece..61f64ce3d0af 100644 --- a/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java +++ b/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java @@ -23,11 +23,14 @@ import java.util.List; import java.util.Set; +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.Iterables; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import accord.primitives.Ranges; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Keyspace; @@ -41,19 +44,30 @@ import org.apache.cassandra.db.rows.UnfilteredRowIterator; import org.apache.cassandra.db.view.View; import org.apache.cassandra.dht.Bounds; +import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.io.sstable.ISSTableScanner; import org.apache.cassandra.io.sstable.SSTable; import org.apache.cassandra.io.sstable.SSTableMultiWriter; import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.AccordTopology; +import org.apache.cassandra.service.accord.IAccordService; +import org.apache.cassandra.service.accord.TimeOnlyRequestBookkeeping.LatencyRequestBookkeeping; import org.apache.cassandra.streaming.IncomingStream; import org.apache.cassandra.streaming.StreamReceiver; import org.apache.cassandra.streaming.StreamSession; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.utils.CassandraVersion; import org.apache.cassandra.utils.CloseableIterator; import org.apache.cassandra.utils.Throwables; import org.apache.cassandra.utils.concurrent.Refs; +import static accord.local.durability.DurabilityService.SyncLocal.Self; +import static accord.local.durability.DurabilityService.SyncRemote.NoRemote; +import static com.google.common.base.Preconditions.checkNotNull; import static org.apache.cassandra.config.CassandraRelevantProperties.REPAIR_MUTATION_REPAIR_ROWS_PER_BATCH; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; public class CassandraStreamReceiver implements StreamReceiver { @@ -74,14 +88,17 @@ public class CassandraStreamReceiver implements StreamReceiver private final boolean requiresWritePath; + private final List> ranges; + - public CassandraStreamReceiver(ColumnFamilyStore cfs, StreamSession session, int totalFiles) + public CassandraStreamReceiver(ColumnFamilyStore cfs, StreamSession session, List> ranges, int totalFiles) { this.cfs = cfs; this.session = session; // this is an "offline" transaction, as we currently manually expose the sstables once done; // this should be revisited at a later date, so that LifecycleTransaction manages all sstable state changes this.txn = LifecycleTransaction.offline(OperationType.STREAM); + this.ranges = ranges; this.sstables = new ArrayList<>(totalFiles); this.requiresWritePath = requiresWritePath(cfs); } @@ -175,7 +192,7 @@ private boolean hasCDC(ColumnFamilyStore cfs) return cfs.metadata().params.cdc; } - // returns true iif it is a cdc table and cdc on repair is enabled. + // returns true if it is a cdc table and cdc on repair is enabled. private boolean cdcRequiresWriteCommitLog(ColumnFamilyStore cfs) { return DatabaseDescriptor.isCDCOnRepairEnabled() && hasCDC(cfs); @@ -190,11 +207,12 @@ private boolean cdcRequiresWriteCommitLog(ColumnFamilyStore cfs) * For CDC-enabled tables and write path for CDC is enabled, we want to ensure that the mutations are * run through the CommitLog, so they can be archived by the CDC process on discard. */ - private boolean requiresWritePath(ColumnFamilyStore cfs) + @VisibleForTesting + boolean requiresWritePath(ColumnFamilyStore cfs) { return cdcRequiresWriteCommitLog(cfs) || cfs.streamToMemtable() - || (session.streamOperation().requiresViewBuild() && hasViews(cfs)); + || (session.streamOperation().requiresViewBuild() && hasViews(cfs) && DatabaseDescriptor.isMaterializedViewsOnRepairEnabled()); } private void sendThroughWritePath(ColumnFamilyStore cfs, Collection readers) @@ -233,6 +251,22 @@ public synchronized void finishTransaction() @Override public void finished() { + CassandraVersion minVersion = ClusterMetadata.current().directory.clusterMinVersion.cassandraVersion; + checkNotNull(minVersion, "Unable to determine minimum cluster version"); + IAccordService accordService = AccordService.instance(); + if (session.streamOperation().requiresBarrierTransaction() + && cfs.metadata().requiresAccordSupport() + && CassandraVersion.CASSANDRA_5_0.compareTo(minVersion) >= 0) + { + Ranges accordRanges = AccordTopology.toAccordRanges(cfs.getTableId(), ranges); + long startedAtNanos = nanoTime(); + long deadlineNanos = startedAtNanos + DatabaseDescriptor.getAccordRangeSyncPointTimeoutNanos(); + // TODO (expected): use the source bounds for the streams to avoid waiting unnecessarily long + AccordService.getBlocking(accordService.maxConflict(accordRanges) + .flatMap(min -> accordService.sync("[Stream #" + session.planId() + ']', min, accordRanges, null, Self, NoRemote)) + , accordRanges, new LatencyRequestBookkeeping(cfs.metric.accordPostStreamRepair), startedAtNanos, deadlineNanos); + } + boolean requiresWritePath = requiresWritePath(cfs); Collection readers = sstables; diff --git a/src/java/org/apache/cassandra/db/transform/FilteredPartitions.java b/src/java/org/apache/cassandra/db/transform/FilteredPartitions.java index 0e6934f76ea5..3486d275e813 100644 --- a/src/java/org/apache/cassandra/db/transform/FilteredPartitions.java +++ b/src/java/org/apache/cassandra/db/transform/FilteredPartitions.java @@ -25,7 +25,7 @@ import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; import org.apache.cassandra.db.rows.RowIterator; -public final class FilteredPartitions extends BasePartitions> implements PartitionIterator +public class FilteredPartitions extends BasePartitions> implements PartitionIterator { // wrap basic iterator for transformation FilteredPartitions(PartitionIterator input) diff --git a/src/java/org/apache/cassandra/db/view/View.java b/src/java/org/apache/cassandra/db/view/View.java index e926edb3a970..36894127ae45 100644 --- a/src/java/org/apache/cassandra/db/view/View.java +++ b/src/java/org/apache/cassandra/db/view/View.java @@ -17,19 +17,26 @@ */ package org.apache.cassandra.db.view; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import java.util.stream.Collectors; - import javax.annotation.Nullable; import com.google.common.collect.Iterables; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import org.apache.cassandra.cql3.*; +import org.apache.cassandra.cql3.QualifiedName; +import org.apache.cassandra.cql3.QueryOptions; +import org.apache.cassandra.cql3.StatementSource; import org.apache.cassandra.cql3.selection.RawSelector; import org.apache.cassandra.cql3.selection.Selectable; import org.apache.cassandra.cql3.statements.SelectStatement; -import org.apache.cassandra.db.*; -import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.ReadQuery; +import org.apache.cassandra.db.rows.Row; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.Schema; @@ -37,8 +44,6 @@ import org.apache.cassandra.schema.ViewMetadata; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.utils.FBUtilities; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * A View copies data from a base table into a view table which can be queried independently from the @@ -174,7 +179,8 @@ SelectStatement getSelectStatement() selectClause(), definition.whereClause, null, - null); + null, + StatementSource.INTERNAL); rawSelect.setBindVariables(Collections.emptyList()); diff --git a/src/java/org/apache/cassandra/db/virtual/AbstractLoggerVirtualTable.java b/src/java/org/apache/cassandra/db/virtual/AbstractLoggerVirtualTable.java new file mode 100644 index 000000000000..008d5d432a2b --- /dev/null +++ b/src/java/org/apache/cassandra/db/virtual/AbstractLoggerVirtualTable.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.virtual; + +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; + +import com.google.common.annotations.VisibleForTesting; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import ch.qos.logback.classic.spi.LoggingEvent; +import org.apache.cassandra.schema.TableMetadata; + +/** + * This table is inherently limited on number of rows it can hold. + * + * @param type parameter saying what object is stored in internal bounded list for query purposes + */ +public abstract class AbstractLoggerVirtualTable extends AbstractMutableVirtualTable +{ + private static final Logger logger = LoggerFactory.getLogger(AbstractLoggerVirtualTable.class); + + // please be sure operations on this structure are thread-safe + protected final List buffer; + + @VisibleForTesting + protected static int resolveBufferSize(int wantedSize, int max, int defaultSize) + { + return (wantedSize < 1 || wantedSize > max) ? defaultSize : wantedSize; + } + + protected AbstractLoggerVirtualTable(TableMetadata metadata, int maxSize) + { + super(metadata); + this.buffer = BoundedLinkedList.create(maxSize); + logger.debug("capacity of virtual table {} is set to be at most {} rows", metadata().toString(), maxSize); + } + + public void add(LoggingEvent event) + { + List messages = getMessages(event); + if (messages != null) + { + // specifically calling buffer.add to reach BoundedLinkedList's add + // instead of linked list's addAll + for (U message : messages) + buffer.add(message); + } + } + + public abstract List getMessages(LoggingEvent event); + + @Override + public void truncate() + { + synchronized (buffer) + { + buffer.clear(); + } + } + + @Override + public boolean allowFilteringImplicitly() + { + return false; + } + + private static final class BoundedLinkedList extends LinkedList + { + private final int maxSize; + + public static List create(int size) + { + return Collections.synchronizedList(new BoundedLinkedList<>(size)); + } + + private BoundedLinkedList(int maxSize) + { + this.maxSize = maxSize; + } + + @Override + public synchronized boolean add(T t) + { + if (size() == maxSize) + removeLast(); + + addFirst(t); + + return true; + } + } +} diff --git a/src/java/org/apache/cassandra/db/virtual/AccordDebugKeyspace.java b/src/java/org/apache/cassandra/db/virtual/AccordDebugKeyspace.java new file mode 100644 index 000000000000..6e359a74e408 --- /dev/null +++ b/src/java/org/apache/cassandra/db/virtual/AccordDebugKeyspace.java @@ -0,0 +1,720 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.db.virtual; + +import java.nio.ByteBuffer; +import java.util.Collection; +import java.util.Date; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; + +import com.google.common.collect.Sets; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.RoutingKey; +import accord.impl.progresslog.DefaultProgressLog; +import accord.impl.progresslog.TxnStateKind; +import accord.local.CommandStore; +import accord.local.CommandStores; +import accord.local.DurableBefore; +import accord.local.MaxConflicts; +import accord.local.RejectBefore; +import accord.local.durability.ShardDurability; +import accord.primitives.Status; +import accord.primitives.TxnId; +import accord.utils.Invariants; +import org.apache.cassandra.cql3.statements.schema.CreateTableStatement; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.LocalPartitioner; +import org.apache.cassandra.dht.NormalizedRanges; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.AccordCache; +import org.apache.cassandra.service.accord.AccordCommandStore; +import org.apache.cassandra.service.accord.AccordCommandStores; +import org.apache.cassandra.service.accord.AccordExecutor; +import org.apache.cassandra.service.accord.AccordKeyspace; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.CommandStoreTxnBlockedGraph; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationState; +import org.apache.cassandra.service.consensus.migration.TableMigrationState; +import org.apache.cassandra.tcm.ClusterMetadata; + +import static accord.local.RedundantStatus.Property.GC_BEFORE; +import static accord.local.RedundantStatus.Property.LOCALLY_APPLIED; +import static accord.local.RedundantStatus.Property.LOCALLY_REDUNDANT; +import static accord.local.RedundantStatus.Property.LOCALLY_SYNCED; +import static accord.local.RedundantStatus.Property.LOCALLY_WITNESSED; +import static accord.local.RedundantStatus.Property.MAJORITY_APPLIED; +import static accord.local.RedundantStatus.Property.PRE_BOOTSTRAP; +import static accord.local.RedundantStatus.Property.SHARD_APPLIED; +import static java.lang.String.format; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static org.apache.cassandra.schema.SchemaConstants.VIRTUAL_ACCORD_DEBUG; +import static org.apache.cassandra.utils.MonotonicClock.Global.approxTime; + +public class AccordDebugKeyspace extends VirtualKeyspace +{ + public static final String DURABILITY_SERVICE = "durability_service"; + public static final String DURABLE_BEFORE = "durable_before"; + public static final String EXECUTOR_CACHE = "executor_cache"; + public static final String MAX_CONFLICTS = "max_conflicts"; + public static final String MIGRATION_STATE = "migration_state"; + public static final String PROGRESS_LOG = "progress_log"; + public static final String REDUNDANT_BEFORE = "redundant_before"; + public static final String REJECT_BEFORE = "reject_before"; + public static final String TXN_BLOCKED_BY = "txn_blocked_by"; + + public static final AccordDebugKeyspace instance = new AccordDebugKeyspace(); + + private AccordDebugKeyspace() + { + super(VIRTUAL_ACCORD_DEBUG, List.of( + new DurabilityServiceTable(), + new DurableBeforeTable(), + new ExecutorCacheTable(), + new MaxConflictsTable(), + new MigrationStateTable(), + new ProgressLogTable(), + new RedundantBeforeTable(), + new RejectBeforeTable(), + new TxnBlockedByTable() + )); + } + + // TODO (consider): use a different type for the three timestamps in micros + public static final class DurabilityServiceTable extends AbstractVirtualTable + { + private DurabilityServiceTable() + { + super(parse(VIRTUAL_ACCORD_DEBUG, DURABILITY_SERVICE, + "Accord per-Range Durability Service State", + "CREATE TABLE %s (\n" + + " keyspace_name text,\n" + + " table_name text,\n" + + " token_sort blob,\n" + + " token_start text,\n" + + " token_end text,\n" + + " last_started_at_micros bigint,\n" + + " cycle_started_at_micros bigint,\n" + + " retries int,\n" + + " min text,\n" + + " active text,\n" + + " waiting text,\n" + + " node_offset int,\n" + + " cycle_offset int,\n" + + " activeIndex int,\n" + + " nextIndex int,\n" + + " nextToIndex int,\n" + + " endIndex int,\n" + + " current_splits int,\n" + + " stopping boolean,\n" + + " stopped boolean,\n" + + " PRIMARY KEY (keyspace_name, table_name, token_start)" + + ')', UTF8Type.instance)); + } + + @Override + public DataSet data() + { + ShardDurability.ImmutableView view = ((AccordService) AccordService.instance()).shardDurability(); + + SimpleDataSet ds = new SimpleDataSet(metadata()); + while (view.advance()) + { + TableId tableId = (TableId) view.shard().range.start().prefix(); + TableMetadata tableMetadata = tableMetadata(tableId); + ds.row(keyspace(tableMetadata), table(tableId, tableMetadata), sortToken(view.shard().range.start())) + .column("start_token", printToken(view.shard().range.start())) + .column("end_token", printToken(view.shard().range.end())) + .column("last_started_at", approxTime.translate().toMillisSinceEpoch(view.lastStartedAtMicros() * 1000)) + .column("cycle_started_at", approxTime.translate().toMillisSinceEpoch(view.cycleStartedAtMicros() * 1000)) + .column("active", Objects.toString(view.active())) + .column("waiting", Objects.toString(view.waiting())) + .column("node_offset", view.nodeOffset()) + .column("cycle_offset", view.cycleOffset()) + .column("activeIndex", view.activeIndex()) + .column("nextIndex", view.nextIndex()) + .column("nextToIndex", view.toIndex()) + .column("endIndex", view.cycleLength()) + .column("current_splits", view.currentSplits()) + .column("stopping", view.stopping()) + .column("stopping", view.stopping()) + ; + } + return ds; + } + } + + public static final class DurableBeforeTable extends AbstractVirtualTable + { + private DurableBeforeTable() + { + super(parse(VIRTUAL_ACCORD_DEBUG, DURABLE_BEFORE, + "Accord Node's DurableBefore State", + "CREATE TABLE %s (\n" + + " keyspace_name text,\n" + + " table_name text,\n" + + " token_sort blob,\n" + + " token_start text,\n" + + " token_end text,\n" + + " majority_before text,\n" + + " universal_before text,\n" + + " PRIMARY KEY (keyspace_name, table_name, token_sort)" + + ')', UTF8Type.instance)); + } + + @Override + public DataSet data() + { + DurableBefore durableBefore = AccordService.instance().node().durableBefore(); + return durableBefore.foldlWithBounds( + (entry, ds, start, end) -> { + TableId tableId = (TableId) start.prefix(); + TableMetadata tableMetadata = tableMetadata(tableId); + ds.row(keyspace(tableMetadata), table(tableId, tableMetadata), sortToken(start)) + .column("start_token", printToken(start)) + .column("end_token", printToken(end)) + .column("majority_before", entry.majorityBefore.toString()) + .column("universal_before", entry.universalBefore.toString()); + return ds; + }, + new SimpleDataSet(metadata()), + ignore -> false + ); + } + } + + public static final class ExecutorCacheTable extends AbstractVirtualTable + { + private ExecutorCacheTable() + { + super(parse(VIRTUAL_ACCORD_DEBUG, EXECUTOR_CACHE, + "Accord Executor Cache Metrics", + "CREATE TABLE %s (\n" + + " executor_id int,\n" + + " scope text,\n" + + " queries bigint,\n" + + " hits bigint,\n" + + " misses bigint,\n" + + " PRIMARY KEY (executor_id, scope)" + + ')', Int32Type.instance)); + } + + @Override + public DataSet data() + { + AccordCommandStores stores = (AccordCommandStores) AccordService.instance().node().commandStores(); + SimpleDataSet ds = new SimpleDataSet(metadata()); + for (AccordExecutor executor : stores.executors()) + { + try (AccordExecutor.ExclusiveGlobalCaches cache = executor.lockCaches()) + { + addRow(ds, executor.executorId(), "commands", cache.commands.statsSnapshot()); + addRow(ds, executor.executorId(), AccordKeyspace.COMMANDS_FOR_KEY, cache.commandsForKey.statsSnapshot()); + } + } + return ds; + } + + private static void addRow(SimpleDataSet ds, int executorId, String scope, AccordCache.ImmutableStats stats) + { + ds.row(executorId, scope) + .column("queries", stats.queries) + .column("hits", stats.hits) + .column("misses", stats.misses); + } + } + + + public static final class MaxConflictsTable extends AbstractVirtualTable + { + private MaxConflictsTable() + { + super(parse(VIRTUAL_ACCORD_DEBUG, MAX_CONFLICTS, + "Accord per-CommandStore MaxConflicts State", + "CREATE TABLE %s (\n" + + " keyspace_name text,\n" + + " table_name text,\n" + + " token_sort blob,\n" + + " token_start text,\n" + + " token_end text,\n" + + " command_store_id bigint,\n" + + " timestamp text,\n" + + " PRIMARY KEY (keyspace_name, table_name, token_sort, command_store_id)" + + ')', UTF8Type.instance)); + } + + @Override + public DataSet data() + { + CommandStores commandStores = AccordService.instance().node().commandStores(); + + SimpleDataSet dataSet = new SimpleDataSet(metadata()); + for (CommandStore commandStore : commandStores.all()) + { + int commandStoreId = commandStore.id(); + MaxConflicts maxConflicts = commandStore.unsafeGetMaxConflicts(); + TableId tableId = ((AccordCommandStore) commandStore).tableId(); + TableMetadata tableMetadata = tableMetadata(tableId); + + maxConflicts.foldlWithBounds( + (timestamp, ds, start, end) -> { + return ds.row(keyspace(tableMetadata), table(tableId, tableMetadata), sortToken(start), commandStoreId) + .column("start_token", printToken(start)) + .column("end_token", printToken(end)) + .column("timestamp", timestamp.toString()) + ; + }, + dataSet, + ignore -> false + ); + } + return dataSet; + } + } + + public static final class MigrationStateTable extends AbstractVirtualTable + { + private static final Logger logger = LoggerFactory.getLogger(MigrationStateTable.class); + + private MigrationStateTable() + { + super(parse(VIRTUAL_ACCORD_DEBUG, MIGRATION_STATE, + "Accord Consensus Migration State", + "CREATE TABLE %s (\n" + + " keyspace_name text,\n" + + " table_name text,\n" + + " table_id uuid,\n" + + " target_protocol text,\n" + + " transactional_mode text,\n" + + " transactional_migration_from text,\n" + + " migrated_ranges frozen>,\n" + + " repair_pending_ranges frozen>,\n" + + " migrating_ranges_by_epoch frozen>>,\n" + + " PRIMARY KEY (keyspace_name, table_name)" + + ')', UTF8Type.instance)); + } + + @Override + public DataSet data() + { + ConsensusMigrationState snapshot = ClusterMetadata.current().consensusMigrationState; + Collection tableStates = snapshot.tableStates(); + return data(tableStates); + } + + @Override + public DataSet data(DecoratedKey key) + { + String keyspaceName = UTF8Type.instance.compose(key.getKey()); + Keyspace keyspace = Schema.instance.getKeyspaceInstance(keyspaceName); + + if (keyspace == null) + throw new InvalidRequestException("Unknown keyspace: '" + keyspaceName + '\''); + + List tableIDs = keyspace.getColumnFamilyStores() + .stream() + .map(ColumnFamilyStore::getTableId) + .collect(Collectors.toList()); + + ConsensusMigrationState snapshot = ClusterMetadata.current().consensusMigrationState; + Collection tableStates = snapshot.tableStatesFor(tableIDs); + + return data(tableStates); + } + + private SimpleDataSet data(Collection tableStates) + { + SimpleDataSet result = new SimpleDataSet(metadata()); + + for (TableMigrationState state : tableStates) + { + TableMetadata table = Schema.instance.getTableMetadata(state.tableId); + + if (table == null) + { + logger.warn("Table {}.{} (id: {}) no longer exists. It may have been dropped.", + state.keyspaceName, state.tableName, state.tableId); + continue; + } + + result.row(state.keyspaceName, state.tableName); + result.column("table_id", state.tableId.asUUID()); + result.column("target_protocol", state.targetProtocol.toString()); + result.column("transactional_mode", table.params.transactionalMode.toString()); + result.column("transactional_migration_from", table.params.transactionalMode.toString()); + + List primitiveMigratedRanges = state.migratedRanges.stream().map(Objects::toString).collect(toImmutableList()); + result.column("migrated_ranges", primitiveMigratedRanges); + + List primitiveRepairPendingRanges = state.repairPendingRanges.stream().map(Objects::toString).collect(toImmutableList()); + result.column("repair_pending_ranges", primitiveRepairPendingRanges); + + Map> primitiveRangesByEpoch = new LinkedHashMap<>(); + for (Map.Entry> entry : state.migratingRangesByEpoch.entrySet()) + primitiveRangesByEpoch.put(entry.getKey().getEpoch(), entry.getValue().stream().map(Objects::toString).collect(toImmutableList())); + + result.column("migrating_ranges_by_epoch", primitiveRangesByEpoch); + } + + return result; + } + } + + // TODO (desired): human readable packed key tracker (but requires loading Txn, so might be preferable to only do conditionally) + public static final class ProgressLogTable extends AbstractVirtualTable + { + private ProgressLogTable() + { + super(parse(VIRTUAL_ACCORD_DEBUG, PROGRESS_LOG, + "Accord per-CommandStore ProgressLog State", + "CREATE TABLE %s (\n" + + " keyspace_name text,\n" + + " table_name text,\n" + + " command_store_id int,\n" + + " txn_id text,\n" + + // Timer + BaseTxnState + " contact_everyone boolean,\n" + + // WaitingState + " waiting_is_uninitialised boolean,\n" + + " waiting_blocked_until text,\n" + + " waiting_home_satisfies text,\n" + + " waiting_progress text,\n" + + " waiting_retry_counter int,\n" + + " waiting_packed_key_tracker_bits text,\n" + + " waiting_scheduled_at timestamp,\n" + + // HomeState/TxnState + " home_phase text,\n" + + " home_progress text,\n" + + " home_retry_counter int,\n" + + " home_scheduled_at timestamp,\n" + + " PRIMARY KEY (keyspace_name, table_name, command_store_id, txn_id)" + + ')', UTF8Type.instance)); + } + + @Override + public DataSet data() + { + CommandStores commandStores = AccordService.instance().node().commandStores(); + SimpleDataSet ds = new SimpleDataSet(metadata()); + for (CommandStore commandStore : commandStores.all()) + { + DefaultProgressLog.ImmutableView view = (DefaultProgressLog.ImmutableView) commandStore.unsafeProgressLog(); + TableId tableId = ((AccordCommandStore)commandStore).tableId(); + TableMetadata tableMetadata = tableMetadata(tableId); + while (view.advance()) + { + ds.row(keyspace(tableMetadata), table(tableId, tableMetadata), view.commandStoreId(), view.txnId().toString()) + .column("contact_everyone", view.contactEveryone()) + .column("waiting_is_uninitialised", view.isWaitingUninitialised()) + .column("waiting_blocked_until", view.waitingIsBlockedUntil().name()) + .column("waiting_home_satisfies", view.waitingHomeSatisfies().name()) + .column("waiting_progress", view.waitingProgress().name()) + .column("waiting_retry_counter", view.waitingRetryCounter()) + .column("waiting_packed_key_tracker_bits", Long.toBinaryString(view.waitingPackedKeyTrackerBits())) + .column("waiting_scheduled_at", toTimestamp(view.timerScheduledAt(TxnStateKind.Waiting))) + .column("home_phase", view.homePhase().name()) + .column("home_progress", view.homeProgress().name()) + .column("home_retry_counter", view.homeRetryCounter()) + .column("home_scheduled_at", toTimestamp(view.timerScheduledAt(TxnStateKind.Home))) + ; + } + } + return ds; + } + + private Date toTimestamp(Long deadline) + { + if (deadline == null) + return null; + + long millisSinceEpoch = approxTime.translate().toMillisSinceEpoch(deadline * 1000L); + return new Date(millisSinceEpoch); + } + } + + public static final class RedundantBeforeTable extends AbstractVirtualTable + { + private RedundantBeforeTable() + { + super(parse(VIRTUAL_ACCORD_DEBUG, REDUNDANT_BEFORE, + "Accord per-CommandStore RedundantBefore State", + "CREATE TABLE %s (\n" + + " keyspace_name text,\n" + + " table_name text,\n" + + " token_sort blob,\n" + + " token_start text,\n" + + " token_end text,\n" + + " command_store_id bigint,\n" + + " start_epoch bigint,\n" + + " end_epoch bigint,\n" + + " gc_before text,\n" + + " shard_only_applied text,\n" + + " locally_applied text,\n" + + " locally_synced text,\n" + + " locally_redundant text,\n" + + " locally_witnessed text,\n" + + " pre_bootstrap text,\n" + + " stale_until_at_least text,\n" + + " PRIMARY KEY (keyspace_name, table_name, token_sort, command_store_id)" + + ')', UTF8Type.instance)); + } + + @Override + public DataSet data() + { + CommandStores commandStores = AccordService.instance().node().commandStores(); + + SimpleDataSet dataSet = new SimpleDataSet(metadata()); + for (CommandStore commandStore : commandStores.all()) + { + int commandStoreId = commandStore.id(); + TableId tableId = ((AccordCommandStore)commandStore).tableId(); + TableMetadata tableMetadata = tableMetadata(tableId); + String keyspace = keyspace(tableMetadata); + String table = table(tableId, tableMetadata); + commandStore.unsafeGetRedundantBefore().foldl( + (entry, ds) -> { + ds.row(keyspace, table, sortToken(entry.range.start()), commandStoreId) + .column("start_token", printToken(entry.range.start())) + .column("end_token", printToken(entry.range.end())) + .column("start_epoch", entry.startEpoch) + .column("end_epoch", entry.endEpoch) + .column("gc_before", entry.maxBound(GC_BEFORE).toString()) + .column("shard_applied", entry.maxBound(SHARD_APPLIED).toString()) + .column("majority_applied", entry.maxBound(MAJORITY_APPLIED).toString()) + .column("locally_applied", entry.maxBound(LOCALLY_APPLIED).toString()) + .column("locally_synced", entry.maxBound(LOCALLY_SYNCED).toString()) + .column("locally_redundant", entry.maxBound(LOCALLY_REDUNDANT).toString()) + .column("locally_witnessed", entry.maxBound(LOCALLY_WITNESSED).toString()) + .column("pre_bootstrap", entry.maxBound(PRE_BOOTSTRAP).toString()) + .column("stale_until_at_least", entry.staleUntilAtLeast != null ? entry.staleUntilAtLeast.toString() : null); + return ds; + }, + dataSet, + ignore -> false + ); + } + return dataSet; + } + } + + public static final class RejectBeforeTable extends AbstractVirtualTable + { + private RejectBeforeTable() + { + super(parse(VIRTUAL_ACCORD_DEBUG, REJECT_BEFORE, + "Accord per-CommandStore RejectBefore State", + "CREATE TABLE %s (\n" + + " keyspace_name text,\n" + + " table_name text,\n" + + " token_sort blob,\n" + + " token_start text,\n" + + " token_end text,\n" + + " command_store_id int,\n" + + " txn_id text,\n" + + " PRIMARY KEY (keyspace_name, table_name, token_sort, command_store_id)" + + ')', UTF8Type.instance)); + } + + @Override + public DataSet data() + { + CommandStores commandStores = AccordService.instance().node().commandStores(); + SimpleDataSet dataSet = new SimpleDataSet(metadata()); + for (CommandStore commandStore : commandStores.all()) + { + RejectBefore rejectBefore = commandStore.unsafeGetRejectBefore(); + if (rejectBefore == null) + continue; + + TableId tableId = ((AccordCommandStore)commandStore).tableId(); + TableMetadata tableMetadata = tableMetadata(tableId); + String keyspace = keyspace(tableMetadata); + String table = table(tableId, tableMetadata); + rejectBefore.foldlWithBounds( + (txnId, ds, start, end) -> ds.row(keyspace, table, sortToken(start), commandStore.id()) + .column("token_start", printToken(start)) + .column("token_end", printToken(end)) + .column("txn_id", txnId.toString()) + , + dataSet, + ignore -> false + ); + } + return dataSet; + } + } + + public static class TxnBlockedByTable extends AbstractVirtualTable + { + enum Reason { Self, Txn, Key } + + protected TxnBlockedByTable() + { + super(parse(VIRTUAL_ACCORD_DEBUG, TXN_BLOCKED_BY, + "Accord Transactions Blocked By Table" , + "CREATE TABLE %s (\n" + + " txn_id text,\n" + + " keyspace_name text,\n" + + " table_name text,\n" + + " command_store_id int,\n" + + " depth int,\n" + + " blocked_by text,\n" + + " reason text,\n" + + " save_status text,\n" + + " execute_at text,\n" + + " key text,\n" + + " PRIMARY KEY (txn_id, keyspace_name, table_name, command_store_id, depth, blocked_by, reason)" + + ')', UTF8Type.instance)); + } + + @Override + public DataSet data(DecoratedKey partitionKey) + { + TxnId id = TxnId.parse(UTF8Type.instance.compose(partitionKey.getKey())); + List shards = AccordService.instance().debugTxnBlockedGraph(id); + + SimpleDataSet ds = new SimpleDataSet(metadata()); + CommandStores commandStores = AccordService.instance().node().commandStores(); + for (CommandStoreTxnBlockedGraph shard : shards) + { + Set processed = new HashSet<>(); + process(ds, commandStores, shard, processed, id, 0, id, Reason.Self, null); + // everything was processed right? + if (!shard.txns.isEmpty() && !shard.txns.keySet().containsAll(processed)) + throw new IllegalStateException("Skipped txns: " + Sets.difference(shard.txns.keySet(), processed)); + } + + return ds; + } + + private void process(SimpleDataSet ds, CommandStores commandStores, CommandStoreTxnBlockedGraph shard, Set processed, TxnId userTxn, int depth, TxnId txnId, Reason reason, Runnable onDone) + { + if (!processed.add(txnId)) + throw new IllegalStateException("Double processed " + txnId); + CommandStoreTxnBlockedGraph.TxnState txn = shard.txns.get(txnId); + if (txn == null) + { + Invariants.require(reason == Reason.Self, "Txn %s unknown for reason %s", txnId, reason); + return; + } + // was it applied? If so ignore it + if (reason != Reason.Self && txn.saveStatus.hasBeen(Status.Applied)) + return; + TableId tableId = tableId(shard.commandStoreId, commandStores); + TableMetadata tableMetadata = tableMetadata(tableId); + ds.row(userTxn.toString(), keyspace(tableMetadata), table(tableId, tableMetadata), + shard.commandStoreId, depth, reason == Reason.Self ? "" : txn.txnId.toString(), reason.name()); + ds.column("save_status", txn.saveStatus.name()); + if (txn.executeAt != null) + ds.column("execute_at", txn.executeAt.toString()); + if (onDone != null) + onDone.run(); + if (txn.isBlocked()) + { + for (TxnId blockedBy : txn.blockedBy) + { + if (!processed.contains(blockedBy)) + process(ds, commandStores, shard, processed, userTxn, depth + 1, blockedBy, Reason.Txn, null); + } + + for (TokenKey blockedBy : txn.blockedByKey) + { + TxnId blocking = shard.keys.get(blockedBy); + if (!processed.contains(blocking)) + process(ds, commandStores, shard, processed, userTxn, depth + 1, blocking, Reason.Key, () -> ds.column("key", printToken(blockedBy))); + } + } + } + + @Override + public DataSet data() + { + throw new InvalidRequestException("Must select a single txn_id"); + } + } + + private static TableId tableId(int commandStoreId, CommandStores commandStores) + { + AccordCommandStore commandStore = (AccordCommandStore) commandStores.forId(commandStoreId); + if (commandStore == null) + return null; + return commandStore.tableId(); + } + + private static TableMetadata tableMetadata(TableId tableId) + { + if (tableId == null) + return null; + return Schema.instance.getTableMetadata(tableId); + } + + private static String keyspace(TableMetadata metadata) + { + return metadata == null ? "Unknown" : metadata.keyspace; + } + + private static String table(TableId tableId, TableMetadata metadata) + { + return metadata == null ? tableId.toString() : metadata.name; + } + + private static String printToken(RoutingKey routingKey) + { + TokenKey key = (TokenKey) routingKey; + return key.token().getPartitioner().getTokenFactory().toString(key.token()); + } + + private static ByteBuffer sortToken(RoutingKey routingKey) + { + TokenKey key = (TokenKey) routingKey; + Token token = key.token(); + IPartitioner partitioner = token.getPartitioner(); + ByteBuffer out = ByteBuffer.allocate(partitioner.accordSerializedSize(token)); + partitioner.accordSerialize(token, out); + out.flip(); + return out; + } + + private static TableMetadata parse(String keyspace, String table, String comment, String schema, AbstractType partitionKeyType) + { + return CreateTableStatement.parse(format(schema, table), keyspace) + .comment(comment) + .kind(TableMetadata.Kind.VIRTUAL) + .partitioner(new LocalPartitioner(partitionKeyType)) + .build(); + } +} diff --git a/src/java/org/apache/cassandra/db/virtual/AccordVirtualTables.java b/src/java/org/apache/cassandra/db/virtual/AccordVirtualTables.java new file mode 100644 index 000000000000..0ca2caf8383e --- /dev/null +++ b/src/java/org/apache/cassandra/db/virtual/AccordVirtualTables.java @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.db.virtual; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import javax.annotation.Nullable; + +import com.google.common.base.Preconditions; +import com.google.common.collect.Sets; + +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.topology.TopologyManager.EpochsSnapshot; +import accord.topology.TopologyManager.EpochsSnapshot.Epoch; +import accord.topology.TopologyManager.EpochsSnapshot.EpochReady; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.statements.schema.CreateTableStatement; +import org.apache.cassandra.db.marshal.LongType; +import org.apache.cassandra.db.marshal.ReversedType; +import org.apache.cassandra.dht.LocalPartitioner; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.TokenRange; + +import static accord.topology.TopologyManager.EpochsSnapshot.ResultStatus.SUCCESS; + +public class AccordVirtualTables +{ + public static final String EPOCHS = "accord_epochs"; + public static final String TABLE_EPOCHS = "accord_table_epochs"; + + private AccordVirtualTables() + { + } + + public static Collection getAll(String keyspace) + { + if (!DatabaseDescriptor.getAccordTransactionsEnabled()) + return Collections.emptyList(); + + return List.of(new EpochReadyTable(keyspace), + new EpochSyncRanges(keyspace) + ); + } + + private static TableMetadata.Builder parse(String keyspace, String query) + { + return CreateTableStatement.parse(query, keyspace) + .kind(TableMetadata.Kind.VIRTUAL); + } + + public static class EpochReadyTable extends AbstractVirtualTable + { + public EpochReadyTable(String keyspace) + { + super(parse(keyspace, "CREATE TABLE " + EPOCHS + " (\n" + + " epoch bigint PRIMARY KEY,\n" + + " ready_metadata text,\n" + + " ready_coordinate text,\n" + + " ready_data text,\n" + + " ready_reads text,\n" + + " ready boolean,\n" + + ")") + .partitioner(new LocalPartitioner(ReversedType.getInstance(LongType.instance))) + .comment("Exposes the epoch ready state for recieved epochs in Accord") + .build()); + } + + @Override + public DataSet data() + { + SimpleDataSet ds = new SimpleDataSet(metadata()); + EpochsSnapshot snapshot = epochsSnapshot(); + for (Epoch epoch : snapshot) + { + ds.row(epoch.epoch); + EpochReady ready = epoch.ready; + ds.column("ready_metadata", ready.metadata.value); + ds.column("ready_coordinate", ready.coordinate.value); + ds.column("ready_data", ready.data.value); + ds.column("ready_reads", ready.reads.value); + ds.column("ready", ready.reads == SUCCESS); + } + return ds; + } + } + + public static class EpochSyncRanges extends AbstractVirtualTable + { + protected EpochSyncRanges(String keyspace) + { + super(parse(keyspace, "CREATE TABLE " + TABLE_EPOCHS + " (\n" + + " epoch bigint,\n" + + " keyspace_name text,\n" + + " table_name text,\n" + + " added frozen>,\n" + + " removed frozen>,\n" + + " synced frozen>,\n" + + " closed frozen>,\n" + + " retired frozen>,\n" + + " PRIMARY KEY (epoch, keyspace_name, table_name)\n" + + ")") + .partitioner(new LocalPartitioner(ReversedType.getInstance(LongType.instance))) + .comment("Shows details on a per-table basis about what ranges are synced per epoch") + .build()); + } + + @Override + public DataSet data() + { + SimpleDataSet ds = new SimpleDataSet(metadata()); + EpochsSnapshot snapshot = epochsSnapshot(); + for (Epoch state : snapshot) + { + Map> addedRanges = groupByTable(state.addedRanges); + Map> removedRanges = groupByTable(state.removedRanges); + Map> synced = groupByTable(state.synced); + Map> closed = groupByTable(state.closed); + Map> retired = groupByTable(state.retired); + + Set allTables = union(addedRanges.keySet(), removedRanges.keySet(), synced.keySet(), closed.keySet(), retired.keySet()); + for (TableId table : allTables) + { + TableMetadata metadata = Schema.instance.getTableMetadata(table); + if (metadata == null) continue; // table dropped, ignore + ds.row(state.epoch, metadata.keyspace, metadata.name); + + ds.column("added", format(addedRanges.get(table))); + ds.column("removed", format(removedRanges.get(table))); + ds.column("synced", format(synced.get(table))); + ds.column("closed", format(closed.get(table))); + ds.column("retired", format(retired.get(table))); + } + } + return ds; + } + + private static Set union(Set... sets) + { + Preconditions.checkArgument(sets.length > 0); + if (sets.length == 1) return sets[0]; + Sets.SetView accum = Sets.union(sets[0], sets[1]); + for (int i = 2; i < sets.length; i++) + accum = Sets.union(accum, sets[i]); + return accum; + } + + private static List format(@Nullable List list) + { + if (list == null || list.isEmpty()) return Collections.emptyList(); + List result = new ArrayList<>(list.size()); + for (TokenRange tr : list) + result.add(toStringNoTable(tr)); + return result; + } + } + + private static EpochsSnapshot epochsSnapshot() + { + return AccordService.instance().topology().epochsSnapshot(); + } + + private static String toStringNoTable(TokenRange tr) + { + // TokenRange extends Range.EndInclusive + return "(" + tr.start().printableSuffix() + ", " + tr.end().printableSuffix() + "]"; + } + + private static Map> groupByTable(Ranges ranges) + { + Map> map = new HashMap<>(); + for (Range range : ranges) + { + TokenRange tr = (TokenRange) range; + map.computeIfAbsent(tr.table(), i -> new ArrayList<>()).add(tr); + } + return map; + } +} diff --git a/src/java/org/apache/cassandra/db/virtual/ClusterMetadataDirectoryTable.java b/src/java/org/apache/cassandra/db/virtual/ClusterMetadataDirectoryTable.java index 0d026ce65d42..e7fba1519b40 100644 --- a/src/java/org/apache/cassandra/db/virtual/ClusterMetadataDirectoryTable.java +++ b/src/java/org/apache/cassandra/db/virtual/ClusterMetadataDirectoryTable.java @@ -17,16 +17,26 @@ */ package org.apache.cassandra.db.virtual; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; import java.util.Map; +import java.util.stream.Collectors; + +import com.google.common.collect.ImmutableMap; import org.apache.cassandra.db.marshal.InetAddressType; import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.ListType; import org.apache.cassandra.db.marshal.LongType; +import org.apache.cassandra.db.marshal.MapType; import org.apache.cassandra.db.marshal.UTF8Type; import org.apache.cassandra.db.marshal.UUIDType; import org.apache.cassandra.dht.LocalPartitioner; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.MultiStepOperation; import org.apache.cassandra.tcm.membership.Directory; import org.apache.cassandra.tcm.membership.Location; import org.apache.cassandra.tcm.membership.NodeAddresses; @@ -35,7 +45,7 @@ import org.apache.cassandra.tcm.membership.NodeVersion; -final class ClusterMetadataDirectoryTable extends AbstractVirtualTable +public final class ClusterMetadataDirectoryTable extends AbstractVirtualTable { private static final String NODE_ID = "node_id"; private static final String HOST_ID = "host_id"; @@ -50,6 +60,8 @@ final class ClusterMetadataDirectoryTable extends AbstractVirtualTable private static final String LOCAL_PORT = "local_port"; private static final String NATIVE_ADDRESS = "native_address"; private static final String NATIVE_PORT = "native_port"; + private static final String TOKENS = "tokens"; + private static final String MULTI_STEP_OPERATION = "multi_step_operation"; ClusterMetadataDirectoryTable(String keyspace) @@ -71,15 +83,31 @@ final class ClusterMetadataDirectoryTable extends AbstractVirtualTable .addRegularColumn(LOCAL_PORT, Int32Type.instance) .addRegularColumn(NATIVE_ADDRESS, InetAddressType.instance) .addRegularColumn(NATIVE_PORT, Int32Type.instance) + .addRegularColumn(TOKENS, ListType.getInstance(UTF8Type.instance, false)) + .addRegularColumn(MULTI_STEP_OPERATION, MapType.getInstance(UTF8Type.instance, UTF8Type.instance, false)) .build()); } @Override public DataSet data() + { + SimpleDataSet result = new SimpleDataSet(metadata()); + + for (Map.Entry> entry : directory(true).entrySet()) + { + result = result.row(entry.getKey().intValue()); + for (Map.Entry row : entry.getValue().entrySet()) + result = result.column(row.getKey(), row.getValue()); + } + return result; + } + + public static Map> directory(boolean tokens) { ClusterMetadata metadata = ClusterMetadata.current(); Directory directory = metadata.directory; - SimpleDataSet result = new SimpleDataSet(metadata()); + Map> result = new LinkedHashMap<>(); + for (Map.Entry entry : directory.states.entrySet()) { NodeId nodeId = entry.getKey(); @@ -87,20 +115,33 @@ public DataSet data() NodeAddresses address = directory.getNodeAddresses(nodeId); Location location = directory.location(nodeId); NodeVersion version = directory.version(nodeId); - result.row(nodeId.id()) - .column(HOST_ID, nodeId.toUUID()) - .column(STATE, nodeState.toString()) - .column(CASSANDRA_VERSION, version != null ? version.cassandraVersion.toString() : null) - .column(SERIALIZATION_VERSION, version != null ? version.serializationVersion : null) - .column(RACK, location != null ? location.rack : null) - .column(DC, location != null ? location.datacenter : null) - .column(BROADCAST_ADDRESS, address != null ? address.broadcastAddress.getAddress() : null) - .column(BROADCAST_PORT, address != null ? address.broadcastAddress.getPort() : null) - .column(LOCAL_ADDRESS, address != null ? address.localAddress.getAddress() : null) - .column(LOCAL_PORT, address != null ? address.localAddress.getPort() : null) - .column(NATIVE_ADDRESS, address != null ? address.nativeAddress.getAddress() : null) - .column(NATIVE_PORT, address != null ? address.nativeAddress.getPort() : null); + Map row = new HashMap<>(); + row.put(HOST_ID, nodeId.toUUID()); + row.put(STATE, nodeState.toString()); + row.put(CASSANDRA_VERSION, version != null ? version.cassandraVersion.toString() : null); + row.put(SERIALIZATION_VERSION, version != null ? version.serializationVersion : null); + row.put(RACK, location != null ? location.rack : null); + row.put(DC, location != null ? location.datacenter : null); + row.put(BROADCAST_ADDRESS, address != null ? address.broadcastAddress.getAddress() : null); + row.put(BROADCAST_PORT, address != null ? address.broadcastAddress.getPort() : null); + row.put(LOCAL_ADDRESS, address != null ? address.localAddress.getAddress() : null); + row.put(LOCAL_PORT, address != null ? address.localAddress.getPort() : null); + row.put(NATIVE_ADDRESS, address != null ? address.nativeAddress.getAddress() : null); + row.put(NATIVE_PORT, address != null ? address.nativeAddress.getPort() : null); + if (tokens) + row.put(TOKENS, tokensToString(metadata.tokenMap.tokens(nodeId))); + MultiStepOperation mso = metadata.inProgressSequences.get(nodeId); + if (mso != null) + row.put(MULTI_STEP_OPERATION, ImmutableMap.of("kind", mso.kind().name(), + "status", mso.status(), + "nextStep", mso.nextStep().name())); + result.put((long)nodeId.id(), row); } return result; } + + private static List tokensToString(List tokens) + { + return tokens.stream().map(Object::toString).collect(Collectors.toList()); + } } diff --git a/src/java/org/apache/cassandra/db/virtual/ClusterMetadataLogTable.java b/src/java/org/apache/cassandra/db/virtual/ClusterMetadataLogTable.java index 152b4769a728..cd755115ec76 100644 --- a/src/java/org/apache/cassandra/db/virtual/ClusterMetadataLogTable.java +++ b/src/java/org/apache/cassandra/db/virtual/ClusterMetadataLogTable.java @@ -19,6 +19,9 @@ import java.io.IOException; import java.util.Date; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; import org.apache.cassandra.cql3.UntypedResultSet; import org.apache.cassandra.db.ConsistencyLevel; @@ -27,6 +30,7 @@ import org.apache.cassandra.db.marshal.UTF8Type; import org.apache.cassandra.locator.MetaStrategy; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.Transformation; import static java.lang.String.format; @@ -34,7 +38,7 @@ import static org.apache.cassandra.schema.DistributedMetadataLogKeyspace.TABLE_NAME; import static org.apache.cassandra.schema.SchemaConstants.METADATA_KEYSPACE_NAME; -final class ClusterMetadataLogTable extends AbstractVirtualTable +public final class ClusterMetadataLogTable extends AbstractVirtualTable { private static final String EPOCH = "epoch"; private static final String KIND = "kind"; @@ -58,22 +62,34 @@ final class ClusterMetadataLogTable extends AbstractVirtualTable @Override public DataSet data() + { + SimpleDataSet result = new SimpleDataSet(metadata()); + for (Map.Entry> entry : log(Epoch.FIRST.getEpoch(), Long.MAX_VALUE).entrySet()) + { + SimpleDataSet data = result.row(entry.getKey()); + for (Map.Entry rowEntry : entry.getValue().entrySet()) + data = data.column(rowEntry.getKey(), rowEntry.getValue()); + } + return result; + } + + public static Map> log(long startEpoch, long endEpoch) { try { - SimpleDataSet result = new SimpleDataSet(metadata()); + Map> result = new LinkedHashMap<>(); UntypedResultSet res = execute(format("SELECT epoch, kind, transformation, entry_id, writetime(kind) as wt " + - "FROM %s.%s", METADATA_KEYSPACE_NAME, TABLE_NAME), ConsistencyLevel.QUORUM); + "FROM %s.%s WHERE token(epoch) >= token(?) AND token(epoch) <= token(?)", METADATA_KEYSPACE_NAME, TABLE_NAME), ConsistencyLevel.QUORUM, endEpoch, startEpoch); for (UntypedResultSet.Row r : res) { Transformation.Kind kind = Transformation.Kind.fromId(r.getInt("kind")); Transformation transformation = kind.fromVersionedBytes(r.getBlob("transformation")); - - result.row(r.getLong("epoch")) - .column(KIND, kind.toString()) - .column(TRANSFORMATION, transformation.toString()) - .column(ENTRY_ID, r.getLong("entry_id")) - .column(ENTRY_TIME, new Date(r.getLong("wt") / 1000)); + Map row = new HashMap<>(); + row.put(KIND, kind.toString()); + row.put(TRANSFORMATION, transformation.toString()); + row.put(ENTRY_ID, r.getLong("entry_id")); + row.put(ENTRY_TIME, new Date(r.getLong("wt") / 1000)); + result.put(r.getLong("epoch"), row); } return result; } diff --git a/src/java/org/apache/cassandra/db/virtual/CollectionVirtualTableAdapter.java b/src/java/org/apache/cassandra/db/virtual/CollectionVirtualTableAdapter.java index 47aa3bd5c43c..cc311e1a2653 100644 --- a/src/java/org/apache/cassandra/db/virtual/CollectionVirtualTableAdapter.java +++ b/src/java/org/apache/cassandra/db/virtual/CollectionVirtualTableAdapter.java @@ -31,7 +31,6 @@ import java.util.TreeMap; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.atomic.AtomicReference; import java.util.function.BiFunction; import java.util.function.Function; @@ -366,14 +365,16 @@ protected UnfilteredRowIterator computeNext() private Iterator buildDataRangeIterator(DataRange dataRange, ColumnFilter columnFilter) { - NavigableMap, Row>> partitionMap = new ConcurrentSkipListMap<>(DecoratedKey.comparator); - StreamSupport.stream(data.spliterator(), true) - .map(row -> makeRow(row, columnFilter)) - .filter(cr -> dataRange.keyRange().contains(cr.key.get())) - .forEach(cr -> partitionMap.computeIfAbsent(cr.key.get(), - key -> new TreeMap<>(metadata.comparator)) - .put(cr.clustering, cr.rowSup.get())); - + NavigableMap, Row>> partitionMap = new TreeMap<>(DecoratedKey.comparator); + for (R row : data) + { + CollectionRow cr = makeRow(row, columnFilter); + if (dataRange.keyRange().contains(cr.key.get())) + { + partitionMap.computeIfAbsent(cr.key.get(), + key -> new TreeMap<>(metadata.comparator)).put(cr.clustering, cr.rowSup.get()); + } + } return partitionMap.entrySet().stream().map( e -> new DataRowUnfilteredIterator(e.getKey(), dataRange.clusteringIndexFilter(e.getKey()), columnFilter, e.getValue())).iterator(); diff --git a/src/java/org/apache/cassandra/db/virtual/LocalRepairTables.java b/src/java/org/apache/cassandra/db/virtual/LocalRepairTables.java index 1c8c24b1716a..34aeccf128c2 100644 --- a/src/java/org/apache/cassandra/db/virtual/LocalRepairTables.java +++ b/src/java/org/apache/cassandra/db/virtual/LocalRepairTables.java @@ -136,7 +136,7 @@ private void updateDataset(SimpleDataSet result, CoordinatorState state) { result.row(state.id); addState(result, state); - result.column("type", getType(state)); + result.column("type", state.getType()); result.column("keyspace_name", state.keyspace); result.column("command_id", state.cmd); @@ -144,7 +144,7 @@ private void updateDataset(SimpleDataSet result, CoordinatorState state) result.column("options_primary_range", state.options.isPrimaryRange()); result.column("options_trace", state.options.isTraced()); result.column("options_job_threads", state.options.getJobThreads()); - result.column("options_subrange_repair", state.options.isSubrangeRepair()); + result.column("options_subrange_repair", false); result.column("options_pull_repair", state.options.isPullRepair()); result.column("options_force_repair", state.options.isForcedRepair()); result.column("options_preview_kind", state.options.getPreviewKind().name()); @@ -169,26 +169,6 @@ private void updateDataset(SimpleDataSet result, CoordinatorState state) ranges = state.getCommonRanges(); result.column("unfiltered_ranges", ranges == null ? null : ranges.stream().map(c -> c.ranges).map(LocalRepairTables::toStringList).collect(Collectors.toList())); } - - private String getType(CoordinatorState state) - { - if (state.options.isPreview()) - { - switch (state.options.getPreviewKind()) - { - case ALL: return "preview full"; - case REPAIRED: return "preview repaired"; - case UNREPAIRED: return "preview unrepaired"; - case NONE: throw new AssertionError("NONE preview kind not expected when preview repair is set"); - default: throw new AssertionError("Unknown preview kind: " + state.options.getPreviewKind()); - } - } - else if (state.options.isIncremental()) - { - return "incremental"; - } - return "full"; - } } private static final class SessionTable extends AbstractVirtualTable diff --git a/src/java/org/apache/cassandra/db/virtual/LogMessagesTable.java b/src/java/org/apache/cassandra/db/virtual/LogMessagesTable.java index 5903ac2ab5f3..87978e3fd966 100644 --- a/src/java/org/apache/cassandra/db/virtual/LogMessagesTable.java +++ b/src/java/org/apache/cassandra/db/virtual/LogMessagesTable.java @@ -18,15 +18,11 @@ package org.apache.cassandra.db.virtual; -import java.util.Collections; import java.util.Date; import java.util.Iterator; -import java.util.LinkedList; import java.util.List; import com.google.common.annotations.VisibleForTesting; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import ch.qos.logback.classic.spi.LoggingEvent; import org.apache.cassandra.config.CassandraRelevantProperties; @@ -50,11 +46,8 @@ * @see CASSANDRA-18238 * @see org.apache.cassandra.utils.logging.VirtualTableAppender */ -public final class LogMessagesTable extends AbstractMutableVirtualTable +public final class LogMessagesTable extends AbstractLoggerVirtualTable { - private static final Logger logger = LoggerFactory.getLogger(LogMessagesTable.class); - - public static final int LOGS_VIRTUAL_TABLE_MIN_ROWS = 1000; public static final int LOGS_VIRTUAL_TABLE_DEFAULT_ROWS = 50_000; public static final int LOGS_VIRTUAL_TABLE_MAX_ROWS = 100_000; @@ -67,11 +60,11 @@ public final class LogMessagesTable extends AbstractMutableVirtualTable public static final String LEVEL_COLUMN_NAME = "level"; public static final String MESSAGE_COLUMN_NAME = "message"; - private final List buffer; - LogMessagesTable(String keyspace) { - this(keyspace, resolveBufferSize()); + this(keyspace, resolveBufferSize(CassandraRelevantProperties.LOGS_VIRTUAL_TABLE_MAX_ROWS.getInt(), + LOGS_VIRTUAL_TABLE_MAX_ROWS, + LOGS_VIRTUAL_TABLE_DEFAULT_ROWS)); } @VisibleForTesting @@ -85,10 +78,14 @@ public final class LogMessagesTable extends AbstractMutableVirtualTable .addClusteringColumn(ORDER_IN_MILLISECOND_COLUMN_NAME, Int32Type.instance) .addRegularColumn(LOGGER_COLUMN_NAME, UTF8Type.instance) .addRegularColumn(LEVEL_COLUMN_NAME, UTF8Type.instance) - .addRegularColumn(MESSAGE_COLUMN_NAME, UTF8Type.instance).build()); + .addRegularColumn(MESSAGE_COLUMN_NAME, UTF8Type.instance).build(), + size); + } - logger.debug("capacity of virtual table {} is set to be at most {} rows", metadata().toString(), size); - buffer = BoundedLinkedList.create(size); + @Override + public List getMessages(LoggingEvent event) + { + return List.of(event); } @Override @@ -103,12 +100,12 @@ public DataSet data() int index = 0; - Iterator iterator = buffer.listIterator(); + Iterator iterator = buffer.listIterator(); while (iterator.hasNext()) { - LogMessage log = iterator.next(); + LoggingEvent log = iterator.next(); - milliSecondsOfCurrentLog = log.timestamp; + milliSecondsOfCurrentLog = log.getTimeStamp(); if (milliSecondsOfPreviousLog == milliSecondsOfCurrentLog) ++index; else @@ -116,86 +113,13 @@ public DataSet data() milliSecondsOfPreviousLog = milliSecondsOfCurrentLog; - result.row(new Date(log.timestamp), index) - .column(LOGGER_COLUMN_NAME, log.logger) - .column(LEVEL_COLUMN_NAME, log.level) - .column(MESSAGE_COLUMN_NAME, log.message); + result.row(new Date(milliSecondsOfCurrentLog), index) + .column(LOGGER_COLUMN_NAME, log.getLoggerName()) + .column(LEVEL_COLUMN_NAME, log.getLevel().toString()) + .column(MESSAGE_COLUMN_NAME, log.getFormattedMessage()); } } return result; } - - public void add(LoggingEvent event) - { - buffer.add(new LogMessage(event)); - } - - @Override - public void truncate() - { - buffer.clear(); - } - - @Override - public boolean allowFilteringImplicitly() - { - return false; - } - - @VisibleForTesting - static int resolveBufferSize() - { - int size = CassandraRelevantProperties.LOGS_VIRTUAL_TABLE_MAX_ROWS.getInt(); - return (size < LOGS_VIRTUAL_TABLE_MIN_ROWS || size > LOGS_VIRTUAL_TABLE_MAX_ROWS) - ? LOGS_VIRTUAL_TABLE_DEFAULT_ROWS : size; - } - - @VisibleForTesting - public static class LogMessage - { - public final long timestamp; - public final String logger; - public final String level; - public final String message; - - public LogMessage(LoggingEvent event) - { - this(event.getTimeStamp(), event.getLoggerName(), event.getLevel().toString(), event.getFormattedMessage()); - } - - public LogMessage(long timestamp, String logger, String level, String message) - { - this.timestamp = timestamp; - this.logger = logger; - this.level = level; - this.message = message; - } - } - - private static final class BoundedLinkedList extends LinkedList - { - private final int maxSize; - - public static List create(int size) - { - return Collections.synchronizedList(new BoundedLinkedList<>(size)); - } - - private BoundedLinkedList(int maxSize) - { - this.maxSize = maxSize; - } - - @Override - public boolean add(T t) - { - if (size() == maxSize) - removeLast(); - - addFirst(t); - - return true; - } - } } diff --git a/src/java/org/apache/cassandra/db/virtual/PartitionKeyStatsTable.java b/src/java/org/apache/cassandra/db/virtual/PartitionKeyStatsTable.java index d114e5faa763..550743c6a734 100644 --- a/src/java/org/apache/cassandra/db/virtual/PartitionKeyStatsTable.java +++ b/src/java/org/apache/cassandra/db/virtual/PartitionKeyStatsTable.java @@ -284,7 +284,7 @@ private AbstractBounds getBounds(TableMetadata target, Cluste { Slices s = clusteringIndexFilter.getSlices(target); Token startToken = target.partitioner.getMinimumToken(); - Token endToken = target.partitioner.getMaximumToken(); + Token endToken = target.partitioner.getMaximumTokenForSplitting(); BigInteger startTokenValue = new BigInteger(endToken.getTokenValue().toString(), 10); BigInteger endTokenValue = new BigInteger(startToken.getTokenValue().toString(), 10); diff --git a/src/java/org/apache/cassandra/db/virtual/SlowQueriesTable.java b/src/java/org/apache/cassandra/db/virtual/SlowQueriesTable.java new file mode 100644 index 000000000000..0d392d0ce27e --- /dev/null +++ b/src/java/org/apache/cassandra/db/virtual/SlowQueriesTable.java @@ -0,0 +1,195 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.virtual; + +import java.util.ArrayList; +import java.util.Date; +import java.util.Iterator; +import java.util.List; + +import com.google.common.annotations.VisibleForTesting; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import ch.qos.logback.classic.spi.LoggingEvent; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.marshal.BooleanType; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.LongType; +import org.apache.cassandra.db.marshal.TimestampType; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.db.monitoring.MonitoringTask.Operation; +import org.apache.cassandra.dht.LocalPartitioner; +import org.apache.cassandra.schema.TableMetadata; + +import static java.util.concurrent.TimeUnit.NANOSECONDS; + +public class SlowQueriesTable extends AbstractLoggerVirtualTable +{ + private static final Logger logger = LoggerFactory.getLogger(SlowQueriesTable.class); + + public static final int LOGS_VIRTUAL_TABLE_DEFAULT_ROWS = 10_000; + public static final int LOGS_VIRTUAL_TABLE_MAX_ROWS = 100_000; + + public static final String TABLE_NAME = "slow_queries"; + private static final String TABLE_COMMENT = "Slow queries"; + + public static final String KEYSPACE_COLUMN_NAME = "keyspace_name"; + public static final String TABLE_COLUMN_NAME = "table_name"; + public static final String TIMESTAMP_COLUMN_NAME = "timestamp"; + public static final String QUERY_COLUMN_NAME = "query"; + public static final String MINIMUM_TIME_COLUMN_NAME = "min_ms"; + public static final String MAXIMUM_TIME_COLUMN_NAME = "max_ms"; + public static final String AVERAGE_TIME_COLUMN_NAME = "avg_ms"; + public static final String TIMES_REPORTED_COLUMN_NAME = "times_reported"; + public static final String CROSS_NODE_COLUMN_NAME = "cross_node"; + + SlowQueriesTable(String keyspace) + { + this(keyspace, resolveBufferSize(CassandraRelevantProperties.LOGS_SLOW_QUERIES_VIRTUAL_TABLE_MAX_ROWS.getInt(), + LOGS_VIRTUAL_TABLE_MAX_ROWS, + LOGS_VIRTUAL_TABLE_DEFAULT_ROWS)); + } + + @VisibleForTesting + SlowQueriesTable(String keyspace, int size) + { + super(TableMetadata.builder(keyspace, TABLE_NAME) + .comment(TABLE_COMMENT) + .kind(TableMetadata.Kind.VIRTUAL) + .partitioner(new LocalPartitioner(UTF8Type.instance)) + .addPartitionKeyColumn(KEYSPACE_COLUMN_NAME, UTF8Type.instance) + .addClusteringColumn(TABLE_COLUMN_NAME, UTF8Type.instance) + .addClusteringColumn(TIMESTAMP_COLUMN_NAME, TimestampType.instance) + // We are adding query as a clustering column for uniqueness, + // In theory, it might happen that two monitoring operations + // would be emitted for same keyspace, same table at the exact same time + // (in milliseconds). That means that one operation would "shadow" + // another one because primary key would be same for both. + // To make it truly unique, we include query among clustering keys + // as well. If queries were same, then they would be also reported so + // (it would be reflected in "times_reported" column) + .addClusteringColumn(QUERY_COLUMN_NAME, UTF8Type.instance) + .addRegularColumn(MINIMUM_TIME_COLUMN_NAME, LongType.instance) + .addRegularColumn(MAXIMUM_TIME_COLUMN_NAME, LongType.instance) + .addRegularColumn(AVERAGE_TIME_COLUMN_NAME, LongType.instance) + .addRegularColumn(TIMES_REPORTED_COLUMN_NAME, Int32Type.instance) + .addRegularColumn(CROSS_NODE_COLUMN_NAME, BooleanType.instance) + .build(), + size); + } + + @Override + protected void applyPartitionDeletion(ColumnValues partitionKey) + { + String keyspace = partitionKey.value(0); + + synchronized (buffer) + { + buffer.removeIf(o -> o.keyspace().equals(keyspace)); + } + } + + @Override + public DataSet data() + { + SimpleDataSet result = new SimpleDataSet(metadata(), DecoratedKey.comparator.reversed()); + + synchronized (buffer) + { + Iterator iterator = buffer.listIterator(); + while (iterator.hasNext()) + { + Operation operation = iterator.next(); + + result.row(operation.keyspace(), operation.table(), new Date(operation.timestampMs()), operation.name()) + .column(MINIMUM_TIME_COLUMN_NAME, NANOSECONDS.toMillis(operation.minTimeNanos())) + .column(MAXIMUM_TIME_COLUMN_NAME, NANOSECONDS.toMillis(operation.maxTimeNanos())) + .column(AVERAGE_TIME_COLUMN_NAME, NANOSECONDS.toMillis(operation.averageTime())) + .column(TIMES_REPORTED_COLUMN_NAME, operation.numTimesReported()) + .column(CROSS_NODE_COLUMN_NAME, operation.isCrossNode()); + } + } + + return result; + } + + @Override + public List getMessages(LoggingEvent event) + { + try + { + List qualified = new ArrayList<>(); + for (Operation operation : Operation.deserialize(event.getMessage())) + { + + // in (improbable) case there is an operation which does not have + // keyspace / table on it, we just skip this from processing + // as we would have nothing to show for partition key and clustering column + if (operation.keyspace() == null || operation.table() == null) + continue; + + // if cf of an operation is present, take keyspace and table name from it + // instead of having new string instances per operation which might + // take relatively a lot of additional space unnecessarily + Keyspace keyspace = Keyspace.openIfExists(operation.keyspace()); + String keyspaceName; + String tableName; + if (keyspace != null) + { + keyspaceName = keyspace.getName(); + try + { + ColumnFamilyStore table = keyspace.getColumnFamilyStore(operation.table()); + tableName = table.getTableName(); + } + catch (IllegalArgumentException ex) + { + tableName = operation.table(); + } + } + else + { + keyspaceName = operation.keyspace(); + tableName = operation.table(); + } + + operation.setKeyspace(keyspaceName); + operation.setTable(tableName); + qualified.add(operation); + } + + return qualified; + } + catch (Throwable t) + { + logger.trace("Unable to generate list of slow queries", t); + return null; + } + } + + @Override + public boolean allowFilteringImplicitly() + { + return true; + } +} diff --git a/src/java/org/apache/cassandra/db/virtual/SystemViewsKeyspace.java b/src/java/org/apache/cassandra/db/virtual/SystemViewsKeyspace.java index dacf9f643a8c..28c6dc8fef40 100644 --- a/src/java/org/apache/cassandra/db/virtual/SystemViewsKeyspace.java +++ b/src/java/org/apache/cassandra/db/virtual/SystemViewsKeyspace.java @@ -61,6 +61,7 @@ private SystemViewsKeyspace() .add(new GossipInfoTable(VIRTUAL_VIEWS)) .add(new QueriesTable(VIRTUAL_VIEWS)) .add(new LogMessagesTable(VIRTUAL_VIEWS)) + .add(new SlowQueriesTable(VIRTUAL_VIEWS)) .add(new SnapshotsTable(VIRTUAL_VIEWS)) .add(new PeersTable(VIRTUAL_VIEWS)) .add(new LocalTable(VIRTUAL_VIEWS)) @@ -69,6 +70,7 @@ private SystemViewsKeyspace() .addAll(LocalRepairTables.getAll(VIRTUAL_VIEWS)) .addAll(CIDRFilteringMetricsTable.getAll(VIRTUAL_VIEWS)) .addAll(StorageAttachedIndexTables.getAll(VIRTUAL_VIEWS)) + .addAll(AccordVirtualTables.getAll(VIRTUAL_VIEWS)) .build()); } } diff --git a/src/java/org/apache/cassandra/db/virtual/VirtualKeyspace.java b/src/java/org/apache/cassandra/db/virtual/VirtualKeyspace.java index 044c11476bb1..a0585a2694b5 100644 --- a/src/java/org/apache/cassandra/db/virtual/VirtualKeyspace.java +++ b/src/java/org/apache/cassandra/db/virtual/VirtualKeyspace.java @@ -50,7 +50,7 @@ public VirtualKeyspace(String name, Collection tables) if (!duplicates.isEmpty()) throw new IllegalArgumentException(String.format("Duplicate table names in virtual keyspace %s: %s", name, duplicates)); - metadata = KeyspaceMetadata.virtual(name, Tables.of(Iterables.transform(tables, VirtualTable::metadata))); + this.metadata = KeyspaceMetadata.virtual(name, Tables.of(Iterables.transform(tables, VirtualTable::metadata))); } public String name() @@ -58,13 +58,13 @@ public String name() return name; } - public KeyspaceMetadata metadata() + public ImmutableCollection tables() { - return metadata; + return tables; } - public ImmutableCollection tables() + public KeyspaceMetadata metadata() { - return tables; + return metadata; } } diff --git a/src/java/org/apache/cassandra/db/virtual/VirtualMutation.java b/src/java/org/apache/cassandra/db/virtual/VirtualMutation.java index 8c3b5b4afda6..ee98da29c115 100644 --- a/src/java/org/apache/cassandra/db/virtual/VirtualMutation.java +++ b/src/java/org/apache/cassandra/db/virtual/VirtualMutation.java @@ -19,7 +19,9 @@ import java.util.Collection; import java.util.concurrent.TimeUnit; +import java.util.function.Predicate; import java.util.function.Supplier; +import javax.annotation.Nullable; import com.google.common.base.MoreObjects; import com.google.common.collect.ImmutableMap; @@ -28,6 +30,7 @@ import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.IMutation; import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.service.ClientState; @@ -107,6 +110,12 @@ public Collection getPartitionUpdates() return modifications.values(); } + @Override + public boolean hasUpdateForTable(TableId tableId) + { + return modifications.containsKey(tableId); + } + @Override public Supplier hintOnFailure() { @@ -123,4 +132,19 @@ public void validateSize(int version, int overhead) { // no-op } + + @Override + public @Nullable VirtualMutation filter(Predicate test) + { + throw new UnsupportedOperationException(); + } + + /* + * Accord doesn't support reading/writing virtual tables yet so updating them non-transactionally is always safe + */ + @Override + public PotentialTxnConflicts potentialTxnConflicts() + { + return PotentialTxnConflicts.ALLOW; + } } diff --git a/src/java/org/apache/cassandra/dht/AccordBytesSplitter.java b/src/java/org/apache/cassandra/dht/AccordBytesSplitter.java new file mode 100644 index 000000000000..9af419274882 --- /dev/null +++ b/src/java/org/apache/cassandra/dht/AccordBytesSplitter.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.dht; + +import java.math.BigInteger; + +import accord.api.RoutingKey; +import accord.primitives.Ranges; +import accord.utils.Invariants; +import org.apache.cassandra.service.accord.api.TokenKey; + +import static accord.utils.Invariants.requireArgument; +import static java.math.BigInteger.ONE; +import static java.math.BigInteger.ZERO; + +public class AccordBytesSplitter extends AccordSplitter +{ + final int byteLength; + + protected AccordBytesSplitter(Ranges ranges) + { + int bytesLength = 0; + for (accord.primitives.Range range : ranges) + { + bytesLength = Integer.max(bytesLength, byteLength(range.start())); + bytesLength = Integer.max(bytesLength, byteLength(range.end())); + } + // In the single node single token case the ranges in TCM are merged to +/-inf which have no token + // and no byte length. This isn't really a problem because byte length isn't really that important it just means + // the shard boundaries will be arbitrary. You won't notice a problem until you go to add nodes and more tokens + // and suddenly the splitter might use a different length and now your shards are laid out slightly differently at + // each node which would result in a small amount of metadata moving between command stores. + // Since BOP is already not working/supported I think it's fine to punt on this. + if (bytesLength == 0) + { + requireArgument(ranges.size() <= 1); + requireArgument(ranges.isEmpty() || ((TokenKey)ranges.get(0).start()).isMin()); + requireArgument(ranges.isEmpty() || ((TokenKey)ranges.get(0).end()).isMax()); + // Intentionally does not match 16 that is used by ServerTestUtils.getRandomToken to elicit breakage + bytesLength = 8; + } + this.byteLength = bytesLength; + } + + @Override + BigInteger minimumValue() + { + return ZERO; + } + + @Override + BigInteger maximumValue() + { + return ONE.shiftLeft(8 * byteLength).subtract(ONE); + } + + @Override + BigInteger valueForToken(Token token) + { + byte[] bytes = ((ByteOrderedPartitioner.BytesToken) token).token; + requireArgument(bytes.length <= byteLength); + BigInteger value = ZERO; + for (int i = 0 ; i < bytes.length ; ++i) + value = value.add(BigInteger.valueOf(bytes[i] & 0xffL).shiftLeft((byteLength - 1 - i) * 8)); + return value; + } + + @Override + Token tokenForValue(BigInteger value) + { + Invariants.requireArgument(value.compareTo(ZERO) >= 0); + byte[] bytes = new byte[byteLength]; + for (int i = 0 ; i < bytes.length ; ++i) + bytes[i] = value.shiftRight((byteLength - 1 - i) * 8).byteValue(); + return new ByteOrderedPartitioner.BytesToken(bytes); + } + + private static int byteLength(RoutingKey routingKey) + { + TokenKey accordKey = (TokenKey) routingKey; + return byteLength(accordKey.token()); + } + + private static int byteLength(Token token) + { + return ((ByteOrderedPartitioner.BytesToken) token).token.length; + } +} diff --git a/src/java/org/apache/cassandra/dht/AccordSplitter.java b/src/java/org/apache/cassandra/dht/AccordSplitter.java new file mode 100644 index 000000000000..6fc25c1feacc --- /dev/null +++ b/src/java/org/apache/cassandra/dht/AccordSplitter.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.dht; + +import java.math.BigInteger; + +import accord.local.ShardDistributor; +import accord.primitives.Range; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.service.accord.api.TokenKey; + +import static java.math.BigInteger.ZERO; + +public abstract class AccordSplitter implements ShardDistributor.EvenSplit.Splitter +{ + abstract BigInteger valueForToken(Token token); + abstract Token tokenForValue(BigInteger value); + abstract BigInteger minimumValue(); + abstract BigInteger maximumValue(); + + @Override + public BigInteger sizeOf(accord.primitives.Range range) + { + // note: minimum value + TokenKey startBound = (TokenKey)range.start(); + TokenKey endBound = (TokenKey)range.end(); + BigInteger start = startBound.isMin() ? minimumValue() : valueForToken(((TokenKey)range.start()).token()); + BigInteger end = endBound.isMax() ? maximumValue() : valueForToken(((TokenKey)range.end()).token()); + return end.subtract(start); + } + + @Override + public TokenRange subRange(accord.primitives.Range range, BigInteger startOffset, BigInteger endOffset) + { + TokenKey startBound = (TokenKey)range.start(); + TokenKey endBound = (TokenKey)range.end(); + + BigInteger start = startBound.isMin() ? minimumValue() : valueForToken(startBound.token()); + BigInteger end = endBound.isMax() ? maximumValue() : valueForToken(endBound.token()); + BigInteger sizeOfRange = end.subtract(start); + + TableId tableId = startBound.table(); + return TokenRange.create(startOffset.equals(ZERO) ? startBound : new TokenKey(tableId, tokenForValue(start.add(startOffset))), + endOffset.compareTo(sizeOfRange) >= 0 ? endBound : new TokenKey(tableId, tokenForValue(start.add(endOffset)))); + } + + @Override + public boolean splittable(Range range, int numSplits) + { + return sizeOf(range).compareTo(BigInteger.valueOf(numSplits)) >= 0; + } + + @Override + public BigInteger zero() + { + return ZERO; + } + + @Override + public BigInteger valueOf(int v) + { + return BigInteger.valueOf(v); + } + + + @Override + public BigInteger add(BigInteger a, BigInteger b) + { + return a.add(b); + } + + @Override + public BigInteger subtract(BigInteger a, BigInteger b) + { + return a.subtract(b); + } + + @Override + public BigInteger divide(BigInteger a, int i) + { + return a.divide(BigInteger.valueOf(i)); + } + + @Override + public BigInteger divide(BigInteger a, BigInteger i) + { + return a.divide(i); + } + + + @Override + public BigInteger multiply(BigInteger a, int i) + { + return a.multiply(BigInteger.valueOf(i)); + } + + @Override + public int min(BigInteger v, int i) + { + return v.min(BigInteger.valueOf(i)).intValue(); + } + + @Override + public int compare(BigInteger a, BigInteger b) + { + return a.compareTo(b); + } +} diff --git a/src/java/org/apache/cassandra/dht/BootStrapper.java b/src/java/org/apache/cassandra/dht/BootStrapper.java index ebf04dddf89f..609b4b89e783 100644 --- a/src/java/org/apache/cassandra/dht/BootStrapper.java +++ b/src/java/org/apache/cassandra/dht/BootStrapper.java @@ -126,7 +126,8 @@ public Future bootstrap(StreamStateStore stateStore, boolean useStr true, DatabaseDescriptor.getStreamingConnectionsPerHost(), movements, - strictMovements); + strictMovements, + true); if (beingReplaced != null) streamer.addSourceFilter(new RangeStreamer.ExcludedSourcesFilter(Collections.singleton(beingReplaced))); diff --git a/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java b/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java index 88f2a3b6f004..527b5113ba44 100644 --- a/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java +++ b/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java @@ -17,35 +17,41 @@ */ package org.apache.cassandra.dht; -import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.schema.Schema; +import java.io.IOException; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.ThreadLocalRandom; +import java.util.function.Function; + +import com.google.common.collect.Maps; +import org.apache.commons.lang3.ArrayUtils; + +import accord.primitives.Ranges; import org.apache.cassandra.db.BufferDecoratedKey; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.BytesType; +import org.apache.cassandra.db.marshal.ValueAccessor; import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.api.TokenKey.Serializer; import org.apache.cassandra.utils.ByteBufferUtil; -import org.apache.cassandra.utils.bytecomparable.ByteComparable; -import org.apache.cassandra.utils.bytecomparable.ByteSource; -import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.Hex; import org.apache.cassandra.utils.ObjectSizes; import org.apache.cassandra.utils.Pair; - -import org.apache.commons.lang3.ArrayUtils; - -import java.math.BigInteger; -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Random; -import java.util.concurrent.ThreadLocalRandom; - -import com.google.common.collect.Maps; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; public class ByteOrderedPartitioner implements IPartitioner { @@ -128,6 +134,18 @@ public Object getTokenValue() return token; } + @Override + public int tokenHash() + { + return hashCode(); + } + + @Override + public TokenFactory tokenFactory() + { + return tokenFactory; + } + @Override public double size(Token next) { @@ -141,8 +159,57 @@ public Token nextValidToken() throw new UnsupportedOperationException(String.format("Token type %s does not support token allocation.", getClass().getSimpleName())); } + + public Token increaseSlightly() + { + // find first byte we can increment + int i = token.length - 1; + while (i >= 0) + { + if (token[i] != -1) + break; + --i; + } + if (i == -1) + return new BytesToken(Arrays.copyOf(token, token.length + 1)); + + // increment and fill remainder with zeros + byte[] newToken = token.clone(); + ++newToken[i]; + Arrays.fill(newToken, i + 1, newToken.length, (byte)0); + return new BytesToken(newToken); + } + + @Override + public Token decreaseSlightly() + { + if (token.length == 0) + throw new IndexOutOfBoundsException("Cannot create a smaller token the MINIMUM"); + + // find first byte we can decrement + int i = token.length - 1; + while (i >= 0) + { + if (token[i] != 0) + break; + --i; + } + if (i == -1) + { + byte[] newToken = Arrays.copyOf(token, token.length - 1); + return new BytesToken(newToken); + } + + // decrement and fill remainder with -1 + byte[] newToken = token.clone(); + --newToken[i]; + Arrays.fill(newToken, i + 1, newToken.length, (byte)-1); + return new BytesToken(newToken); + } } + private ByteOrderedPartitioner() {} + public BytesToken getToken(ByteBuffer key) { if (key.remaining() == 0) @@ -229,7 +296,7 @@ public BytesToken getRandomToken(Random random) return new BytesToken(buffer); } - private final Token.TokenFactory tokenFactory = new Token.TokenFactory() + private static final Token.TokenFactory tokenFactory = new Token.TokenFactory() { public Token fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version) { @@ -286,6 +353,58 @@ public Token.TokenFactory getTokenFactory() return tokenFactory; } + @Override + public boolean accordSupported() + { + return true; + } + + @Override + public final void accordSerialize(Token token, DataOutputPlus out) throws IOException + { + Serializer.serializeWithEscapes(((BytesToken)token).token, out); + } + + @Override + public final void accordSerialize(Token token, ByteBuffer out) + { + Serializer.serializeWithEscapes(((BytesToken)token).token, out); + } + + @Override + public final Token accordDeserialize(DataInputPlus in, int length) throws IOException + { + byte[] bytes = Serializer.deserializeWithEscapes(in, length); + return new BytesToken(bytes); + } + + @Override + public final Token accordDeserialize(ByteBuffer in, int length) + { + byte[] bytes = Serializer.deserializeWithEscapes(in, length); + return new BytesToken(bytes); + } + + @Override + public final Token accordDeserialize(V src, ValueAccessor accessor, int offset, int length) + { + byte[] bytes = Serializer.deserializeWithEscapes(src, accessor, offset, length); + return new BytesToken(bytes); + } + + @Override + public final int accordSerializedSize(Token token) + { + byte[] bytes = ((BytesToken)token).token; + return Serializer.serializedSize(bytes); + } + + @Override + public final int accordFixedLength() + { + return -1; + } + public boolean preservesOrder() { return true; @@ -339,4 +458,10 @@ public AbstractType partitionOrdering() { return BytesType.instance; } + + @Override + public Function accordSplitter() + { + return AccordBytesSplitter::new; + } } diff --git a/src/java/org/apache/cassandra/dht/ComparableObjectToken.java b/src/java/org/apache/cassandra/dht/ComparableObjectToken.java index 4a6aa8d5a879..8aada75663ff 100644 --- a/src/java/org/apache/cassandra/dht/ComparableObjectToken.java +++ b/src/java/org/apache/cassandra/dht/ComparableObjectToken.java @@ -80,4 +80,11 @@ public Token nextValidToken() throw new UnsupportedOperationException(String.format("Token type %s does not support token allocation.", getClass().getSimpleName())); } + + @Override + public Token decreaseSlightly() + { + throw new UnsupportedOperationException(String.format("Token type %s does not support token allocation.", + getClass().getSimpleName())); + } } diff --git a/src/java/org/apache/cassandra/dht/IPartitioner.java b/src/java/org/apache/cassandra/dht/IPartitioner.java index 341ebc47f1d4..af2d9051b69e 100644 --- a/src/java/org/apache/cassandra/dht/IPartitioner.java +++ b/src/java/org/apache/cassandra/dht/IPartitioner.java @@ -17,15 +17,20 @@ */ package org.apache.cassandra.dht; +import java.io.IOException; import java.nio.ByteBuffer; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Random; +import java.util.function.Function; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.ValueAccessor; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; import javax.annotation.Nullable; @@ -67,9 +72,9 @@ static IPartitioner global() * The biggest token for this partitioner, unlike getMinimumToken, this token is actually used and users wanting to * include all tokens need to do getMaximumToken().maxKeyBound() * - * Not implemented for the ordered partitioners + * THIS IS NOT SAFE FOR PURPOSES BESIDES SPLITTING/BALANCING */ - default Token getMaximumToken() + default Token getMaximumTokenForSplitting() { throw new UnsupportedOperationException("If you are using a splitting partitioner, getMaximumToken has to be implemented"); } @@ -90,6 +95,16 @@ default boolean supportsSplitting() */ public Token getToken(ByteBuffer key); + /** + * @return a Token that can be used to route a given key + * (This is NOT a method to create a Token from its string representation; + * for that, use TokenFactory.fromString.) + */ + default int compareToken(ByteBuffer key, Token token) + { + return getToken(key).compareTo(token); + } + /** * @return a randomly generated token */ @@ -144,8 +159,24 @@ default Optional splitter() return Optional.empty(); } + Function accordSplitter(); + + default boolean isFixedLength() + { + return false; + } + default public int getMaxTokenSize() { return Integer.MIN_VALUE; } + + default boolean accordSupported() { return false; } + default void accordSerialize(Token token, DataOutputPlus out) throws IOException { throw new UnsupportedOperationException(); } + default void accordSerialize(Token token, ByteBuffer out) { throw new UnsupportedOperationException(); } + default Token accordDeserialize(DataInputPlus in, int length) throws IOException { throw new UnsupportedOperationException(); } + default Token accordDeserialize(ByteBuffer in, int length) { throw new UnsupportedOperationException(); } + default Token accordDeserialize(V src, ValueAccessor accessor, int offset, int length) { throw new UnsupportedOperationException(); } + default int accordSerializedSize(Token token) { throw new UnsupportedOperationException(); } + default int accordFixedLength() { throw new UnsupportedOperationException(); } } diff --git a/src/java/org/apache/cassandra/dht/IPartitionerDependentSerializer.java b/src/java/org/apache/cassandra/dht/IPartitionerDependentSerializer.java index 5c75788c0b97..8c612fb23b24 100644 --- a/src/java/org/apache/cassandra/dht/IPartitionerDependentSerializer.java +++ b/src/java/org/apache/cassandra/dht/IPartitionerDependentSerializer.java @@ -19,8 +19,9 @@ import java.io.IOException; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; -import org.apache.cassandra.io.util.DataOutputPlus; /** * Versioned serializer where the serialization depends on partitioner. @@ -28,18 +29,8 @@ * On serialization the partitioner is given by the entity being serialized. To deserialize the partitioner used must * be known to the calling method. */ -public interface IPartitionerDependentSerializer +public interface IPartitionerDependentSerializer extends IVersionedSerializer { - /** - * Serialize the specified type into the specified DataOutputStream instance. - * - * @param t type that needs to be serialized - * @param out DataOutput into which serialization needs to happen. - * @param version protocol version - * @throws java.io.IOException if serialization fails - */ - public void serialize(T t, DataOutputPlus out, int version) throws IOException; - /** * Deserialize into the specified DataInputStream instance. * @param in DataInput from which deserialization needs to happen. @@ -51,11 +42,8 @@ public interface IPartitionerDependentSerializer */ public T deserialize(DataInputPlus in, IPartitioner p, int version) throws IOException; - /** - * Calculate serialized size of object without actually serializing. - * @param t object to calculate serialized size - * @param version protocol version - * @return serialized size of object t - */ - public long serializedSize(T t, int version); + default T deserialize(DataInputPlus in, int version) throws IOException + { + return deserialize(in, DatabaseDescriptor.getPartitioner(), version); + } } diff --git a/src/java/org/apache/cassandra/dht/LocalPartitioner.java b/src/java/org/apache/cassandra/dht/LocalPartitioner.java index 185871d9a27a..4c45887dc490 100644 --- a/src/java/org/apache/cassandra/dht/LocalPartitioner.java +++ b/src/java/org/apache/cassandra/dht/LocalPartitioner.java @@ -21,10 +21,13 @@ import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Random; +import java.util.function.Function; -import org.apache.cassandra.db.DecoratedKey; +import accord.primitives.Ranges; import org.apache.cassandra.db.CachedHashDecoratedKey; +import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.ByteBufferAccessor; import org.apache.cassandra.utils.ByteBufferUtil; @@ -37,7 +40,7 @@ public class LocalPartitioner implements IPartitioner { private static final long EMPTY_SIZE = ObjectSizes.measure(new LocalPartitioner(null).new LocalToken()); - final AbstractType comparator; // package-private to avoid access workarounds in embedded LocalToken. + protected final AbstractType comparator; public LocalPartitioner(AbstractType comparator) { @@ -69,6 +72,11 @@ public LocalToken getToken(ByteBuffer key) return new LocalToken(key); } + public int compareToken(ByteBuffer key, Token token) + { + return comparator.compare(key, ((LocalToken)token).token); + } + public LocalToken getRandomToken() { throw new UnsupportedOperationException(); @@ -138,6 +146,21 @@ public AbstractType partitionOrdering() return comparator; } + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + LocalPartitioner that = (LocalPartitioner) o; + return comparator.equals(that.comparator) && tokenFactory.equals(that.tokenFactory); + } + + @Override + public int hashCode() + { + return Objects.hash(comparator, tokenFactory); + } + public class LocalToken extends ComparableObjectToken { static final long serialVersionUID = 8437543776403014875L; @@ -162,8 +185,7 @@ public String toString() public int compareTo(Token o) { // todo (tcm); seems partitioner got mutated on alter type (for example) before tcm, now we create a new one - not sure its enough just making sure that its the same type of partitioner - assert o.getPartitioner().getClass().equals(getPartitioner().getClass()); -// assert getPartitioner() == o.getPartitioner() : String.format("partitioners do not match; %s != %s", getPartitioner(), o.getPartitioner()); + assert o.getPartitioner().getClass().equals(getPartitioner().getClass()) : String.format("partitioners do not match; %s != %s", getPartitioner(), o.getPartitioner()); return comparator.compare(token, ((LocalToken) o).token); } @@ -174,6 +196,12 @@ public int hashCode() return prime + token.hashCode(); } + @Override + public int tokenHash() + { + return hashCode(); + } + @Override public boolean equals(Object obj) { @@ -203,4 +231,10 @@ public long getHeapSize() return EMPTY_SIZE + ObjectSizes.sizeOnHeapOf(token); } } + + @Override + public Function accordSplitter() + { + return AccordBytesSplitter::new; + } } diff --git a/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java b/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java index dfe0971f7a46..35122bc24996 100644 --- a/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java +++ b/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java @@ -21,26 +21,35 @@ import java.math.BigDecimal; import java.math.BigInteger; import java.nio.ByteBuffer; -import java.util.*; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Random; import java.util.concurrent.ThreadLocalRandom; +import java.util.function.Function; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.primitives.Longs; + +import accord.primitives.Ranges; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.PreHashedDecoratedKey; import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.db.marshal.AbstractType; -import org.apache.cassandra.db.marshal.PartitionerDefinedOrder; import org.apache.cassandra.db.marshal.LongType; +import org.apache.cassandra.db.marshal.PartitionerDefinedOrder; +import org.apache.cassandra.db.marshal.ValueAccessor; import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.MurmurHash; +import org.apache.cassandra.utils.ObjectSizes; import org.apache.cassandra.utils.bytecomparable.ByteComparable; import org.apache.cassandra.utils.bytecomparable.ByteSource; import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; -import org.apache.cassandra.utils.MurmurHash; -import org.apache.cassandra.utils.ObjectSizes; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.primitives.Longs; /** * This class generates a BigIntegerToken using a Murmur3 hash. @@ -58,6 +67,8 @@ public class Murmur3Partitioner implements IPartitioner private final Splitter splitter = new Splitter(this) { + final BigInteger MAX = BigInteger.valueOf(Long.MAX_VALUE), MIN = BigInteger.valueOf(Long.MIN_VALUE); + public Token tokenForValue(BigInteger value) { return new LongToken(value.longValue()); @@ -67,8 +78,22 @@ public BigInteger valueForToken(Token token) { return BigInteger.valueOf(((LongToken) token).token); } + + @Override + BigInteger minimumValue() + { + return MIN; + } + + @Override + BigInteger maximumValue() + { + return MAX; + } }; + protected Murmur3Partitioner() {} + public DecoratedKey decorateKey(ByteBuffer key) { long[] hash = getHash(key); @@ -214,6 +239,18 @@ public long getLongValue() return token; } + @Override + public int tokenHash() + { + return Long.hashCode(token); + } + + @Override + public TokenFactory tokenFactory() + { + return tokenFactory; + } + @Override public double size(Token next) { @@ -226,11 +263,23 @@ public double size(Token next) @Override public LongToken nextValidToken() { + // CASSANDRA-17109 Added the below checks, but paxos tests were not updated, rather than fix + // the paxos tests, disabling the checks for now. The current paxos tests bias twards MIN but + // not for MAX, which makes the test very flaky as when MAX is generated the test fails... + // TODO (required): this check breaks a bunch of tests, but should be re-enabled +// if (token == MAXIMUM) +// throw new IllegalArgumentException("Cannot increase above MAXIMUM"); + return new LongToken(token + 1); } public LongToken decreaseSlightly() { + // CASSANDRA-17109 Added the below checks, but paxos tests were not updated, rather than fix + // the paxos tests, disabling the checks for now +// if (equals(MINIMUM)) +// throw new IllegalArgumentException("Cannot decrease below MINIMUM"); + return new LongToken(token - 1); } @@ -271,11 +320,69 @@ private LongToken getToken(ByteBuffer key, long[] hash) return new LongToken(normalize(hash[0])); } + @Override + public boolean isFixedLength() + { + return true; + } + public int getMaxTokenSize() { return MAXIMUM_TOKEN_SIZE; } + public final boolean accordSupported() + { + return true; + } + + @Override + public final void accordSerialize(Token token, DataOutputPlus out) throws IOException + { + out.writeLong(flip(((LongToken)token).token)); + } + + @Override + public final void accordSerialize(Token token, ByteBuffer out) + { + out.putLong(flip(((LongToken)token).token)); + } + + @Override + public final Token accordDeserialize(DataInputPlus in, int length) throws IOException + { + return new LongToken(flip(in.readLong())); + } + + @Override + public final Token accordDeserialize(ByteBuffer in, int length) + { + return new LongToken(flip(in.getLong())); + } + + @Override + public final Token accordDeserialize(V src, ValueAccessor accessor, int offset, int length) + { + return new LongToken(flip(accessor.getLong(src, offset))); + } + + @Override + public final int accordSerializedSize(Token token) + { + return 8; + } + + @Override + public final int accordFixedLength() + { + return 8; + } + + private static long flip(long value) + { + return value ^ 0x8000000000000000L; + } + private long[] getHash(ByteBuffer key) { long[] hash = new long[2]; @@ -344,7 +451,7 @@ public Token.TokenFactory getTokenFactory() return tokenFactory; } - private final Token.TokenFactory tokenFactory = new Token.TokenFactory() + private static final Token.TokenFactory tokenFactory = new Token.TokenFactory() { public Token fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version) { @@ -364,6 +471,12 @@ public void serialize(Token token, DataOutputPlus out) throws IOException out.writeLong(((LongToken) token).token); } + @Override + public Token deserialize(DataInputPlus in, IPartitioner p) throws IOException + { + return new LongToken(in.readLong()); + } + @Override public void serialize(Token token, ByteBuffer out) { @@ -422,7 +535,7 @@ public AbstractType getTokenValidator() return LongType.instance; } - public Token getMaximumToken() + public Token getMaximumTokenForSplitting() { return new LongToken(Long.MAX_VALUE); } @@ -441,4 +554,10 @@ public Optional splitter() { return Optional.of(splitter); } + + @Override + public Function accordSplitter() + { + return ignore -> splitter; + } } diff --git a/src/java/org/apache/cassandra/dht/NormalizedRanges.java b/src/java/org/apache/cassandra/dht/NormalizedRanges.java new file mode 100644 index 000000000000..c9a040a6585b --- /dev/null +++ b/src/java/org/apache/cassandra/dht/NormalizedRanges.java @@ -0,0 +1,297 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.dht; + +import java.util.AbstractList; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; +import java.util.RandomAccess; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterators; +import com.google.common.collect.PeekingIterator; + +import static com.google.common.base.Preconditions.checkState; + +/* + * Immutable list of ranges that are statically known to be normalized + */ +public class NormalizedRanges> extends AbstractList> implements List>, RandomAccess +{ + private static final NormalizedRanges EMPTY_NORMALIZED_RANGES = new NormalizedRanges(Collections.emptyList()); + + private static final Comparator NORMALIZED_TOKEN_RANGE_COMPARATOR = (o1, o2) -> { + Range range = (Range) o1; + RingPosition key = (RingPosition) o2; + boolean rangeRightIsMin = range.right.isMinimum(); + boolean keyIsMinimum = key.isMinimum(); + + if (keyIsMinimum & rangeRightIsMin) + return 0; + + int lc = key.compareTo(range.left); + int rc = key.compareTo(range.right); + if ((lc < 0 & !keyIsMinimum) | lc == 0) return 1; + if (rc > 0 & !rangeRightIsMin) return -1; + return 0; + }; + + public static > NormalizedRanges empty() + { + return (NormalizedRanges) EMPTY_NORMALIZED_RANGES; + } + + /* + * Compares ranges by right token. Used for intersecting normalized ranges. + * + * Assumes no wrap around ranges except for RHS = minValue which is essentialy synonymous with the maximal value. + * This shows up coming out of unwrap because Range is not left inclusive so the only way to include minValue + * in the range is by wrapping from maxValue. + */ + private static > int compareNormalized(Range lhs, Range rhs) + { + // otherwise compare by right. + int cmp = lhs.right.compareTo(rhs.right); + // minValue on the RHS is maxValue, but doesn't work with compare so check for it explicitly + boolean rhsRMin = rhs.right.isMinimum(); + boolean lhsRMin = lhs.right.isMinimum(); + + if (rhsRMin && lhsRMin) + return 0; + + if (cmp < 0) + { + if (lhsRMin) + { + return 1; + } + return -1; + } + else if (cmp > 0) + { + if (rhsRMin) + { + return -1; + } + return 1; + } + return 0; + } + + private final Object[] ranges; + + private NormalizedRanges(Collection> ranges) + { + this.ranges = new Object[ranges.size()]; + int index = 0; + for (Range range : ranges) + this.ranges[index++] = range; + } + + public static > NormalizedRanges normalizedRanges(Collection> ranges) + { + if (ranges instanceof NormalizedRanges) + return (NormalizedRanges) ranges; + return new NormalizedRanges<>(Range.normalize(ranges)); + } + + @Override + public Range get(int index) + { + Objects.checkIndex(index, ranges.length); + return (Range) ranges[index]; + } + + public boolean intersects(T token) + { + if (this.size() == 1 && this.get(0).isFull()) + return true; + boolean isIn = Collections.binarySearch((List) this, token, NORMALIZED_TOKEN_RANGE_COMPARATOR) >= 0; + if (Range.EXPENSIVE_CHECKS) + checkState(Range.isInRanges(token, this) == isIn); + return isIn; + } + + public NormalizedRanges subtract(NormalizedRanges b) + { + if (b.isEmpty()) + return this; + + if (b.size() == 1 && b.get(0).isFull()) + return NormalizedRanges.empty(); + + if (this.size() == 1 && this.get(0).isFull()) + return b.invert(); + + List> remaining = new ArrayList<>(); + Iterator> aIter = this.iterator(); + Iterator> bIter = b.iterator(); + Range aRange = aIter.hasNext() ? aIter.next() : null; + Range bRange = bIter.hasNext() ? bIter.next() : null; + while (aRange != null && bRange != null) + { + boolean aRMin = aRange.right.isMinimum(); + boolean bRMin = bRange.right.isMinimum(); + + if (aRMin && bRMin) + { + if (aRange.left.compareTo(bRange.left) < 0) + remaining.add(new Range<>(aRange.left, bRange.left)); + checkState(!aIter.hasNext() && !bIter.hasNext()); + aRange = null; + break; + } + + if (!aRMin && aRange.right.compareTo(bRange.left) <= 0) + { + remaining.add(aRange); + aRange = aIter.hasNext() ? aIter.next() : null; + } + else if (!bRMin && aRange.left.compareTo(bRange.right) >= 0) + { + bRange = bIter.hasNext() ? bIter.next() : null; + } + else + { + // Handle what remains to the left of the intersection + if (aRange.left.compareTo(bRange.left) < 0) + { + remaining.add(new Range(aRange.left, bRange.left)); + } + + // Handle what remains to the right of the intersection + if (!aRMin && (aRange.right.compareTo(bRange.right) <= 0 | bRMin)) + aRange = aIter.hasNext() ? aIter.next() : null; + else + aRange = new Range(bRange.right, aRange.right); + } + } + + while (aRange != null) + { + remaining.add(aRange); + aRange = aIter.hasNext() ? aIter.next() : null; + } + + NormalizedRanges result = normalizedRanges(remaining); + if (Range.EXPENSIVE_CHECKS) + checkState(result.equals(Range.normalize(Range.subtract(this, b)))); + return result; + } + + @VisibleForTesting + NormalizedRanges invert() + { + checkState(!isEmpty()); + + List> result = new ArrayList<>(size() + 2); + T minValue = get(0).left.minValue(); + T left = minValue; + for (Range r : this) + { + if (!r.left.equals(left)) + { + result.add(new Range<>(left, r.left)); + } + left = r.right; + } + + // Loop doesn't add the range to the right of the last one + Range last = get(size() - 1); + if (!last.right.isMinimum()) + result.add(new Range<>(last.right, minValue)); + + result = Range.normalize(result); + if (Range.EXPENSIVE_CHECKS) + checkState(result.equals(Range.normalize(Range.subtract(ImmutableList.of(new Range<>(minValue, minValue)), this)))); + return new NormalizedRanges<>(result); + } + + public NormalizedRanges intersection(NormalizedRanges b) + { + if (this.size() == 1 && this.get(0).isFull()) + return b; + if (b.size() == 1 && b.get(0).isFull()) + return this; + + List> merged = new ArrayList<>(); + PeekingIterator> aIter = Iterators.peekingIterator(this.iterator()); + PeekingIterator> bIter = Iterators.peekingIterator(b.iterator()); + while (aIter.hasNext() && bIter.hasNext()) + { + Range aRange = aIter.peek(); + Range bRange = bIter.peek(); + + int cmp = compareNormalized(aRange, bRange); + if (aRange.intersects(bRange)) + { + merged.addAll(aRange.intersectionWith(bRange)); + if (cmp == 0) + { + aIter.next(); + bIter.next(); + } + else if (cmp < 0) + { + aIter.next(); + } + else + { + bIter.next(); + } + } + else + { + if (cmp <= 0) + aIter.next(); + if (cmp >= 0) + bIter.next(); + } + } + + NormalizedRanges result = normalizedRanges(merged); + + if (Range.EXPENSIVE_CHECKS) + { + List> expensiveResult = new ArrayList<>(); + for (Range r1 : this) + { + for (Range r2 : b) + { + expensiveResult.addAll(r1.intersectionWith(r2)); + } + } + checkState(result.equals(Range.normalize(expensiveResult))); + } + + return result; + } + + @Override + public int size() + { + return ranges.length; + } +} diff --git a/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java b/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java index eb2e01e3bcb3..fe76e0eace8d 100644 --- a/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java +++ b/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java @@ -20,11 +20,18 @@ import java.math.BigInteger; import java.nio.ByteBuffer; import java.nio.charset.CharacterCodingException; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; import java.util.concurrent.ThreadLocalRandom; +import java.util.function.Function; -import org.apache.cassandra.db.DecoratedKey; +import accord.api.RoutingKey; +import accord.primitives.Ranges; import org.apache.cassandra.db.CachedHashDecoratedKey; +import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.UTF8Type; import org.apache.cassandra.exceptions.ConfigurationException; @@ -32,6 +39,7 @@ import org.apache.cassandra.schema.Schema; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.api.TokenKey; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.bytecomparable.ByteComparable; import org.apache.cassandra.utils.bytecomparable.ByteSource; @@ -40,6 +48,11 @@ import org.apache.cassandra.utils.ObjectSizes; import org.apache.cassandra.utils.Pair; +import static accord.utils.Invariants.requireArgument; +import static java.lang.Integer.max; +import static java.math.BigInteger.ONE; +import static java.math.BigInteger.ZERO; + public class OrderPreservingPartitioner implements IPartitioner { private static final String rndchars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; @@ -61,6 +74,8 @@ public int compareTo(Token o) public static final OrderPreservingPartitioner instance = new OrderPreservingPartitioner(); + private OrderPreservingPartitioner() {} + public DecoratedKey decorateKey(ByteBuffer key) { return new CachedHashDecoratedKey(getToken(key), key); @@ -90,7 +105,7 @@ private static BigInteger bigForString(String str, int sigchars) { assert str.length() <= sigchars; - BigInteger big = BigInteger.ZERO; + BigInteger big = ZERO; for (int i = 0; i < str.length(); i++) { int charpos = 16 * (sigchars - (i + 1)); @@ -125,7 +140,7 @@ public StringToken getMinimumToken() return MINIMUM; } - public StringToken getMaximumToken() + public StringToken getMaximumTokenForSplitting() { return MAXIMUM; } @@ -232,6 +247,12 @@ public int compareTo(Token o) return super.compareTo(o); } + + @Override + public int tokenHash() + { + return token.hashCode(); + } } public StringToken getToken(ByteBuffer key) @@ -252,7 +273,7 @@ public Map describeOwnership(List sortedTokens) { // allTokens will contain the count and be returned, sorted_ranges is shorthand for token<->token math. Map allTokens = new HashMap(); - List> sortedRanges = new ArrayList>(sortedTokens.size()); + List> sortedRanges = new ArrayList<>(sortedTokens.size()); // this initializes the counts to 0 and calcs the ranges in order. Token lastToken = sortedTokens.get(sortedTokens.size() - 1); @@ -296,4 +317,59 @@ public AbstractType partitionOrdering() { return UTF8Type.instance; } + + @Override + public Function accordSplitter() + { + return ranges -> new AccordSplitter() + { + final int charLength = ranges.stream().mapToInt(range -> max(charLength(range.start()), charLength(range.end()))) + .max().orElse(0); + + @Override + BigInteger valueForToken(Token token) + { + String chars = ((StringToken) token).token; + requireArgument(chars.length() <= charLength); + BigInteger value = ZERO; + for (int i = 0 ; i < chars.length() ; ++i) + value = value.add(BigInteger.valueOf(chars.charAt(i) & 0xffffL).shiftLeft((charLength - 1 - i) * 16)); + return value; + } + + @Override + Token tokenForValue(BigInteger value) + { + // TODO (required): test + requireArgument(value.compareTo(ZERO) >= 0); + char[] chars = new char[charLength]; + for (int i = 0 ; i < chars.length ; ++i) + chars[i] = (char) value.shiftRight((charLength - 1 - i) * 16).shortValue(); + return new StringToken(new String(chars)); + } + + @Override + BigInteger minimumValue() + { + return ZERO; + } + + @Override + BigInteger maximumValue() + { + return ONE.shiftLeft(charLength * 16).subtract(ONE); + } + }; + } + + private static int charLength(RoutingKey routingKey) + { + TokenKey accordKey = (TokenKey) routingKey; + return charLength(accordKey.token()); + } + + private static int charLength(Token token) + { + return ((StringToken) token).token.length(); + } } diff --git a/src/java/org/apache/cassandra/dht/RandomPartitioner.java b/src/java/org/apache/cassandra/dht/RandomPartitioner.java index 9b833e3868d8..45aff607c476 100644 --- a/src/java/org/apache/cassandra/dht/RandomPartitioner.java +++ b/src/java/org/apache/cassandra/dht/RandomPartitioner.java @@ -22,26 +22,36 @@ import java.math.BigInteger; import java.nio.ByteBuffer; import java.security.MessageDigest; -import java.util.*; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Random; +import java.util.function.Function; import com.google.common.annotations.VisibleForTesting; +import accord.primitives.Ranges; +import accord.utils.Invariants; import org.apache.cassandra.db.CachedHashDecoratedKey; -import org.apache.cassandra.db.marshal.ByteArrayAccessor; -import org.apache.cassandra.db.marshal.ByteBufferAccessor; -import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.ByteArrayAccessor; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; import org.apache.cassandra.db.marshal.IntegerType; import org.apache.cassandra.db.marshal.PartitionerDefinedOrder; +import org.apache.cassandra.db.marshal.ValueAccessor; +import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.utils.ByteBufferUtil; -import org.apache.cassandra.utils.bytecomparable.ByteComparable; -import org.apache.cassandra.utils.bytecomparable.ByteSource; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.GuidGenerator; import org.apache.cassandra.utils.ObjectSizes; import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; /** * This class generates a BigIntegerToken using MD5 hash. @@ -92,8 +102,22 @@ public BigInteger valueForToken(Token token) { return ((BigIntegerToken)token).getTokenValue(); } + + @Override + BigInteger minimumValue() + { + return MINIMUM.getTokenValue(); + } + + @Override + BigInteger maximumValue() + { + return MAXIMUM; + } }; + private RandomPartitioner() {} + public DecoratedKey decorateKey(ByteBuffer key) { return new CachedHashDecoratedKey(getToken(key), key); @@ -278,9 +302,25 @@ public long getHeapSize() public Token nextValidToken() { + if (token.equals(MAXIMUM)) + throw new IllegalArgumentException("Cannot increase above MAXIMUM"); return new BigIntegerToken(token.add(BigInteger.ONE)); } + @Override + public Token decreaseSlightly() + { + if (token.equals(MINIMUM.token)) + throw new IllegalArgumentException("Cannot decrease below MINIMUM"); + return new BigIntegerToken(token.subtract(BigInteger.ONE)); + } + + @Override + public int tokenHash() + { + return token.hashCode(); + } + public double size(Token next) { BigIntegerToken n = (BigIntegerToken) next; @@ -337,7 +377,7 @@ public Map describeOwnership(List sortedTokens) return ownerships; } - public Token getMaximumToken() + public Token getMaximumTokenForSplitting() { return new BigIntegerToken(MAXIMUM); } @@ -362,6 +402,92 @@ public Optional splitter() return Optional.of(splitter); } + @Override + public Function accordSplitter() + { + return ignore -> splitter; + } + + public final boolean accordSupported() + { + return true; + } + + private static final byte[] ZERO_BYTES = new byte[16]; + + @Override + public final void accordSerialize(Token token, DataOutputPlus out) throws IOException + { + byte[] bytes = increment(((BigIntegerToken)token).token.toByteArray()); + Invariants.require(bytes.length <= 16); + if (bytes.length < 16) + out.write(ZERO_BYTES, 0, 16 - bytes.length); + out.write(bytes); + } + + @Override + public final void accordSerialize(Token token, ByteBuffer out) + { + byte[] bytes = increment(((BigIntegerToken)token).token.toByteArray()); + Invariants.require(bytes.length <= 16); + if (bytes.length < 16) + out.put(ZERO_BYTES, 0, 16 - bytes.length); + out.put(bytes); + } + + @Override + public final Token accordDeserialize(DataInputPlus in, int length) throws IOException + { + Invariants.require(length == 16); + byte[] bytes = new byte[16]; + in.readFully(bytes); + decrement(bytes); + return new BigIntegerToken(new BigInteger(bytes)); + } + + @Override + public final Token accordDeserialize(ByteBuffer in, int length) + { + byte[] bytes = new byte[16]; + in.get(bytes); + decrement(bytes); + return new BigIntegerToken(new BigInteger(bytes)); + } + + @Override + public final Token accordDeserialize(V src, ValueAccessor accessor, int offset, int length) + { + byte[] bytes = accessor.toArray(src, offset, 16); + decrement(bytes); + return new BigIntegerToken(new BigInteger(bytes)); + } + + public static byte[] increment(byte[] bytes) + { + int i = bytes.length; + while (--i >= 0 && ++bytes[i] == 0); + if (i == -1) + return ZERO_BYTES; + return bytes; + } + + public static void decrement(byte[] bytes) + { + for (int i = bytes.length - 1 ; i >= 0 && bytes[i]-- == 0 ; --i); + } + + @Override + public final int accordSerializedSize(Token token) + { + return 16; + } + + @Override + public final int accordFixedLength() + { + return 16; + } + private static BigInteger hashToBigInteger(ByteBuffer data) { MessageDigest messageDigest = localMD5Digest.get(); diff --git a/src/java/org/apache/cassandra/dht/Range.java b/src/java/org/apache/cassandra/dht/Range.java index b5d06967ac01..00204c66b489 100644 --- a/src/java/org/apache/cassandra/dht/Range.java +++ b/src/java/org/apache/cassandra/dht/Range.java @@ -19,13 +19,24 @@ import java.io.IOException; import java.io.Serializable; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; import java.util.function.Predicate; import com.google.common.collect.Iterables; import org.apache.commons.lang3.ObjectUtils; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.dht.Token.KeyBound; +import org.apache.cassandra.dht.Token.TokenFactory; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.net.MessagingService; @@ -34,6 +45,11 @@ import org.apache.cassandra.tcm.serialization.Version; import org.apache.cassandra.utils.Pair; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static java.util.Collections.emptyList; +import static org.apache.cassandra.config.CassandraRelevantProperties.TEST_RANGE_EXPENSIVE_CHECKS; + /** * A representation of the range that a node is responsible for on the DHT ring. * @@ -48,6 +64,34 @@ public class Range> extends AbstractBounds implemen public static final Serializer serializer = new Serializer(); public static final long serialVersionUID = 1L; + public static final boolean EXPENSIVE_CHECKS = TEST_RANGE_EXPENSIVE_CHECKS.getBoolean(); + + public static final IPartitionerDependentSerializer rangeSerializer = new RangeSerializer(); + + public static class RangeSerializer> implements IPartitionerDependentSerializer> + { + @Override + public void serialize(Range range, DataOutputPlus out, int version) throws IOException + { + Token.compactSerializer.serialize(range.left.getToken(), out, version); + Token.compactSerializer.serialize(range.right.getToken(), out, version); + } + + @Override + public Range deserialize(DataInputPlus in, IPartitioner p, int version) throws IOException + { + return new Range(Token.compactSerializer.deserialize(in, p, version), + Token.compactSerializer.deserialize(in, p, version)); + } + + @Override + public long serializedSize(Range range, int version) + { + return Token.compactSerializer.serializedSize(range.left.getToken(), version) + + Token.compactSerializer.serializedSize(range.right.getToken(), version); + } + } + public Range(T left, T right) { super(left, right); @@ -361,7 +405,7 @@ private List> subtractContained(Range contained) // both ranges cover the entire ring, their difference is an empty set if(isFull(left, right) && isFull(contained.left, contained.right)) { - return Collections.emptyList(); + return emptyList(); } // a range is subtracted from another range that covers the entire ring @@ -472,6 +516,11 @@ public static > boolean isInRanges(T token, Iterable> boolean equals(Collection> a, Collection> b) + { + return normalize(a).equals(normalize(b)); + } + + // Helper to convert a range string to POJO so you can copy toString from a debugger + public static Range fromString(String value) + { + return fromString(value, DatabaseDescriptor.getPartitioner()); + } + + public static Range fromString(String value, IPartitioner partitioner) + { + TokenFactory tokenFactory = partitioner.getTokenFactory(); + String[] parts = value.split(","); + Token left = tokenFactory.fromString(parts[0].substring(1)); + Token right = tokenFactory.fromString(parts[1].substring(0, parts[1].length() -1)); + return new Range<>(left, right); + } + public static > void assertNormalized(List> ranges) { Range lastRange = null; @@ -710,4 +779,88 @@ public long serializedSize(Range t, Version version) return tokenSerializer.serializedSize(t, SERDE_VERSION); } } + + /** + * Returns a Pair containing the intersection (or null) and the remainder of the bounds that is to the right of the + * range, the remainder to the left is discarded since it is assumed if you are checking for intersection of multiple ranges + * the ranges are being checked in order. + */ + public static Pair, AbstractBounds> intersectionAndRemainder(AbstractBounds bounds, org.apache.cassandra.dht.Range range) + { + checkArgument((bounds.inclusiveRight() && bounds.inclusiveLeft()) || (bounds.left.compareTo(bounds.right) < 0 || bounds.right.isMinimum()), "Wrap around not handled"); + boolean boundsInclusiveLeft = bounds.inclusiveLeft() || (bounds.left.getClass() == KeyBound.class && ((KeyBound)bounds.left).isMinimumBound); + boolean boundsInclusiveRight = bounds.inclusiveRight() || (bounds.right.getClass() == KeyBound.class && !((KeyBound)bounds.right).isMinimumBound); + Token boundsLeft = bounds.left.getToken(); + Token boundsRight = bounds.right.getToken(); + Token rangeLeft = range.left; + Token rangeRight = range.right; + checkState(rangeLeft.compareTo(rangeRight) < 0 || rangeRight.isMinimum(), "Wrap around is not handled"); + + // Completely before + int rightLeftCmp = boundsRight.compareTo(rangeLeft); + // Nothing is > min on the right + if (boundsRight.isMinimum()) + rightLeftCmp = 1; + // Range left is not inclusive, doesn't matter whether the bound is inclusive/exclusive left + rightLeftCmp = rightLeftCmp == 0 ? -1 : rightLeftCmp; + if (rightLeftCmp < 0) + return Pair.create(null, null); + + // Completely after + int leftRightCmp = boundsLeft.compareTo(rangeRight); + // Nothing is > min on the right + if (rangeRight.isMinimum()) + leftRightCmp = -1; + // Fixed mismatched inclusivity + leftRightCmp = leftRightCmp == 0 && !boundsInclusiveLeft ? 1 : leftRightCmp; + if (leftRightCmp > 0) + return Pair.create(null, bounds); + + int rightRightCmp = boundsRight.compareTo(rangeRight); + // min on the right is > than everything + if (rangeRight.isMinimum() && boundsRight.isMinimum()) + rightRightCmp = 0; + else if (boundsRight.isMinimum()) + rightRightCmp = 1; + else if (rangeRight.isMinimum()) + rightRightCmp = -1; + // Fixed mismatched inclusivity + rightRightCmp = rightRightCmp == 0 && !boundsInclusiveRight ? -1 : rightRightCmp; + + int leftLeftCmp = boundsLeft.compareTo(rangeLeft); + // Range left is not inclusive, doesn't matter whether the bound is inclusive/exclusive left + leftLeftCmp = leftLeftCmp == 0 ? -1 : leftLeftCmp; + + // Fully contained + if (leftLeftCmp > 0 && rightRightCmp <= 0) + return Pair.create(bounds, null); + // Split by the right bound of the range (rightRightCmp is implicitly > 0 given the preceding condition) + else if (leftLeftCmp >= 0) + return bounds.split(rangeRight.maxKeyBound()); + // Intersects but has some portion that needs to be discarded first + else + { + // Remove everything before the intersection + Pair, AbstractBounds> split = bounds.split(rangeLeft.maxKeyBound()); + AbstractBounds intersectionAndRemainder = bounds; + if (split != null) + intersectionAndRemainder = split.right; + // There is a remainder + if (rightRightCmp > 0) + return intersectionAndRemainder.split(rangeRight.maxKeyBound()); + // There is no remainder everything that + return Pair.create(intersectionAndRemainder, null); + } + } + + public static int compareRightToken(Token a, Token b) + { + if (a.isMinimum() && b.isMinimum()) + return 0; + if (a.isMinimum()) + return 1; + if (b.isMinimum()) + return 0; + return a.compareTo(b); + } } diff --git a/src/java/org/apache/cassandra/dht/RangeStreamer.java b/src/java/org/apache/cassandra/dht/RangeStreamer.java index 7d57b709565c..f21dd3186d40 100644 --- a/src/java/org/apache/cassandra/dht/RangeStreamer.java +++ b/src/java/org/apache/cassandra/dht/RangeStreamer.java @@ -39,6 +39,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.gms.FailureDetector; @@ -58,7 +59,9 @@ import org.apache.cassandra.locator.ReplicaCollection.Builder.Conflict; import org.apache.cassandra.locator.Replicas; import org.apache.cassandra.locator.NodeProximity; +import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.ReplicationParams; +import org.apache.cassandra.schema.Schema; import org.apache.cassandra.streaming.PreviewKind; import org.apache.cassandra.streaming.StreamOperation; import org.apache.cassandra.streaming.StreamPlan; @@ -98,6 +101,7 @@ public class RangeStreamer private final StreamStateStore stateStore; private final MovementMap movements; private final MovementMap strictMovements; + private final boolean excludeAccordTables; public static class FetchReplica { @@ -298,10 +302,11 @@ public RangeStreamer(ClusterMetadata metadata, boolean connectSequentially, int connectionsPerHost, MovementMap movements, - MovementMap strictMovements) + MovementMap strictMovements, + boolean excludeAccordTables) { this(metadata, streamOperation, useStrictConsistency, proximity, stateStore, - FailureDetector.instance, connectSequentially, connectionsPerHost, movements, strictMovements); + FailureDetector.instance, connectSequentially, connectionsPerHost, movements, strictMovements, excludeAccordTables); } RangeStreamer(ClusterMetadata metadata, @@ -313,8 +318,10 @@ public RangeStreamer(ClusterMetadata metadata, boolean connectSequentially, int connectionsPerHost, MovementMap movements, - MovementMap strictMovements) + MovementMap strictMovements, + boolean excludeAccordTables) { + this.excludeAccordTables = excludeAccordTables; Preconditions.checkArgument(streamOperation == StreamOperation.BOOTSTRAP || streamOperation == StreamOperation.REBUILD, streamOperation); this.metadata = metadata; this.description = streamOperation.getDescription(); @@ -383,8 +390,12 @@ public void addKeyspaceToFetch(String keyspaceName) Multimap workMap; //Only use the optimized strategy if we don't care about strict sources, have a replication factor > 1, and no - //transient replicas. - if (useStrictSource || strat == null || strat.getReplicationFactor().allReplicas == 1 || strat.getReplicationFactor().hasTransientReplicas()) + //transient replicas or it is intentionally skipped. + if (CassandraRelevantProperties.SKIP_OPTIMAL_STREAMING_CANDIDATES_CALCULATION.getBoolean() || + useStrictSource || + strat == null || + strat.getReplicationFactor().allReplicas == 1 || + strat.getReplicationFactor().hasTransientReplicas()) { workMap = convertPreferredEndpointsToWorkMap(fetchMap); } @@ -755,8 +766,17 @@ public StreamResultFuture fetchAsync() logger.debug("Source and our replicas {}", fetchReplicas); logger.debug("Source {} Keyspace {} streaming full {} transient {}", source, keyspace, full, transientReplicas); - /* Send messages to respective folks to stream data over to me */ - streamPlan.requestRanges(source, keyspace, full, transientReplicas); + KeyspaceMetadata ksm = Schema.instance.getKeyspaceMetadata(keyspace); + if (excludeAccordTables && StreamPlan.hasAccordTables(ksm)) + { + String[] cfNames = StreamPlan.nonAccordTablesForKeyspace(ksm); + if (cfNames != null) + streamPlan.requestRanges(source, keyspace, full, transientReplicas, cfNames); + } + else + { + streamPlan.requestRanges(source, keyspace, full, transientReplicas); + } }); }); diff --git a/src/java/org/apache/cassandra/dht/ReversedLongLocalPartitioner.java b/src/java/org/apache/cassandra/dht/ReversedLongLocalPartitioner.java index 43f9ab832d18..f95f1e776400 100644 --- a/src/java/org/apache/cassandra/dht/ReversedLongLocalPartitioner.java +++ b/src/java/org/apache/cassandra/dht/ReversedLongLocalPartitioner.java @@ -23,10 +23,12 @@ import java.util.List; import java.util.Map; import java.util.Random; +import java.util.function.Function; import com.google.common.annotations.VisibleForTesting; import com.google.common.primitives.Longs; +import accord.primitives.Ranges; import org.apache.cassandra.db.CachedHashDecoratedKey; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.marshal.AbstractType; @@ -156,6 +158,12 @@ public AbstractType partitionOrdering() return LongType.instance; } + @Override + public Function accordSplitter() + { + throw new UnsupportedOperationException("Accord is not supported by " + getClass().getName()); + } + @VisibleForTesting public static class ReversedLongLocalToken extends Token { @@ -184,6 +192,12 @@ public Object getTokenValue() return token; } + @Override + public int tokenHash() + { + return Long.hashCode(token); + } + @Override public long getLongValue() { @@ -210,6 +224,13 @@ public Token nextValidToken() getClass().getSimpleName())); } + @Override + public Token decreaseSlightly() + { + throw new UnsupportedOperationException(String.format("Token type %s does not support token allocation.", + getClass().getSimpleName())); + } + @Override public int compareTo(Token o) { diff --git a/src/java/org/apache/cassandra/dht/Splitter.java b/src/java/org/apache/cassandra/dht/Splitter.java index 53b4462221cd..165a2ecf9bd6 100644 --- a/src/java/org/apache/cassandra/dht/Splitter.java +++ b/src/java/org/apache/cassandra/dht/Splitter.java @@ -36,7 +36,7 @@ /** * Partition splitter. */ -public abstract class Splitter +public abstract class Splitter extends AccordSplitter { private final IPartitioner partitioner; @@ -45,18 +45,12 @@ protected Splitter(IPartitioner partitioner) this.partitioner = partitioner; } - @VisibleForTesting - protected abstract Token tokenForValue(BigInteger value); - - @VisibleForTesting - protected abstract BigInteger valueForToken(Token token); - @VisibleForTesting protected BigInteger tokensInRange(Range range) { //full range case if (range.left.equals(range.right)) - return tokensInRange(new Range(partitioner.getMinimumToken(), partitioner.getMaximumToken())); + return tokensInRange(new Range(partitioner.getMinimumToken(), partitioner.getMaximumTokenForSplitting())); BigInteger totalTokens = BigInteger.ZERO; for (Range unwrapped : range.unwrap()) @@ -101,7 +95,7 @@ public double positionInRange(Token token, Range range) { //full range case if (range.left.equals(range.right)) - return positionInRange(token, new Range(partitioner.getMinimumToken(), partitioner.getMaximumToken())); + return positionInRange(token, new Range(partitioner.getMinimumToken(), partitioner.getMaximumTokenForSplitting())); // leftmost token means we are on position 0.0 if (token.equals(range.left)) @@ -121,7 +115,7 @@ public double positionInRange(Token token, Range range) public List splitOwnedRanges(int parts, List weightedRanges, boolean dontSplitRanges) { if (weightedRanges.isEmpty() || parts == 1) - return Collections.singletonList(partitioner.getMaximumToken()); + return Collections.singletonList(partitioner.getMaximumTokenForSplitting()); BigInteger totalTokens = BigInteger.ZERO; for (WeightedRange weightedRange : weightedRanges) @@ -132,7 +126,7 @@ public List splitOwnedRanges(int parts, List weightedRange BigInteger perPart = totalTokens.divide(BigInteger.valueOf(parts)); // the range owned is so tiny we can't split it: if (perPart.equals(BigInteger.ZERO)) - return Collections.singletonList(partitioner.getMaximumToken()); + return Collections.singletonList(partitioner.getMaximumTokenForSplitting()); if (dontSplitRanges) return splitOwnedRangesNoPartialRanges(weightedRanges, perPart, parts); @@ -161,7 +155,7 @@ else if (partsLeft == 1) } sum = sum.add(currentRangeWidth); } - boundaries.set(boundaries.size() - 1, partitioner.getMaximumToken()); + boundaries.set(boundaries.size() - 1, partitioner.getMaximumTokenForSplitting()); assert boundaries.size() == parts : boundaries.size() + "!=" + parts + " " + boundaries + ":" + weightedRanges; return boundaries; @@ -198,7 +192,7 @@ private List splitOwnedRangesNoPartialRanges(List weighted } i++; } - boundaries.add(partitioner.getMaximumToken()); + boundaries.add(partitioner.getMaximumTokenForSplitting()); return boundaries; } @@ -208,7 +202,7 @@ private List splitOwnedRangesNoPartialRanges(List weighted */ private Token token(Token t) { - return t.equals(partitioner.getMinimumToken()) ? partitioner.getMaximumToken() : t; + return t.equals(partitioner.getMinimumToken()) ? partitioner.getMaximumTokenForSplitting() : t; } /** diff --git a/src/java/org/apache/cassandra/dht/Token.java b/src/java/org/apache/cassandra/dht/Token.java index 3b78d7b84dc0..edee034aea87 100644 --- a/src/java/org/apache/cassandra/dht/Token.java +++ b/src/java/org/apache/cassandra/dht/Token.java @@ -20,24 +20,34 @@ import java.io.IOException; import java.io.Serializable; import java.nio.ByteBuffer; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +import com.google.common.collect.Sets; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.apache.cassandra.db.PartitionPosition; import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; -import org.apache.cassandra.utils.bytecomparable.ByteComparable; -import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.tcm.serialization.PartitionerAwareMetadataSerializer; import org.apache.cassandra.tcm.serialization.Version; -import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.vint.VIntCoding; public abstract class Token implements RingPosition, Serializable { + private static final Logger logger = LoggerFactory.getLogger(Token.class); + private static final long serialVersionUID = 1L; public static final TokenSerializer serializer = new TokenSerializer(); public static final MetadataSerializer metadataSerializer = new MetadataSerializer(); + public static final CompactTokenSerializer compactSerializer = new CompactTokenSerializer(); public static abstract class TokenFactory { @@ -75,11 +85,25 @@ public void serialize(Token token, DataOutputPlus out) throws IOException out.write(toByteArray(token)); } - public void serialize(Token token, ByteBuffer out) throws IOException + public void serialize(Token token, ByteBuffer out) { out.put(toByteArray(token)); } + public Token deserialize(DataInputPlus in, IPartitioner p) throws IOException + { + int size = p.isFixedLength() ? p.getMaxTokenSize() : in.readUnsignedVInt32(); + byte[] bytes = new byte[size]; + in.readFully(bytes); + return p.getTokenFactory().fromByteArray(ByteBuffer.wrap(bytes)); + } + + public void skip(DataInputPlus in, IPartitioner p) throws IOException + { + int size = p.isFixedLength() ? p.getMaxTokenSize() : in.readUnsignedVInt32(); + in.skipBytesFully(size); + } + public Token fromByteBuffer(ByteBuffer bytes, int position, int length) { bytes = bytes.duplicate(); @@ -158,9 +182,80 @@ public long serializedSize(Token object, int version) } } + public static boolean logPartitioner = false; + public static final Set> serializePartitioners = Sets.newSetFromMap(new ConcurrentHashMap<>()); + public static final Set> deserializePartitioners = Sets.newSetFromMap(new ConcurrentHashMap<>()); + + public static class CompactTokenSerializer implements IPartitionerDependentSerializer + { + public void serialize(Token token, DataOutputPlus out, int version) throws IOException + { + IPartitioner p = token.getPartitioner(); + if (logPartitioner && serializePartitioners.add(p.getClass())) + logger.debug("Serializing token with partitioner " + p); + if (!p.isFixedLength()) + out.writeUnsignedVInt32(p.getTokenFactory().byteSize(token)); + p.getTokenFactory().serialize(token, out); + } + + public void serialize(Token token, ByteBuffer out) + { + IPartitioner p = token.getPartitioner(); + if (logPartitioner && serializePartitioners.add(p.getClass())) + logger.debug("Serializing token with partitioner " + p); + if (!p.isFixedLength()) + VIntCoding.writeUnsignedVInt32(p.getTokenFactory().byteSize(token), out); + p.getTokenFactory().serialize(token, out); + } + + public Token deserialize(ByteBuffer in, IPartitioner p) + { + int size = p.isFixedLength() ? p.getMaxTokenSize() : VIntCoding.readUnsignedVInt32(in); + if (logPartitioner && deserializePartitioners.add(p.getClass())) + logger.debug("Deserializing token with partitioner " + p); + byte[] bytes = new byte[size]; + in.get(bytes); + return p.getTokenFactory().fromByteArray(ByteBuffer.wrap(bytes)); + } + + public void skip(DataInputPlus in, IPartitioner p, int version) throws IOException + { + int size = p.isFixedLength() ? p.getMaxTokenSize() : in.readUnsignedVInt32(); + if (logPartitioner && deserializePartitioners.add(p.getClass())) + logger.debug("Deserializing token with partitioner " + p); + in.skipBytesFully(size); + } + + public Token deserialize(DataInputPlus in, IPartitioner p, int version) throws IOException + { + int size = p.isFixedLength() ? p.getMaxTokenSize() : in.readUnsignedVInt32(); + if (logPartitioner && deserializePartitioners.add(p.getClass())) + logger.debug("Deserializing token with partitioner " + p); + byte[] bytes = new byte[size]; + in.readFully(bytes); + return p.getTokenFactory().fromByteArray(ByteBuffer.wrap(bytes)); + } + + public long serializedSize(Token object, int version) + { + return serializedSize(object); + } + + public long serializedSize(Token object) + { + IPartitioner p = object.getPartitioner(); + int byteSize = p.getTokenFactory().byteSize(object); + if (p.isFixedLength()) + return byteSize; + return TypeSizes.sizeofUnsignedVInt(byteSize) + byteSize; + } + } + abstract public IPartitioner getPartitioner(); abstract public long getHeapSize(); abstract public Object getTokenValue(); + abstract public int tokenHash(); + public TokenFactory tokenFactory() { return getPartitioner().getTokenFactory(); } /** * This method exists so that callers can access the primitive {@code long} value for this {@link Token}, if @@ -196,6 +291,7 @@ public long getLongValue() * Used by the token allocation algorithm (see CASSANDRA-7032). */ abstract public double size(Token next); + /** * Returns the next possible token in the token space, one that compares * greater than this and such that there is no other token that sits @@ -209,6 +305,15 @@ public long getLongValue() * constructing token ranges for sstables. */ abstract public Token nextValidToken(); + /** + * Returns a token that is slightly more than this. This is NOT guaranteed to be the directly following token. + */ + public Token increaseSlightly() { return nextValidToken(); } + + /** + * Returns a token that is slightly less than this. This is NOT guaranteed to be the directly preceding token. + */ + abstract public Token decreaseSlightly(); public Token getToken() { diff --git a/src/java/org/apache/cassandra/exceptions/ExceptionSerializer.java b/src/java/org/apache/cassandra/exceptions/ExceptionSerializer.java new file mode 100644 index 000000000000..de379739a383 --- /dev/null +++ b/src/java/org/apache/cassandra/exceptions/ExceptionSerializer.java @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.exceptions; + +import java.io.IOException; +import java.util.HashMap; +import java.util.IdentityHashMap; +import java.util.Map; +import java.util.Set; + +import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.utils.ArraySerializers; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.NullableSerializer; + +import static java.util.Collections.newSetFromMap; +import static org.apache.cassandra.db.TypeSizes.sizeof; +import static org.apache.cassandra.db.TypeSizes.sizeofUnsignedVInt; + +/** + * Support for serializing exceptions without a dependency on being able to instantiate the exception class + * on the other side to eliminate dependencies across versions. + * + * This is still slightly more flexible than sending a string representation of the exception because it's still an exception so using it + * as a cause or suppressed exception works and it is formatted nicely as if it were another local exception. + */ +public class ExceptionSerializer +{ + public static class RemoteException extends RuntimeException + { + private final String originalClass; + + public RemoteException(String originalClass, String originalMessage, StackTraceElement[] stackTrace) + { + super(originalMessage); + this.originalClass = originalClass; + setStackTrace(stackTrace); + } + + private void initSuppressedAndCause(RemoteException cause, RemoteException[] suppressed) + { + initCause(cause); + for (RemoteException e : suppressed) + addSuppressed(e); + } + + @Override + public String toString() + { + String message = getMessage(); + return message != null ? originalClass + ": " + message : originalClass; + } + } + + static String getMessageWithOriginatingHost(Throwable t, boolean isFirstException) + { + if (isFirstException) + return "Remote exception from host " + FBUtilities.getBroadcastAddressAndPort().toString() + (t.getLocalizedMessage() != null ? " - " + t.getLocalizedMessage() : ""); + else + return t.getLocalizedMessage(); + } + + private static final IVersionedSerializer stackTraceElementSerializer = new IVersionedSerializer() + { + @Override + public void serialize(StackTraceElement t, DataOutputPlus out, int version) throws IOException + { + out.writeUTF(t.getClassName()); + out.writeUTF(t.getMethodName()); + out.writeBoolean(t.getFileName() != null); + if (t.getFileName() != null) + out.writeUTF(t.getFileName()); + out.writeUnsignedVInt32(t.getLineNumber()); + } + + @Override + public StackTraceElement deserialize(DataInputPlus in, int version) throws IOException + { + String className = in.readUTF(); + String methodName = in.readUTF(); + String fileName = null; + if (in.readBoolean()) + fileName = in.readUTF(); + int lineNumber = in.readUnsignedVInt32(); + return new StackTraceElement(className, methodName, fileName, lineNumber); + } + + @Override + public long serializedSize(StackTraceElement t, int version) + { + long size = sizeof(t.getClassName()) + + sizeof(t.getMethodName()) + + sizeof(t.getFileName() != null) + + sizeofUnsignedVInt(t.getLineNumber()); + if (t.getFileName() != null) + size += sizeof(t.getFileName()); + return size; + } + }; + + public static final IVersionedSerializer remoteExceptionSerializer = new IVersionedSerializer() + { + @Override + public void serialize(Throwable t, DataOutputPlus out, int version) throws IOException + { + Map alreadySerialized = new IdentityHashMap<>(); + serializeNextException(t, out, true, version, 0, alreadySerialized); + } + + private int serializeNextException(Throwable t, DataOutputPlus out, boolean isFirstException, int version, int nextExceptionId, Map alreadySerialized) throws IOException + { + if (alreadySerialized.containsKey(t)) + { + out.writeInt(alreadySerialized.get(t)); + return nextExceptionId; + } + else + { + alreadySerialized.put(t, nextExceptionId); + out.writeInt(nextExceptionId); + nextExceptionId++; + } + + out.writeUTF(t.getClass().getName()); + String message = getMessageWithOriginatingHost(t, isFirstException); + out.writeBoolean(message != null); + if (message != null) + out.writeUTF(message); + ArraySerializers.serializeArray(t.getStackTrace(), out, version, stackTraceElementSerializer); + + // Do cause and suppressed last so they can reference back to previously partially deserialized exceptions + out.writeBoolean(t.getCause() != null); + if (t.getCause() != null) + nextExceptionId = serializeNextException(t.getCause(), out, false, version, nextExceptionId, alreadySerialized); + out.writeUnsignedVInt32(t.getSuppressed().length); + for (Throwable suppressed : t.getSuppressed()) + nextExceptionId = serializeNextException(suppressed, out, false, version, nextExceptionId, alreadySerialized); + + return nextExceptionId; + } + + @Override + public Throwable deserialize(DataInputPlus in, int version) throws IOException + { + Map alreadyDeserialized = new HashMap<>(); + return deserializeNextException(in, version, alreadyDeserialized); + } + + private Throwable deserializeNextException(DataInputPlus in, int version, Map alreadyDeserialized) throws IOException + { + int nextExceptionId = in.readInt(); + Throwable alreadyDeserializedThrowable = alreadyDeserialized.get(nextExceptionId); + if (alreadyDeserializedThrowable != null) + return alreadyDeserializedThrowable; + + String originalClass = in.readUTF(); + String originalMessage = null; + if (in.readBoolean()) + originalMessage = in.readUTF(); + + StackTraceElement[] stackTrace = ArraySerializers.deserializeArray(in, version, stackTraceElementSerializer, size -> new StackTraceElement[size]); + RemoteException deserializedException = new RemoteException(originalClass, originalMessage, stackTrace); + deserializedException.setStackTrace(stackTrace); + alreadyDeserialized.put(nextExceptionId, deserializedException); + + // Do cause and suppressed last after alreadyDeserialized contains the exception we just processsed + RemoteException cause = in.readBoolean() ? (RemoteException)deserializeNextException(in, version, alreadyDeserialized) : null; + RemoteException[] suppressed = new RemoteException[in.readUnsignedVInt32()]; + for (int i = 0; i < suppressed.length; i++) + suppressed[i] = (RemoteException)deserializeNextException(in, version, alreadyDeserialized); + deserializedException.initSuppressedAndCause(cause, suppressed); + + return deserializedException; + } + + @Override + public long serializedSize(Throwable t, int version) + { + Set alreadySeen = newSetFromMap(new IdentityHashMap<>()); + return nextExceptionSerializedSize(t, version, true, alreadySeen); + } + + private long nextExceptionSerializedSize(Throwable t, int version, boolean isFirstException, Set alreadySeen) + { + if (!alreadySeen.add(t)) + return sizeof(42); // Exception ID from the last time it was serialized + + String message = getMessageWithOriginatingHost(t, isFirstException); + long size = sizeof(42) + // Exception ID generated during serialization + sizeof(t.getClass().getName()) + + sizeof(message != null) + + (message != null ? sizeof(message) : 0) + + sizeof(t.getCause() != null) + + (t.getCause() != null ? nextExceptionSerializedSize(t.getCause(), version, false, alreadySeen) : 0) + + sizeofUnsignedVInt(t.getSuppressed().length); + size += ArraySerializers.serializedArraySize(t.getStackTrace(), version, stackTraceElementSerializer); + for (Throwable suppressed : t.getSuppressed()) + size += nextExceptionSerializedSize(suppressed, version, false, alreadySeen); + return size; + } + }; + + public static final IVersionedSerializer nullableRemoteExceptionSerializer = NullableSerializer.wrap(remoteExceptionSerializer); +} diff --git a/src/java/org/apache/cassandra/exceptions/ReadTimeoutException.java b/src/java/org/apache/cassandra/exceptions/ReadTimeoutException.java index 05f3510e7b39..809f0a1780fd 100644 --- a/src/java/org/apache/cassandra/exceptions/ReadTimeoutException.java +++ b/src/java/org/apache/cassandra/exceptions/ReadTimeoutException.java @@ -28,4 +28,10 @@ public ReadTimeoutException(ConsistencyLevel consistency, int received, int bloc super(ExceptionCode.READ_TIMEOUT, consistency, received, blockFor); this.dataPresent = dataPresent; } + + public ReadTimeoutException(ConsistencyLevel consistency, int received, int blockFor, boolean dataPresent, String msg) + { + super(ExceptionCode.READ_TIMEOUT, consistency, received, blockFor, msg); + this.dataPresent = dataPresent; + } } diff --git a/src/java/org/apache/cassandra/exceptions/RequestFailure.java b/src/java/org/apache/cassandra/exceptions/RequestFailure.java new file mode 100644 index 000000000000..b9bba7fc7061 --- /dev/null +++ b/src/java/org/apache/cassandra/exceptions/RequestFailure.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.exceptions; + +import java.io.IOException; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import org.apache.cassandra.db.filter.TombstoneOverwhelmingException; +import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.tcm.NotCMSException; + +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.cassandra.exceptions.ExceptionSerializer.nullableRemoteExceptionSerializer; + +/** + * Allow inclusion of a serialized exception in failure response messages + * This continues to use the same verb as the old failure response (whether a message payload or parameter) + * and has a nullable failure field that may contain a serialized exception in later versions. + * + * It's important to note RequestFailure is not a singleton for each type, unlike RequestFailureReason, + * since it might include a stack trace so don't compare using identity. + */ +public class RequestFailure +{ + public static final RequestFailure UNKNOWN = new RequestFailure(RequestFailureReason.UNKNOWN); + public static final RequestFailure READ_TOO_MANY_TOMBSTONES = new RequestFailure(RequestFailureReason.READ_TOO_MANY_TOMBSTONES); + public static final RequestFailure TIMEOUT = new RequestFailure(RequestFailureReason.TIMEOUT); + public static final RequestFailure INCOMPATIBLE_SCHEMA = new RequestFailure(RequestFailureReason.INCOMPATIBLE_SCHEMA); + public static final RequestFailure READ_SIZE = new RequestFailure(RequestFailureReason.READ_SIZE); + public static final RequestFailure NODE_DOWN = new RequestFailure(RequestFailureReason.NODE_DOWN); + public static final RequestFailure NOT_CMS = new RequestFailure(RequestFailureReason.NOT_CMS); + public static final RequestFailure INVALID_ROUTING = new RequestFailure(RequestFailureReason.INVALID_ROUTING); + public static final RequestFailure INDEX_NOT_AVAILABLE = new RequestFailure(RequestFailureReason.INDEX_NOT_AVAILABLE); + public static final RequestFailure COORDINATOR_BEHIND = new RequestFailure(RequestFailureReason.COORDINATOR_BEHIND); + public static final RequestFailure READ_TOO_MANY_INDEXES = new RequestFailure(RequestFailureReason.READ_TOO_MANY_INDEXES); + public static final RequestFailure RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM = new RequestFailure(RequestFailureReason.RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM); + public static final RequestFailure INDEX_BUILD_IN_PROGRESS = new RequestFailure(RequestFailureReason.INDEX_BUILD_IN_PROGRESS); + + static + { + // Validate all reasons are handled + for (RequestFailureReason reason : RequestFailureReason.values()) + forReason(reason); + } + + // Allow RequestFailureReason to force class load to check failure reasons are handled + public static void init() {} + + public static final IVersionedSerializer serializer = new IVersionedSerializer() + { + @Override + public void serialize(RequestFailure t, DataOutputPlus out, int version) throws IOException + { + RequestFailureReason.serializer.serialize(t.reason, out, version); + if (version >= MessagingService.VERSION_51) + nullableRemoteExceptionSerializer.serialize(t.failure, out, version); + } + + @Override + public RequestFailure deserialize(DataInputPlus in, int version) throws IOException + { + RequestFailureReason reason = RequestFailureReason.serializer.deserialize(in, version); + Throwable failure = null; + if (version >= MessagingService.VERSION_51) + failure = nullableRemoteExceptionSerializer.deserialize(in, version); + if (failure == null) + return forReason(reason); + else + return new RequestFailure(reason, failure); + } + + @Override + public long serializedSize(RequestFailure t, int version) + { + long size = RequestFailureReason.serializer.serializedSize(t.reason, version); + if (version >= MessagingService.VERSION_51) + size += nullableRemoteExceptionSerializer.serializedSize(t.failure, version); + return size; + } + }; + + @Nonnull + public final RequestFailureReason reason; + + @Nullable + public final Throwable failure; + + public static RequestFailure forException(Throwable t) + { + if (t instanceof TombstoneOverwhelmingException) + return READ_TOO_MANY_TOMBSTONES; + + if (t instanceof IncompatibleSchemaException) + return INCOMPATIBLE_SCHEMA; + + if (t instanceof NotCMSException) + return NOT_CMS; + + if (t instanceof InvalidRoutingException) + return INVALID_ROUTING; + + if (t instanceof RetryOnDifferentSystemException) + return RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM; + + if (t instanceof CoordinatorBehindException) + return COORDINATOR_BEHIND; + + return UNKNOWN; + } + + public static RequestFailure forReason(RequestFailureReason reason) + { + switch (reason) + { + default: throw new IllegalStateException("Unhandled request failure reason " + reason); + case UNKNOWN: return UNKNOWN; + case READ_TOO_MANY_TOMBSTONES: return READ_TOO_MANY_TOMBSTONES; + case TIMEOUT: return TIMEOUT; + case INCOMPATIBLE_SCHEMA: return INCOMPATIBLE_SCHEMA; + case READ_SIZE: return READ_SIZE; + case NODE_DOWN: return NODE_DOWN; + case NOT_CMS: return NOT_CMS; + case INVALID_ROUTING: return INVALID_ROUTING; + case INDEX_NOT_AVAILABLE: return INDEX_NOT_AVAILABLE; + case COORDINATOR_BEHIND: return COORDINATOR_BEHIND; + case READ_TOO_MANY_INDEXES: return READ_TOO_MANY_INDEXES; + case INDEX_BUILD_IN_PROGRESS: return INDEX_BUILD_IN_PROGRESS; + case RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM: return RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM; + } + } + + private RequestFailure(RequestFailureReason reason) + { + this(reason, null); + } + + public RequestFailure(@Nonnull Throwable failure) + { + this(RequestFailureReason.UNKNOWN, failure); + } + + public RequestFailure(@Nonnull RequestFailureReason reason, @Nullable Throwable failure) + { + checkNotNull(reason); + this.reason = reason; + this.failure = failure; + } + + @Override + public String toString() + { + return "RequestFailure{" + + "reason=" + reason + + ", failure='" + failure + '\'' + + '}'; + } +} diff --git a/src/java/org/apache/cassandra/exceptions/RequestFailureReason.java b/src/java/org/apache/cassandra/exceptions/RequestFailureReason.java index 1bc86ff061ab..bafab71752c3 100644 --- a/src/java/org/apache/cassandra/exceptions/RequestFailureReason.java +++ b/src/java/org/apache/cassandra/exceptions/RequestFailureReason.java @@ -18,32 +18,49 @@ package org.apache.cassandra.exceptions; import java.io.IOException; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.Map; + +import com.google.common.collect.Sets; import org.apache.cassandra.db.filter.TombstoneOverwhelmingException; +import org.apache.cassandra.index.IndexBuildInProgressException; +import org.apache.cassandra.index.IndexNotAvailableException; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.tcm.NotCMSException; import org.apache.cassandra.utils.vint.VIntCoding; -import static java.lang.Math.max; import static org.apache.cassandra.net.MessagingService.VERSION_40; public enum RequestFailureReason { - UNKNOWN (0), - READ_TOO_MANY_TOMBSTONES (1), - TIMEOUT (2), - INCOMPATIBLE_SCHEMA (3), - READ_SIZE (4), - NODE_DOWN (5), - INDEX_NOT_AVAILABLE (6), - READ_TOO_MANY_INDEXES (7), - NOT_CMS (8), - INVALID_ROUTING (9), - COORDINATOR_BEHIND (10), + UNKNOWN (0), + READ_TOO_MANY_TOMBSTONES (1), + TIMEOUT (2), + INCOMPATIBLE_SCHEMA (3), + READ_SIZE (4), + // below reason is only logged, but it does not have associated exception + NODE_DOWN (5), + INDEX_NOT_AVAILABLE (6), + // below reason does not have an associated exception + READ_TOO_MANY_INDEXES (7), + NOT_CMS (8), + INVALID_ROUTING (9), + COORDINATOR_BEHIND (10), + RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM (11), + // The following codes have been ported from an external fork, where they were offset explicitly to avoid conflicts. + INDEX_BUILD_IN_PROGRESS (503), ; + static + { + // Load RequestFailure class to check that all request failure reasons are handled + RequestFailure.init(); + } + public static final Serializer serializer = new Serializer(); public final int code; @@ -53,26 +70,44 @@ public enum RequestFailureReason this.code = code; } - private static final RequestFailureReason[] codeToReasonMap; + private static final Map codeToReasonMap = new HashMap<>(); + private static final Map, RequestFailureReason> exceptionToReasonMap = new HashMap<>(); static { + EnumSet withoutExceptions = EnumSet.of(UNKNOWN, NODE_DOWN, READ_TOO_MANY_INDEXES); + Sets.SetView withExceptions = Sets.difference(EnumSet.allOf(RequestFailureReason.class), withoutExceptions); RequestFailureReason[] reasons = values(); - int max = -1; - for (RequestFailureReason r : reasons) - max = max(r.code, max); - - RequestFailureReason[] codeMap = new RequestFailureReason[max + 1]; - for (RequestFailureReason reason : reasons) { - if (codeMap[reason.code] != null) + if (codeToReasonMap.put(reason.code, reason) != null) throw new RuntimeException("Two RequestFailureReason-s that map to the same code: " + reason.code); - codeMap[reason.code] = reason; } - codeToReasonMap = codeMap; + exceptionToReasonMap.put(TombstoneOverwhelmingException.class, READ_TOO_MANY_TOMBSTONES); + exceptionToReasonMap.put(WriteTimeoutException.class, TIMEOUT); + exceptionToReasonMap.put(IncompatibleSchemaException.class, INCOMPATIBLE_SCHEMA); + exceptionToReasonMap.put(ReadSizeAbortException.class, READ_SIZE); + exceptionToReasonMap.put(IndexNotAvailableException.class, INDEX_NOT_AVAILABLE); + exceptionToReasonMap.put(NotCMSException.class, NOT_CMS); + exceptionToReasonMap.put(InvalidRoutingException.class, INVALID_ROUTING); + exceptionToReasonMap.put(CoordinatorBehindException.class, COORDINATOR_BEHIND); + exceptionToReasonMap.put(IndexBuildInProgressException.class, INDEX_BUILD_IN_PROGRESS); + exceptionToReasonMap.put(RetryOnDifferentSystemException.class, RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM); + + if (exceptionToReasonMap.size() != reasons.length - withoutExceptions.size()) + { + EnumSet actual = EnumSet.copyOf(exceptionToReasonMap.values()); + Sets.SetView missing = Sets.difference(withExceptions, actual); + Sets.SetView added = Sets.difference(actual, withExceptions); + StringBuilder sb = new StringBuilder(); + if (!missing.isEmpty()) + sb.append("Expected the following RequestFailureReason, but were missing: ").append(missing).append('\n'); + if (!added.isEmpty()) + sb.append("Unexpected RequestFailureReason found: ").append(added); + throw new AssertionError(sb.toString()); + } } public static RequestFailureReason fromCode(int code) @@ -81,25 +116,18 @@ public static RequestFailureReason fromCode(int code) throw new IllegalArgumentException("RequestFailureReason code must be non-negative (got " + code + ')'); // be forgiving and return UNKNOWN if we aren't aware of the code - for forward compatibility - return code < codeToReasonMap.length ? codeToReasonMap[code] : UNKNOWN; + return codeToReasonMap.getOrDefault(code, UNKNOWN); } public static RequestFailureReason forException(Throwable t) { - if (t instanceof TombstoneOverwhelmingException) - return READ_TOO_MANY_TOMBSTONES; - - if (t instanceof IncompatibleSchemaException) - return INCOMPATIBLE_SCHEMA; - - if (t instanceof NotCMSException) - return NOT_CMS; - - if (t instanceof InvalidRoutingException) - return INVALID_ROUTING; + RequestFailureReason r = exceptionToReasonMap.get(t.getClass()); + if (r != null) + return r; - if (t instanceof CoordinatorBehindException) - return COORDINATOR_BEHIND; + for (Map.Entry, RequestFailureReason> entry : exceptionToReasonMap.entrySet()) + if (entry.getKey().isInstance(t)) + return entry.getValue(); return UNKNOWN; } diff --git a/src/java/org/apache/cassandra/exceptions/RetryOnDifferentSystemException.java b/src/java/org/apache/cassandra/exceptions/RetryOnDifferentSystemException.java new file mode 100644 index 000000000000..e0ca033a74e4 --- /dev/null +++ b/src/java/org/apache/cassandra/exceptions/RetryOnDifferentSystemException.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.exceptions; + +/** + * Thrown when a non-transactional operation is attempted when the operation needs to be done transactionally (or vice versa) + * and it could interfere with operations performed transactionally or can't be applied by the chosen transaction system. + * + * The correct way to handle this is to forward the error the originator of the operation who can then retry it on + * the correct system. + */ +public class RetryOnDifferentSystemException extends RuntimeException +{ +} diff --git a/src/java/org/apache/cassandra/gms/EndpointState.java b/src/java/org/apache/cassandra/gms/EndpointState.java index c96f99da2e6f..26fe33ec698c 100644 --- a/src/java/org/apache/cassandra/gms/EndpointState.java +++ b/src/java/org/apache/cassandra/gms/EndpointState.java @@ -26,6 +26,7 @@ import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Function; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -53,8 +54,19 @@ public class EndpointState public final static IVersionedSerializer serializer = new EndpointStateSerializer(); public final static IVersionedSerializer nullableSerializer = NullableSerializer.wrap(serializer); - private volatile HeartBeatState hbState; - private final AtomicReference> applicationState; + private static class View + { + final HeartBeatState hbState; + final Map applicationState; + + private View(HeartBeatState hbState, Map applicationState) + { + this.hbState = hbState; + this.applicationState = applicationState; + } + } + + private final AtomicReference ref; /* fields below do not get serialized */ private volatile long updateTimestamp; @@ -62,19 +74,20 @@ public class EndpointState public EndpointState(HeartBeatState initialHbState) { - this(initialHbState, new EnumMap(ApplicationState.class)); + this(initialHbState, new EnumMap<>(ApplicationState.class)); } public EndpointState(EndpointState other) { - this(new HeartBeatState(other.hbState), new EnumMap<>(other.applicationState.get())); + ref = new AtomicReference<>(other.ref.get()); + updateTimestamp = nanoTime(); + isAlive = true; } @VisibleForTesting public EndpointState(HeartBeatState initialHbState, Map states) { - hbState = initialHbState; - applicationState = new AtomicReference<>(new EnumMap<>(states)); + ref = new AtomicReference<>(new View(initialHbState, new EnumMap<>(states))); updateTimestamp = nanoTime(); isAlive = true; } @@ -82,28 +95,53 @@ public EndpointState(HeartBeatState initialHbState, Map fn) + { + HeartBeatState previous = null; + HeartBeatState update = null; + while (true) + { + View view = ref.get(); + if (previous == null || view.hbState != previous) // if this races with updating states then can avoid bumping versions + update = fn.apply(view.hbState); + if (ref.compareAndSet(view, new View(update, view.applicationState))) + return; + previous = view.hbState; + } } public VersionedValue getApplicationState(ApplicationState key) { - return applicationState.get().get(key); + return ref.get().applicationState.get(key); } public boolean containsApplicationState(ApplicationState key) { - return applicationState.get().containsKey(key); + return ref.get().applicationState.containsKey(key); } public Set> states() { - return applicationState.get().entrySet(); + return ref.get().applicationState.entrySet(); } public void addApplicationState(ApplicationState key, VersionedValue value) @@ -117,17 +155,27 @@ public void addApplicationStates(Map values) } public void addApplicationStates(Set> values) + { + addApplicationStates(values, null); + } + + public void addApplicationStates(Set> values, @Nullable HeartBeatState hbState) { while (true) { - Map orig = applicationState.get(); + View view = this.ref.get(); + Map orig = view.applicationState; Map copy = new EnumMap<>(orig); for (Map.Entry value : values) copy.put(value.getKey(), value.getValue()); - if (applicationState.compareAndSet(orig, copy)) + if (this.ref.compareAndSet(view, new View(hbState == null ? view.hbState : hbState, copy))) + { + if (hbState != null) + updateTimestamp(); return; + } } } @@ -135,18 +183,19 @@ void removeMajorVersion3LegacyApplicationStates() { while (hasLegacyFields()) { - Map orig = applicationState.get(); + View view = ref.get(); + Map orig = view.applicationState; Map updatedStates = filterMajorVersion3LegacyApplicationStates(orig); // avoid updating if no state is removed if (orig.size() == updatedStates.size() - || applicationState.compareAndSet(orig, updatedStates)) + || ref.compareAndSet(view, new View(view.hbState, updatedStates))) return; } } private boolean hasLegacyFields() { - Set statesPresent = applicationState.get().keySet(); + Set statesPresent = ref.get().applicationState.keySet(); if (statesPresent.isEmpty()) return false; return (statesPresent.contains(ApplicationState.STATUS) && statesPresent.contains(ApplicationState.STATUS_WITH_PORT)) @@ -211,7 +260,7 @@ public void markDead() public boolean isStateEmpty() { - return applicationState.get().isEmpty(); + return ref.get().applicationState.isEmpty(); } /** @@ -219,14 +268,15 @@ public boolean isStateEmpty() */ public boolean isEmptyWithoutStatus() { - Map state = applicationState.get(); + View view = ref.get(); + Map state = view.applicationState; boolean hasStatus = state.containsKey(ApplicationState.STATUS_WITH_PORT) || state.containsKey(ApplicationState.STATUS); - return hbState.isEmpty() && !hasStatus + return view.hbState.isEmpty() && !hasStatus // In the very specific case where hbState.isEmpty and STATUS is missing, this is known to be safe to "fake" // the data, as this happens when the gossip state isn't coming from the node but instead from a peer who // restarted and is missing the node's state. // - // When hbState is not empty, then the node gossiped an empty STATUS; this happens during bootstrap and it's not + // When hbState is not empty, then the node gossiped an empty STATUS; this happens during bootstrap, and it's not // possible to tell if this is ok or not (we can't really tell if the node is dead or having networking issues). // For these cases allow an external actor to verify and inform Cassandra that it is safe - this is done by // updating the LOOSE_DEF_OF_EMPTY_ENABLED field. @@ -273,7 +323,8 @@ public CassandraVersion getReleaseVersion() public String toString() { - return "EndpointState: HeartBeatState = " + hbState + ", AppStateMap = " + applicationState.get(); + View view = ref.get(); + return "EndpointState: HeartBeatState = " + view.hbState + ", AppStateMap = " + view.applicationState; } public boolean isSupersededBy(EndpointState that) diff --git a/src/java/org/apache/cassandra/gms/FailureDetector.java b/src/java/org/apache/cassandra/gms/FailureDetector.java index 49b208929748..da90efc3326f 100644 --- a/src/java/org/apache/cassandra/gms/FailureDetector.java +++ b/src/java/org/apache/cassandra/gms/FailureDetector.java @@ -326,7 +326,7 @@ public boolean isAlive(InetAddressAndPort ep) // an error in that case. ClusterMetadata metadata = ClusterMetadata.current(); if (!metadata.directory.allJoinedEndpoints().contains(ep) && !metadata.fullCMSMembers().contains(ep)) - logger.error("Unknown endpoint: " + ep, new IllegalArgumentException("Unknown endpoint: " + ep)); + logger.error("Unknown endpoint: " + ep, new UnknownEndpointException(ep)); } return epState != null && epState.isAlive(); } @@ -437,6 +437,14 @@ public String toString() sb.append("-----------------------------------------------------------------------"); return sb.toString(); } + + public static class UnknownEndpointException extends IllegalArgumentException + { + public UnknownEndpointException(InetAddressAndPort ep) + { + super("Unknown endpoint: " + ep); + } + } } /* diff --git a/src/java/org/apache/cassandra/gms/Gossiper.java b/src/java/org/apache/cassandra/gms/Gossiper.java index 14cc5f5adaaf..d6aebf1ba80f 100644 --- a/src/java/org/apache/cassandra/gms/Gossiper.java +++ b/src/java/org/apache/cassandra/gms/Gossiper.java @@ -249,7 +249,7 @@ public void run() taskLock.lock(); /* Update the local heartbeat counter. */ - endpointStateMap.get(getBroadcastAddressAndPort()).getHeartBeatState().updateHeartBeat(); + endpointStateMap.get(getBroadcastAddressAndPort()).updateHeartBeat(); if (logger.isTraceEnabled()) logger.trace("My heartbeat is now {}", endpointStateMap.get(FBUtilities.getBroadcastAddressAndPort()).getHeartBeatState().getHeartBeatVersion()); final List gDigests = new ArrayList<>(); @@ -560,7 +560,7 @@ protected void markAsShutdown(InetAddressAndPort endpoint) epState.addApplicationState(ApplicationState.STATUS_WITH_PORT, shutdown); epState.addApplicationState(ApplicationState.STATUS, StorageService.instance.valueFactory.shutdown(true)); epState.addApplicationState(ApplicationState.RPC_READY, StorageService.instance.valueFactory.rpcReady(false)); - epState.getHeartBeatState().forceHighestPossibleVersionUnsafe(); + epState.forceHighestPossibleVersionUnsafe(); markDead(endpoint, epState); FailureDetector.instance.forceConviction(endpoint); GossiperDiagnostics.markedAsShutdown(this, endpoint); @@ -586,7 +586,7 @@ protected void markAsShutdown(InetAddressAndPort endpoint, EndpointState remoteS VersionedValue shutdown = remoteState.getApplicationState(ApplicationState.STATUS_WITH_PORT); if (shutdown == null) throw new AssertionError("Remote shutdown sent but missing STATUS_WITH_PORT; " + remoteState); - remoteState.getHeartBeatState().forceHighestPossibleVersionUnsafe(); + remoteState.forceHighestPossibleVersionUnsafe(); endpointStateMap.put(endpoint, remoteState); markDead(endpoint, remoteState); FailureDetector.instance.forceConviction(endpoint); @@ -1440,14 +1440,7 @@ private void applyNewStates(InetAddressAndPort addr, EndpointState localState, E // don't assert here, since if the node restarts the version will go back to zero int oldVersion = localState.getHeartBeatState().getHeartBeatVersion(); - localState.setHeartBeatState(remoteState.getHeartBeatState()); - if (logger.isTraceEnabled()) - logger.trace("Updating heartbeat state version to {} from {} for {} ...", localState.getHeartBeatState().getHeartBeatVersion(), oldVersion, addr); - - Set> remoteStates = remoteState.states(); - assert remoteState.getHeartBeatState().getGeneration() == localState.getHeartBeatState().getGeneration(); - - Set> updatedStates = remoteStates.stream().filter(entry -> { + Set> updatedStates = remoteState.states().stream().filter(entry -> { // filter out the states that are already up to date (has the same or higher version) VersionedValue local = localState.getApplicationState(entry.getKey()); return (local == null || local.version < entry.getValue().version); @@ -1460,7 +1453,9 @@ private void applyNewStates(InetAddressAndPort addr, EndpointState localState, E logger.trace("Updating {} state version to {} for {}", entry.getKey().toString(), entry.getValue().version, addr); } } - localState.addApplicationStates(updatedStates); + localState.addApplicationStates(updatedStates, remoteState.getHeartBeatState()); + if (logger.isTraceEnabled()) + logger.trace("Updating heartbeat state version to {} from {} for {} ...", localState.getHeartBeatState().getHeartBeatVersion(), oldVersion, addr); localState.removeMajorVersion3LegacyApplicationStates(); // need to run STATUS or STATUS_WITH_PORT first to handle BOOT_REPLACE correctly (else won't be a member, so TOKENS won't be processed) @@ -1765,7 +1760,7 @@ public void maybeInitializeLocalState(int generationNbr) public void forceNewerGeneration() { EndpointState epstate = endpointStateMap.get(getBroadcastAddressAndPort()); - epstate.getHeartBeatState().forceNewerGenerationUnsafe(); + epstate.forceNewerGenerationUnsafe(); } private void addLocalApplicationStateInternal(ApplicationState state, VersionedValue value) @@ -2046,6 +2041,11 @@ public void stopShutdownAndWait(long timeout, TimeUnit unit) throws InterruptedE ExecutorUtils.shutdownAndWait(timeout, unit, executor); } + public void shutdownAndWait(long timeout, TimeUnit unit) throws InterruptedException, TimeoutException + { + ExecutorUtils.shutdownAndWait(timeout, unit, executor); + } + @Nullable private String getReleaseVersionString(InetAddressAndPort ep) { diff --git a/src/java/org/apache/cassandra/gms/HeartBeatState.java b/src/java/org/apache/cassandra/gms/HeartBeatState.java index 3f633cb0fabd..374d346a0a8e 100644 --- a/src/java/org/apache/cassandra/gms/HeartBeatState.java +++ b/src/java/org/apache/cassandra/gms/HeartBeatState.java @@ -19,8 +19,6 @@ import java.io.*; -import com.google.common.annotations.VisibleForTesting; - import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; @@ -29,15 +27,14 @@ /** * HeartBeat State associated with any given endpoint. */ - public class HeartBeatState { public static final int EMPTY_VERSION = -1; public static final IVersionedSerializer serializer = new HeartBeatStateSerializer(); - private volatile int generation; - private volatile int version; + private final int generation; + private final int version; HeartBeatState(int gen) { @@ -75,9 +72,9 @@ public int getGeneration() return generation; } - void updateHeartBeat() + HeartBeatState updateHeartBeat() { - version = VersionGenerator.getNextVersion(); + return new HeartBeatState(generation, VersionGenerator.getNextVersion()); } public int getHeartBeatVersion() @@ -85,15 +82,14 @@ public int getHeartBeatVersion() return version; } - void forceNewerGenerationUnsafe() + HeartBeatState forceNewerGenerationUnsafe() { - generation += 1; + return new HeartBeatState(generation + 1, version); } - @VisibleForTesting - public void forceHighestPossibleVersionUnsafe() + HeartBeatState forceHighestPossibleVersionUnsafe() { - version = Integer.MAX_VALUE; + return new HeartBeatState(generation, Integer.MAX_VALUE); } public String toString() diff --git a/src/java/org/apache/cassandra/hints/HintDiagnostics.java b/src/java/org/apache/cassandra/hints/HintDiagnostics.java index 285193b8a878..67e1f7380991 100644 --- a/src/java/org/apache/cassandra/hints/HintDiagnostics.java +++ b/src/java/org/apache/cassandra/hints/HintDiagnostics.java @@ -37,44 +37,44 @@ static void dispatcherCreated(HintsDispatcher dispatcher) { if (isEnabled(HintEventType.DISPATCHER_CREATED)) service.publish(new HintEvent(HintEventType.DISPATCHER_CREATED, dispatcher, - dispatcher.hostId, dispatcher.address, null, null, null, null)); + dispatcher.hostId, dispatcher.address, null, null, null, null, null)); } static void dispatcherClosed(HintsDispatcher dispatcher) { if (isEnabled(HintEventType.DISPATCHER_CLOSED)) service.publish(new HintEvent(HintEventType.DISPATCHER_CLOSED, dispatcher, - dispatcher.hostId, dispatcher.address, null, null, null, null)); + dispatcher.hostId, dispatcher.address, null, null, null, null, null)); } static void dispatchPage(HintsDispatcher dispatcher) { if (isEnabled(HintEventType.DISPATCHER_PAGE)) service.publish(new HintEvent(HintEventType.DISPATCHER_PAGE, dispatcher, - dispatcher.hostId, dispatcher.address, null, null, null, null)); + dispatcher.hostId, dispatcher.address, null, null, null, null, null)); } static void abortRequested(HintsDispatcher dispatcher) { if (isEnabled(HintEventType.ABORT_REQUESTED)) service.publish(new HintEvent(HintEventType.ABORT_REQUESTED, dispatcher, - dispatcher.hostId, dispatcher.address, null, null, null, null)); + dispatcher.hostId, dispatcher.address, null, null, null, null, null)); } - static void pageSuccessResult(HintsDispatcher dispatcher, long success, long failures, long timeouts) + static void pageSuccessResult(HintsDispatcher dispatcher, long success, long failures, long timeouts, long retryDifferentSystem) { if (isEnabled(HintEventType.DISPATCHER_HINT_RESULT)) service.publish(new HintEvent(HintEventType.DISPATCHER_HINT_RESULT, dispatcher, dispatcher.hostId, dispatcher.address, HintResult.PAGE_SUCCESS, - success, failures, timeouts)); + success, failures, timeouts, retryDifferentSystem)); } - static void pageFailureResult(HintsDispatcher dispatcher, long success, long failures, long timeouts) + static void pageFailureResult(HintsDispatcher dispatcher, long success, long failures, long timeouts, long retryDifferentSystem) { if (isEnabled(HintEventType.DISPATCHER_HINT_RESULT)) service.publish(new HintEvent(HintEventType.DISPATCHER_HINT_RESULT, dispatcher, dispatcher.hostId, dispatcher.address, HintResult.PAGE_FAILURE, - success, failures, timeouts)); + success, failures, timeouts, retryDifferentSystem)); } private static boolean isEnabled(HintEventType type) diff --git a/src/java/org/apache/cassandra/hints/HintEvent.java b/src/java/org/apache/cassandra/hints/HintEvent.java index 695357e9b63c..d709fbcd8288 100644 --- a/src/java/org/apache/cassandra/hints/HintEvent.java +++ b/src/java/org/apache/cassandra/hints/HintEvent.java @@ -64,10 +64,13 @@ enum HintResult private final Long pageHintsFailed; @Nullable private final Long pageHintsTimeout; + @Nullable + private final Long pageHintsRetryDifferentSystem; HintEvent(HintEventType type, HintsDispatcher dispatcher, UUID targetHostId, InetAddressAndPort targetAddress, @Nullable HintResult dispatchResult, @Nullable Long pageHintsSuccessful, - @Nullable Long pageHintsFailed, @Nullable Long pageHintsTimeout) + @Nullable Long pageHintsFailed, @Nullable Long pageHintsTimeout, + @Nullable Long pageHintsRetryDifferentSystem) { this.type = type; this.dispatcher = dispatcher; @@ -77,6 +80,7 @@ enum HintResult this.pageHintsSuccessful = pageHintsSuccessful; this.pageHintsFailed = pageHintsFailed; this.pageHintsTimeout = pageHintsTimeout; + this.pageHintsRetryDifferentSystem = pageHintsRetryDifferentSystem; } public Enum getType() @@ -96,6 +100,7 @@ public HashMap toMap() ret.put("hint.page.hints_succeeded", pageHintsSuccessful); ret.put("hint.page.hints_failed", pageHintsFailed); ret.put("hint.page.hints_timed_out", pageHintsTimeout); + ret.put("hint.page.hints_retry_different_system", pageHintsRetryDifferentSystem); } return ret; } diff --git a/src/java/org/apache/cassandra/hints/HintVerbHandler.java b/src/java/org/apache/cassandra/hints/HintVerbHandler.java index 73e6967e398e..be164b9be1ca 100644 --- a/src/java/org/apache/cassandra/hints/HintVerbHandler.java +++ b/src/java/org/apache/cassandra/hints/HintVerbHandler.java @@ -24,7 +24,9 @@ import org.slf4j.LoggerFactory; import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.exceptions.RetryOnDifferentSystemException; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.metrics.HintsServiceMetrics; import org.apache.cassandra.net.IVerbHandler; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; @@ -34,6 +36,8 @@ import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.membership.NodeId; +import static org.apache.cassandra.exceptions.RequestFailureReason.RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM; + /** * Verb handler used both for hint dispatch and streaming. * @@ -100,8 +104,24 @@ else if (!StorageProxy.instance.appliesLocally(hint.mutation)) } else { - // the common path - the node is both the destination and a valid replica for the hint. - hint.applyFuture().addCallback(o -> respond(message), e -> logger.debug("Failed to apply hint", e)); + try + { + // the common path - the node is both the destination and a valid replica for the hint. + hint.applyFuture().addCallback( + o -> { + HintsServiceMetrics.hintsApplySucceeded.mark(); + respond(message); + }, + e -> { + HintsServiceMetrics.hintsApplyFailed.mark(); + logger.debug("Failed to apply hint", e); + } + ); + } + catch (RetryOnDifferentSystemException e) + { + MessagingService.instance().respondWithFailure(RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM, message); + } } } diff --git a/src/java/org/apache/cassandra/hints/HintsBuffer.java b/src/java/org/apache/cassandra/hints/HintsBuffer.java index 646dd72febef..41c9cdf234f5 100644 --- a/src/java/org/apache/cassandra/hints/HintsBuffer.java +++ b/src/java/org/apache/cassandra/hints/HintsBuffer.java @@ -179,7 +179,19 @@ private Allocation allocate(int totalSize, OpOrder.Group opGroup) return new Allocation(offset, totalSize, opGroup); } - // allocate bytes in the slab, or return negative if not enough space + /** + * Allocate bytes in the segment, or return -1 if not enough space. Method ensures that marker bytes + * for each allocation (i.e. offset of its end) is written as a 32 bit integer at its beginning, and + * that these marker bytes are always written sequentially. In other words, if allocation A has a lower + * starting offset than allocation B, A's marker will always be written before the offset for B is returned. + * + * `allocateOffset` consists of two integers: + * 64 32 0 + * | (i32) inProgress | (i32) writtenTo | + * + * If inProgress bytes are not zeroes, they contain an unwritten offset. Before allocating any bytes, + * inProgresss bytes need to be written at the writtenTo location in the target buffer. + */ private int allocateBytes(int totalSize) { long prev = position.getAndAdd(totalSize); diff --git a/src/java/org/apache/cassandra/hints/HintsBufferPool.java b/src/java/org/apache/cassandra/hints/HintsBufferPool.java index 275dbc37e624..f6a0c2b4606b 100644 --- a/src/java/org/apache/cassandra/hints/HintsBufferPool.java +++ b/src/java/org/apache/cassandra/hints/HintsBufferPool.java @@ -21,6 +21,8 @@ import java.util.UUID; import java.util.concurrent.BlockingQueue; +import com.google.common.annotations.VisibleForTesting; + import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; @@ -133,6 +135,13 @@ private HintsBuffer createBuffer() return HintsBuffer.create(bufferSize); } + @VisibleForTesting + public void clearUnsafe() + { + if (currentBuffer != null) + currentBuffer = currentBuffer.recycle(); + } + public void close() { currentBuffer.free(); diff --git a/src/java/org/apache/cassandra/hints/HintsCatalog.java b/src/java/org/apache/cassandra/hints/HintsCatalog.java index e989850dff98..902b239dec87 100644 --- a/src/java/org/apache/cassandra/hints/HintsCatalog.java +++ b/src/java/org/apache/cassandra/hints/HintsCatalog.java @@ -20,7 +20,11 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Stream; import javax.annotation.Nullable; @@ -30,10 +34,10 @@ import org.slf4j.LoggerFactory; import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.io.util.File; import org.apache.cassandra.io.FSError; import org.apache.cassandra.io.FSReadError; import org.apache.cassandra.io.FSWriteError; +import org.apache.cassandra.io.util.File; import org.apache.cassandra.io.util.FileUtils; import org.apache.cassandra.utils.NativeLibrary; import org.apache.cassandra.utils.SyncUtil; @@ -110,6 +114,11 @@ HintsStore getNullable(UUID hostId) return stores.get(hostId); } + void deleteAllHintsUnsafe() + { + stores.values().forEach(HintsStore::deleteAllHintsUnsafe); + } + /** * Delete all hints for all host ids. * diff --git a/src/java/org/apache/cassandra/hints/HintsDispatchExecutor.java b/src/java/org/apache/cassandra/hints/HintsDispatchExecutor.java index 5a566e363980..5777062db97d 100644 --- a/src/java/org/apache/cassandra/hints/HintsDispatchExecutor.java +++ b/src/java/org/apache/cassandra/hints/HintsDispatchExecutor.java @@ -26,9 +26,9 @@ import java.util.function.BooleanSupplier; import java.util.function.Predicate; import java.util.function.Supplier; +import javax.annotation.Nullable; import com.google.common.util.concurrent.RateLimiter; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -39,10 +39,11 @@ import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.tcm.ClusterMetadata; -import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; import org.apache.cassandra.utils.concurrent.Future; +import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; +import static org.apache.cassandra.hints.HintsService.RETRY_ON_DIFFERENT_SYSTEM_UUID; /** * A multi-threaded (by default) executor for dispatching hints. @@ -273,7 +274,7 @@ private boolean dispatch(HintsDescriptor descriptor) logger.trace("Dispatching hints file {}", descriptor.hintsFileName); InetAddressAndPort address = StorageService.instance.getEndpointForHostId(hostId); - if (address != null) + if (address != null || hostId == RETRY_ON_DIFFERENT_SYSTEM_UUID) return deliver(descriptor, address); // address == null means the target no longer exist; find new home for each hint entry. @@ -281,12 +282,12 @@ private boolean dispatch(HintsDescriptor descriptor) return true; } - private boolean deliver(HintsDescriptor descriptor, InetAddressAndPort address) + private boolean deliver(HintsDescriptor descriptor, @Nullable InetAddressAndPort address) { File file = descriptor.file(hintsDirectory); InputPosition offset = store.getDispatchOffset(descriptor); - BooleanSupplier shouldAbort = () -> !isAlive.test(address) || isPaused.get(); + BooleanSupplier shouldAbort = () -> (!hostId.equals(RETRY_ON_DIFFERENT_SYSTEM_UUID) && (address == null || !isAlive.test(address)) || isPaused.get()); try (HintsDispatcher dispatcher = HintsDispatcher.create(file, rateLimiter, address, descriptor.hostId, shouldAbort)) { if (offset != null) @@ -298,7 +299,7 @@ private boolean deliver(HintsDescriptor descriptor, InetAddressAndPort address) { store.delete(descriptor); store.cleanUp(descriptor); - logger.info("Finished hinted handoff of file {} to endpoint {}: {}", descriptor.fileName(), address, hostId); + logger.info("Finished hinted handoff of file {} to destination {}: {}", descriptor.fileName(), dispatcher.destination(), hostId); return true; } else @@ -322,7 +323,7 @@ private void handleDispatchFailure(HintsDispatcher dispatcher, HintsDescriptor d { store.markDispatchOffset(descriptor, dispatcher.dispatchPosition()); store.offerFirst(descriptor); - logger.info("Finished hinted handoff of file {} to endpoint {}: {}, partially", descriptor.fileName(), address, hostId); + logger.info("Finished hinted handoff of file {} to destination {}: {}, partially", descriptor.fileName(), dispatcher.destination(), hostId); } // for each hint in the hints file for a node that isn't part of the ring anymore, write RF hints for each replica diff --git a/src/java/org/apache/cassandra/hints/HintsDispatcher.java b/src/java/org/apache/cassandra/hints/HintsDispatcher.java index b6273385435b..350b7445e19b 100644 --- a/src/java/org/apache/cassandra/hints/HintsDispatcher.java +++ b/src/java/org/apache/cassandra/hints/HintsDispatcher.java @@ -18,27 +18,64 @@ package org.apache.cassandra.hints; import java.nio.ByteBuffer; -import java.util.*; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Collection; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Queue; +import java.util.UUID; +import java.util.concurrent.TimeUnit; +import java.util.function.BiConsumer; import java.util.function.BooleanSupplier; import java.util.function.Function; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; import com.google.common.util.concurrent.RateLimiter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.cassandra.net.RequestCallback; +import org.apache.cassandra.concurrent.DebuggableTask.RunnableDebuggableTask; +import org.apache.cassandra.concurrent.Stage; +import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.exceptions.RequestExecutionException; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RetryOnDifferentSystemException; import org.apache.cassandra.io.util.File; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.Replica; import org.apache.cassandra.metrics.HintsServiceMetrics; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.net.RequestCallback; +import org.apache.cassandra.service.accord.IAccordService.IAccordResult; +import org.apache.cassandra.service.accord.txn.TxnResult; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationMutationHelper; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationMutationHelper.SplitMutation; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ownership.DataPlacement; +import org.apache.cassandra.tcm.ownership.VersionedEndpoints; +import org.apache.cassandra.transport.Dispatcher; +import org.apache.cassandra.utils.Clock; +import org.apache.cassandra.utils.MonotonicClock; +import org.apache.cassandra.utils.NoSpamLogger; import org.apache.cassandra.utils.concurrent.Condition; - -import static org.apache.cassandra.hints.HintsDispatcher.Callback.Outcome.*; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static org.apache.cassandra.hints.HintsDispatcher.Callback.Outcome.FAILURE; +import static org.apache.cassandra.hints.HintsDispatcher.Callback.Outcome.INTERRUPTED; +import static org.apache.cassandra.hints.HintsDispatcher.Callback.Outcome.RETRY_DIFFERENT_SYSTEM; +import static org.apache.cassandra.hints.HintsDispatcher.Callback.Outcome.SUCCESS; +import static org.apache.cassandra.hints.HintsDispatcher.Callback.Outcome.TIMEOUT; +import static org.apache.cassandra.hints.HintsService.RETRY_ON_DIFFERENT_SYSTEM_UUID; +import static org.apache.cassandra.metrics.HintsServiceMetrics.ACCORD_HINT_ENDPOINT; import static org.apache.cassandra.metrics.HintsServiceMetrics.updateDelayMetrics; import static org.apache.cassandra.net.Verb.HINT_REQ; +import static org.apache.cassandra.service.accord.txn.TxnResult.Kind.retry_new_protocol; import static org.apache.cassandra.utils.MonotonicClock.Global.approxTime; import static org.apache.cassandra.utils.concurrent.Condition.newOneTimeCondition; @@ -51,19 +88,28 @@ final class HintsDispatcher implements AutoCloseable { private static final Logger logger = LoggerFactory.getLogger(HintsDispatcher.class); + private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(logger, 1, TimeUnit.MINUTES); private enum Action { CONTINUE, ABORT } private final HintsReader reader; final UUID hostId; + + @Nullable final InetAddressAndPort address; private final int messagingVersion; private final BooleanSupplier abortRequested; private InputPosition currentPagePosition; - private HintsDispatcher(HintsReader reader, UUID hostId, InetAddressAndPort address, int messagingVersion, BooleanSupplier abortRequested) + // Hints from the batch log that were attempted on Accord don't have a list of hosts that need hinting + // since Accord doesn't expose that on failure. If Accord no longer manages the range for this hint then we need + // to send the hint to all replicas after the page succeeds + private final Queue hintsNeedingRehinting = new LinkedList<>(); + + private HintsDispatcher(HintsReader reader, UUID hostId, @Nullable InetAddressAndPort address, int messagingVersion, BooleanSupplier abortRequested) { + checkArgument(address != null ^ hostId.equals(RETRY_ON_DIFFERENT_SYSTEM_UUID), "address must be nonnull or hostId must be " + RETRY_ON_DIFFERENT_SYSTEM_UUID); currentPagePosition = null; this.reader = reader; @@ -73,9 +119,9 @@ private HintsDispatcher(HintsReader reader, UUID hostId, InetAddressAndPort addr this.abortRequested = abortRequested; } - static HintsDispatcher create(File file, RateLimiter rateLimiter, InetAddressAndPort address, UUID hostId, BooleanSupplier abortRequested) + static HintsDispatcher create(File file, RateLimiter rateLimiter, @Nullable InetAddressAndPort address, UUID hostId, BooleanSupplier abortRequested) { - int messagingVersion = MessagingService.instance().versions.get(address); + int messagingVersion = address == null ? MessagingService.current_version : MessagingService.instance().versions.get(address); HintsDispatcher dispatcher = new HintsDispatcher(HintsReader.open(file, rateLimiter), hostId, address, messagingVersion, abortRequested); HintDiagnostics.dispatcherCreated(dispatcher); return dispatcher; @@ -87,6 +133,11 @@ public void close() reader.close(); } + String destination() + { + return address == null ? "RETRY_ON_DIFFERENT_SYSTEM" : address.toString(); + } + void seek(InputPosition position) { reader.seek(position); @@ -125,7 +176,19 @@ private Action dispatch(HintsReader.Page page) private Action sendHintsAndAwait(HintsReader.Page page) { - Collection callbacks = new ArrayList<>(); + try + { + return doSendHintsAndAwait(page, null); + } + finally + { + hintsNeedingRehinting.clear(); + } + } + + private Action doSendHintsAndAwait(HintsReader.Page page, @Nullable BitSet hintsFilter) + { + List callbacks = new ArrayList<>(); /* * If hints file messaging version matches the version of the target host, we'll use the optimised path - @@ -133,50 +196,137 @@ private Action sendHintsAndAwait(HintsReader.Page page) * * If that is not the case, we'll need to perform conversion to a newer (or an older) format, and decoding the hint * is an unavoidable intermediate step. + * + * If these hints are from the batchlog and were originally attempted on Accord then + * we also need to decode so we can route the Hint contents appropriately. + * + * If filtering of hints is requested, because this is retrying a page that had some retry on different system + * errors, then also don't go down the sendEncodedHints path since it won't re-route the mutation and will trigger + * the same retry on different system error. */ - Action action = reader.descriptor().messagingVersion() == messagingVersion - ? sendHints(page.buffersIterator(), callbacks, this::sendEncodedHint) - : sendHints(page.hintsIterator(), callbacks, this::sendHint); + boolean isBatchLogHints = hostId.equals(RETRY_ON_DIFFERENT_SYSTEM_UUID); + boolean sendEncodedHints = reader.descriptor().messagingVersion() == messagingVersion && !isBatchLogHints && hintsFilter == null; + // If the hints filter is set then splitting the hints is needed and encoded hints can't do that + checkState(!sendEncodedHints || hintsFilter == null, "Should not send encoded hints if hints filter is set"); + Action action = sendEncodedHints + ? sendHints(page.buffersIterator(), null, callbacks, this::sendEncodedHint) + : sendHints(page.hintsIterator(), hintsFilter, callbacks, this::sendHint); if (action == Action.ABORT) return action; - long success = 0, failures = 0, timeouts = 0; - for (Callback cb : callbacks) + BitSet retryDifferentSystemHints = new BitSet(callbacks.size()); + long success = 0, failures = 0, timeouts = 0, retryDifferentSystem = 0; + for (int i = 0; i < callbacks.size(); i++) { + Callback cb = callbacks.get(i); Callback.Outcome outcome = cb.await(); if (outcome == Callback.Outcome.SUCCESS) success++; else if (outcome == Callback.Outcome.FAILURE) failures++; else if (outcome == Callback.Outcome.TIMEOUT) timeouts++; + else if (outcome == RETRY_DIFFERENT_SYSTEM) + { + retryDifferentSystemHints.set(i); + retryDifferentSystem++; + } + else throw new IllegalStateException("Unhandled outcome: " + outcome); } - updateMetrics(success, failures, timeouts); + updateMetrics(success, failures, timeouts, retryDifferentSystem); + + // If the only errors were retryDifferentSystem and we aren't already filtering the hints then retry + // immediately otherwise we will repeat the page later including any successful hints we may have already delivered + // Hints for the batch log can hit RETRY_DIFFERENT_SYSTEM but don't need to be retried here and it could result + // in the same hint ending up in hintsNeedingRehinting twice + boolean failedRetryDifferentSystem = false; + if (retryDifferentSystem > 0 && failures < 1 && timeouts < 1 && hintsFilter == null && !isBatchLogHints) + { + reader.seek(currentPagePosition); + Action retryResult = doSendHintsAndAwait(page, retryDifferentSystemHints); + if (retryResult != Action.CONTINUE) + failedRetryDifferentSystem = true; + } - if (failures > 0 || timeouts > 0) + // The batchlog Accord hints need to return abort if any hint needs to be retried and retry the whole page + // since we don't want hints to ping pong back and forth via hintsNeedingRehinting + if (failures > 0 || timeouts > 0 || failedRetryDifferentSystem || (isBatchLogHints && retryDifferentSystem > 0)) { - HintDiagnostics.pageFailureResult(this, success, failures, timeouts); + HintDiagnostics.pageFailureResult(this, success, failures, timeouts, retryDifferentSystem); return Action.ABORT; } else { - HintDiagnostics.pageSuccessResult(this, success, failures, timeouts); + HintDiagnostics.pageSuccessResult(this, success, failures, timeouts, retryDifferentSystem); + rehintHintsNeedingRehinting(); return Action.CONTINUE; } } - private void updateMetrics(long success, long failures, long timeouts) + private void rehintHintsNeedingRehinting() + { + ClusterMetadata cm = ClusterMetadata.current(); + Hint hint; + while ((hint = hintsNeedingRehinting.poll()) != null) + { + HintsService.instance.writeForAllReplicas(hint); + Mutation mutation = hint.mutation; + // Also may need to apply locally because it's possible this is from the batchlog + // and we never applied it locally + // TODO (review): Additional error handling necessary? Hints are lossy + DataPlacement dataPlacement = cm.placements.get(cm.schema.getKeyspace(mutation.getKeyspaceName()).getMetadata().params.replication); + VersionedEndpoints.ForToken forToken = dataPlacement.writes.forToken(mutation.key().getToken()); + Replica self = forToken.get().selfIfPresent(); + if (self != null) + { + Stage.MUTATION.maybeExecuteImmediately(new RunnableDebuggableTask() + { + private final long approxCreationTimeNanos = MonotonicClock.Global.approxTime.now(); + private volatile long approxStartTimeNanos; + + @Override + public void run() + { + approxStartTimeNanos = MonotonicClock.Global.approxTime.now(); + mutation.apply(); + } + + @Override + public long creationTimeNanos() + { + return approxCreationTimeNanos; + } + + @Override + public long startTimeNanos() + { + return approxStartTimeNanos; + } + + @Override + public String description() + { + return "HintsService rehinting Accord txn"; + } + }); + } + } + + } + + private void updateMetrics(long success, long failures, long timeouts, long retryDifferentSystem) { HintsServiceMetrics.hintsSucceeded.mark(success); HintsServiceMetrics.hintsFailed.mark(failures); HintsServiceMetrics.hintsTimedOut.mark(timeouts); + HintsServiceMetrics.hintsRetryDifferentSystem.mark(retryDifferentSystem); } /* * Sending hints in compatibility mode. */ - - private Action sendHints(Iterator hints, Collection callbacks, Function sendFunction) + private Action sendHints(Iterator hints, @Nullable BitSet hintsFilter, Collection callbacks, Function sendFunction) { + int hintIndex = -1; while (hints.hasNext()) { if (abortRequested.getAsBoolean()) @@ -184,19 +334,95 @@ private Action sendHints(Iterator hints, Collection callbacks, HintDiagnostics.abortRequested(this); return Action.ABORT; } - callbacks.add(sendFunction.apply(hints.next())); + + T hint = hints.next(); + hintIndex++; + if (hintsFilter != null && !hintsFilter.get(hintIndex)) + continue; + + callbacks.add(sendFunction.apply(hint)); } return Action.CONTINUE; } private Callback sendHint(Hint hint) { - Callback callback = new Callback(hint.creationTime); - Message message = Message.out(HINT_REQ, new HintMessage(hostId, hint)); - MessagingService.instance().sendWithCallback(message, address, callback); + ClusterMetadata cm = ClusterMetadata.current(); + SplitHint splitHint = splitHintIntoAccordAndNormal(cm, hint); + Mutation accordHintMutation = splitHint.accordMutation; + Dispatcher.RequestTime requestTime; + IAccordResult accordTxnResult = null; + if (accordHintMutation != null) + { + requestTime = Dispatcher.RequestTime.forImmediateExecution(); + accordTxnResult = accordHintMutation != null ? ConsensusMigrationMutationHelper.instance().mutateWithAccordAsync(cm, accordHintMutation, null, requestTime) : null; + } + + Hint normalHint = splitHint.normalHint; + Callback callback = new Callback(address, hint.creationTime, accordTxnResult); + if (normalHint != null) + { + // We had a hint that was supposed to be done on Accord for the batch log (otherwise address would be non-null), + // but Accord no longer manages that table/range and now we don't know which nodes (if any) are missing the Mutation. + // Convert them to per replica hints *after* all the hints in this page have been applied so we can be reasonably sure + // this page isn't going to be played again thus avoiding any futher amplification from the same hint being + // replayed and repeatedly converted to per replica hints + if (address == null) + { + checkState(hostId.equals(RETRY_ON_DIFFERENT_SYSTEM_UUID), "If there is no address to send the hint to then the host ID should be BATCHLOG_ACCORD_HINT_UUID"); + callback.onResponse(null); + hintsNeedingRehinting.add(normalHint); + } + else + { + Message message = Message.out(HINT_REQ, new HintMessage(hostId, normalHint)); + MessagingService.instance().sendWithCallback(message, address, callback); + } + } + else + { + // Don't wait for a normal response that will never come since no hints were sent + callback.onResponse(null); + } + return callback; } + /** + * Result of splitting a hint across Accord and non-transactional boundaries + */ + private class SplitHint + { + private final Mutation accordMutation; + private final Hint normalHint; + + public SplitHint(Mutation accordMutation, Hint normalHint) + { + this.accordMutation = accordMutation; + this.normalHint = normalHint; + } + + @Override + public String toString() + { + return "SplitHint{" + + "accordMutation=" + accordMutation + + ", normalHint=" + normalHint + + '}'; + } + } + + private SplitHint splitHintIntoAccordAndNormal(ClusterMetadata cm, Hint hint) + { + SplitMutation splitMutation = ConsensusMigrationMutationHelper.instance().splitMutationIntoAccordAndNormal(hint.mutation, cm); + if (splitMutation.accordMutation == null) + return new SplitHint(null, hint); + if (splitMutation.normalMutation == null) + return new SplitHint(splitMutation.accordMutation, null); + Hint normalHint = Hint.create(splitMutation.normalMutation, hint.creationTime, splitMutation.normalMutation.smallestGCGS()); + return new SplitHint(splitMutation.accordMutation, normalHint); + } + /* * Sending hints in raw mode. */ @@ -204,23 +430,36 @@ private Callback sendHint(Hint hint) private Callback sendEncodedHint(ByteBuffer hint) { HintMessage.Encoded message = new HintMessage.Encoded(hostId, hint, messagingVersion); - Callback callback = new Callback(message.getHintCreationTime()); + Callback callback = new Callback(address, message.getHintCreationTime()); MessagingService.instance().sendWithCallback(Message.out(HINT_REQ, message), address, callback); return callback; } - static final class Callback implements RequestCallback + static final class Callback implements RequestCallback, BiConsumer { - enum Outcome { SUCCESS, TIMEOUT, FAILURE, INTERRUPTED } + enum Outcome { SUCCESS, TIMEOUT, FAILURE, INTERRUPTED, RETRY_DIFFERENT_SYSTEM } private final long start = approxTime.now(); private final Condition condition = newOneTimeCondition(); - private volatile Outcome outcome; + private Outcome normalOutcome; + private Outcome accordOutcome; + @Nullable + private final InetAddressAndPort to; private final long hintCreationNanoTime; - private Callback(long hintCreationTimeMillisSinceEpoch) + private Callback(@Nonnull InetAddressAndPort to, long hintCreationTimeMillisSinceEpoch) { + this(to, hintCreationTimeMillisSinceEpoch, null); + } + + private Callback(@Nullable InetAddressAndPort to, long hintCreationTimeMillisSinceEpoch, @Nullable IAccordResult accordTxnResult) + { + this.to = to != null ? to : ACCORD_HINT_ENDPOINT; this.hintCreationNanoTime = approxTime.translate().fromMillisSinceEpoch(hintCreationTimeMillisSinceEpoch); + if (accordTxnResult != null) + accordTxnResult.addCallback(this); + else + accordOutcome = SUCCESS; } Outcome await() @@ -235,8 +474,31 @@ Outcome await() logger.warn("Hint dispatch was interrupted", e); return INTERRUPTED; } + normalOutcome = timedOut ? TIMEOUT : normalOutcome; + + return outcome(); + } - return timedOut ? TIMEOUT : outcome; + private Outcome outcome() + { + checkState((normalOutcome != null && accordOutcome != null) || (normalOutcome != SUCCESS || accordOutcome != SUCCESS), "Outcome for both normal and accord hint delivery should be known"); + if (normalOutcome == RETRY_DIFFERENT_SYSTEM || accordOutcome == RETRY_DIFFERENT_SYSTEM) + return RETRY_DIFFERENT_SYSTEM; + if (normalOutcome == TIMEOUT || accordOutcome == TIMEOUT) + return TIMEOUT; + if (normalOutcome == FAILURE || accordOutcome == FAILURE) + return FAILURE; + checkState(normalOutcome == SUCCESS && accordOutcome == SUCCESS, "Hint delivery should have been successful"); + return SUCCESS; + } + + private synchronized void maybeSignal() + { + if ((normalOutcome != null && accordOutcome != null) || normalOutcome == FAILURE || accordOutcome == FAILURE) + { + updateDelayMetrics(to, approxTime.now() - this.hintCreationNanoTime); + condition.signalAll(); + } } @Override @@ -246,18 +508,53 @@ public boolean invokeOnFailure() } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) + public void onFailure(InetAddressAndPort from, RequestFailure failureMessage) { - outcome = FAILURE; - condition.signalAll(); + if (failureMessage.reason == RequestFailureReason.RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM) + normalOutcome = RETRY_DIFFERENT_SYSTEM; + else + normalOutcome = FAILURE; + maybeSignal(); } @Override public void onResponse(Message msg) { - updateDelayMetrics(msg.from(), approxTime.now() - this.hintCreationNanoTime); - outcome = SUCCESS; - condition.signalAll(); + normalOutcome = SUCCESS; + maybeSignal(); + } + + @Override + public void accept(TxnResult success, Throwable fail) + { + if (fail != null) + { + if (fail instanceof RequestExecutionException || fail instanceof RetryOnDifferentSystemException) + { + if (fail instanceof RetryOnDifferentSystemException) + accordOutcome = RETRY_DIFFERENT_SYSTEM; + else + accordOutcome = TIMEOUT; + String msg = "Accord hint delivery transaction failed retriably"; + if (noSpamLogger.getStatement(msg).shouldLog(Clock.Global.nanoTime())) + logger.error(msg, fail); + } + else + { + accordOutcome = FAILURE; + String msg = "Accord hint delivery transaction failed permanently"; + if (noSpamLogger.getStatement(msg).shouldLog(Clock.Global.nanoTime())) + logger.error(msg, fail); + } + } + else + { + if (success.kind() == retry_new_protocol) + accordOutcome = RETRY_DIFFERENT_SYSTEM; + else + accordOutcome = SUCCESS; + } + maybeSignal(); } } } diff --git a/src/java/org/apache/cassandra/hints/HintsReader.java b/src/java/org/apache/cassandra/hints/HintsReader.java index 2738f023f734..117e1ccdd805 100644 --- a/src/java/org/apache/cassandra/hints/HintsReader.java +++ b/src/java/org/apache/cassandra/hints/HintsReader.java @@ -34,6 +34,7 @@ import org.apache.cassandra.exceptions.UnknownTableException; import org.apache.cassandra.io.FSReadError; +import org.apache.cassandra.metrics.HintsServiceMetrics; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.AbstractIterator; @@ -231,8 +232,7 @@ private Hint computeNextInternal() throws IOException private Hint readHint(int size) throws IOException { - if (rateLimiter != null) - rateLimiter.acquire(size); + applyThrottleRateLimit(size); input.limit(size); Hint hint; @@ -338,8 +338,7 @@ private ByteBuffer computeNextInternal() throws IOException private ByteBuffer readBuffer(int size) throws IOException { - if (rateLimiter != null) - rateLimiter.acquire(size); + applyThrottleRateLimit(size); input.limit(size); ByteBuffer buffer = Hint.serializer.readBufferIfLive(input, now, size, descriptor.messagingVersion()); @@ -364,4 +363,13 @@ private static boolean verifyAllZeros(ChecksummedDataInput input) throws IOExcep } return true; } + + private void applyThrottleRateLimit(int size) + { + if (rateLimiter != null) + { + rateLimiter.acquire(size); + HintsServiceMetrics.hintsThrottle.inc(size); + } + } } diff --git a/src/java/org/apache/cassandra/hints/HintsService.java b/src/java/org/apache/cassandra/hints/HintsService.java index a1877802d2af..0372c40e6418 100644 --- a/src/java/org/apache/cassandra/hints/HintsService.java +++ b/src/java/org/apache/cassandra/hints/HintsService.java @@ -33,30 +33,32 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; -import org.apache.cassandra.db.Keyspace; -import org.apache.cassandra.io.util.File; -import org.apache.cassandra.locator.ReplicaLayout; -import org.apache.cassandra.utils.concurrent.Future; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.cassandra.concurrent.ScheduledExecutors; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.config.ParameterizedClass; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.gms.FailureDetector; import org.apache.cassandra.gms.IFailureDetector; +import org.apache.cassandra.io.util.File; import org.apache.cassandra.locator.EndpointsForToken; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.ReplicaLayout; import org.apache.cassandra.metrics.HintedHandoffMetrics; import org.apache.cassandra.metrics.StorageMetrics; -import org.apache.cassandra.dht.Token; import org.apache.cassandra.service.StorageProxy; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.MBeanWrapper; +import org.apache.cassandra.utils.TimeUUID; +import org.apache.cassandra.utils.concurrent.Future; import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; import static com.google.common.collect.Iterables.filter; import static com.google.common.collect.Iterables.transform; +import static org.apache.cassandra.config.CassandraRelevantProperties.HINT_DISPATCH_INTERVAL_MS; /** * A singleton-ish wrapper over various hints components: @@ -70,12 +72,35 @@ */ public final class HintsService implements HintsServiceMBean { - private static final Logger logger = LoggerFactory.getLogger(HintsService.class); + // Dummy address to use for storing metrics for hints that will be retried on a different transaction system + // and aren't being sent to a specific node + public static final InetAddressAndPort RETRY_ON_DIFFERENT_SYSTEM_ADDRESS; + + static + { + try + { + RETRY_ON_DIFFERENT_SYSTEM_ADDRESS = InetAddressAndPort.getByNameOverrideDefaults("0.0.0.0", 65535); + } + catch (UnknownHostException e) + { + throw new RuntimeException(e); + } + } + + // Batch log replay may need to route some mutations to Accord which may fail and Hints are used for retry by the batch log. + // Write them to this endpoint which indicates that on replay hints will need to calculate the endpoints + // to deliver to since it's not really a per node hint, but part of a batch that needs replaying. + // This can also occur with regular mutations as well when some replicas return a retry error but quorum + // is reached so hinting is used to bring the other replicas up to date + public static final UUID RETRY_ON_DIFFERENT_SYSTEM_UUID = TimeUUID.atUnixMicrosWithLsbAsUUID(-1, -1); public static HintsService instance = new HintsService(); public static final String MBEAN_NAME = "org.apache.cassandra.hints:type=HintsService"; + private static final Logger logger = LoggerFactory.getLogger(HintsService.class); + private static final int MIN_BUFFER_SIZE = 32 << 20; static final ImmutableMap EMPTY_PARAMS = ImmutableMap.of(); @@ -214,6 +239,14 @@ public void flushAndFsyncBlockingly(Iterable hostIds) writeExecutor.fsyncWritersBlockingly(stores); } + @VisibleForTesting + public void flushAndFsyncBlockingly() + { + List stores = catalog.stores().collect(Collectors.toList()); + writeExecutor.flushBufferPool(bufferPool, stores); + writeExecutor.fsyncWritersBlockingly(stores); + } + public synchronized void startDispatch() { if (isShutDown) @@ -226,7 +259,8 @@ public synchronized void startDispatch() HintsDispatchTrigger trigger = new HintsDispatchTrigger(catalog, writeExecutor, dispatchExecutor, isDispatchPaused); // triggering hint dispatch is now very cheap, so we can do it more often - every 10 seconds vs. every 10 minutes, // previously; this reduces mean time to delivery, and positively affects batchlog delivery latencies, too - triggerDispatchFuture = ScheduledExecutors.scheduledTasks.scheduleWithFixedDelay(trigger, 10, 10, TimeUnit.SECONDS); + long hintDispatchIntervalMs = HINT_DISPATCH_INTERVAL_MS.getLong(); + triggerDispatchFuture = ScheduledExecutors.scheduledTasks.scheduleWithFixedDelay(trigger, hintDispatchIntervalMs, hintDispatchIntervalMs, TimeUnit.MILLISECONDS); } public void pauseDispatch() @@ -258,6 +292,17 @@ public long getTotalHintsSize(UUID hostId) return store.getTotalFileSize(); } + /** + * Get the total hints file size of current node + */ + public long getTotalHintsSizeOfNode() + { + return catalog.stores() + .filter(Objects::nonNull) + .mapToLong(HintsStore::getTotalFileSize) + .sum(); + } + /** * Gracefully and blockingly shut down the service. * @@ -322,6 +367,13 @@ public void deleteAllHints() catalog.deleteAllHints(); } + @VisibleForTesting + public void deleteAllHintsUnsafe() + { + catalog.deleteAllHintsUnsafe(); + bufferPool.clearUnsafe(); + } + /** * Deletes all hints for the provided destination. Doesn't make snapshots - should be used with care. * diff --git a/src/java/org/apache/cassandra/hints/HintsStore.java b/src/java/org/apache/cassandra/hints/HintsStore.java index cb3d67b8afda..795c1479e115 100644 --- a/src/java/org/apache/cassandra/hints/HintsStore.java +++ b/src/java/org/apache/cassandra/hints/HintsStore.java @@ -18,6 +18,7 @@ package org.apache.cassandra.hints; import java.io.IOException; +import java.nio.channels.ClosedChannelException; import java.util.Deque; import java.util.HashSet; import java.util.Iterator; @@ -174,6 +175,8 @@ public long findOldestHintTimestamp() boolean isLive() { + if (hostId.equals(HintsService.RETRY_ON_DIFFERENT_SYSTEM_UUID)) + return true; InetAddressAndPort address = address(); return address != null && FailureDetector.instance.isAlive(address); } @@ -193,6 +196,20 @@ void offerLast(HintsDescriptor descriptor) dispatchDequeue.offerLast(descriptor); } + void deleteAllHintsUnsafe() + { + try + { + closeWriter(); + } + catch (FSWriteError e) + { + if (!(e.getCause() instanceof ClosedChannelException)) + throw e; + } + deleteAllHints(); + } + void deleteAllHints() { HintsDescriptor descriptor; diff --git a/src/java/org/apache/cassandra/index/IndexBuildInProgressException.java b/src/java/org/apache/cassandra/index/IndexBuildInProgressException.java new file mode 100644 index 000000000000..1807ff36e0b4 --- /dev/null +++ b/src/java/org/apache/cassandra/index/IndexBuildInProgressException.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index; + +/** + * Thrown if a secondary index is not currently available because it is building. + */ +public final class IndexBuildInProgressException extends RuntimeException +{ + public static final String INDEX_BUILD_IN_PROGRESS_ERROR = "The secondary index '%s' is not yet available as it is building"; + + /** + * Creates a new IndexIsBuildingException for the specified index. + * @param index the index + */ + public IndexBuildInProgressException(Index index) + { + super(String.format(INDEX_BUILD_IN_PROGRESS_ERROR, index.getIndexMetadata().name)); + } +} diff --git a/src/java/org/apache/cassandra/index/IndexStatusManager.java b/src/java/org/apache/cassandra/index/IndexStatusManager.java index cc98def63e9a..0f50a26276b4 100644 --- a/src/java/org/apache/cassandra/index/IndexStatusManager.java +++ b/src/java/org/apache/cassandra/index/IndexStatusManager.java @@ -24,12 +24,13 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import javax.annotation.Nonnull; import javax.annotation.Nullable; import com.google.common.annotations.VisibleForTesting; -import org.apache.cassandra.tcm.ClusterMetadata; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,7 +47,9 @@ import org.apache.cassandra.locator.Replica; import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.utils.CassandraVersion; +import org.apache.cassandra.utils.ExecutorUtils; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.JsonUtils; @@ -89,6 +92,7 @@ public > E filterForQuery(E liveEndpoints, Keyspace keysp { // UNKNOWN states are transient/rare; only a few replicas should have this state at any time. See CASSANDRA-19400 Set queryableNonSucceeded = new HashSet<>(4); + Map indexStatusMap = new HashMap<>(); E queryableEndpoints = liveEndpoints.filter(replica -> { @@ -97,7 +101,10 @@ public > E filterForQuery(E liveEndpoints, Keyspace keysp { Index.Status status = getIndexStatus(replica.endpoint(), keyspace.getName(), index.getIndexMetadata().name); if (!index.isQueryable(status)) + { + indexStatusMap.put(replica.endpoint(), status); return false; + } if (status != Index.Status.BUILD_SUCCEEDED) allBuilt = false; @@ -125,7 +132,13 @@ public > E filterForQuery(E liveEndpoints, Keyspace keysp { Map failureReasons = new HashMap<>(); liveEndpoints.without(queryableEndpoints.endpoints()) - .forEach(replica -> failureReasons.put(replica.endpoint(), RequestFailureReason.INDEX_NOT_AVAILABLE)); + .forEach(replica -> { + Index.Status status = indexStatusMap.get(replica.endpoint()); + if (status == Index.Status.FULL_REBUILD_STARTED) + failureReasons.put(replica.endpoint(), RequestFailureReason.INDEX_BUILD_IN_PROGRESS); + else + failureReasons.put(replica.endpoint(), RequestFailureReason.INDEX_NOT_AVAILABLE); + }); throw new ReadFailureException(level, filtered, required, false, failureReasons); } @@ -325,4 +338,9 @@ private String identifier(String keyspace, String index) { return keyspace + '.' + index; } + + public void shutdownAndWait(long interval, TimeUnit unit) throws InterruptedException, TimeoutException + { + ExecutorUtils.shutdownAndWait(interval, unit, statusPropagationExecutor); + } } diff --git a/src/java/org/apache/cassandra/index/SecondaryIndexManager.java b/src/java/org/apache/cassandra/index/SecondaryIndexManager.java index 791293fbb951..5f1c6e3d52fb 100644 --- a/src/java/org/apache/cassandra/index/SecondaryIndexManager.java +++ b/src/java/org/apache/cassandra/index/SecondaryIndexManager.java @@ -41,6 +41,7 @@ import com.google.common.collect.Maps; import com.google.common.collect.Sets; import com.google.common.util.concurrent.FutureCallback; +import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -306,17 +307,29 @@ public boolean isIndexQueryable(Index index) /** * Throws an {@link IndexNotAvailableException} if any of the indexes in the specified {@link Index.QueryPlan} is - * not queryable, as it's defined by {@link #isIndexQueryable(Index)}. + * not queryable, as it's defined by {@link #isIndexQueryable(Index)}. If the reason for the index to be not available + * is that it's building, it will throw an {@link IndexBuildInProgressException}. * * @param queryPlan a query plan * @throws IndexNotAvailableException if the query plan has any index that is not queryable */ public void checkQueryability(Index.QueryPlan queryPlan) { + InetAddressAndPort endpoint = FBUtilities.getBroadcastAddressAndPort(); + for (Index index : queryPlan.getIndexes()) { + String indexName = index.getIndexMetadata().name; + Index.Status indexStatus = IndexStatusManager.instance.getIndexStatus(endpoint, keyspace.getName(), indexName); + if (!isIndexQueryable(index)) + { + // isQueryable is always true for non-SAI index implementations, thus we need to check both not queryable and building + if (indexStatus == Index.Status.FULL_REBUILD_STARTED) + throw new IndexBuildInProgressException(index); + throw new IndexNotAvailableException(index); + } } } diff --git a/src/java/org/apache/cassandra/index/accord/CheckpointIntervalArrayIndex.java b/src/java/org/apache/cassandra/index/accord/CheckpointIntervalArrayIndex.java new file mode 100644 index 000000000000..9fbb3634b99d --- /dev/null +++ b/src/java/org/apache/cassandra/index/accord/CheckpointIntervalArrayIndex.java @@ -0,0 +1,809 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.accord; + +import java.io.Closeable; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Arrays; +import java.util.Comparator; +import java.util.EnumMap; +import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; +import java.util.function.Supplier; +import java.util.zip.CRC32C; +import java.util.zip.Checksum; + +import accord.utils.AsymmetricComparator; +import accord.utils.CheckpointIntervalArray; +import accord.utils.CheckpointIntervalArrayBuilder; +import accord.utils.CheckpointIntervalArrayBuilder.Accessor; +import accord.utils.SortedArrays; +import org.apache.cassandra.index.accord.IndexDescriptor.IndexComponent; +import org.apache.cassandra.io.util.ChecksumedRandomAccessReader; +import org.apache.cassandra.io.util.ChecksumedSequentialWriter; +import org.apache.cassandra.io.util.FileHandle; +import org.apache.cassandra.io.util.FileUtils; +import org.apache.cassandra.io.util.RandomAccessReader; +import org.apache.cassandra.utils.ByteArrayUtil; +import org.apache.cassandra.utils.Clock; +import org.apache.cassandra.utils.Throwables; + +import static accord.utils.CheckpointIntervalArrayBuilder.Links.LINKS; +import static accord.utils.CheckpointIntervalArrayBuilder.Strategy.ACCURATE; + +//TODO (now): Add support for variable length tokens; this is needed for Ordered partitioner (which we plan to support) +public class CheckpointIntervalArrayIndex +{ + private static final Accessor LIST_INTERVAL_ACCESSOR = new Accessor<>() + { + @Override + public int size(Interval[] intervals) + { + return intervals.length; + } + + @Override + public Interval get(Interval[] intervals, int index) + { + return intervals[index]; + } + + @Override + public byte[] start(Interval[] intervals, int index) + { + return intervals[index].start; + } + + @Override + public byte[] start(Interval interval) + { + return interval.start; + } + + @Override + public byte[] end(Interval[] intervals, int index) + { + return intervals[index].end; + } + + @Override + public byte[] end(Interval interval) + { + return interval.end; + } + + @Override + public Comparator keyComparator() + { + return (a, b) -> ByteArrayUtil.compareUnsigned(a, 0, b, 0, a.length); + } + + @Override + public int compareEndTo(Interval interval, byte[] key) + { + return ByteArrayUtil.compareUnsigned(interval.end, key); + } + + @Override + public int compareStartTo(Interval interval, byte[] key) + { + int c = ByteArrayUtil.compareUnsigned(interval.start, key); + if (c == 0) c = -1; + return c; + } + + @Override + public boolean endInclusive(Interval[] checksumedRandomAccessReader) + { + return true; + } + + @Override + public int binarySearch(Interval[] intervals, int from, int to, byte[] find, AsymmetricComparator comparator, SortedArrays.Search op) + { + return SortedArrays.binarySearch(intervals, from, to, find, comparator, op); + } + }; + public static final Supplier CHECKSUM_SUPPLIER = CRC32C::new; + + //TODO (performance): rather than row structure, would column structure be better? Sorted tokens tend to have prefix relationships + // so could compress the data more. The negative here is the binary search might cost more and the scan after the random access needs end + value... + // Its also possible to do a hybrid structure where either start/end are column and the other one is row based... + //TODO (performance): store min/max values so could filter based off metadata without having to walk the tree first? This means that the metadata + // doesn't need to be stored in-memory 100% of the time and only when a file "could" match. The perf here would trade read costs for less memory. + //TODO (fault tolerence): right now there is no checksumming outside of the header, so a corruption in the middle + // could lead to weird behavior... since this structure is fixed lenght it "should" only lead to mismatches or binary + // search going the wrong direction... + //TODO (fault tolerence): maybe replace readStart/End with readRecord and extrat the value from there, this makes it so it would be trivial to add a checksum per-record. + // Given the migration from SAI work, we can now remove the TableId from the data (16 bytes) so a 4 byte footer wouldn't be a big cost. We also compute the checksum on read/write + // right now, just ignore the value... the performance is currently better than with SAI (less overhead as we are not generic), so the checksumming costs are effectivally 0. + public static class SortedListWriter + { + private final int bytesPerKey, bytesPerValue; + + public SortedListWriter(int bytesPerKey, int bytesPerValue) + { + this.bytesPerKey = bytesPerKey; + this.bytesPerValue = bytesPerValue; + } + + public long write(ChecksumedSequentialWriter out, Interval[] sortedIntervals, Callback callback) throws IOException + { + long treeFilePointer = out.getFilePointer(); + // write header + out.resetChecksum(); // reset checksum so the header is isolated + out.writeUnsignedVInt32(bytesPerKey); + out.writeUnsignedVInt32(bytesPerValue); + out.writeUnsignedVInt32(sortedIntervals.length); + out.writeInt(out.getValue32AndResetChecksum()); + + // write values + callback.preWalk(sortedIntervals); + int count = 0; + for (Interval it : sortedIntervals) + { + validate(count, it); + + out.resetChecksum(); + out.write(it.start, 0, it.start.length); + out.write(it.end, 0, it.end.length); + out.write(it.value, 0, it.value.length); + out.writeInt(out.getValue32()); + callback.onWrite(count, it); + count++; + } + //TODO (now): don't need as this was here only for SAI. Offset/position are the same now + return count == 0 ? -1 : treeFilePointer; + } + + private void validate(int numIntervals, Interval it) + { + if (it.start.length != bytesPerKey) + throw new IllegalArgumentException("Interval " + numIntervals + "'s start value is size " + it.start.length + ", but expected " + bytesPerKey); + if (it.end.length != bytesPerKey) + throw new IllegalArgumentException("Interval " + numIntervals + "'s end value is size " + it.end.length + ", but expected " + bytesPerKey); + if (it.value.length != bytesPerValue) + throw new IllegalArgumentException("Interval " + numIntervals + "'s value is size " + it.value.length + ", but expected " + bytesPerValue); + } + + public interface Callback + { + default void preWalk(Interval[] sortedIntervals) throws IOException + { + } + + default void onWrite(int index, Interval interval) throws IOException + { + } + } + } + + public static class SortedListReader implements Closeable + { + private final FileHandle fh; + private final long firstRecordOffset; + private final int bytesPerKey, bytesPerValue, recordSize; + private final int count; + + public SortedListReader(FileHandle fh, long pos) + { + this.fh = fh; + + try (RandomAccessReader reader = fh.createReader(); + ChecksumedRandomAccessReader in = new ChecksumedRandomAccessReader(reader, CHECKSUM_SUPPLIER)) + { + if (pos != -1) + in.seek(pos); + + bytesPerKey = in.readUnsignedVInt32(); + bytesPerValue = in.readUnsignedVInt32(); + recordSize = bytesPerKey * 2 + bytesPerValue + Integer.BYTES; + count = in.readUnsignedVInt32(); + int actualChecksum = in.getValue32AndResetChecksum(); + int expectedChecksum = in.readInt(); + assert actualChecksum == expectedChecksum; + firstRecordOffset = reader.getFilePointer(); + } + catch (Throwable t) + { + FileUtils.closeQuietly(fh); + throw Throwables.unchecked(t); + } + } + + @Override + public void close() throws IOException + { + FileUtils.closeQuietly(fh); + } + + public enum SeekReason + {BINARY_SEARCH, GET, SCAN} + + private boolean maybeSeek(ChecksumedRandomAccessReader indexInput, Stats stats, SeekReason reason, long target) throws IOException + { + if (indexInput.getFilePointer() != target) + { + indexInput.seek(target); + switch (reason) + { + case SCAN: + stats.seekForScan++; + break; + case GET: + stats.seekForGet++; + break; + case BINARY_SEARCH: + stats.seekForBinarySearch++; + break; + default: + throw new IllegalArgumentException("Unknown reason: " + reason); + } + return true; + } + return false; + } + + public byte[] getRecord(ChecksumedRandomAccessReader indexInput, Stats stats, SeekReason reason, byte[] recordBuffer, int pos) throws IOException + { + maybeSeek(indexInput, stats, reason, fileOffsetStart(pos)); + return getCurrentRecord(indexInput, stats, recordBuffer); + } + + public byte[] getCurrentRecord(ChecksumedRandomAccessReader indexInput, Stats stats, byte[] recordBuffer) throws IOException + { + stats.bytesRead += recordBuffer.length + Integer.BYTES; + indexInput.resetChecksum(); + indexInput.readFully(recordBuffer, 0, recordBuffer.length); + int actualChecksum = indexInput.getValue32(); + int expectedChecksum = indexInput.readInt(); + assert actualChecksum == expectedChecksum; + return recordBuffer; + } + + public byte[] readStart(ChecksumedRandomAccessReader indexInput, Stats stats, byte[] recordBuffer, byte[] keyBuffer, int pos) throws IOException + { + getRecord(indexInput, stats, SeekReason.GET, recordBuffer, pos); + copyStart(recordBuffer, keyBuffer); + return keyBuffer; + } + + public byte[] readEnd(ChecksumedRandomAccessReader indexInput, Stats stats, byte[] recordBuffer, byte[] keyBuffer, int pos) throws IOException + { + getRecord(indexInput, stats, SeekReason.GET, recordBuffer, pos); + copyEnd(recordBuffer, keyBuffer); + return keyBuffer; + } + + public byte[] copyStart(byte[] recordBuffer, byte[] keyBuffer) + { + System.arraycopy(recordBuffer, 0, keyBuffer, 0, keyBuffer.length); + return keyBuffer; + } + + public byte[] copyEnd(byte[] recordBuffer, byte[] keyBuffer) + { + System.arraycopy(recordBuffer, bytesPerKey, keyBuffer, 0, bytesPerKey); + return keyBuffer; + } + + public int binarySearch(ChecksumedRandomAccessReader indexInput, Stats stats, byte[] recordBuffer, int from, int to, byte[] find, AsymmetricComparator comparator, SortedArrays.Search op) throws IOException + { + int found = -1; + while (from < to) + { + int i = (from + to) >>> 1; + int c = comparator.compare(find, getRecord(indexInput, stats, SeekReason.BINARY_SEARCH, recordBuffer, i)); + if (c < 0) + { + to = i; + } + else if (c > 0) + { + from = i + 1; + } + else + { + switch (op) + { + default: + throw new IllegalStateException("Unknown search operation: " + op); + case FAST: + return i; + + case CEIL: + to = found = i; + break; + + case FLOOR: + found = i; + from = i + 1; + } + } + } + // return -(low + 1); // key not found. + return found >= 0 ? found : -1 - to; + } + + public Interval copyTo(byte[] record, Interval buffer) + { + buffer.start = Arrays.copyOfRange(record, 0, bytesPerKey); + buffer.end = Arrays.copyOfRange(record, bytesPerKey, bytesPerKey * 2); + buffer.value = Arrays.copyOfRange(record, bytesPerKey * 2, record.length); + return buffer; + } + + private long fileOffsetStart(int offset) + { + if (offset >= count) + throw new IndexOutOfBoundsException("Start is from (0, " + count + "]; attempted to access " + offset); + return firstRecordOffset + (offset * recordSize); + } + + private long fileOffsetEnd(int offset) + { + if (offset >= count) + throw new IndexOutOfBoundsException("Start is from (0, " + count + "]; attempted to access " + offset); + return firstRecordOffset + (offset * recordSize) + bytesPerKey; + } + + private long fileOffsetValue(int offset) + { + if (offset >= count) + throw new IndexOutOfBoundsException("Start is from (0, " + count + "]; attempted to access " + offset); + return firstRecordOffset + (offset * recordSize) + bytesPerKey * 2; + } + } + + public static class CheckpointWriter implements SortedListWriter.Callback, Closeable + { + private final ChecksumedSequentialWriter out; + private final long offset; + private final long position; //TODO (now): don't need as this was here only for SAI. Offset/position are the same now + private long length = -1; + + public CheckpointWriter(ChecksumedSequentialWriter out) + { + this.out = out; + this.offset = position = out.getFilePointer(); + } + + @Override + public void preWalk(Interval[] sortedIntervals) throws IOException + { + class Checkpoints + { + final int[] bounds, headers, lists; + final int maxScanAndCheckpointMatches; + + Checkpoints(int[] bounds, int[] headers, int[] lists, int maxScanAndCheckpointMatches) + { + this.bounds = bounds; + this.headers = headers; + this.lists = lists; + this.maxScanAndCheckpointMatches = maxScanAndCheckpointMatches; + } + } + Checkpoints c = new CheckpointIntervalArrayBuilder<>(LIST_INTERVAL_ACCESSOR, sortedIntervals, ACCURATE, LINKS).build((ignore, bounds, headers, lists, max) -> new Checkpoints(bounds, headers, lists, max)); + out.resetChecksum(); // reset checksum so it only covers this metadata + out.writeUnsignedVInt32(c.maxScanAndCheckpointMatches); + write(c.bounds); + write(c.headers); + write(c.lists); + out.writeInt(out.getValue32AndResetChecksum()); + } + + private void write(int[] array) throws IOException + { + out.writeUnsignedVInt32(array.length); + for (int i = 0; i < array.length; i++) + out.writeVInt32(array[i]); + } + + @Override + public void close() throws IOException + { + length = out.getFilePointer() - offset; + out.close(); + } + } + + //TODO (performance): the current format assumes random list access is cheap, which isn't true for a disk index. + // This format was chosen as a place holder for now so we don't drift from the in-memory logic; in the original paper + // a new sorted list is used for each checkpoint, which then makes the access a sequential scan rather than random access. + public static class CheckpointReader implements Closeable + { + private final FileHandle fh; + private final int[] bounds, headers, lists; + private final int maxScanAndCheckpointMatches; + + public CheckpointReader(FileHandle fh, long pos) + { + this.fh = fh; + try (RandomAccessReader reader = fh.createReader(); + ChecksumedRandomAccessReader input = new ChecksumedRandomAccessReader(reader, CHECKSUM_SUPPLIER)) + { + if (pos != -1) + input.seek(pos); + + input.resetChecksum(); // reset checksum so it only covers this metadata + maxScanAndCheckpointMatches = input.readUnsignedVInt32(); + bounds = readArray(input); + headers = readArray(input); + lists = readArray(input); + int actualChecksum = input.getValue32AndResetChecksum(); + int expectedChecksum = input.readInt(); + assert actualChecksum == expectedChecksum; + } + catch (Throwable t) + { + FileUtils.closeQuietly(fh); + throw Throwables.unchecked(t); + } + } + + private static int[] readArray(ChecksumedRandomAccessReader input) throws IOException + { + int size = input.readUnsignedVInt32(); + int[] array = new int[size]; + for (int i = 0; i < size; i++) + array[i] = input.readVInt32(); + return array; + } + + @Override + public void close() throws IOException + { + FileUtils.closeQuietly(fh); + } + } + + public static class SegmentWriter + { + private final IndexDescriptor id; + private final SortedListWriter writer; + + public SegmentWriter(IndexDescriptor id, int bytesPerKey, int bytesPerValue) + { + this.id = id; + this.writer = new SortedListWriter(bytesPerKey, bytesPerValue); + } + + public EnumMap write(Interval[] sortedIntervals) throws IOException + { + EnumMap metas = new EnumMap<>(IndexComponent.class); + try (ChecksumedSequentialWriter treeOutput = ChecksumedSequentialWriter.open(id.fileFor(IndexComponent.CINTIA_SORTED_LIST), true, CHECKSUM_SUPPLIER); + CheckpointWriter checkpointWriter = new CheckpointWriter(ChecksumedSequentialWriter.open(id.fileFor(IndexComponent.CINTIA_CHECKPOINTS), true, CHECKSUM_SUPPLIER))) + { + // The SSTable component file is opened in append mode, so our offset is the current file pointer. + long sortedOffset = treeOutput.getFilePointer(); + long sortedPosition = writer.write(treeOutput, sortedIntervals, checkpointWriter); + + // If the treePosition is less than 0 then we didn't write any values out and the index is empty + if (sortedPosition < 0) + return metas; + //TODO (now): currently does SAI header so offset isn't correct here and need position + metas.put(IndexComponent.CINTIA_SORTED_LIST, new Segment.ComponentMetadata(sortedPosition, treeOutput.getFilePointer())); + metas.put(IndexComponent.CINTIA_CHECKPOINTS, new Segment.ComponentMetadata(checkpointWriter.position, checkpointWriter.out.getFilePointer())); + } + return metas; + } + } + + public static class Stats + { + int seekForGet, seekForBinarySearch, seekForScan; + long durationNs, bytesRead, matches; + + @Override + public String toString() + { + return "Stats{" + + "seeks={Get=" + seekForGet + + ", BinarySearch=" + seekForBinarySearch + + ", Scan=" + seekForScan + + "}, bytesRead=" + bytesRead + + ", matches=" + matches + + ", duration_micro=" + TimeUnit.NANOSECONDS.toMicros(durationNs) + + '}'; + } + } + + public static class SegmentSearcher implements Closeable + { + private final SortedListReader reader; + private final CheckpointReader checkpoints; + + public SegmentSearcher(FileHandle sortedListFile, + long sortedListPosition, + FileHandle checkpointFile, + long checkpointPosition) + { + this.reader = new SortedListReader(sortedListFile, sortedListPosition); + this.checkpoints = new CheckpointReader(checkpointFile, checkpointPosition); + } + + // contains + public Stats contains(byte[] key, Consumer callback) throws IOException + { + return run(ctx -> { + ctx.searcher.forEachKey(key, (i1, i2, i3, i4, index) -> { + ctx.stats.matches++; + callback.accept(reader.copyTo(ctx.accessor.get(ctx.indexInput, index), ctx.buffer)); + }, (i1, i2, i3, i4, startIdx, endIdx) -> { + try + { + if (startIdx == endIdx) + return; + reader.maybeSeek(ctx.indexInput, ctx.stats, SortedListReader.SeekReason.SCAN, reader.fileOffsetStart(startIdx)); + for (int i = startIdx; i < endIdx; i++) + { + ctx.stats.matches++; + reader.getCurrentRecord(ctx.indexInput, ctx.stats, ctx.recordBuffer); + callback.accept(reader.copyTo(ctx.recordBuffer, ctx.buffer)); + } + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + }, 0, 0, 0, 0, 0); + }); + } + + public Stats intersects(byte[] start, byte[] end, Consumer callback) throws IOException + { + return run(ctx -> { + ctx.searcher.forEachRange(start, end, (i1, i2, i3, i4, index) -> { + ctx.stats.matches++; + callback.accept(reader.copyTo(ctx.accessor.get(ctx.indexInput, index), ctx.buffer)); + }, (i1, i2, i3, i4, startIdx, endIdx) -> { + try + { + if (startIdx == endIdx) + return; + reader.maybeSeek(ctx.indexInput, ctx.stats, SortedListReader.SeekReason.SCAN, reader.fileOffsetStart(startIdx)); + for (int i = startIdx; i < endIdx; i++) + { + ctx.stats.matches++; + reader.getCurrentRecord(ctx.indexInput, ctx.stats, ctx.recordBuffer); + callback.accept(reader.copyTo(ctx.recordBuffer, ctx.buffer)); + } + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + }, 0, 0, 0, 0, 0); + }); + } + + private Stats run(Consumer fn) throws IOException + { + long startNanos = Clock.Global.nanoTime(); + Context ctx = new Context(); + try (ctx) + { + fn.accept(ctx); + } + finally + { + ctx.stats.durationNs = Clock.Global.nanoTime() - startNanos; + } + return ctx.stats; + } + + @Override + public void close() + { + FileUtils.closeQuietly(checkpoints); + FileUtils.closeQuietly(reader); + } + + private class Context implements Closeable + { + final byte[] keyBuffer = new byte[reader.bytesPerKey]; + final byte[] recordBuffer = new byte[reader.recordSize - Integer.BYTES]; + final Stats stats = new Stats(); + final ChecksumedRandomAccessReader indexInput = new ChecksumedRandomAccessReader(reader.fh.createReader(), CHECKSUM_SUPPLIER); + final Interval buffer = new Interval(); + final Accessor accessor = new Accessor<>() + { + @Override + public int size(ChecksumedRandomAccessReader indexInput) + { + return reader.count; + } + + @Override + public byte[] get(ChecksumedRandomAccessReader indexInput, int index) + { + try + { + return reader.getRecord(indexInput, stats, SortedListReader.SeekReason.GET, recordBuffer, index); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } + + @Override + public byte[] start(ChecksumedRandomAccessReader indexInput, int index) + { + try + { + return reader.readStart(indexInput, stats, recordBuffer, keyBuffer, index); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } + + @Override + public byte[] start(byte[] bytes) + { + return reader.copyStart(bytes, keyBuffer); + } + + @Override + public byte[] end(ChecksumedRandomAccessReader indexInput, int index) + { + try + { + return reader.readEnd(indexInput, stats, recordBuffer, keyBuffer, index); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } + + @Override + public byte[] end(byte[] bytes) + { + return reader.copyEnd(bytes, keyBuffer); + } + + @Override + public Comparator keyComparator() + { + return (a, b) -> ByteArrayUtil.compareUnsigned(a, 0, b, 0, a.length); + } + + @Override + public int compareEndTo(byte[] range, byte[] key) + { + return ByteArrayUtil.compareUnsigned(end(range), key); + } + + @Override + public int compareStartTo(byte[] range, byte[] key) + { + int c = ByteArrayUtil.compareUnsigned(start(range), key); + if (c == 0) c = 1; + return c; + } + + @Override + public boolean endInclusive(ChecksumedRandomAccessReader checksumedRandomAccessReader) + { + return true; + } + + @Override + public int binarySearch(ChecksumedRandomAccessReader indexInput, int from, int to, byte[] find, AsymmetricComparator comparator, SortedArrays.Search op) + { + try + { + return reader.binarySearch(indexInput, stats, recordBuffer, from, to, find, comparator, op); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } + }; + final CheckpointIntervalArray searcher = new CheckpointIntervalArray<>(accessor, indexInput, checkpoints.bounds, checkpoints.headers, checkpoints.lists, checkpoints.maxScanAndCheckpointMatches); + + @Override + public void close() throws IOException + { + indexInput.close(); + } + } + } + + public static class Interval implements Comparable + { + public byte[] start, end, value; // mutable to avoid allocating Interval for every element + + public Interval() + { + } + + public Interval(byte[] start, byte[] end, byte[] value) + { + this.start = start; + this.end = end; + this.value = value; + } + + public Interval(Interval other) + { + this.start = other.start; + this.end = other.end; + this.value = other.value; + } + + @Override + public int compareTo(Interval b) + { + int rc = compareStart(b); + if (rc == 0) + rc = ByteArrayUtil.compareUnsigned(value, 0, b.value, 0, value.length); + return rc; + } + + public int compareStart(Interval b) + { + int rc = ByteArrayUtil.compareUnsigned(start, 0, b.start, 0, start.length); + if (rc == 0) + rc = ByteArrayUtil.compareUnsigned(end, 0, b.end, 0, end.length); + return rc; + } + + public int compareEnd(Interval b) + { + int rc = ByteArrayUtil.compareUnsigned(end, 0, b.end, 0, end.length); + if (rc == 0) + rc = ByteArrayUtil.compareUnsigned(start, 0, b.start, 0, start.length); + return rc; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Interval interval = (Interval) o; + return Arrays.equals(start, interval.start) && Arrays.equals(end, interval.end) && Arrays.equals(value, interval.value); + } + + @Override + public int hashCode() + { + int result = Arrays.hashCode(start); + result = 31 * result + Arrays.hashCode(end); + result = 31 * result + Arrays.hashCode(value); + return result; + } + + public boolean intersects(byte[] start, byte[] end) + { + if (ByteArrayUtil.compareUnsigned(this.start, end) >= 0) + return false; + if (ByteArrayUtil.compareUnsigned(this.end, start) <= 0) + return false; + return true; + } + } +} diff --git a/src/java/org/apache/cassandra/index/accord/Group.java b/src/java/org/apache/cassandra/index/accord/Group.java new file mode 100644 index 000000000000..e8e4d332949a --- /dev/null +++ b/src/java/org/apache/cassandra/index/accord/Group.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.accord; + +import java.util.Objects; + +import org.apache.cassandra.schema.TableId; + +public class Group implements Comparable +{ + public final int storeId; + public final TableId tableId; + + public Group(int storeId, TableId tableId) + { + this.storeId = storeId; + this.tableId = tableId; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Group group = (Group) o; + return storeId == group.storeId && Objects.equals(tableId, group.tableId); + } + + @Override + public int hashCode() + { + return Objects.hash(storeId, tableId); + } + + @Override + public String toString() + { + return "Group{" + + "storeId=" + storeId + + ", tableId=" + tableId + + '}'; + } + + @Override + public int compareTo(Group o) + { + int rc = Integer.compare(storeId, o.storeId); + if (rc == 0) + rc = tableId.compareTo(o.tableId); + return rc; + } +} diff --git a/src/java/org/apache/cassandra/index/accord/IndexDescriptor.java b/src/java/org/apache/cassandra/index/accord/IndexDescriptor.java new file mode 100644 index 000000000000..b549751a3fb7 --- /dev/null +++ b/src/java/org/apache/cassandra/index/accord/IndexDescriptor.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.accord; + +import java.util.Collection; +import java.util.function.Function; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.apache.cassandra.db.ClusteringComparator; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.io.sstable.Component; +import org.apache.cassandra.io.sstable.Descriptor; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.io.util.File; + +public class IndexDescriptor +{ + public enum Version + { + v1("aa", c -> defaultFileNameFormat(c, "aa")); + + public static final Version CURRENT = v1; + + public final String versionString; + public final Function fileNameFormatter; + + Version(String versionString, Function fileNameFormatter) + { + this.versionString = versionString; + this.fileNameFormatter = fileNameFormatter; + } + + + private static String defaultFileNameFormat(IndexComponent indexComponent, String version) + { + StringBuilder sb = new StringBuilder(); + + sb.append(IndexComponent.DESCRIPTOR).append(IndexComponent.SEPARATOR) + .append(version).append(IndexComponent.SEPARATOR) + .append(indexComponent.name).append(Descriptor.EXTENSION); + + return sb.toString(); + } + } + + public enum IndexComponent + { + CINTIA_SORTED_LIST("CintiaSortedList", (byte) 1), + CINTIA_CHECKPOINTS("CintiaCheckpoints", (byte) 2), + SEGMENT("Segement", (byte) 3), + METADATA("Metadata", (byte) 4); + + public static final String DESCRIPTOR = "ACCORD"; + public static final String SEPARATOR = "+"; + + public final String name; + public final Component.Type type; + public final byte value; + + IndexComponent(String name, byte value) + { + this.name = name; + this.type = componentType(name); + this.value = value; + } + + private static Component.Type componentType(String name) + { + String componentName = DESCRIPTOR + SEPARATOR + name; + String repr = Pattern.quote(DESCRIPTOR + SEPARATOR) + + ".*" + + Pattern.quote(SEPARATOR + name + ".db"); + return Component.Type.create(componentName, repr, true, null); + } + + public static IndexComponent fromByte(byte b) + { + switch (b) + { + case 1: return CINTIA_SORTED_LIST; + case 2: return CINTIA_CHECKPOINTS; + case 3: return SEGMENT; + case 4: return METADATA; + default:throw new IllegalArgumentException("Unknow byte: " + b); + } + } + } + + public final Version version; + public final Descriptor sstableDescriptor; + public final IPartitioner partitioner; + public final ClusteringComparator clusteringComparator; + + public IndexDescriptor(Version version, Descriptor sstableDescriptor, IPartitioner partitioner, ClusteringComparator clusteringComparator) + { + this.version = version; + this.sstableDescriptor = sstableDescriptor; + this.partitioner = partitioner; + this.clusteringComparator = clusteringComparator; + } + + public static IndexDescriptor create(SSTableReader sstable) + { + for (Version version : Version.values()) + { + IndexDescriptor id = new IndexDescriptor(version, sstable.descriptor, sstable.getPartitioner(), sstable.metadata().comparator); + if (id.isIndexBuildComplete()) + return id; + } + return new IndexDescriptor(Version.CURRENT, sstable.descriptor, sstable.getPartitioner(), sstable.metadata().comparator); + } + + public static IndexDescriptor create(Descriptor descriptor, IPartitioner partitioner, ClusteringComparator comparator) + { + return new IndexDescriptor(Version.CURRENT, descriptor, partitioner, comparator); + } + + public boolean isIndexBuildComplete() + { + return hasComponent(IndexComponent.METADATA); + } + + public boolean hasComponent(IndexComponent indexComponent) + { + return fileFor(indexComponent).exists(); + } + + public File fileFor(IndexComponent indexComponent) + { + Component c = indexComponent.type.createComponent(version.fileNameFormatter.apply(indexComponent)); + return sstableDescriptor.fileFor(c); + } + + public void deleteIndex() + { + Stream.of(IndexComponent.values()).map(this::fileFor).forEach(File::deleteIfExists); + } + + public Collection getLiveSSTableComponents() + { + return Stream.of(IndexComponent.values()) + .map(c -> c.type.createComponent(version.fileNameFormatter.apply(c))) + .filter(c -> sstableDescriptor.fileFor(c).exists()) + .collect(Collectors.toList()); + } + + public Collection getLiveComponents() + { + return Stream.of(IndexComponent.values()) + .filter(c -> fileFor(c).exists()) + .collect(Collectors.toList()); + } +} diff --git a/src/java/org/apache/cassandra/index/accord/IndexMetrics.java b/src/java/org/apache/cassandra/index/accord/IndexMetrics.java new file mode 100644 index 000000000000..cb12afe147e7 --- /dev/null +++ b/src/java/org/apache/cassandra/index/accord/IndexMetrics.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.accord; + +import java.util.ArrayList; +import java.util.List; + +import com.codahale.metrics.Timer; +import org.apache.cassandra.metrics.CassandraMetricsRegistry; +import org.apache.cassandra.metrics.DefaultNameFactory; + +import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics; + +// Stolen from org.apache.cassandra.index.sai.metrics.AbstractMetrics +public class IndexMetrics +{ + public static final String TYPE = "RouteIndex"; + private static final String SCOPE = "IndexMetrics"; + + private final List tracked = new ArrayList<>(); + + private final String ks; + private final String table; + private final String indexName; + public final Timer memtableIndexWriteLatency; + + public IndexMetrics(RouteJournalIndex index) + { + this.ks = index.baseCfs().getKeyspaceName(); + this.table = index.baseCfs().name; + this.indexName = index.getIndexMetadata().name; + memtableIndexWriteLatency = Metrics.timer(createMetricName("MemtableIndexWriteLatency")); + } + + public void release() + { + tracked.forEach(Metrics::remove); + tracked.clear(); + } + + private CassandraMetricsRegistry.MetricName createMetricName(String name) + { + String metricScope = ks + '.' + table; + if (indexName != null) + { + metricScope += '.' + indexName; + } + metricScope += '.' + SCOPE; + + CassandraMetricsRegistry.MetricName metricName = new CassandraMetricsRegistry.MetricName(DefaultNameFactory.GROUP_NAME, + TYPE, name, metricScope, createMBeanName(name, SCOPE)); + tracked.add(metricName); + return metricName; + } + + private String createMBeanName(String name, String scope) + { + StringBuilder builder = new StringBuilder(); + builder.append(DefaultNameFactory.GROUP_NAME); + builder.append(":type=").append(TYPE); + builder.append(',').append("keyspace=").append(ks); + builder.append(',').append("table=").append(table); + if (indexName != null) + builder.append(',').append("index=").append(indexName); + builder.append(',').append("scope=").append(scope); + builder.append(',').append("name=").append(name); + return builder.toString(); + } +} diff --git a/src/java/org/apache/cassandra/index/accord/MemtableIndex.java b/src/java/org/apache/cassandra/index/accord/MemtableIndex.java new file mode 100644 index 000000000000..8249ab3921a2 --- /dev/null +++ b/src/java/org/apache/cassandra/index/accord/MemtableIndex.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.accord; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collection; +import java.util.concurrent.atomic.LongAdder; + +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.schema.TableId; + +public class MemtableIndex +{ + private final RangeMemoryIndex memoryIndex = new RangeMemoryIndex(); + private final LongAdder writeCount = new LongAdder(); + private final LongAdder estimatedMemoryUsed = new LongAdder(); + + public long writeCount() + { + return writeCount.sum(); + } + + public long estimatedMemoryUsed() + { + return estimatedMemoryUsed.sum(); + } + + public boolean isEmpty() + { + return memoryIndex.isEmpty(); + } + + public long index(DecoratedKey key, Clustering clustering, ByteBuffer value) + { + if (value == null || value.remaining() == 0) + return 0; + long size = memoryIndex.add(key, clustering, value); + writeCount.increment(); + estimatedMemoryUsed.add(size); + return size; + } + + public Segment write(IndexDescriptor id) throws IOException + { + return memoryIndex.write(id); + } + + public Collection search(int storeId, TableId tableId, byte[] start, boolean startInclusive, byte[] end, boolean endInclusive) + { + return memoryIndex.search(storeId, tableId, start, startInclusive, end, endInclusive); + } + + public Collection search(int storeId, TableId tableId, byte[] key) + { + return memoryIndex.search(storeId, tableId, key); + } +} diff --git a/src/java/org/apache/cassandra/index/accord/MemtableIndexManager.java b/src/java/org/apache/cassandra/index/accord/MemtableIndexManager.java new file mode 100644 index 000000000000..bdaebc2a94a7 --- /dev/null +++ b/src/java/org/apache/cassandra/index/accord/MemtableIndexManager.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.accord; + +import java.nio.ByteBuffer; +import java.util.NavigableSet; + +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.lifecycle.LifecycleNewTracker; +import org.apache.cassandra.db.memtable.Memtable; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.schema.TableId; + +public interface MemtableIndexManager +{ + long index(DecoratedKey key, Row row, Memtable mt); + + MemtableIndex getPendingMemtableIndex(LifecycleNewTracker tracker); + + void discardMemtable(Memtable memtable); + + void renewMemtable(Memtable renewed); + + NavigableSet search(int storeId, TableId tableId, byte[] start, boolean startInclusive, byte[] end, boolean endInclusive); + NavigableSet search(int storeId, TableId tableId, byte[] key); +} diff --git a/src/java/org/apache/cassandra/index/accord/OrderedRouteSerializer.java b/src/java/org/apache/cassandra/index/accord/OrderedRouteSerializer.java new file mode 100644 index 000000000000..2e20bc63b61b --- /dev/null +++ b/src/java/org/apache/cassandra/index/accord/OrderedRouteSerializer.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.accord; + +import java.nio.ByteBuffer; + +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.utils.ByteBufferUtil; + +public class OrderedRouteSerializer +{ + public static ByteBuffer serialize(TokenKey key) + { + return TokenKey.serializer.serialize(key); + } + + public static byte[] serializeTokenOnly(TokenKey key) + { + return ByteBufferUtil.getArrayUnsafe(TokenKey.serializer.serializeWithoutPrefixOrLength(key)); + } + + public static TokenKey deserialize(ByteBuffer bb) + { + return TokenKey.serializer.deserialize(bb); + } +} diff --git a/src/java/org/apache/cassandra/index/accord/RangeMemoryIndex.java b/src/java/org/apache/cassandra/index/accord/RangeMemoryIndex.java new file mode 100644 index 000000000000..faefc0a51ba3 --- /dev/null +++ b/src/java/org/apache/cassandra/index/accord/RangeMemoryIndex.java @@ -0,0 +1,262 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.accord; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.EnumMap; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.NavigableSet; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.stream.Collectors; +import javax.annotation.concurrent.GuardedBy; + +import accord.primitives.Participants; +import accord.primitives.Routable; +import accord.primitives.Unseekable; +import org.apache.cassandra.cache.IMeasurableMemory; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.AccordKeyspace; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.utils.ByteArrayUtil; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.FastByteOperations; +import org.apache.cassandra.utils.ObjectSizes; +import org.apache.cassandra.utils.RTree; +import org.apache.cassandra.utils.RangeTree; + +import static org.apache.cassandra.index.accord.RouteIndexFormat.deserializeTouches; + +public class RangeMemoryIndex +{ + + @GuardedBy("this") + private final Map> map = new HashMap<>(); + @GuardedBy("this") + private final Map groupMetadata = new HashMap<>(); + + private static class Metadata + { + public byte[] minTerm, maxTerm; + } + + private static RangeTree createRangeTree() + { + return new RTree<>((a, b) -> ByteArrayUtil.compareUnsigned(a, 0, b, 0, a.length), new RangeTree.Accessor<>() + { + @Override + public byte[] start(Range range) + { + return range.start; + } + + @Override + public byte[] end(Range range) + { + return range.end; + } + + @Override + public boolean contains(byte[] start, byte[] end, byte[] bytes) + { + // bytes are ordered, start is exclusive, end is inclusive + return FastByteOperations.compareUnsigned(start, bytes) < 0 + && FastByteOperations.compareUnsigned(end, bytes) >= 0; + } + + @Override + public boolean intersects(Range range, byte[] start, byte[] end) + { + return range.intersects(start, end); + } + + @Override + public boolean intersects(Range left, Range right) + { + return left.intersects(right.start, right.end); + } + }); + } + + public synchronized long add(DecoratedKey key, Clustering clustering, ByteBuffer value) + { + Participants route; + try + { + route = deserializeTouches(value); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + + return add(key, route); + } + + public synchronized long add(DecoratedKey key, Participants route) + { + if (route == null || route.domain() != Routable.Domain.Range) + return 0; + long sum = 0; + for (Unseekable keyOrRange : route) + sum += add(key, keyOrRange); + return sum; + } + + protected long add(DecoratedKey key, Unseekable keyOrRange) + { + if (keyOrRange.domain() != Routable.Domain.Range) + throw new IllegalArgumentException("Unexpected domain: " + keyOrRange.domain()); + TokenRange ts = (TokenRange) keyOrRange; + + int storeId = AccordKeyspace.JournalColumns.getStoreId(key); + TableId tableId = ts.table(); + Group group = new Group(storeId, tableId); + byte[] start = OrderedRouteSerializer.serializeTokenOnly(ts.start()); + byte[] end = OrderedRouteSerializer.serializeTokenOnly(ts.end()); + Range range = new Range(start, end); + map.computeIfAbsent(group, ignore -> createRangeTree()).add(range, key); + Metadata metadata = groupMetadata.computeIfAbsent(group, ignore -> new Metadata()); + + metadata.minTerm = metadata.minTerm == null ? start : ByteArrayUtil.compareUnsigned(metadata.minTerm, 0, start, 0, metadata.minTerm.length) > 0 ? start : metadata.minTerm; + metadata.maxTerm = metadata.maxTerm == null ? end : ByteArrayUtil.compareUnsigned(metadata.maxTerm, 0, end, 0, metadata.maxTerm.length) < 0 ? end : metadata.maxTerm; + return TableId.EMPTY_SIZE + range.unsharedHeapSize(); + } + + public synchronized NavigableSet search(int storeId, TableId tableId, byte[] start, boolean startInclusive, byte[] end, boolean endInclusive) + { + RangeTree rangesToPks = map.get(new Group(storeId, tableId)); + if (rangesToPks == null || rangesToPks.isEmpty()) + return Collections.emptyNavigableSet(); + TreeMap> matches = search(rangesToPks, start, end); + if (matches.isEmpty()) + return Collections.emptyNavigableSet(); + TreeSet pks = new TreeSet<>(); + matches.values().forEach(s -> s.forEach(d -> pks.add(d.getKey()))); + return pks; + } + + public TreeMap> search(RangeTree tokensToPks, byte[] start, byte[] end) + { + + TreeMap> matches = new TreeMap<>(); + tokensToPks.search(new Range(start, end), e -> matches.computeIfAbsent(e.getKey(), ignore -> new HashSet<>()).add(e.getValue())); + return matches; + } + + public synchronized NavigableSet search(int storeId, TableId tableId, byte[] key) + { + RangeTree rangesToPks = map.get(new Group(storeId, tableId)); + if (rangesToPks == null || rangesToPks.isEmpty()) + return Collections.emptyNavigableSet(); + + TreeMap> matches = new TreeMap<>(); + rangesToPks.searchToken(key, e -> matches.computeIfAbsent(e.getKey(), ignore -> new HashSet<>()).add(e.getValue())); + + TreeSet pks = new TreeSet<>(); + matches.values().forEach(s -> s.forEach(d -> pks.add(d.getKey()))); + return pks; + } + + public synchronized boolean isEmpty() + { + return map.isEmpty(); + } + + public synchronized Segment write(IndexDescriptor id) throws IOException + { + if (map.isEmpty()) + throw new AssertionError("Unable to write empty index"); + Map output = new HashMap<>(); + + List groups = new ArrayList<>(map.keySet()); + groups.sort(Comparator.naturalOrder()); + + for (Group group : groups) + { + RangeTree submap = map.get(group); + if (submap.isEmpty()) // is this possible? put here for safty so list is never empty + continue; + Metadata metadata = groupMetadata.get(group); + + //TODO (performance): if the RangeTree can return the data in sorted order, then this local can become faster + // Right now the code is based off RTree, which is undefined order, so we must iterate then sort; in testing this is a good chunk of the time of this method + List list = submap.stream() + .map(e -> new CheckpointIntervalArrayIndex.Interval(e.getKey().start, e.getKey().end, ByteBufferUtil.getArray(e.getValue().getKey()))) + .sorted(Comparator.naturalOrder()) + .collect(Collectors.toList()); + + CheckpointIntervalArrayIndex.SegmentWriter writer = new CheckpointIntervalArrayIndex.SegmentWriter(id, list.get(0).start.length, list.get(0).value.length); + EnumMap meta = writer.write(list.toArray(CheckpointIntervalArrayIndex.Interval[]::new)); + if (meta.isEmpty()) // don't include empty segments + continue; + output.put(group, new Segment.Metadata(meta, metadata.minTerm, metadata.maxTerm)); + } + + return new Segment(output); + } + + private static class Range implements Comparable, IMeasurableMemory + { + private static final long EMPTY_SIZE = ObjectSizes.measure(new Range(null, null)); + + private final byte[] start, end; + + private Range(byte[] start, byte[] end) + { + this.start = start; + this.end = end; + } + + @Override + public int compareTo(Range other) + { + int rc = ByteArrayUtil.compareUnsigned(start, 0, other.start, 0, start.length); + if (rc == 0) + rc = ByteArrayUtil.compareUnsigned(end, 0, other.end, 0, end.length); + return rc; + } + + @Override + public long unsharedHeapSize() + { + return EMPTY_SIZE + ObjectSizes.sizeOfArray(start) * 2; + } + + public boolean intersects(byte[] start, byte[] end) + { + if (ByteArrayUtil.compareUnsigned(this.start, 0, end, 0, end.length) >= 0) + return false; + if (ByteArrayUtil.compareUnsigned(this.end, 0, start, 0, start.length) <= 0) + return false; + return true; + } + } +} diff --git a/src/java/org/apache/cassandra/index/accord/RouteIndexFormat.java b/src/java/org/apache/cassandra/index/accord/RouteIndexFormat.java new file mode 100644 index 000000000000..56be6b9eed9d --- /dev/null +++ b/src/java/org/apache/cassandra/index/accord/RouteIndexFormat.java @@ -0,0 +1,338 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.accord; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.EnumMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.function.Supplier; +import java.util.zip.CRC32C; +import java.util.zip.Checksum; + +import com.google.common.collect.Maps; + +import accord.local.StoreParticipants; +import accord.primitives.Participants; +import accord.primitives.TxnId; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.Unfiltered; +import org.apache.cassandra.index.accord.IndexDescriptor.IndexComponent; +import org.apache.cassandra.io.AsymmetricVersionedSerializer; +import org.apache.cassandra.io.EmbeddedAsymmetricVersionedSerializer; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.sstable.SSTableFlushObserver; +import org.apache.cassandra.io.util.ChecksumedRandomAccessReader; +import org.apache.cassandra.io.util.ChecksumedSequentialWriter; +import org.apache.cassandra.io.util.FileHandle; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.serializers.UUIDSerializer; +import org.apache.cassandra.service.accord.AccordJournal; +import org.apache.cassandra.service.accord.AccordJournalTable; +import org.apache.cassandra.service.accord.AccordKeyspace; +import org.apache.cassandra.service.accord.JournalKey; +import org.apache.cassandra.service.accord.serializers.KeySerializers; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.utils.ByteArrayUtil; +import org.apache.cassandra.utils.Throwables; + +import static org.apache.cassandra.utils.Clock.Global.nowInSeconds; + +// A route index consists of a few files: cintia_sorted_list, cintia_checkpoints, and metadata +// metadata stores the segement mappings and stats needed for search selection +public class RouteIndexFormat +{ + public static final Supplier CHECKSUM_SUPPLIER = CRC32C::new; + + static final EmbeddedAsymmetricVersionedSerializer, Participants, Version> touches = localSerializer(KeySerializers.participants); + private static EmbeddedAsymmetricVersionedSerializer localSerializer(UnversionedSerializer serializer) + { + return new EmbeddedAsymmetricVersionedSerializer<>(Version.DOWNGRADE_SAFE_VERSION, Version.Serializer.instance, AsymmetricVersionedSerializer.from(serializer)); + } + + public static ByteBuffer serialize(Participants value) throws IOException + { + return touches.serialize(value); + } + + static Participants deserializeTouches(ByteBuffer bytes) throws IOException + { + if (bytes == null || ByteBufferAccessor.instance.isEmpty(bytes)) + return null; + + return touches.deserialize(bytes); + } + + public interface Writer extends SSTableFlushObserver + { + + } + + public static class SSTableIndexWriter extends MemtableRouteIndexWriter + { + private final RouteJournalIndex index; + private DecoratedKey current; + private JournalKey journalKey; + + public SSTableIndexWriter(RouteJournalIndex index, IndexDescriptor id) + { + super(id, new MemtableIndex()); + this.index = index; + } + + @Override + public void startPartition(DecoratedKey key, long keyPosition, long keyPositionForSASI) + { + this.current = key; + this.journalKey = AccordKeyspace.JournalColumns.getJournalKey(key); + } + + @Override + public void nextUnfilteredCluster(Unfiltered unfiltered) + { + // there is some duplication from org.apache.cassandra.index.accord.RouteMemtableIndexManager.index + // should this be cleaned up? + if (!unfiltered.isRow() || !RouteJournalIndex.allowed(journalKey)) + return; + Row row = (Row) unfiltered; + ByteBuffer value = extractParticipants(index, journalKey.id, row); + indexer.index(current, row.clustering(), value); + } + } + + public static ByteBuffer extractParticipants(RouteJournalIndex index, TxnId txnId, Row row) + { + boolean recordNull = row.getCell(AccordKeyspace.JournalColumns.record) == null; + boolean userVersionNull = row.getCell(AccordKeyspace.JournalColumns.user_version) == null; + if (recordNull != userVersionNull) + throw new IllegalStateException(String.format("Record is %s, but user_version is %s", + (recordNull ? "null" : "defined"), + (userVersionNull ? "null" : "defined"))); + if (recordNull) + return null; + Cell recordCell = row.getCell(AccordKeyspace.JournalColumns.record); + Cell user_versionCell = row.getCell(AccordKeyspace.JournalColumns.user_version); + long nowInSec = nowInSeconds(); + boolean recordLive = recordCell.isLive(nowInSec); + boolean user_versionLive = user_versionCell.isLive(nowInSec); + if (recordLive != user_versionLive) + throw new IllegalStateException(String.format("Record is %s, but user_version is %s", + (recordLive ? "live" : "dead"), + (user_versionLive ? "live" : "dead"))); + if (!recordLive) + return null; + ByteBuffer record = recordCell.buffer(); + Version user_version = Version.fromVersion(Int32Type.instance.compose(user_versionCell.buffer())); + AccordJournal.Builder builder = extract(txnId, record, user_version); + StoreParticipants participants = builder.participants(); + if (participants == null) + return null; + + Participants touches = participants.touches(); + if (touches == null) + return null; + + try + { + return serialize(touches); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } + + public static AccordJournal.Builder extract(TxnId txnId, ByteBuffer record, Version userVersion) + { + AccordJournal.Builder builder = new AccordJournal.Builder(txnId, AccordJournal.Load.ALL); + AccordJournalTable.readBuffer(record, builder::deserializeNext, userVersion); + return builder; + } + + public static class MemtableRouteIndexWriter implements Writer + { + private final IndexDescriptor id; + protected final MemtableIndex indexer; + + public MemtableRouteIndexWriter(IndexDescriptor id, MemtableIndex indexer) + { + this.id = id; + this.indexer = indexer; + } + + + @Override + public void begin() + { + // no-op + } + + @Override + public void startPartition(DecoratedKey key, long keyPosition, long keyPositionForSASI) + { + // no-op + } + + @Override + public void staticRow(Row staticRow) + { + // no-op + } + + @Override + public void nextUnfilteredCluster(Unfiltered unfiltered) + { + // no-op + } + + @Override + public void complete() + { + try + { + if (!indexer.isEmpty()) + { + Segment segment = indexer.write(id); + appendSegment(id, segment); + } + else + { + // nothing to see here... need to still mark the SSTable as indexed, so need an empty segment + appendSegment(id, Segment.EMPTY); + } + } + catch (IOException e) + { + abort(e); + throw Throwables.unchecked(e); + } + } + + @Override + public void abort(Throwable accumulator) + { + id.deleteIndex(); + } + + public void abort(Throwable accumulator, boolean fromIndex) + { + abort(accumulator); + // If the abort was from an index error, propagate the error upstream so index builds, compactions, and + // flushes can handle it correctly. + if (fromIndex) + throw Throwables.unchecked(accumulator); + } + } + + static List readSegments(Map index) throws IOException + { + List segments = new ArrayList<>(); + + try (ChecksumedRandomAccessReader metaReader = new ChecksumedRandomAccessReader(index.get(IndexComponent.METADATA).createReader(), CHECKSUM_SUPPLIER); + ChecksumedRandomAccessReader segmentReader = new ChecksumedRandomAccessReader(index.get(IndexComponent.SEGMENT).createReader(), CHECKSUM_SUPPLIER)) + { + while (metaReader.getFilePointer() < metaReader.length()) + { + metaReader.resetChecksum(); + long startPointer = metaReader.readUnsignedVInt(); + long endPointer = metaReader.readUnsignedVInt(); + int groupSize = metaReader.readUnsignedVInt32(); + int segmentChecksum = metaReader.readInt(); + int metadataChecksum = metaReader.getValue32AndResetChecksum(); + int actualChecksum = metaReader.readInt(); + assert actualChecksum == metadataChecksum; + + segmentReader.resetChecksum(); + segmentReader.seek(startPointer); + Map groups = Maps.newHashMapWithExpectedSize(groupSize); + for (int i = 0; i < groupSize; i++) + { + int storeId = segmentReader.readVInt32(); + TableId tableId = TableId.fromUUID(new UUID(segmentReader.readLong(), segmentReader.readLong())); + Group group = new Group(storeId, tableId); + int metaSize = segmentReader.readUnsignedVInt32(); + EnumMap metas = new EnumMap<>(IndexComponent.class); + for (int j = 0; j < metaSize; j++) + { + IndexComponent c = IndexComponent.fromByte(segmentReader.readByte()); + metas.put(c, new Segment.ComponentMetadata(segmentReader.readUnsignedVInt(), segmentReader.readUnsignedVInt())); + } + byte[] minTerm = ByteArrayUtil.readWithVIntLength(segmentReader); + byte[] maxTerm = ByteArrayUtil.readWithVIntLength(segmentReader); + Segment.Metadata existing = groups.put(group, new Segment.Metadata(metas, minTerm, maxTerm)); + assert existing == null; + } + int actualSegmentChecksum = segmentReader.getValue32AndResetChecksum(); + assert actualSegmentChecksum == segmentChecksum; + assert segmentReader.getFilePointer() == endPointer; + segments.add(new Segment(groups)); + } + } + return segments; + } + + static void appendSegment(IndexDescriptor id, Segment segment) throws IOException + { + List groups = new ArrayList<>(segment.groups.keySet()); + groups.sort(Comparator.naturalOrder()); + + try (ChecksumedSequentialWriter segmentWriter = ChecksumedSequentialWriter.open(id.fileFor(IndexComponent.SEGMENT), true, CHECKSUM_SUPPLIER); + ChecksumedSequentialWriter metadataWriter = ChecksumedSequentialWriter.open(id.fileFor(IndexComponent.METADATA), true, CHECKSUM_SUPPLIER)) + { + long startPointer = segmentWriter.getFilePointer(); + for (Group group : groups) + { + Segment.Metadata metadata = segment.groups.get(group); + writeGroup(segmentWriter, group, metadata); + } + long endPointer = segmentWriter.getFilePointer(); + + int checksum = segmentWriter.getValue32AndResetChecksum(); + metadataWriter.writeUnsignedVInt(startPointer); + metadataWriter.writeUnsignedVInt(endPointer); + metadataWriter.writeUnsignedVInt32(segment.groups.size()); + metadataWriter.writeInt(checksum); + metadataWriter.writeInt(metadataWriter.getValue32AndResetChecksum()); + } + } + + private static void writeGroup(ChecksumedSequentialWriter seq, Group group, Segment.Metadata metadata) throws IOException + { + seq.writeVInt32(group.storeId); + seq.write(UUIDSerializer.instance.serialize(group.tableId.asUUID())); + seq.writeUnsignedVInt32(metadata.metas.size()); + for (Map.Entry e : metadata.metas.entrySet()) + { + seq.writeByte(e.getKey().value); + seq.writeUnsignedVInt(e.getValue().offset); + seq.writeUnsignedVInt(e.getValue().endOffset); + } + ByteArrayUtil.writeWithVIntLength(metadata.minTerm, seq); + ByteArrayUtil.writeWithVIntLength(metadata.maxTerm, seq); + } +} diff --git a/src/java/org/apache/cassandra/index/accord/RouteJournalIndex.java b/src/java/org/apache/cassandra/index/accord/RouteJournalIndex.java new file mode 100644 index 000000000000..aa97dd981e8d --- /dev/null +++ b/src/java/org/apache/cassandra/index/accord/RouteJournalIndex.java @@ -0,0 +1,644 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.accord; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.NavigableSet; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.Callable; +import java.util.stream.Collectors; + +import com.google.common.base.Splitter; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.primitives.TxnId; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.Operator; +import org.apache.cassandra.db.CassandraWriteContext; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.ReadCommand; +import org.apache.cassandra.db.ReadExecutionController; +import org.apache.cassandra.db.RegularAndStaticColumns; +import org.apache.cassandra.db.WriteContext; +import org.apache.cassandra.db.compaction.CompactionManager; +import org.apache.cassandra.db.compaction.OperationType; +import org.apache.cassandra.db.filter.RowFilter; +import org.apache.cassandra.db.lifecycle.LifecycleNewTracker; +import org.apache.cassandra.db.lifecycle.Tracker; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.memtable.Memtable; +import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; +import org.apache.cassandra.db.rows.BTreeRow; +import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.Unfiltered; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.index.Index; +import org.apache.cassandra.index.IndexRegistry; +import org.apache.cassandra.index.sai.StorageAttachedIndex; +import org.apache.cassandra.index.transactions.IndexTransaction; +import org.apache.cassandra.io.sstable.Component; +import org.apache.cassandra.io.sstable.Descriptor; +import org.apache.cassandra.io.sstable.SSTableFlushObserver; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.notifications.INotification; +import org.apache.cassandra.notifications.INotificationConsumer; +import org.apache.cassandra.notifications.MemtableDiscardedNotification; +import org.apache.cassandra.notifications.MemtableRenewedNotification; +import org.apache.cassandra.notifications.SSTableAddedNotification; +import org.apache.cassandra.notifications.SSTableListChangedNotification; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.IndexMetadata; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.accord.AccordJournalTable; +import org.apache.cassandra.service.accord.AccordKeyspace; +import org.apache.cassandra.service.accord.JournalKey; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.utils.AbstractIterator; +import org.apache.cassandra.utils.concurrent.Future; +import org.apache.cassandra.utils.concurrent.FutureCombiner; + +import static accord.primitives.Routable.Domain.Range; + +public class RouteJournalIndex implements Index, INotificationConsumer +{ + public enum RegisterStatus { PENDING, REGISTERED, UNREGISTERED } + + private static final Logger logger = LoggerFactory.getLogger(RouteJournalIndex.class); + + private static final Component.Type type = Component.Type.createSingleton("AccordRoute", "AccordRoute.*.db", true, null); + + private final ColumnFamilyStore baseCfs; + private final IndexMetadata indexMetadata; + private final IndexMetrics indexMetrics; + private final MemtableIndexManager memtableIndexManager; + private final SSTableManager sstableManager; + // Tracks whether we've started the index build on initialization. + private volatile boolean initBuildStarted = false; + private volatile RegisterStatus registerStatus = RegisterStatus.PENDING; + + public RouteJournalIndex(ColumnFamilyStore baseCfs, IndexMetadata indexMetadata) + { + validateTargets(baseCfs, indexMetadata); + + this.baseCfs = baseCfs; + // type is only IndexTarget.Type.VALUES + this.indexMetadata = indexMetadata; + + this.memtableIndexManager = new RouteMemtableIndexManager(this); + this.sstableManager = new RouteSSTableManager(); + this.indexMetrics = new IndexMetrics(this); + + Tracker tracker = baseCfs.getTracker(); + tracker.subscribe(this); + } + + public static boolean allowed(JournalKey id) + { + return id.type == JournalKey.Type.COMMAND_DIFF && allowed(id.id); + } + + public static boolean allowed(TxnId id) + { + return id.is(Range); + } + + private static void validateTargets(ColumnFamilyStore baseCfs, IndexMetadata indexMetadata) + { + // this contains 2 columns.... + if (!SchemaConstants.ACCORD_KEYSPACE_NAME.equals(baseCfs.getKeyspaceName())) + throw new IllegalArgumentException("Route index is only allowed for accord journal table; given " + baseCfs.metadata()); + if (!AccordKeyspace.JOURNAL.equals(baseCfs.name)) + throw new IllegalArgumentException("Route index is only allowed for accord journal table; given " + baseCfs.metadata()); + Set columns = Splitter.on(',').trimResults().omitEmptyStrings().splitToStream(indexMetadata.options.get("target")).collect(Collectors.toSet()); + Set expected = Set.of("record", "user_version"); + if (!expected.equals(columns)) + throw new IllegalArgumentException("Route index is only allowed for accord journal table, and on the record/user_value columns; given " + baseCfs.metadata() + " and columns " + columns); + } + + public IndexMetrics indexMetrics() + { + return indexMetrics; + } + + public RegisterStatus registerStatus() + { + return registerStatus; + } + + public ColumnFamilyStore baseCfs() + { + return baseCfs; + } + + @Override + public IndexMetadata getIndexMetadata() + { + return indexMetadata; + } + + @Override + public boolean shouldBuildBlocking() + { + return true; + } + + @Override + public boolean isSSTableAttached() + { + return true; + } + + @Override + public Optional getBackingTable() + { + return Optional.empty(); + } + + @Override + public Set getComponents() + { + return Collections.singleton(type.getSingleton()); + } + + @Override + public Callable getInitializationTask() + { + return () -> { + if (baseCfs.indexManager.isIndexQueryable(this)) + { + initBuildStarted = true; + return null; + } + + // stop in-progress compaction tasks to prevent compacted sstable not being indexed. + CompactionManager.instance.interruptCompactionFor(Collections.singleton(baseCfs.metadata()), + ssTableReader -> true, + true); + // Force another flush to make sure on disk index is generated for memtable data before marking it queryable. + // In the case of offline scrub, there are no live memtables. + if (!baseCfs.getTracker().getView().liveMemtables.isEmpty()) + baseCfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.INDEX_BUILD_STARTED); + + // It is now safe to flush indexes directly from flushing Memtables. + initBuildStarted = true; + + List nonIndexed = findNonIndexedSSTables(baseCfs, sstableManager); + + if (nonIndexed.isEmpty()) + return null; + + // split sorted sstables into groups with similar size and build each group in separate compaction thread + List> groups = StorageAttachedIndex.groupBySize(nonIndexed, DatabaseDescriptor.getConcurrentIndexBuilders()); + List> futures = new ArrayList<>(); + + for (List group : groups) + { + futures.add(CompactionManager.instance.submitIndexBuild(new RouteSecondaryIndexBuilder(this, sstableManager, group, false, true))); + } + + return FutureCombiner.allOf(futures).get(); + }; + } + + private List findNonIndexedSSTables(ColumnFamilyStore baseCfs, SSTableManager manager) + { + Set sstables = baseCfs.getLiveSSTables(); + + // Initialize the SSTable indexes w/ valid existing components... + manager.onSSTableChanged(Collections.emptyList(), sstables); + + // ...then identify and rebuild the SSTable indexes that are missing. + List nonIndexed = new ArrayList<>(); + + for (SSTableReader sstable : sstables) + { + if (!sstable.isMarkedCompacted() && !manager.isIndexComplete(sstable)) + { + nonIndexed.add(sstable); + } + } + + return nonIndexed; + } + + + @Override + public boolean isQueryable(Status status) + { + // consider unknown status as queryable, because gossip may not be up-to-date for newly joining nodes. + return status == Status.BUILD_SUCCEEDED || status == Status.UNKNOWN; + } + + @Override + public synchronized void register(IndexRegistry registry) + { + registry.registerIndex(this); + registerStatus = RegisterStatus.REGISTERED; + } + + @Override + public synchronized void unregister(IndexRegistry registry) + { + Index.super.unregister(registry); + registerStatus = RegisterStatus.UNREGISTERED; + } + + @Override + public Callable getTruncateTask(long truncatedAt) + { + /* + * index files will be removed as part of base sstable lifecycle in {@link LogTransaction#delete(java.io.File)} + * asynchronously, but we need to mark the index queryable because if the truncation is during the initial + * build of the index it won't get marked queryable by the build. + */ + return () -> { + logger.info("Making index queryable during table truncation"); + baseCfs.indexManager.makeIndexQueryable(this, Status.BUILD_SUCCEEDED); + return null; + }; + } + + @Override + public Callable getBlockingFlushTask() + { + return null; // storage-attached indexes are flushed alongside memtable + } + + @Override + public Callable getMetadataReloadTask(IndexMetadata indexMetadata) + { + return null; + } + + @Override + public Callable getInvalidateTask() + { + return () -> null; + } + + @Override + public void validate(PartitionUpdate update, ClientState state) throws InvalidRequestException + { + // only internal can write... so it must be valid no? + } + + @Override + public SSTableFlushObserver getFlushObserver(Descriptor descriptor, + LifecycleNewTracker tracker) + { + // mimics org.apache.cassandra.index.sai.disk.v1.V1OnDiskFormat.newPerColumnIndexWriter + IndexDescriptor id = IndexDescriptor.create(descriptor, baseCfs.getPartitioner(), baseCfs.metadata().comparator); + if (tracker.opType() != OperationType.FLUSH || !initBuildStarted) + { + return new RouteIndexFormat.SSTableIndexWriter(this, id); + } + else + { + return new RouteIndexFormat.MemtableRouteIndexWriter(id, memtableIndexManager.getPendingMemtableIndex(tracker)); + } + } + + @Override + public Indexer indexerFor(DecoratedKey key, + RegularAndStaticColumns columns, + long nowInSec, + WriteContext ctx, + IndexTransaction.Type transactionType, + Memtable memtable) + { + // since we are attached we only care about update + if (transactionType != IndexTransaction.Type.UPDATE) + return null; + return new Indexer() + { + @Override + public void insertRow(Row row) + { + long size = memtableIndexManager.index(key, row, memtable); + if (size > 0) + memtable.markExtraOnHeapUsed(size, CassandraWriteContext.fromContext(ctx).getGroup()); + } + + @Override + public void updateRow(Row oldRowData, Row newRowData) + { + insertRow(newRowData); + } + }; + } + + @Override + public boolean supportsExpression(ColumnMetadata column, Operator operator) + { + // disallow all queries, in order to interact with this index you must bypass CQL + return false; + } + + @Override + public RowFilter getPostIndexQueryFilter(RowFilter filter) + { + return RowFilter.none(); + } + + @Override + public Searcher searcherFor(ReadCommand command) + { + List expressions = command.rowFilter().getExpressions().stream().collect(Collectors.toList()); + if (expressions.isEmpty()) + return null; + ByteBuffer start = null; + boolean startInclusive = true; + ByteBuffer end = null; + boolean endInclusive = true; + Integer storeId = null; + for (RowFilter.Expression e : expressions) + { + if (e.column() == AccordJournalTable.SyntheticColumn.participants.metadata) + { + switch (e.operator()) + { + case GT: + start = e.getIndexValue(); + startInclusive = false; + break; + case GTE: + start = e.getIndexValue(); + startInclusive = true; + break; + case LT: + end = e.getIndexValue(); + endInclusive = false; + break; + case LTE: + end = e.getIndexValue(); + endInclusive = true; + break; + default: + return null; + } + } + else if (e.column() == AccordJournalTable.SyntheticColumn.store_id.metadata && e.operator() == Operator.EQ) + { + storeId = Int32Type.instance.compose(e.getIndexValue()); + } + else + { + throw new IllegalArgumentException("Unexpected expression: " + e.toCQLString()); + } + } + if (start == null || end == null || storeId == null) + return null; + if (start.equals(end)) + return keySearcher(command, storeId, start); + return rangeSearcher(command, storeId, start, startInclusive, end, endInclusive); + } + + private Searcher keySearcher(ReadCommand command, Integer storeId, ByteBuffer key) + { + return new Searcher() + { + @Override + public ReadCommand command() + { + return command; + } + + @Override + public UnfilteredPartitionIterator search(ReadExecutionController executionController) + { + // find all partitions from memtable / sstable + NavigableSet partitions = search(storeId, key); + // do SinglePartitionReadCommand per partition + return new SearchIterator(command, partitions); + } + + NavigableSet search(int storeId, ByteBuffer key) + { + TableId tableId; + byte[] start; + { + TokenKey route = OrderedRouteSerializer.deserialize(key); + tableId = route.table(); + start = OrderedRouteSerializer.serializeTokenOnly(route); + } + NavigableSet matches = sstableManager.search(storeId, tableId, start); + matches.addAll(memtableIndexManager.search(storeId, tableId, start)); + return matches; + } + }; + } + + private Searcher rangeSearcher(ReadCommand command, int storeId, ByteBuffer start, boolean startInclusive, ByteBuffer end, boolean endInclusive) + { + return new Searcher() + { + @Override + public ReadCommand command() + { + return command; + } + + @Override + public UnfilteredPartitionIterator search(ReadExecutionController executionController) + { + // find all partitions from memtable / sstable + NavigableSet partitions = search(storeId, start, startInclusive, end, endInclusive); + // do SinglePartitionReadCommand per partition + return new SearchIterator(command, partitions); + } + + NavigableSet search(int storeId, + ByteBuffer startTableWithToken, boolean startInclusive, + ByteBuffer endTableWithToken, boolean endInclusive) + { + TableId tableId; + byte[] start; + { + + TokenKey route = OrderedRouteSerializer.deserialize(startTableWithToken); + tableId = route.table(); + start = OrderedRouteSerializer.serializeTokenOnly(route); + } + byte[] end = OrderedRouteSerializer.serializeTokenOnly(OrderedRouteSerializer.deserialize(endTableWithToken)); + NavigableSet matches = sstableManager.search(storeId, tableId, start, startInclusive, end, endInclusive); + matches.addAll(memtableIndexManager.search(storeId, tableId, start, startInclusive, end, endInclusive)); + return matches; + } + }; + } + + @Override + public void handleNotification(INotification notification, Object sender) + { + // unfortunately, we can only check the type of notification via instanceof :( + if (notification instanceof SSTableAddedNotification) + { + SSTableAddedNotification notice = (SSTableAddedNotification) notification; + sstableManager.onSSTableChanged(Collections.emptySet(), notice.added); + } + else if (notification instanceof SSTableListChangedNotification) + { + SSTableListChangedNotification notice = (SSTableListChangedNotification) notification; + sstableManager.onSSTableChanged(notice.removed, notice.added); + } + else if (notification instanceof MemtableRenewedNotification) + { + memtableIndexManager.renewMemtable(((MemtableRenewedNotification) notification).renewed); + } + else if (notification instanceof MemtableDiscardedNotification) + { + memtableIndexManager.discardMemtable(((MemtableDiscardedNotification) notification).memtable); + } + } + + //TODO (coverage): everything below here never triggered... + + @Override + public boolean dependsOn(ColumnMetadata column) + { + throw new UnsupportedOperationException(); + } + + @Override + public AbstractType customExpressionValueType() + { + throw new UnsupportedOperationException(); + } + + @Override + public long getEstimatedResultRows() + { + throw new UnsupportedOperationException(); + } + + private class SearchIterator extends AbstractIterator implements UnfilteredPartitionIterator + { + private final TableMetadata metadata; + private final Iterator partitions; + + private SearchIterator(ReadCommand command, NavigableSet partitions) + { + this.metadata = command.metadata(); + this.partitions = partitions.iterator(); + } + + @Override + public TableMetadata metadata() + { + return metadata; + } + + @Override + protected UnfilteredRowIterator computeNext() + { + if (!partitions.hasNext()) + return endOfData(); + DecoratedKey pk = metadata.partitioner.decorateKey(partitions.next()); + return new UnfilteredRowIterator() + { + @Override + public DeletionTime partitionLevelDeletion() + { + return DeletionTime.LIVE; + } + + @Override + public EncodingStats stats() + { + return EncodingStats.NO_STATS; + } + + @Override + public TableMetadata metadata() + { + return metadata; + } + + @Override + public boolean isReverseOrder() + { + return false; + } + + @Override + public RegularAndStaticColumns columns() + { + return RegularAndStaticColumns.NONE; + } + + @Override + public DecoratedKey partitionKey() + { + return pk; + } + + @Override + public Row staticRow() + { + return null; + } + + @Override + public void close() + { + + } + + private Row row = BTreeRow.emptyRow(Clustering.EMPTY); + + @Override + public boolean hasNext() + { + return row != null; + } + + @Override + public Unfiltered next() + { + Row row = this.row; + this.row = null; + return row; + } + }; + } + + @Override + public void close() + { + + } + } +} diff --git a/src/java/org/apache/cassandra/index/accord/RouteMemtableIndexManager.java b/src/java/org/apache/cassandra/index/accord/RouteMemtableIndexManager.java new file mode 100644 index 000000000000..f979ea5cfe26 --- /dev/null +++ b/src/java/org/apache/cassandra/index/accord/RouteMemtableIndexManager.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.accord; + +import java.nio.ByteBuffer; +import java.util.NavigableSet; +import java.util.TreeSet; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.TimeUnit; + +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.lifecycle.LifecycleNewTracker; +import org.apache.cassandra.db.memtable.Memtable; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.AccordKeyspace; +import org.apache.cassandra.service.accord.JournalKey; + +import static org.apache.cassandra.utils.Clock.Global.nanoTime; + +public class RouteMemtableIndexManager implements MemtableIndexManager +{ + private final ConcurrentMap liveMemtableIndexMap = new ConcurrentHashMap<>(); + private final RouteJournalIndex index; + + public RouteMemtableIndexManager(RouteJournalIndex index) + { + this.index = index; + } + + @Override + public long index(DecoratedKey key, Row row, Memtable mt) + { + if (row.isStatic()) + return 0; + JournalKey journalKey = AccordKeyspace.JournalColumns.getJournalKey(key); + if (!RouteJournalIndex.allowed(journalKey)) + return 0; + //TODO (performance): we dropped jdk8 and this was fixed in jdk8... so do we need to do this still? + MemtableIndex current = liveMemtableIndexMap.get(mt); + + // We expect the relevant IndexMemtable to be present most of the time, so only make the + // call to computeIfAbsent() if it's not. (see https://bugs.openjdk.java.net/browse/JDK-8161372) + MemtableIndex target = (current != null) + ? current + : liveMemtableIndexMap.computeIfAbsent(mt, memtable -> new MemtableIndex()); + + long start = nanoTime(); + + long bytes = 0; + + ByteBuffer value = RouteIndexFormat.extractParticipants(index, journalKey.id, row); + bytes += target.index(key, row.clustering(), value); + index.indexMetrics().memtableIndexWriteLatency.update(nanoTime() - start, TimeUnit.NANOSECONDS); + return bytes; + } + + @Override + public MemtableIndex getPendingMemtableIndex(LifecycleNewTracker tracker) + { + return liveMemtableIndexMap.keySet().stream() + .filter(m -> tracker.equals(m.getFlushTransaction())) + .findFirst() + .map(liveMemtableIndexMap::get) + .orElse(null); + } + + @Override + public void discardMemtable(Memtable memtable) + { + liveMemtableIndexMap.remove(memtable); + } + + @Override + public void renewMemtable(Memtable renewed) + { + for (Memtable memtable : liveMemtableIndexMap.keySet()) + { + // remove every index but the one that corresponds to the post-truncate Memtable + if (renewed != memtable) + { + liveMemtableIndexMap.remove(memtable); + } + } + } + + @Override + public NavigableSet search(int storeId, TableId tableId, byte[] start, boolean startInclusive, byte[] end, boolean endInclusive) + { + TreeSet matches = new TreeSet<>(); + liveMemtableIndexMap.values().forEach(m -> matches.addAll(m.search(storeId, tableId, start, startInclusive, end, endInclusive))); + return matches; + } + + @Override + public NavigableSet search(int storeId, TableId tableId, byte[] key) + { + TreeSet matches = new TreeSet<>(); + liveMemtableIndexMap.values().forEach(m -> matches.addAll(m.search(storeId, tableId, key))); + return matches; + } +} diff --git a/src/java/org/apache/cassandra/index/accord/RouteSSTableManager.java b/src/java/org/apache/cassandra/index/accord/RouteSSTableManager.java new file mode 100644 index 000000000000..3fdb88449479 --- /dev/null +++ b/src/java/org/apache/cassandra/index/accord/RouteSSTableManager.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.accord; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.NavigableSet; +import java.util.TreeSet; + +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.schema.TableId; + +public class RouteSSTableManager implements SSTableManager +{ + private final Map sstables = new HashMap<>(); + + @Override + public synchronized void onSSTableChanged(Collection removed, Iterable added) + { + //TODO (performance): most added tables will have 0 segmenets, so exclude those from search + removed.forEach(s -> { + SSTableIndex index = sstables.remove(s); + if (index != null) + { + index.close(); + index.id.deleteIndex(); + } + }); + + List notComplete = null; + for (SSTableReader sstable : added) + { + IndexDescriptor id = IndexDescriptor.create(sstable); + if (!id.isIndexBuildComplete()) + { + if (notComplete == null) notComplete = new ArrayList<>(); + notComplete.add(sstable); + continue; + } + try + { + sstables.put(sstable, SSTableIndex.create(id)); + } + catch (IOException e) + { + notComplete.add(sstable); + } + } + if (notComplete != null) + throw new IllegalArgumentException("SStables were added without an index... " + notComplete); + } + + @Override + public synchronized boolean isIndexComplete(SSTableReader reader) + { + return sstables.containsKey(reader); + } + + @Override + public synchronized NavigableSet search(int storeId, TableId tableId, + byte[] start, boolean startInclusive, + byte[] end, boolean endInclusive) + { + Group group = new Group(storeId, tableId); + TreeSet matches = new TreeSet<>(); + for (SSTableIndex index : sstables.values()) + matches.addAll(index.search(group, start, startInclusive, end, endInclusive)); + return matches; + } + + @Override + public synchronized NavigableSet search(int storeId, TableId tableId, byte[] key) + { + Group group = new Group(storeId, tableId); + TreeSet matches = new TreeSet<>(); + for (SSTableIndex index : sstables.values()) + matches.addAll(index.search(group, key)); + return matches; + } +} diff --git a/src/java/org/apache/cassandra/index/accord/RouteSecondaryIndexBuilder.java b/src/java/org/apache/cassandra/index/accord/RouteSecondaryIndexBuilder.java new file mode 100644 index 000000000000..428ed774c04e --- /dev/null +++ b/src/java/org/apache/cassandra/index/accord/RouteSecondaryIndexBuilder.java @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.accord; + +import java.util.Collections; +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.compaction.CompactionInfo; +import org.apache.cassandra.db.compaction.CompactionInterruptedException; +import org.apache.cassandra.db.compaction.OperationType; +import org.apache.cassandra.db.lifecycle.LifecycleTransaction; +import org.apache.cassandra.db.lifecycle.Tracker; +import org.apache.cassandra.index.SecondaryIndexBuilder; +import org.apache.cassandra.io.sstable.KeyIterator; +import org.apache.cassandra.io.sstable.SSTableFlushObserver; +import org.apache.cassandra.io.sstable.SSTableIdentityIterator; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.io.util.RandomAccessReader; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.Throwables; +import org.apache.cassandra.utils.TimeUUID; +import org.apache.cassandra.utils.concurrent.Ref; + +import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; + +public class RouteSecondaryIndexBuilder extends SecondaryIndexBuilder +{ + private static final Logger logger = LoggerFactory.getLogger(RouteSecondaryIndexBuilder.class); + + private final TimeUUID compactionId = nextTimeUUID(); + private final RouteJournalIndex index; + private final TableMetadata metadata; + private final Tracker tracker; + private final SSTableManager sstableManager; + private final List sstables; + private final boolean isFullRebuild; + private final boolean isInitialBuild; + private final long totalSizeInBytes; + private long bytesProcessed = 0; + + public RouteSecondaryIndexBuilder(RouteJournalIndex index, + SSTableManager sstableManager, + List sstables, + boolean isFullRebuild, + boolean isInitialBuild) + { + this.index = index; + this.metadata = index.baseCfs().metadata(); + this.tracker = index.baseCfs().getTracker(); + this.sstableManager = sstableManager; + this.sstables = sstables; + this.isFullRebuild = isFullRebuild; + this.isInitialBuild = isInitialBuild; + this.totalSizeInBytes = sstables.stream().mapToLong(SSTableReader::uncompressedLength).sum(); + } + + @Override + public CompactionInfo getCompactionInfo() + { + return new CompactionInfo(metadata, + OperationType.INDEX_BUILD, + bytesProcessed, + totalSizeInBytes, + compactionId, + sstables); + } + + @Override + public void build() + { + if (!validateIndexes()) + return; + for (SSTableReader sstable : sstables) + { + if (indexSSTable(sstable)) + return; + } + } + + /** + * @return true if index build should be stopped + */ + private boolean indexSSTable(SSTableReader sstable) + { + logger.debug("Starting index build on {}", sstable.descriptor); + + RouteIndexFormat.SSTableIndexWriter indexWriter = null; + + Ref ref = sstable.tryRef(); + if (ref == null) + { + logger.warn("Couldn't acquire reference to the SSTable {}. It may have been removed.", sstable.descriptor); + return false; + } + + try (RandomAccessReader dataFile = sstable.openDataReader(); + LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.INDEX_BUILD, sstable)) + { + // remove existing per column index files instead of overwriting + IndexDescriptor indexDescriptor = IndexDescriptor.create(sstable); + indexDescriptor.deleteIndex(); + + indexWriter = new RouteIndexFormat.SSTableIndexWriter(index, indexDescriptor); + indexWriter.begin(); + + long previousBytesRead = 0; + + try (KeyIterator keys = sstable.keyIterator()) + { + while (keys.hasNext()) + { + if (isStopRequested()) + { + logger.debug("Index build has been stopped"); + throw new CompactionInterruptedException(getCompactionInfo()); + } + + DecoratedKey key = keys.next(); + + indexWriter.startPartition(key, -1, -1); + + long position = sstable.getPosition(key, SSTableReader.Operator.EQ); + dataFile.seek(position); + ByteBufferUtil.readWithShortLength(dataFile); // key + + try (SSTableIdentityIterator partition = SSTableIdentityIterator.create(sstable, dataFile, key)) + { + // if the row has statics attached, it has to be indexed separately + if (metadata.hasStaticColumns()) + indexWriter.nextUnfilteredCluster(partition.staticRow()); + + while (partition.hasNext()) + indexWriter.nextUnfilteredCluster(partition.next()); + } + long bytesRead = keys.getBytesRead(); + bytesProcessed += bytesRead - previousBytesRead; + previousBytesRead = bytesRead; + } + + completeSSTable(indexDescriptor, indexWriter, sstable); + } + + return false; + } + catch (Throwable t) + { + if (indexWriter != null) + { + indexWriter.abort(t, true); + } + + if (t instanceof InterruptedException) + { + logger.warn("Interrupted while building indexes on SSTable {}", sstable.descriptor); + Thread.currentThread().interrupt(); + return true; + } + else if (t instanceof CompactionInterruptedException) + { + //TODO Shouldn't do this if the stop was interrupted by a truncate + if (isInitialBuild) + { + logger.error("Stop requested while building initial indexes on SSTable {}.", sstable.descriptor); + throw Throwables.unchecked(t); + } + else + { + logger.info("Stop requested while building indexes on SSTable {}.", sstable.descriptor); + return true; + } + } + else + { + logger.error("Unable to build indexes on SSTable {}. Cause: {}.", sstable, t.getMessage()); + throw Throwables.unchecked(t); + } + } + finally + { + ref.release(); + } + } + + private void completeSSTable(IndexDescriptor indexDescriptor, SSTableFlushObserver indexWriter, SSTableReader sstable) + { + indexWriter.complete(); + + if (!validateIndexes()) + { + logger.debug("dropped during index build"); + return; + } + + // register custom index components into existing sstables + sstable.registerComponents(indexDescriptor.getLiveSSTableComponents(), tracker); + sstableManager.onSSTableChanged(Collections.emptyList(), Collections.singleton(sstable)); + } + + /** + * In case of full rebuild, stop the index build if any index is dropped. + * Otherwise, skip dropped indexes to avoid exception during repair/streaming. + */ + private boolean validateIndexes() + { + switch (index.registerStatus()) + { + case PENDING: throw new IllegalStateException("Unable to build indexes if the index has not been registered"); + case REGISTERED: return true; + case UNREGISTERED: break; + default: throw new AssertionError("Unknown status: " + index.registerStatus()); + } + + if (isFullRebuild) + throw new RuntimeException(String.format("%s are dropped, will stop index build.", index.getIndexMetadata().name)); + + return false; + } +} diff --git a/src/java/org/apache/cassandra/index/accord/SSTableIndex.java b/src/java/org/apache/cassandra/index/accord/SSTableIndex.java new file mode 100644 index 000000000000..03c669f4bd38 --- /dev/null +++ b/src/java/org/apache/cassandra/index/accord/SSTableIndex.java @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.accord; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collection; +import java.util.Collections; +import java.util.EnumMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.cassandra.index.accord.CheckpointIntervalArrayIndex.SegmentSearcher; +import org.apache.cassandra.index.accord.IndexDescriptor.IndexComponent; +import org.apache.cassandra.io.FSReadError; +import org.apache.cassandra.io.util.FileHandle; +import org.apache.cassandra.utils.ByteArrayUtil; +import org.apache.cassandra.utils.concurrent.RefCounted; +import org.apache.cassandra.utils.concurrent.SharedCloseableImpl; + +public class SSTableIndex extends SharedCloseableImpl +{ + public final IndexDescriptor id; + private final Map files; + private final List segments; + + private SSTableIndex(IndexDescriptor id, + Map files, + List segments, + Cleanup cleanup) + { + super(cleanup); + this.id = id; + this.files = files; + this.segments = segments; + } + + public SSTableIndex(SSTableIndex copy) + { + super(copy); + this.id = copy.id; + this.files = copy.files; + this.segments = copy.segments; + } + + @Override + public SSTableIndex sharedCopy() + { + return new SSTableIndex(this); + } + + public static SSTableIndex create(IndexDescriptor id) throws IOException + { + Map files = new EnumMap<>(IndexComponent.class); + for (IndexComponent c : id.getLiveComponents()) + files.put(c, new FileHandle.Builder(id.fileFor(c)).mmapped(true).complete()); + List segments = RouteIndexFormat.readSegments(files); + files.remove(IndexComponent.SEGMENT).close(); + files.remove(IndexComponent.METADATA).close(); + Cleanup cleanup = new Cleanup(files); + return new SSTableIndex(id, files, segments, cleanup); + } + + public Collection search(Group group, byte[] key) + { + List matches = segments.stream().filter(s -> { + Segment.Metadata metadata = s.groups.get(group); + if (metadata == null) return false; + return ByteArrayUtil.compareUnsigned(metadata.minTerm, key) < 0 + && ByteArrayUtil.compareUnsigned(metadata.maxTerm, key) >= 0; + }) + .collect(Collectors.toList()); + if (matches.isEmpty()) return Collections.emptyList(); + if (matches.size() == 1) return search(matches.get(0), group, key); + Set found = new HashSet<>(); + for (Segment s : matches) + found.addAll(search(s, group, key)); + return found; + } + + private Collection search(Segment segment, Group group, byte[] key) + { + Set matches = new HashSet<>(); + Segment.Metadata metadata = Objects.requireNonNull(segment.groups.get(group), () -> "Unknown group: " + group); + try + { + SegmentSearcher searcher = new SegmentSearcher(fileFor(IndexComponent.CINTIA_SORTED_LIST), metadata.metas.get(IndexComponent.CINTIA_SORTED_LIST).offset, + fileFor(IndexComponent.CINTIA_CHECKPOINTS), metadata.metas.get(IndexComponent.CINTIA_CHECKPOINTS).offset); + searcher.contains(key, interval -> matches.add(ByteBuffer.wrap(interval.value))); + } + catch (IOException e) + { + throw new FSReadError(e, id.fileFor(IndexComponent.CINTIA_SORTED_LIST)); + } + return matches; + } + + public Collection search(Group group, byte[] start, boolean startInclusive, byte[] end, boolean endInclusive) + { + List matches = segments.stream().filter(s -> { + Segment.Metadata metadata = s.groups.get(group); + if (metadata == null) return false; + if (ByteArrayUtil.compareUnsigned(metadata.minTerm, end) >= 0) + return false; + if (ByteArrayUtil.compareUnsigned(metadata.maxTerm, start) <= 0) + return false; + return true; + }) + .collect(Collectors.toList()); + if (matches.isEmpty()) return Collections.emptyList(); + if (matches.size() == 1) return search(matches.get(0), group, start, startInclusive, end, endInclusive); + Set found = new HashSet<>(); + for (Segment s : matches) + found.addAll(search(s, group, start, startInclusive, end, endInclusive)); + return found; + } + + private Collection search(Segment segment, Group group, byte[] start, boolean startInclusive, byte[] end, boolean endInclusive) + { + Set matches = new HashSet<>(); + Segment.Metadata metadata = Objects.requireNonNull(segment.groups.get(group), () -> "Unknown group: " + group); + try + { + SegmentSearcher searcher = new SegmentSearcher(fileFor(IndexComponent.CINTIA_SORTED_LIST), metadata.metas.get(IndexComponent.CINTIA_SORTED_LIST).offset, + fileFor(IndexComponent.CINTIA_CHECKPOINTS), metadata.metas.get(IndexComponent.CINTIA_CHECKPOINTS).offset); + searcher.intersects(start, end, interval -> matches.add(ByteBuffer.wrap(interval.value))); + } + catch (IOException e) + { + throw new FSReadError(e, id.fileFor(IndexComponent.CINTIA_SORTED_LIST)); + } + return matches; + } + + private FileHandle fileFor(IndexComponent c) + { + return Objects.requireNonNull(files.get(c), () -> "Unknown component: " + c); + } + + private static class Cleanup implements RefCounted.Tidy + { + private final Map files; + + private Cleanup(Map files) + { + this.files = files; + } + + @Override + public void tidy() throws Exception + { + for (IndexComponent c : IndexComponent.values()) + { + FileHandle fh = files.remove(c); + if (fh == null) continue; + fh.close(); + } + } + + @Override + public String name() + { + return "SSTableIndex Cleanup"; + } + } +} diff --git a/src/java/org/apache/cassandra/index/accord/SSTableManager.java b/src/java/org/apache/cassandra/index/accord/SSTableManager.java new file mode 100644 index 000000000000..5513466ee752 --- /dev/null +++ b/src/java/org/apache/cassandra/index/accord/SSTableManager.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.accord; + +import java.nio.ByteBuffer; +import java.util.Collection; +import java.util.NavigableSet; + +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.schema.TableId; + +public interface SSTableManager +{ + void onSSTableChanged(Collection removed, Iterable added); + boolean isIndexComplete(SSTableReader reader); + + NavigableSet search(int storeId, TableId tableId, byte[] start, boolean startInclusive, byte[] end, boolean endInclusive); + NavigableSet search(int storeId, TableId tableId, byte[] key); +} diff --git a/src/java/org/apache/cassandra/index/accord/Segment.java b/src/java/org/apache/cassandra/index/accord/Segment.java new file mode 100644 index 000000000000..b6d0ffa08529 --- /dev/null +++ b/src/java/org/apache/cassandra/index/accord/Segment.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.accord; + +import java.util.Collections; +import java.util.EnumMap; +import java.util.Map; + +public class Segment +{ + public static final Segment EMPTY = new Segment(Collections.emptyMap()); + + public final Map groups; + + public Segment(Map groups) + { + this.groups = groups; + } + + public static class ComponentMetadata + { + public final long offset, endOffset; + + public ComponentMetadata(long offset, long endOffset) + { + this.offset = offset; + this.endOffset = endOffset; + } + } + + public static class Metadata + { + public final EnumMap metas; + public final byte[] minTerm, maxTerm; + + public Metadata(EnumMap metas, byte[] minTerm, byte[] maxTerm) + { + this.metas = metas; + this.minTerm = minTerm; + this.maxTerm = maxTerm; + } + } +} diff --git a/src/java/org/apache/cassandra/index/internal/composites/CompositesSearcher.java b/src/java/org/apache/cassandra/index/internal/composites/CompositesSearcher.java index 8f357027e957..6629c0625640 100644 --- a/src/java/org/apache/cassandra/index/internal/composites/CompositesSearcher.java +++ b/src/java/org/apache/cassandra/index/internal/composites/CompositesSearcher.java @@ -21,17 +21,29 @@ import java.util.ArrayList; import java.util.List; -import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.db.*; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ClusteringComparator; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.ReadCommand; +import org.apache.cassandra.db.ReadExecutionController; +import org.apache.cassandra.db.SinglePartitionReadCommand; +import org.apache.cassandra.db.WriteContext; import org.apache.cassandra.db.filter.ClusteringIndexNamesFilter; import org.apache.cassandra.db.filter.DataLimits; import org.apache.cassandra.db.filter.RowFilter; import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; -import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.RowIterator; +import org.apache.cassandra.db.rows.Rows; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIterators; import org.apache.cassandra.db.transform.Transformation; +import org.apache.cassandra.index.Index; import org.apache.cassandra.index.internal.CassandraIndex; import org.apache.cassandra.index.internal.CassandraIndexSearcher; import org.apache.cassandra.index.internal.IndexEntry; +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.utils.btree.BTreeSet; @@ -159,7 +171,7 @@ private boolean prepareNext() DataLimits.NONE, partitionKey, filter, - null); + (Index.QueryPlan) null); } // by the next caller of next, or through closing this iterator is this come before. diff --git a/src/java/org/apache/cassandra/index/internal/keys/KeysSearcher.java b/src/java/org/apache/cassandra/index/internal/keys/KeysSearcher.java index 9baa6f6f47c5..98e4e0216040 100644 --- a/src/java/org/apache/cassandra/index/internal/keys/KeysSearcher.java +++ b/src/java/org/apache/cassandra/index/internal/keys/KeysSearcher.java @@ -22,12 +22,22 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.cassandra.db.*; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.ReadCommand; +import org.apache.cassandra.db.ReadExecutionController; +import org.apache.cassandra.db.SinglePartitionReadCommand; +import org.apache.cassandra.db.WriteContext; import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.filter.DataLimits; import org.apache.cassandra.db.filter.RowFilter; import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; -import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.RowIterator; +import org.apache.cassandra.db.rows.Rows; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.index.Index; import org.apache.cassandra.index.internal.CassandraIndex; import org.apache.cassandra.index.internal.CassandraIndexSearcher; import org.apache.cassandra.schema.TableMetadata; @@ -91,7 +101,7 @@ private boolean prepareNext() DataLimits.NONE, key, command.clusteringIndexFilter(key), - null); + (Index.QueryPlan) null); // Otherwise, we close right away if empty, and if it's assigned to next it will be called either // by the next caller of next, or through closing this iterator is this come before. diff --git a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java index bddeacecc7cd..02e7971814bf 100644 --- a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java +++ b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java @@ -177,7 +177,7 @@ public Index.Indexer indexerFor(Predicate indexSelector, public void insertRow(Row row) { // SAI does not index deletions, as these are resolved during post-filtering. - if (row.deletion().isLive()) + if (row.hasLiveData(nowInSec, false)) for (Index.Indexer indexer : indexers) indexer.insertRow(row); } @@ -185,8 +185,10 @@ public void insertRow(Row row) @Override public void updateRow(Row oldRow, Row newRow) { - for (Index.Indexer indexer : indexers) - indexer.updateRow(oldRow, newRow); + // SAI does not index deletions, as these are resolved during post-filtering. + if (newRow.hasLiveData(nowInSec, false)) + for (Index.Indexer indexer : indexers) + indexer.updateRow(oldRow, newRow); } }; } diff --git a/src/java/org/apache/cassandra/index/sai/disk/StorageAttachedIndexWriter.java b/src/java/org/apache/cassandra/index/sai/disk/StorageAttachedIndexWriter.java index f35ae67f93c2..341bcaac5e37 100644 --- a/src/java/org/apache/cassandra/index/sai/disk/StorageAttachedIndexWriter.java +++ b/src/java/org/apache/cassandra/index/sai/disk/StorageAttachedIndexWriter.java @@ -37,6 +37,7 @@ import org.apache.cassandra.index.sai.disk.format.IndexDescriptor; import org.apache.cassandra.index.sai.utils.PrimaryKey; import org.apache.cassandra.io.sstable.SSTableFlushObserver; +import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.Throwables; /** @@ -52,6 +53,8 @@ public class StorageAttachedIndexWriter implements SSTableFlushObserver private final PerSSTableIndexWriter perSSTableWriter; private final Stopwatch stopwatch = Stopwatch.createUnstarted(); private final RowMapping rowMapping; + private final long nowInSeconds = FBUtilities.nowInSeconds(); + private DecoratedKey currentKey; private boolean tokenOffsetWriterCompleted = false; private boolean aborted = false; @@ -126,9 +129,14 @@ public void nextUnfilteredCluster(Unfiltered unfiltered) if (!unfiltered.isRow()) return; + // Ignore rows with no live data... + Row row = (Row) unfiltered; + if (!row.hasLiveData(nowInSeconds, false)) + return; + try { - addRow((Row)unfiltered); + addRow(row); } catch (Throwable t) { diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/MemtableIndexWriter.java b/src/java/org/apache/cassandra/index/sai/disk/v1/MemtableIndexWriter.java index 0650e9b9d6dc..c5f833f4ac16 100644 --- a/src/java/org/apache/cassandra/index/sai/disk/v1/MemtableIndexWriter.java +++ b/src/java/org/apache/cassandra/index/sai/disk/v1/MemtableIndexWriter.java @@ -91,14 +91,16 @@ public void addRow(PrimaryKey key, Row row, long sstableRowId) // keys and row IDs in the flushing SSTable. This writer, therefore, does nothing in // response to the flushing of individual rows except for keeping index-specific statistics. boolean isStatic = indexTermType.columnMetadata().isStatic(); + boolean isPartitionKey = indexTermType.columnMetadata().isPartitionKey(); // Indexes on static columns should only track static rows, and indexes on non-static columns // should only track non-static rows. (Within a partition, the row ID for a static row will always - // come before any non-static row.) - if (key.kind() == PrimaryKey.Kind.STATIC && isStatic || key.kind() != PrimaryKey.Kind.STATIC && !isStatic) + // come before any non-static row.) The only exception to this is indexes on partition key elements. + if ((key.kind() == PrimaryKey.Kind.STATIC && (isStatic || isPartitionKey)) || key.kind() != PrimaryKey.Kind.STATIC && !isStatic) { if (minKey == null) minKey = key; + maxKey = key; rowCount++; maxSSTableRowId = Math.max(maxSSTableRowId, sstableRowId); @@ -144,12 +146,15 @@ public void complete(Stopwatch stopwatch) throws IOException { final Iterator> iterator = rowMapping.merge(memtable); - try (MemtableTermsIterator terms = new MemtableTermsIterator(memtable.getMinTerm(), memtable.getMaxTerm(), iterator)) + long cellCount = 0; + if (iterator.hasNext()) { - long cellCount = flush(terms); - - completeIndexFlush(cellCount, start, stopwatch); + try (MemtableTermsIterator terms = new MemtableTermsIterator(memtable.getMinTerm(), memtable.getMaxTerm(), iterator)) + { + cellCount = flush(terms); + } } + completeIndexFlush(cellCount, start, stopwatch); } } catch (Throwable t) @@ -215,8 +220,8 @@ private void flushVectorIndex(long startTime, Stopwatch stopwatch) throws IOExce private void completeIndexFlush(long cellCount, long startTime, Stopwatch stopwatch) throws IOException { - // create a completion marker indicating that the index is complete and not-empty - ColumnCompletionMarkerUtil.create(indexDescriptor, indexIdentifier, false); + // create a completion marker indicating that the index is complete + ColumnCompletionMarkerUtil.create(indexDescriptor, indexIdentifier, cellCount == 0); indexMetrics.memtableIndexFlushCount.inc(); diff --git a/src/java/org/apache/cassandra/index/sai/iterators/KeyRangeIntersectionIterator.java b/src/java/org/apache/cassandra/index/sai/iterators/KeyRangeIntersectionIterator.java index 6237aa013180..4909033bf86e 100644 --- a/src/java/org/apache/cassandra/index/sai/iterators/KeyRangeIntersectionIterator.java +++ b/src/java/org/apache/cassandra/index/sai/iterators/KeyRangeIntersectionIterator.java @@ -375,6 +375,10 @@ public void update(KeyRangeIterator range) min = nullSafeMax(min, range.getMinimum()); // maximum of the intersection is the smallest maximum of individual iterators max = nullSafeMin(max, range.getMaximum()); + + // With STATIC keys, it is possible for the min to overtake the max, which must be corrected. + min = nullSafeMin(min, max); + if (empty) { empty = false; diff --git a/src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java b/src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java index 4798393fd799..c531ba6696c3 100644 --- a/src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java +++ b/src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java @@ -23,13 +23,13 @@ import java.util.Map; import java.util.PriorityQueue; import java.util.SortedSet; +import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.LongAdder; import java.util.function.Function; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import io.netty.util.concurrent.FastThreadLocal; import org.apache.cassandra.db.Clustering; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.PartitionPosition; @@ -59,6 +59,7 @@ public class TrieMemoryIndex extends MemoryIndex { private static final Logger logger = LoggerFactory.getLogger(TrieMemoryIndex.class); private static final int MAX_RECURSIVE_KEY_LENGTH = 128; + private static final int MINIMUM_PRIORITY_QUEUE_SIZE = 128; private final InMemoryTrie data; private final PrimaryKeysReducer primaryKeysReducer; @@ -66,6 +67,11 @@ public class TrieMemoryIndex extends MemoryIndex private ByteBuffer minTerm; private ByteBuffer maxTerm; + // Maintain the last queue size used on this index to use for the next range match. + // This allows for receiving a stream of wide range queries where the queue size + // is larger than we would want to default the size to. + private final AtomicInteger lastPriorityQueueSize = new AtomicInteger(MINIMUM_PRIORITY_QUEUE_SIZE); + public TrieMemoryIndex(StorageAttachedIndex index) { super(index); @@ -142,7 +148,11 @@ public KeyRangeIterator search(QueryContext queryContext, Expression expression, case CONTAINS_VALUE: return exactMatch(expression, keyRange); case RANGE: - return rangeMatch(expression, keyRange); + KeyRangeIterator keyIterator = rangeMatch(expression, keyRange); + int keyCount = (int) keyIterator.getMaxKeys(); + if (keyCount > MINIMUM_PRIORITY_QUEUE_SIZE) + lastPriorityQueueSize.set(keyCount); + return keyIterator; default: throw new IllegalArgumentException("Unsupported expression: " + expression); } @@ -251,29 +261,15 @@ private KeyRangeIterator exactMatch(Expression expression, AbstractBounds lastQueueSize = new FastThreadLocal<>() - { - protected Integer initialValue() - { - return MINIMUM_QUEUE_SIZE; - } - }; + final PriorityQueue mergedKeys; + final AbstractBounds keyRange; - PrimaryKey minimumKey = null; PrimaryKey maximumKey = null; - final PriorityQueue mergedKeys = new PriorityQueue<>(lastQueueSize.get()); - final AbstractBounds keyRange; - - public Collector(AbstractBounds keyRange) + public Collector(AbstractBounds keyRange, int expectedKeys) { this.keyRange = keyRange; + this.mergedKeys = new PriorityQueue<>(expectedKeys); } public void processContent(PrimaryKeys keys) @@ -295,12 +291,8 @@ public void processContent(PrimaryKeys keys) || primaryKeys.last().partitionKey().compareTo(keyRange.left) < 0) return; - primaryKeys.forEach(this::processKey); - } - - public void updateLastQueueSize() - { - lastQueueSize.set(Math.max(MINIMUM_QUEUE_SIZE, mergedKeys.size())); + for (PrimaryKey primaryKey : primaryKeys) + processKey(primaryKey); } private void processKey(PrimaryKey key) @@ -309,7 +301,7 @@ private void processKey(PrimaryKey key) { mergedKeys.add(key); - minimumKey = minimumKey == null ? key : key.compareTo(minimumKey) < 0 ? key : minimumKey; + // We only track the maximum key, as the minimum can be peeked in constant time on the PQ itself. maximumKey = maximumKey == null ? key : key.compareTo(maximumKey) > 0 ? key : maximumKey; } } @@ -341,20 +333,16 @@ private KeyRangeIterator rangeMatch(Expression expression, AbstractBounds values = data.subtrie(lowerBound, lowerInclusive, upperBound, upperInclusive).valueIterator(); - data.subtrie(lowerBound, lowerInclusive, upperBound, upperInclusive) - .values() - .forEach(cd::processContent); + while (values.hasNext()) + cd.processContent(values.next()); if (cd.mergedKeys.isEmpty()) - { return KeyRangeIterator.empty(); - } - - cd.updateLastQueueSize(); - return new InMemoryKeyRangeIterator(cd.minimumKey, cd.maximumKey, cd.mergedKeys); + return new InMemoryKeyRangeIterator(cd.mergedKeys.peek(), cd.maximumKey, cd.mergedKeys); } private static class PrimaryKeysReducer implements InMemoryTrie.UpsertTransformer diff --git a/src/java/org/apache/cassandra/index/sai/plan/QueryController.java b/src/java/org/apache/cassandra/index/sai/plan/QueryController.java index 49f5a76628f4..53cf23c0d9c5 100644 --- a/src/java/org/apache/cassandra/index/sai/plan/QueryController.java +++ b/src/java/org/apache/cassandra/index/sai/plan/QueryController.java @@ -455,7 +455,12 @@ private ClusteringIndexFilter makeFilter(List keys) { nextClusterings.clear(); for (PrimaryKey key : keys) - nextClusterings.add(key.clustering()); + { + // primary keys privided by SAI may contain NativeCustering + // filter logic may use ValueAccessor.factory() for slicing, which is not supported for NativeCustering + Clustering clustering = key.clustering().ensureAccessorFactorySupport(); + nextClusterings.add(clustering); + } return new ClusteringIndexNamesFilter(nextClusterings, clusteringIndexFilter.isReversed()); } } diff --git a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java index 858242006aa2..20a9cad58c45 100644 --- a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java +++ b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java @@ -18,6 +18,7 @@ package org.apache.cassandra.index.sai.plan; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; @@ -32,6 +33,8 @@ import io.netty.util.concurrent.FastThreadLocal; import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ClusteringBound; +import org.apache.cassandra.db.ClusteringComparator; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.DataRange; import org.apache.cassandra.db.DecoratedKey; @@ -39,6 +42,10 @@ import org.apache.cassandra.db.ReadCommand; import org.apache.cassandra.db.ReadExecutionController; import org.apache.cassandra.db.RegularAndStaticColumns; +import org.apache.cassandra.db.Slices; +import org.apache.cassandra.db.filter.ClusteringIndexFilter; +import org.apache.cassandra.db.filter.ClusteringIndexNamesFilter; +import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter; import org.apache.cassandra.db.filter.RowFilter; import org.apache.cassandra.db.partitions.PartitionIterator; import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; @@ -138,6 +145,7 @@ private class ResultRetriever extends AbstractIterator im private final PrimaryKey firstPrimaryKey; private final PrimaryKey lastPrimaryKey; private final Iterator keyRanges; + private final DataRange firstDataRange; private AbstractBounds currentKeyRange; private final KeyRangeIterator resultKeyIterator; @@ -152,7 +160,8 @@ private class ResultRetriever extends AbstractIterator im private ResultRetriever(ReadExecutionController executionController, boolean topK) { this.keyRanges = queryController.dataRanges().iterator(); - this.currentKeyRange = keyRanges.next().keyRange(); + this.firstDataRange = keyRanges.next(); + this.currentKeyRange = firstDataRange.keyRange(); this.resultKeyIterator = Operation.buildIterator(queryController); this.filterTree = Operation.buildFilter(queryController, queryController.usesStrictFiltering()); this.executionController = executionController; @@ -175,7 +184,52 @@ public UnfilteredRowIterator computeNext() // We can't put this code in the constructor because it may throw and the caller // may not be prepared for that. if (lastKey == null) - resultKeyIterator.skipTo(firstPrimaryKey); + { + PrimaryKey skipTarget = firstPrimaryKey; + ClusteringComparator comparator = command.metadata().comparator; + + // If there are no clusterings, the first data range selects an entire partitions, or we have static + // expressions, don't bother trying to skip forward within the partition. + if (comparator.size() > 0 && !firstDataRange.selectsAllPartition() && !command.rowFilter().hasStaticExpression()) + { + // Only attempt to skip if the first data range covers a single partition. + if (currentKeyRange.left.equals(currentKeyRange.right) && currentKeyRange.left instanceof DecoratedKey) + { + DecoratedKey decoratedKey = (DecoratedKey) currentKeyRange.left; + ClusteringIndexFilter filter = firstDataRange.clusteringIndexFilter(decoratedKey); + + if (filter instanceof ClusteringIndexSliceFilter) + { + Slices slices = ((ClusteringIndexSliceFilter) filter).requestedSlices(); + + if (!slices.isEmpty()) + { + ClusteringBound startBound = slices.get(0).start(); + + if (!startBound.isEmpty()) + { + ByteBuffer[] rawValues = startBound.getBufferArray(); + + if (rawValues.length == comparator.size()) + skipTarget = keyFactory.create(decoratedKey, Clustering.make(rawValues)); + } + } + } + else if (filter instanceof ClusteringIndexNamesFilter) + { + ClusteringIndexNamesFilter namesFilter = (ClusteringIndexNamesFilter) filter; + + if (!namesFilter.requestedRows().isEmpty()) + { + Clustering skipClustering = namesFilter.requestedRows().iterator().next(); + skipTarget = keyFactory.create(decoratedKey, skipClustering); + } + } + } + } + + resultKeyIterator.skipTo(skipTarget); + } // Theoretically we wouldn't need this if the caller of computeNext always ran the // returned iterators to the completion. Unfortunately, we have no control over the caller behavior here. @@ -229,7 +283,7 @@ private List nextSelectedKeysInRange() if (firstKey == null) return Collections.emptyList(); } - while (queryController.doesNotSelect(firstKey) || firstKey.equals(lastKey)); + while (queryController.doesNotSelect(firstKey) || firstKey.equals(lastKey, false)); lastKey = firstKey; threadLocalNextKeys.add(firstKey); @@ -291,7 +345,7 @@ private void fillNextSelectedKeysInPartition(DecoratedKey partitionKey, List partit AbstractType baseType = indexType.unwrap(); - if (baseType.subTypes().isEmpty()) + // We only need to inspect subtypes when it is possible for them to be queried individually. + if (baseType.subTypes().isEmpty() || indexTargetType == IndexTarget.Type.SIMPLE || indexTargetType == IndexTarget.Type.FULL) { this.subTypes = Collections.emptyList(); } diff --git a/src/java/org/apache/cassandra/index/sai/utils/PrimaryKey.java b/src/java/org/apache/cassandra/index/sai/utils/PrimaryKey.java index 3cc503304450..6de7a6c88462 100644 --- a/src/java/org/apache/cassandra/index/sai/utils/PrimaryKey.java +++ b/src/java/org/apache/cassandra/index/sai/utils/PrimaryKey.java @@ -218,10 +218,20 @@ public int hashCode() } @Override - public boolean equals(Object obj) + public boolean equals(Object o) { - if (obj instanceof PrimaryKey) - return compareTo((PrimaryKey) obj) == 0; + if (o instanceof PrimaryKey) + return compareTo((PrimaryKey) o) == 0; + return false; + } + + @Override + public boolean equals(Object o, boolean strict) + { + if (o == null) + return false; + if (o instanceof PrimaryKey) + return compareTo((PrimaryKey) o, strict) == 0; return false; } @@ -494,4 +504,6 @@ default int compareTo(PrimaryKey key, boolean strict) { return compareTo(key); } + + boolean equals(Object obj, boolean strict); } diff --git a/src/java/org/apache/cassandra/index/sasi/plan/Expression.java b/src/java/org/apache/cassandra/index/sasi/plan/Expression.java index 6c3b9b88f30f..b17a50d621bb 100644 --- a/src/java/org/apache/cassandra/index/sasi/plan/Expression.java +++ b/src/java/org/apache/cassandra/index/sasi/plan/Expression.java @@ -118,7 +118,7 @@ public Expression(QueryController controller, ColumnIndex columnIndex) @VisibleForTesting public Expression(String name, AbstractType validator) { - this(null, new ColumnIndex(UTF8Type.instance, ColumnMetadata.regularColumn("sasi", "internal", name, validator), null)); + this(null, new ColumnIndex(UTF8Type.instance, ColumnMetadata.regularColumn("sasi", "internal", name, validator, ColumnMetadata.NO_UNIQUE_ID), null)); } public Expression setLower(Bound newLower) diff --git a/src/java/org/apache/cassandra/io/AsymmetricParameterisedUnversionedSerializer.java b/src/java/org/apache/cassandra/io/AsymmetricParameterisedUnversionedSerializer.java new file mode 100644 index 000000000000..69b1bd2e0cb8 --- /dev/null +++ b/src/java/org/apache/cassandra/io/AsymmetricParameterisedUnversionedSerializer.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; + +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.io.util.DataOutputPlus; + +public interface AsymmetricParameterisedUnversionedSerializer +{ + void serialize(In t, P p, DataOutputPlus out) throws IOException; + default ByteBuffer serialize(In t, P p) throws IOException + { + int size = Math.toIntExact(serializedSize(t, p)); + try (DataOutputBuffer buffer = new DataOutputBuffer(size)) + { + serialize(t, p, buffer); + ByteBuffer bb = buffer.buffer(); + assert size == bb.remaining() : String.format("Expected to write %d but wrote %d", size, bb.remaining()); + return bb; + } + } + + default ByteBuffer serializeUnchecked(In t, P p) + { + try + { + return serialize(t, p); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } + Out deserialize(P p, DataInputPlus in) throws IOException; + default Out deserialize(P p, ByteBuffer buffer) throws IOException + { + try (DataInputBuffer in = new DataInputBuffer(buffer, true)) + { + return deserialize(p, in); + } + } + + default Out deserializeUnchecked(P p, ByteBuffer buffer) + { + try + { + return deserialize(p, buffer); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } + + long serializedSize(In t, P p); +} diff --git a/src/java/org/apache/cassandra/io/AsymmetricParameterisedVersionedSerializer.java b/src/java/org/apache/cassandra/io/AsymmetricParameterisedVersionedSerializer.java new file mode 100644 index 000000000000..a1a295f83d89 --- /dev/null +++ b/src/java/org/apache/cassandra/io/AsymmetricParameterisedVersionedSerializer.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; + +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.io.util.DataOutputPlus; + +public interface AsymmetricParameterisedVersionedSerializer +{ + void serialize(In t, P p, DataOutputPlus out, Version version) throws IOException; + + default ByteBuffer serialize(In t, P p, Version version) throws IOException + { + int size = Math.toIntExact(serializedSize(t, p, version)); + try (DataOutputBuffer buffer = new DataOutputBuffer(size)) + { + serialize(t, p, buffer, version); + ByteBuffer bb = buffer.buffer(); + assert size == bb.remaining() : String.format("Expected to write %d but wrote %d", size, bb.remaining()); + return bb; + } + } + + default ByteBuffer serializeUnchecked(In t, P p, Version version) + { + try + { + return serialize(t, p, version); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } + + Out deserialize(P p, DataInputPlus in, Version version) throws IOException; + + default Out deserialize(P p, ByteBuffer buffer, Version version) throws IOException + { + try (DataInputBuffer in = new DataInputBuffer(buffer, true)) + { + return deserialize(p, in, version); + } + } + + default Out deserializeUnchecked(P p, ByteBuffer buffer, Version version) + { + try + { + return deserialize(p, buffer, version); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } + long serializedSize(In t, P p, Version version); + + static AsymmetricParameterisedVersionedSerializer from(AsymmetricParameterisedUnversionedSerializer delegate) + { + return new AsymmetricParameterisedVersionedSerializer<>() + { + @Override + public void serialize(In t, P p, DataOutputPlus out, Version version) throws IOException + { + delegate.serialize(t, p, out); + } + + @Override + public Out deserialize(P p, DataInputPlus in, Version version) throws IOException + { + return delegate.deserialize(p, in); + } + + @Override + public long serializedSize(In t, P p, Version version) + { + return delegate.serializedSize(t, p); + } + }; + } +} diff --git a/src/java/org/apache/cassandra/io/AsymmetricUnversionedSerializer.java b/src/java/org/apache/cassandra/io/AsymmetricUnversionedSerializer.java new file mode 100644 index 000000000000..570741903f38 --- /dev/null +++ b/src/java/org/apache/cassandra/io/AsymmetricUnversionedSerializer.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; + +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.io.util.DataOutputPlus; + +public interface AsymmetricUnversionedSerializer +{ + void serialize(In t, DataOutputPlus out) throws IOException; + default ByteBuffer serialize(In t) throws IOException + { + int size = Math.toIntExact(serializedSize(t)); + try (DataOutputBuffer buffer = new DataOutputBuffer(size)) + { + serialize(t, buffer); + ByteBuffer bb = buffer.buffer(); + assert size == bb.remaining() : String.format("Expected to write %d but wrote %d", size, bb.remaining()); + return bb; + } + } + + default void skip(DataInputPlus in) throws IOException + { + deserialize(in); + } + + default ByteBuffer serializeUnchecked(In t) + { + try + { + return serialize(t); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } + Out deserialize(DataInputPlus in) throws IOException; + default Out deserialize(ByteBuffer buffer) throws IOException + { + try (DataInputBuffer in = new DataInputBuffer(buffer, true)) + { + return deserialize(in); + } + } + + default Out deserializeUnchecked(ByteBuffer buffer) + { + try + { + return deserialize(buffer); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } + long serializedSize(In t); +} diff --git a/src/java/org/apache/cassandra/io/AsymmetricVersionedSerializer.java b/src/java/org/apache/cassandra/io/AsymmetricVersionedSerializer.java new file mode 100644 index 000000000000..2dac3a577787 --- /dev/null +++ b/src/java/org/apache/cassandra/io/AsymmetricVersionedSerializer.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; + +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.io.util.DataOutputPlus; + +public interface AsymmetricVersionedSerializer +{ + void serialize(In t, DataOutputPlus out, Version version) throws IOException; + default ByteBuffer serialize(In t, Version version) throws IOException + { + int size = Math.toIntExact(serializedSize(t, version)); + try (DataOutputBuffer buffer = new DataOutputBuffer(size)) + { + serialize(t, buffer, version); + ByteBuffer bb = buffer.buffer(); + assert size == bb.remaining() : String.format("Expected to write %d but wrote %d", size, bb.remaining()); + return bb; + } + } + + default ByteBuffer serializeUnchecked(In t, Version version) + { + try + { + return serialize(t, version); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } + Out deserialize(DataInputPlus in, Version version) throws IOException; + default Out deserialize(ByteBuffer buffer, Version version) throws IOException + { + try (DataInputBuffer in = new DataInputBuffer(buffer, true)) + { + return deserialize(in, version); + } + } + + default Out deserializeUnchecked(ByteBuffer buffer, Version version) + { + try + { + return deserialize(buffer, version); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } + long serializedSize(In t, Version version); + + static AsymmetricVersionedSerializer from(AsymmetricUnversionedSerializer delegate) + { + return new AsymmetricVersionedSerializer<>() + { + @Override + public void serialize(In t, DataOutputPlus out, Version version) throws IOException + { + delegate.serialize(t, out); + } + + @Override + public Out deserialize(DataInputPlus in, Version version) throws IOException + { + return delegate.deserialize(in); + } + + @Override + public long serializedSize(In t, Version version) + { + return delegate.serializedSize(t); + } + }; + } +} diff --git a/src/java/org/apache/cassandra/io/EmbeddedAsymmetricVersionedSerializer.java b/src/java/org/apache/cassandra/io/EmbeddedAsymmetricVersionedSerializer.java new file mode 100644 index 000000000000..b3baec39acb3 --- /dev/null +++ b/src/java/org/apache/cassandra/io/EmbeddedAsymmetricVersionedSerializer.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io; + +import java.io.IOException; + +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + +public class EmbeddedAsymmetricVersionedSerializer implements IVersionedAsymmetricSerializer, AsymmetricUnversionedSerializer +{ + private final Version version; + private final UnversionedSerializer versionSerializer; + private final AsymmetricVersionedSerializer delegate; + + public EmbeddedAsymmetricVersionedSerializer(Version version, + UnversionedSerializer versionSerializer, + AsymmetricVersionedSerializer delegate) + { + this.version = version; + this.versionSerializer = versionSerializer; + this.delegate = delegate; + } + + @Override + public void serialize(In t, DataOutputPlus out, int msgVersion) throws IOException + { + serialize(t, out); + } + + @Override + public void serialize(In t, DataOutputPlus out) throws IOException + { + versionSerializer.serialize(version, out); + delegate.serialize(t, out, version); + } + + @Override + public Out deserialize(DataInputPlus in, int msgVersion) throws IOException + { + return deserialize(in); + } + + @Override + public Out deserialize(DataInputPlus in) throws IOException + { + Version version = versionSerializer.deserialize(in); + return delegate.deserialize(in, version); + } + + public Version deserializeVersion(DataInputPlus in) throws IOException + { + return versionSerializer.deserialize(in); + } + + @Override + public long serializedSize(In t, int msgVersion) + { + return serializedSize(t); + } + + @Override + public long serializedSize(In t) + { + return versionSerializer.serializedSize(version) + + delegate.serializedSize(t, version); + } +} diff --git a/src/java/org/apache/cassandra/io/IVersionedAsymmetricSerializer.java b/src/java/org/apache/cassandra/io/IVersionedAsymmetricSerializer.java index 8ad2c285c326..06469fa1290c 100644 --- a/src/java/org/apache/cassandra/io/IVersionedAsymmetricSerializer.java +++ b/src/java/org/apache/cassandra/io/IVersionedAsymmetricSerializer.java @@ -18,8 +18,12 @@ package org.apache.cassandra.io; import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import org.apache.cassandra.io.util.DataInputBuffer; import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputBuffer; import org.apache.cassandra.io.util.DataOutputPlus; public interface IVersionedAsymmetricSerializer @@ -32,7 +36,31 @@ public interface IVersionedAsymmetricSerializer * @param version protocol version * @throws IOException if serialization fails */ - public void serialize(In t, DataOutputPlus out, int version) throws IOException; + void serialize(In t, DataOutputPlus out, int version) throws IOException; + + default ByteBuffer serialize(In t, int version) throws IOException + { + int size = Math.toIntExact(serializedSize(t, version)); + try (DataOutputBuffer buffer = new DataOutputBuffer(size)) + { + serialize(t, buffer, version); + ByteBuffer bb = buffer.buffer(); + assert size == bb.remaining() : String.format("Expected to write %d but wrote %d", size, bb.remaining()); + return bb; + } + } + + default ByteBuffer serializeUnchecked(In t, int version) + { + try + { + return serialize(t, version); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } /** * Deserialize into the specified DataInputStream instance. @@ -41,7 +69,27 @@ public interface IVersionedAsymmetricSerializer * @return the type that was deserialized * @throws IOException if deserialization fails */ - public Out deserialize(DataInputPlus in, int version) throws IOException; + Out deserialize(DataInputPlus in, int version) throws IOException; + + default Out deserialize(ByteBuffer buffer, int version) throws IOException + { + try (DataInputBuffer in = new DataInputBuffer(buffer, true)) + { + return deserialize(in, version); + } + } + + default Out deserializeUnchecked(ByteBuffer buffer, int version) + { + try + { + return deserialize(buffer, version); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } /** * Calculate serialized size of object without actually serializing. @@ -49,5 +97,5 @@ public interface IVersionedAsymmetricSerializer * @param version protocol version * @return serialized size of object t */ - public long serializedSize(In t, int version); + long serializedSize(In t, int version); } diff --git a/src/java/org/apache/cassandra/io/IVersionedSerializer.java b/src/java/org/apache/cassandra/io/IVersionedSerializer.java index 6730ec08249e..0e5a400ff1c8 100644 --- a/src/java/org/apache/cassandra/io/IVersionedSerializer.java +++ b/src/java/org/apache/cassandra/io/IVersionedSerializer.java @@ -17,6 +17,34 @@ */ package org.apache.cassandra.io; +import java.io.IOException; + +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + public interface IVersionedSerializer extends IVersionedAsymmetricSerializer { + static IVersionedSerializer from(UnversionedSerializer delegate) + { + return new IVersionedSerializer() + { + @Override + public void serialize(T t, DataOutputPlus out, int version) throws IOException + { + delegate.serialize(t, out); + } + + @Override + public T deserialize(DataInputPlus in, int version) throws IOException + { + return delegate.deserialize(in); + } + + @Override + public long serializedSize(T t, int version) + { + return delegate.serializedSize(t); + } + }; + } } diff --git a/src/java/org/apache/cassandra/io/MessageVersionProvider.java b/src/java/org/apache/cassandra/io/MessageVersionProvider.java new file mode 100644 index 000000000000..a6ad468281c0 --- /dev/null +++ b/src/java/org/apache/cassandra/io/MessageVersionProvider.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io; + +public interface MessageVersionProvider +{ + int messageVersion(); +} diff --git a/src/java/org/apache/cassandra/io/ParameterisedUnversionedSerializer.java b/src/java/org/apache/cassandra/io/ParameterisedUnversionedSerializer.java new file mode 100644 index 000000000000..b93b8aacfc8b --- /dev/null +++ b/src/java/org/apache/cassandra/io/ParameterisedUnversionedSerializer.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io; + +public interface ParameterisedUnversionedSerializer extends AsymmetricParameterisedUnversionedSerializer +{ +} diff --git a/src/java/org/apache/cassandra/io/ParameterisedVersionedSerializer.java b/src/java/org/apache/cassandra/io/ParameterisedVersionedSerializer.java new file mode 100644 index 000000000000..3393c8141c7c --- /dev/null +++ b/src/java/org/apache/cassandra/io/ParameterisedVersionedSerializer.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io; + +public interface ParameterisedVersionedSerializer extends AsymmetricParameterisedVersionedSerializer +{ +} diff --git a/src/java/org/apache/cassandra/io/UnversionedSerializer.java b/src/java/org/apache/cassandra/io/UnversionedSerializer.java new file mode 100644 index 000000000000..2bf1f2013a95 --- /dev/null +++ b/src/java/org/apache/cassandra/io/UnversionedSerializer.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io; + +public interface UnversionedSerializer extends AsymmetricUnversionedSerializer +{ +} diff --git a/src/java/org/apache/cassandra/io/VersionedSerializer.java b/src/java/org/apache/cassandra/io/VersionedSerializer.java new file mode 100644 index 000000000000..e8497530455d --- /dev/null +++ b/src/java/org/apache/cassandra/io/VersionedSerializer.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io; + +public interface VersionedSerializer extends AsymmetricVersionedSerializer +{ +} diff --git a/src/java/org/apache/cassandra/io/sstable/CQLSSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/CQLSSTableWriter.java index dbe91cee106c..0676eb1f568c 100644 --- a/src/java/org/apache/cassandra/io/sstable/CQLSSTableWriter.java +++ b/src/java/org/apache/cassandra/io/sstable/CQLSSTableWriter.java @@ -26,9 +26,11 @@ import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.NavigableSet; +import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.function.Consumer; import java.util.stream.Collectors; @@ -280,7 +282,6 @@ public CQLSSTableWriter rawAddRow(List values) // Note that we asks indexes to not validate values (the last 'false' arg below) because that triggers a 'Keyspace.open' // and that forces a lot of initialization that we don't want. UpdateParameters params = new UpdateParameters(modificationStatement.metadata, - modificationStatement.updatedColumns(), ClientState.forInternalCalls(), options, modificationStatement.getTimestamp(TimeUnit.MILLISECONDS.toMicros(now), options), @@ -652,6 +653,18 @@ public Builder openSSTableOnProduced() return this; } + /** + * Specifies the SSTable format this CQLSSTableWriter instance should use for writing. + * + * @param format The format to use + * @return this builder + */ + public Builder withFormat(SSTableFormat format) + { + this.format = format; + return this; + } + public CQLSSTableWriter build() { if (directory == null) @@ -661,7 +674,12 @@ public CQLSSTableWriter build() if (modificationStatement == null) throw new IllegalStateException("No modification (INSERT/UPDATE/DELETE) statement specified, you should provide a modification statement through using()"); - Preconditions.checkState(Sets.difference(SchemaConstants.LOCAL_SYSTEM_KEYSPACE_NAMES, Schema.instance.getKeyspaces()).isEmpty(), + Set activeKeyspaces = new HashSet<>(SchemaConstants.LOCAL_SYSTEM_KEYSPACE_NAMES); + + if (!DatabaseDescriptor.getAccordTransactionsEnabled()) + activeKeyspaces.remove(SchemaConstants.ACCORD_KEYSPACE_NAME); + + Preconditions.checkState(Sets.difference(activeKeyspaces, Schema.instance.getKeyspaces()).isEmpty(), "Local keyspaces were not loaded. If this is running as a client, please make sure to add %s=true system property.", CassandraRelevantProperties.FORCE_LOAD_LOCAL_KEYSPACES.getKey()); diff --git a/src/java/org/apache/cassandra/io/sstable/Descriptor.java b/src/java/org/apache/cassandra/io/sstable/Descriptor.java index bf58fff71761..b9e149804278 100644 --- a/src/java/org/apache/cassandra/io/sstable/Descriptor.java +++ b/src/java/org/apache/cassandra/io/sstable/Descriptor.java @@ -64,14 +64,14 @@ public class Descriptor // to the SSTable naming. static final Pattern SSTABLE_DIR_PATTERN = Pattern.compile(".*/(?\\w+)/" + "(?\\w+)-(?[0-9a-f]{32})/" + - "(backups/|snapshots/(?[\\w-]+)/)?" + + "(backups/|snapshots/(?[^/]+)/)?" + "(\\.(?[\\w-]+)/)?" + "(?[\\w-\\+]+)\\.(?[\\w]+)$"); // Pre 2.1 SSTable directory format is {keyspace}/{tableName}-{tableId}[/backups|/snapshots/{tag}][/.{indexName}]/{component}.db static final Pattern LEGACY_SSTABLE_DIR_PATTERN = Pattern.compile(".*/(?\\w+)/" + "(?\\w+)/" + - "(backups/|snapshots/(?[\\w-]+)/)?" + + "(backups/|snapshots/(?[^/]+)/)?" + "(\\.(?[\\w-]+)/)?" + "(?[\\w-]+)\\.(?[\\w]+)$"); diff --git a/src/java/org/apache/cassandra/io/sstable/KeyIterator.java b/src/java/org/apache/cassandra/io/sstable/KeyIterator.java index dbe501f36e7e..c8c1709503d9 100644 --- a/src/java/org/apache/cassandra/io/sstable/KeyIterator.java +++ b/src/java/org/apache/cassandra/io/sstable/KeyIterator.java @@ -21,12 +21,15 @@ import java.util.concurrent.locks.ReadWriteLock; import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.dht.AbstractBounds; import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.utils.AbstractIterator; import org.apache.cassandra.utils.CloseableIterator; public class KeyIterator extends AbstractIterator implements CloseableIterator { + private final AbstractBounds bounds; private final IPartitioner partitioner; private final KeyReader it; private final ReadWriteLock fileAccessLock; @@ -34,8 +37,9 @@ public class KeyIterator extends AbstractIterator implements Close private boolean initialized = false; - public KeyIterator(KeyReader it, IPartitioner partitioner, long totalBytes, ReadWriteLock fileAccessLock) + public KeyIterator(AbstractBounds bounds, KeyReader it, IPartitioner partitioner, long totalBytes, ReadWriteLock fileAccessLock) { + this.bounds = bounds; this.it = it; this.partitioner = partitioner; this.totalBytes = totalBytes; @@ -48,19 +52,26 @@ protected DecoratedKey computeNext() fileAccessLock.readLock().lock(); try { - if (!initialized) + while (true) { - initialized = true; - return it.isExhausted() - ? endOfData() - : partitioner.decorateKey(it.key()); - } - else - { - return it.advance() - ? partitioner.decorateKey(it.key()) - : endOfData(); + if (!initialized) + { + initialized = true; + if (it.isExhausted()) + break; + } + else if (!it.advance()) + break; + + DecoratedKey key = partitioner.decorateKey(it.key()); + if (bounds == null || bounds.contains(key)) + return key; + + if (key.compareTo(bounds.right) >= 0) + break; } + + return endOfData(); } catch (IOException e) { diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableZeroCopyWriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableZeroCopyWriter.java index 3bf21f1155ec..46a490974e3e 100644 --- a/src/java/org/apache/cassandra/io/sstable/SSTableZeroCopyWriter.java +++ b/src/java/org/apache/cassandra/io/sstable/SSTableZeroCopyWriter.java @@ -38,9 +38,12 @@ import org.apache.cassandra.io.FSWriteError; import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.File; import org.apache.cassandra.io.util.SequentialWriter; +import org.apache.cassandra.io.util.SequentialWriterOption; import org.apache.cassandra.net.AsyncStreamingInputPlus; import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.utils.ByteBufferUtil; import static java.lang.String.format; import static org.apache.cassandra.utils.FBUtilities.prettyPrintMemory; @@ -50,7 +53,7 @@ public class SSTableZeroCopyWriter extends SSTable implements SSTableMultiWriter private static final Logger logger = LoggerFactory.getLogger(SSTableZeroCopyWriter.class); private volatile SSTableReader finalReader; - private final Map componentWriters; // indexed by component name + private final Map componentWriters; // indexed by component name public SSTableZeroCopyWriter(Builder builder, LifecycleNewTracker lifecycleNewTracker, @@ -89,12 +92,12 @@ public AbstractBounds getBounds() throw new UnsupportedOperationException(); } - private SequentialWriter makeWriter(Descriptor descriptor, Component component) + private ZeroCopySequentialWriter makeWriter(Descriptor descriptor, Component component) { - return new SequentialWriter(descriptor.fileFor(component), ioOptions.writerOptions, false); + return new ZeroCopySequentialWriter(descriptor.fileFor(component), ioOptions.writerOptions, false); } - private void write(DataInputPlus in, long size, SequentialWriter out) throws FSWriteError + private void write(DataInputPlus in, long size, ZeroCopySequentialWriter out) throws FSWriteError { final int BUFFER_SIZE = 1 << 20; long bytesRead = 0; @@ -128,7 +131,7 @@ public Collection finish(boolean openResult) { setOpenResult(openResult); - for (SequentialWriter writer : componentWriters.values()) + for (ZeroCopySequentialWriter writer : componentWriters.values()) writer.finish(); return finished(); @@ -170,7 +173,7 @@ public TableId getTableId() @Override public Throwable commit(Throwable accumulate) { - for (SequentialWriter writer : componentWriters.values()) + for (ZeroCopySequentialWriter writer : componentWriters.values()) accumulate = writer.commit(accumulate); return accumulate; } @@ -178,7 +181,7 @@ public Throwable commit(Throwable accumulate) @Override public Throwable abort(Throwable accumulate) { - for (SequentialWriter writer : componentWriters.values()) + for (ZeroCopySequentialWriter writer : componentWriters.values()) accumulate = writer.abort(accumulate); return accumulate; } @@ -186,29 +189,30 @@ public Throwable abort(Throwable accumulate) @Override public void prepareToCommit() { - for (SequentialWriter writer : componentWriters.values()) + for (ZeroCopySequentialWriter writer : componentWriters.values()) writer.prepareToCommit(); } @Override public void close() { - for (SequentialWriter writer : componentWriters.values()) + for (ZeroCopySequentialWriter writer : componentWriters.values()) writer.close(); } public void writeComponent(Component component, DataInputPlus in, long size) throws ClosedChannelException { - SequentialWriter writer = componentWriters.get(component.name); + ZeroCopySequentialWriter writer = componentWriters.get(component.name); logger.info("Writing component {} to {} length {}", component, writer.getPath(), prettyPrintMemory(size)); if (in instanceof AsyncStreamingInputPlus) write((AsyncStreamingInputPlus) in, size, writer); else + // this code path is not valid for production and only exists to simplify unit tests write(in, size, writer); } - private void write(AsyncStreamingInputPlus in, long size, SequentialWriter writer) throws ClosedChannelException + private void write(AsyncStreamingInputPlus in, long size, ZeroCopySequentialWriter writer) throws ClosedChannelException { logger.info("Block Writing component to {} length {}", writer.getPath(), prettyPrintMemory(size)); @@ -233,4 +237,25 @@ private void write(AsyncStreamingInputPlus in, long size, SequentialWriter write throw new FSWriteError(e, writer.getPath()); } } + + private static class ZeroCopySequentialWriter extends SequentialWriter + { + private ZeroCopySequentialWriter(File file, SequentialWriterOption option, boolean strictFlushing) + { + super(file, ByteBufferUtil.EMPTY_BYTE_BUFFER, option, strictFlushing); + } + + /** + * In production, we do not expect this method to be called, as only writeDirectlyToChannel should be invoked for zero-copy. + *

+ * This method only exists for tests. + */ + @Override + public void write(byte[] b, int off, int len) throws IOException + { + if (this.buffer == ByteBufferUtil.EMPTY_BYTE_BUFFER) + this.buffer = option.allocateBuffer(); + super.write(b, off, len); + } + } } diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java index ff488694dc82..52986d73dc26 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java +++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java @@ -92,6 +92,7 @@ import org.apache.cassandra.io.util.FileDataInput; import org.apache.cassandra.io.util.FileHandle; import org.apache.cassandra.io.util.FileUtils; +import org.apache.cassandra.io.util.FileUtils.DuplicateHardlinkException; import org.apache.cassandra.io.util.RandomAccessReader; import org.apache.cassandra.metrics.RestorableMeter; import org.apache.cassandra.schema.SchemaConstants; @@ -940,9 +941,11 @@ public UnfilteredRowIterator simpleIterator(FileDataInput file, DecoratedKey key */ public KeyIterator keyIterator() throws IOException { - return new KeyIterator(keyReader(), getPartitioner(), uncompressedLength(), new ReentrantReadWriteLock()); + return new KeyIterator(null, keyReader(), getPartitioner(), uncompressedLength(), new ReentrantReadWriteLock()); } + public abstract KeyIterator keyIterator(AbstractBounds range) throws IOException; + /** * Finds and returns the first key beyond a given token in this SSTable or null if no such key exists. */ @@ -1125,15 +1128,29 @@ public void createLinks(String snapshotDirectoryPath) public void createLinks(String snapshotDirectoryPath, RateLimiter rateLimiter) { - createLinks(descriptor, components, snapshotDirectoryPath, rateLimiter); + createLinks(snapshotDirectoryPath, rateLimiter, false); + } + + public void createLinks(String snapshotDirectoryPath, RateLimiter rateLimiter, boolean ephemeralSnapshot) + { + createLinks(descriptor, components, snapshotDirectoryPath, rateLimiter, ephemeralSnapshot); } public static void createLinks(Descriptor descriptor, Set components, String snapshotDirectoryPath) { - createLinks(descriptor, components, snapshotDirectoryPath, null); + createLinks(descriptor, components, snapshotDirectoryPath, null, false); } - public static void createLinks(Descriptor descriptor, Set components, String snapshotDirectoryPath, RateLimiter limiter) + /** + * Create hardlinks for given set of components + * + * @param descriptor descriptor to use + * @param components components to create links for + * @param snapshotDirectoryPath directory path for snapshot + * @param limiter rate limiter to use + * @param force if true, if target link file exists, do not fail, otherwise throw RTE + */ + public static void createLinks(Descriptor descriptor, Set components, String snapshotDirectoryPath, RateLimiter limiter, boolean force) { for (Component component : components) { @@ -1143,7 +1160,15 @@ public static void createLinks(Descriptor descriptor, Set components, if (null != limiter) limiter.acquire(); File targetLink = new File(snapshotDirectoryPath, sourceFile.name()); - FileUtils.createHardLink(sourceFile, targetLink); + try + { + FileUtils.createHardLink(sourceFile, targetLink); + } + catch (DuplicateHardlinkException ex) + { + if (!force) + throw new RuntimeException(ex.getMessage()); + } } } @@ -1882,7 +1907,7 @@ public UniqueIdentifier instanceId() @Override public int compareTo(SSTableReader other) { - // Used in IntervalTree with the expecation that compareTo uniquely identifies an SSTableReader + // Used in IntervalTree with the expectation that compareTo uniquely identifies an SSTableReader // Use accessor for instanceId for mocks return instanceId().compareTo(other.instanceId()); } diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableSimpleScanner.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableSimpleScanner.java index 190ec42fa939..a649fbea4c33 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/SSTableSimpleScanner.java +++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableSimpleScanner.java @@ -73,7 +73,7 @@ public SSTableSimpleScanner(SSTableReader sstable, { assert sstable != null; - this.dfile = sstable.openDataReader(); + this.dfile = sstable.openDataReaderForScan(); this.sstable = sstable; this.sizeInBytes = boundsList.stream().mapToLong(ppb -> ppb.upperPosition - ppb.lowerPosition).sum(); this.compressedSizeInBytes = sstable.compression ? sstable.onDiskSizeForPartitionPositions(boundsList) : sizeInBytes; diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java index 0864a64cee4c..d44bd7f71429 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java +++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java @@ -25,11 +25,15 @@ import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.concurrent.locks.ReentrantReadWriteLock; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.io.sstable.*; +import org.apache.cassandra.utils.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -47,16 +51,6 @@ import org.apache.cassandra.db.rows.UnfilteredRowIterators; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; -import org.apache.cassandra.io.sstable.AbstractRowIndexEntry; -import org.apache.cassandra.io.sstable.CorruptSSTableException; -import org.apache.cassandra.io.sstable.Descriptor; -import org.apache.cassandra.io.sstable.Downsampling; -import org.apache.cassandra.io.sstable.ISSTableScanner; -import org.apache.cassandra.io.sstable.IVerifier; -import org.apache.cassandra.io.sstable.IndexInfo; -import org.apache.cassandra.io.sstable.KeyReader; -import org.apache.cassandra.io.sstable.SSTable; -import org.apache.cassandra.io.sstable.SSTableReadsListener; import org.apache.cassandra.io.sstable.SSTableReadsListener.SelectionReason; import org.apache.cassandra.io.sstable.SSTableReadsListener.SkippingReason; import org.apache.cassandra.io.sstable.format.SSTableReader; @@ -72,9 +66,6 @@ import org.apache.cassandra.io.util.FileHandle; import org.apache.cassandra.io.util.FileUtils; import org.apache.cassandra.io.util.RandomAccessReader; -import org.apache.cassandra.utils.ByteBufferUtil; -import org.apache.cassandra.utils.IFilter; -import org.apache.cassandra.utils.OutputHandler; import static org.apache.cassandra.utils.concurrent.SharedCloseable.sharedCopyOrNull; @@ -155,6 +146,16 @@ public KeyReader keyReader() throws IOException return BigTableKeyReader.create(ifile, rowIndexEntrySerializer); } + @Override + public KeyIterator keyIterator(AbstractBounds range) throws IOException + { + + RandomAccessReader ifileReader = ifile.createReader(); + ifileReader.seek(getIndexScanPosition(range.left)); + BigTableKeyReader keyReader = BigTableKeyReader.create(ifileReader, rowIndexEntrySerializer); + return new KeyIterator(range, keyReader, getPartitioner(), uncompressedLength(), new ReentrantReadWriteLock()); + } + @Override public KeyReader keyReader(PartitionPosition key) throws IOException { @@ -310,6 +311,7 @@ public RowIndexEntry getRowIndexEntry(PartitionPosition key, // of the next interval). int i = 0; String path = null; + ByteBuffer indexKey = null; try (FileDataInput in = ifile.createReader(sampledPosition)) { path = in.getPath(); @@ -317,7 +319,13 @@ public RowIndexEntry getRowIndexEntry(PartitionPosition key, { i++; - ByteBuffer indexKey = ByteBufferUtil.readWithShortLength(in); + int length = in.readUnsignedShort(); + if (indexKey == null || indexKey.capacity() < length) + indexKey = ByteBuffer.allocate(length); + + in.readFully(indexKey.array(), 0, length); + indexKey.position(0); + indexKey.limit(length); boolean opSatisfied; // did we find an appropriate position for the op requested boolean exactMatch; // is the current position an exact match for the key, suitable for caching diff --git a/src/java/org/apache/cassandra/io/sstable/format/bti/BtiTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/bti/BtiTableReader.java index 39160639584a..791bcb9d18ae 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/bti/BtiTableReader.java +++ b/src/java/org/apache/cassandra/io/sstable/format/bti/BtiTableReader.java @@ -23,6 +23,7 @@ import java.util.Collection; import java.util.Collections; import java.util.List; +import java.util.concurrent.locks.ReentrantReadWriteLock; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; @@ -45,6 +46,7 @@ import org.apache.cassandra.io.sstable.CorruptSSTableException; import org.apache.cassandra.io.sstable.Descriptor; import org.apache.cassandra.io.sstable.IVerifier; +import org.apache.cassandra.io.sstable.KeyIterator; import org.apache.cassandra.io.sstable.KeyReader; import org.apache.cassandra.io.sstable.SSTable; import org.apache.cassandra.io.sstable.SSTableReadsListener; @@ -130,7 +132,7 @@ public KeyReader keyReader(PartitionPosition key) throws IOException { return PartitionIterator.create(partitionIndex, metadata().partitioner, rowIndexFile, dfile, key, -1, - metadata().partitioner.getMaximumToken().maxKeyBound(), 0, + metadata().partitioner.getMaximumTokenForSplitting().maxKeyBound(), 0, descriptor.version); } @@ -474,6 +476,24 @@ public UnfilteredPartitionIterator partitionIterator(ColumnFilter columnFilter, return BtiTableScanner.getScanner(this, columnFilter, dataRange, listener); } + @Override + public KeyIterator keyIterator(AbstractBounds range) + { + PartitionIterator iter; + try + { + iter = PartitionIterator.create(partitionIndex, metadata().partitioner, rowIndexFile, dfile, + range.left, bounds.inclusiveLeft() ? -1 : 0, + null, 0, descriptor.version); + } + catch (IOException e) + { + throw new RuntimeException(e); + } + + return new KeyIterator(range, iter, metadata().partitioner, uncompressedLength(), new ReentrantReadWriteLock()); + } + @Override public IVerifier getVerifier(ColumnFamilyStore cfs, OutputHandler outputHandler, boolean isOffline, IVerifier.Options options) { diff --git a/src/java/org/apache/cassandra/io/sstable/indexsummary/IndexSummary.java b/src/java/org/apache/cassandra/io/sstable/indexsummary/IndexSummary.java index ec5248c70f02..39377cc01fe2 100644 --- a/src/java/org/apache/cassandra/io/sstable/indexsummary/IndexSummary.java +++ b/src/java/org/apache/cassandra/io/sstable/indexsummary/IndexSummary.java @@ -456,6 +456,18 @@ public IndexSummary deserialize(T in, IP entries.free(); throw ioe; } + + // Before 5.0 offsets were written using Native Endian, now they are stored as Little Endian, + // so we apply a heuristic here to detect + // if the loading index summary was created on a Big Endian machine using Native Endian format + if (offsets.size() > 0) + { + int offset = offsets.getInt(0); + int offsetReversed = Integer.reverseBytes(offset); + if (offsetReversed > 0 && offset > offsetReversed || offset - offsets.size() < 0) + throw new IOException(String.format("Rebuilding index summary because offset value (%d) at position: %d " + + "is Big Endian while Little Endian is expected", offset, 0)); + } // our on-disk representation treats the offsets and the summary data as one contiguous structure, // in which the offsets are based from the start of the structure. i.e., if the offsets occupy // X bytes, the value of the first offset will be X. In memory we split the two regions up, so that diff --git a/src/java/org/apache/cassandra/io/util/BufferedDataOutputStreamPlus.java b/src/java/org/apache/cassandra/io/util/BufferedDataOutputStreamPlus.java index a712ba6e4c18..201eb619e8a2 100644 --- a/src/java/org/apache/cassandra/io/util/BufferedDataOutputStreamPlus.java +++ b/src/java/org/apache/cassandra/io/util/BufferedDataOutputStreamPlus.java @@ -27,6 +27,7 @@ import net.nicoulaj.compilecommand.annotations.DontInline; import org.apache.cassandra.utils.FastByteOperations; +import org.apache.cassandra.utils.memory.MemoryUtil; import static org.apache.cassandra.config.CassandraRelevantProperties.NIO_DATA_OUTPUT_STREAM_PLUS_BUFFER_SIZE; @@ -138,6 +139,25 @@ public void write(ByteBuffer src) throws IOException buffer.position(buffer.position() + srcCount); } + @Override + public void writeMemory(long address, int length) throws IOException + { + assert buffer != null : "Attempt to use a closed data output"; + long srcPos = address; + int srcCount = length; + int trgAvailable; + while (srcCount > (trgAvailable = buffer.remaining())) + { + MemoryUtil.getBytes(srcPos, buffer, trgAvailable); + buffer.position(buffer.position() + trgAvailable); + srcPos += trgAvailable; + srcCount -= trgAvailable; + doFlush(srcCount); + } + MemoryUtil.getBytes(srcPos, buffer, srcCount); + buffer.position(buffer.position() + srcCount); + } + @Override public void write(int b) throws IOException { @@ -178,6 +198,22 @@ public void writeMostSignificantBytes(long register, int bytes) throws IOExcepti } } + @Override + public void writeLeastSignificantBytes(long register, int bytes) throws IOException + { + assert buffer != null : "Attempt to use a closed data output"; + if (buffer.remaining() < Long.BYTES) + { + super.writeLeastSignificantBytes(register, bytes); + } + else + { + int pos = buffer.position(); + buffer.putLong(pos, register << (64 - (bytes * 8))); + buffer.position(pos + bytes); + } + } + @Override public void writeShort(int v) throws IOException { diff --git a/src/java/org/apache/cassandra/io/util/Checksumed.java b/src/java/org/apache/cassandra/io/util/Checksumed.java new file mode 100644 index 000000000000..61120ed25e13 --- /dev/null +++ b/src/java/org/apache/cassandra/io/util/Checksumed.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io.util; + +import java.util.zip.Checksum; + +public interface Checksumed +{ + Checksum checksum(); + + default long getAndResetChecksum() + { + Checksum c = checksum(); + long v = c.getValue(); + c.reset(); + return v; + } + + default int getValue32() + { + long v = checksum().getValue(); + if (Long.numberOfLeadingZeros(v) < 32) + throw new IllegalStateException("Checksum is larger than 32 bytes!"); + return (int) v; + } + + default int getValue32AndResetChecksum() + { + Checksum c = checksum(); + long v = c.getValue(); + if (Long.numberOfLeadingZeros(v) < 32) + throw new IllegalStateException("Checksum is larger than 32 bytes!"); + c.reset(); + return (int) v; + } + + default void resetChecksum() + { + checksum().reset(); + } +} diff --git a/src/java/org/apache/cassandra/io/util/ChecksumedDataInputPlus.java b/src/java/org/apache/cassandra/io/util/ChecksumedDataInputPlus.java new file mode 100644 index 000000000000..8b1d701d4af3 --- /dev/null +++ b/src/java/org/apache/cassandra/io/util/ChecksumedDataInputPlus.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io.util; + +import java.io.IOException; +import java.util.function.Supplier; +import java.util.zip.Checksum; + +public class ChecksumedDataInputPlus implements DataInputPlus, Checksumed +{ + private final DataInputPlus delegate; + private final Checksum checksum; + + public ChecksumedDataInputPlus(DataInputPlus delegate, Checksum checksum) + { + this.delegate = delegate; + this.checksum = checksum; + } + + public ChecksumedDataInputPlus(DataInputPlus delegate, Supplier fn) + { + this(delegate, fn.get()); + } + + public DataInputPlus delegate() + { + return delegate; + } + + @Override + public Checksum checksum() + { + return checksum; + } + + @Override + public void readFully(byte[] b) throws IOException + { + delegate().readFully(b); + checksum.update(b); + } + + @Override + public void readFully(byte[] b, int off, int len) throws IOException + { + delegate().readFully(b, off, len); + checksum.update(b, off, len); + } + + @Override + public int skipBytes(int n) throws IOException + { + int skipped = delegate().skipBytes(n); + checksum.reset(); + return skipped; + } + + @Override + public int readUnsignedByte() throws IOException + { + int value = delegate().readUnsignedByte(); + checksum.update(value); + return value; + } + + private byte writeBuffer[] = new byte[8]; + + @Override + public long readLong() throws IOException + { + long v = delegate().readLong(); + writeBuffer[0] = (byte)(v >>> 56); + writeBuffer[1] = (byte)(v >>> 48); + writeBuffer[2] = (byte)(v >>> 40); + writeBuffer[3] = (byte)(v >>> 32); + writeBuffer[4] = (byte)(v >>> 24); + writeBuffer[5] = (byte)(v >>> 16); + writeBuffer[6] = (byte)(v >>> 8); + writeBuffer[7] = (byte)(v >>> 0); + checksum.update(writeBuffer); + return v; + } + + @Override + public String readLine() throws IOException + { + throw new UnsupportedOperationException(); + } +} diff --git a/src/java/org/apache/cassandra/io/util/ChecksumedDataOutputPlus.java b/src/java/org/apache/cassandra/io/util/ChecksumedDataOutputPlus.java new file mode 100644 index 000000000000..83f571e4761a --- /dev/null +++ b/src/java/org/apache/cassandra/io/util/ChecksumedDataOutputPlus.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io.util; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.function.Supplier; +import java.util.zip.Checksum; + +public class ChecksumedDataOutputPlus implements DataOutputPlus, Checksumed +{ + private final DataOutputPlus delegate; + private final Checksum checksum; + + public ChecksumedDataOutputPlus(DataOutputPlus delegate, Checksum checksum) + { + this.delegate = delegate; + this.checksum = checksum; + } + + public ChecksumedDataOutputPlus(DataOutputPlus delegate, Supplier fn) + { + this(delegate, fn.get()); + } + + public DataOutputPlus delegate() + { + return delegate; + } + + @Override + public Checksum checksum() + { + return checksum; + } + + @Override + public void write(ByteBuffer buffer) throws IOException + { + checksum.update(buffer.duplicate()); + delegate().write(buffer); + } + + @Override + public void write(int b) throws IOException + { + checksum.update(b); + delegate().write(b); + } + + @Override + public void write(byte[] b) throws IOException + { + checksum.update(b); + delegate().write(b); + } + + @Override + public void write(byte[] b, int off, int len) throws IOException + { + checksum.update(b, off, len); + delegate().write(b, off, len); + } + + @Override + public void writeByte(int v) throws IOException + { + checksum.update(v); + delegate().writeByte(v); + } + + private byte writeBuffer[] = new byte[8]; + + @Override + public void writeLong(long v) throws IOException + { + writeBuffer[0] = (byte)(v >>> 56); + writeBuffer[1] = (byte)(v >>> 48); + writeBuffer[2] = (byte)(v >>> 40); + writeBuffer[3] = (byte)(v >>> 32); + writeBuffer[4] = (byte)(v >>> 24); + writeBuffer[5] = (byte)(v >>> 16); + writeBuffer[6] = (byte)(v >>> 8); + writeBuffer[7] = (byte)(v >>> 0); + checksum.update(writeBuffer); + delegate().writeLong(v); + } + + @Override + public void writeUTF(String s) throws IOException + { + throw new UnsupportedOperationException("TODO"); + } +} diff --git a/src/java/org/apache/cassandra/io/util/ChecksumedFileDataInput.java b/src/java/org/apache/cassandra/io/util/ChecksumedFileDataInput.java new file mode 100644 index 000000000000..5bdb0b564461 --- /dev/null +++ b/src/java/org/apache/cassandra/io/util/ChecksumedFileDataInput.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io.util; + +import java.io.IOException; +import java.util.function.Supplier; +import java.util.zip.Checksum; + +public class ChecksumedFileDataInput extends ChecksumedDataInputPlus implements FileDataInput +{ + public ChecksumedFileDataInput(FileDataInput delegate, Checksum checksum) + { + super(delegate, checksum); + } + + public ChecksumedFileDataInput(FileDataInput delegate, Supplier fn) + { + super(delegate, fn); + } + + @Override + public FileDataInput delegate() + { + return (FileDataInput) super.delegate(); + } + + @Override + public String getPath() + { + return delegate().getPath(); + } + + @Override + public boolean isEOF() throws IOException + { + return delegate().isEOF(); + } + + @Override + public long bytesRemaining() throws IOException + { + return delegate().bytesRemaining(); + } + + @Override + public void seek(long pos) throws IOException + { + resetChecksum(); + delegate().seek(pos); + } + + @Override + public long getFilePointer() + { + return delegate().getFilePointer(); + } + + @Override + public void close() throws IOException + { + resetChecksum(); + delegate().close(); + } + + @Override + public DataPosition mark() + { + resetChecksum(); + return delegate().mark(); + } + + @Override + public void reset(DataPosition mark) throws IOException + { + resetChecksum(); + delegate().reset(mark); + } + + @Override + public long bytesPastMark(DataPosition mark) + { + return delegate().bytesPastMark(mark); + } +} diff --git a/src/java/org/apache/cassandra/io/util/ChecksumedRandomAccessReader.java b/src/java/org/apache/cassandra/io/util/ChecksumedRandomAccessReader.java new file mode 100644 index 000000000000..5b3b820740f8 --- /dev/null +++ b/src/java/org/apache/cassandra/io/util/ChecksumedRandomAccessReader.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io.util; + +import java.util.function.Supplier; +import java.util.zip.Checksum; + +public class ChecksumedRandomAccessReader extends ChecksumedFileDataInput +{ + public ChecksumedRandomAccessReader(RandomAccessReader delegate, Checksum checksum) + { + super(delegate, checksum); + } + + public ChecksumedRandomAccessReader(RandomAccessReader delegate, Supplier fn) + { + super(delegate, fn); + } + + @Override + public RandomAccessReader delegate() + { + return (RandomAccessReader) super.delegate(); + } + + public long length() + { + return delegate().length(); + } +} diff --git a/src/java/org/apache/cassandra/io/util/ChecksumedSequentialWriter.java b/src/java/org/apache/cassandra/io/util/ChecksumedSequentialWriter.java new file mode 100644 index 000000000000..e14d99c45f2b --- /dev/null +++ b/src/java/org/apache/cassandra/io/util/ChecksumedSequentialWriter.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io.util; + +import java.io.IOException; +import java.util.function.Supplier; +import java.util.zip.Checksum; + +import org.apache.cassandra.utils.concurrent.Transactional; + +public class ChecksumedSequentialWriter extends ChecksumedDataOutputPlus implements Transactional +{ + private static final SequentialWriterOption DEFAULT_WRITER_OPTIONS = SequentialWriterOption.newBuilder().finishOnClose(true).build(); + + public ChecksumedSequentialWriter(SequentialWriter delegate, Checksum checksum) + { + super(delegate, checksum); + } + + public ChecksumedSequentialWriter(SequentialWriter delegate, Supplier fn) + { + super(delegate, fn); + } + + public static ChecksumedSequentialWriter open(File file, boolean append, Supplier fn) throws IOException + { + SequentialWriter writer = new SequentialWriter(file, DEFAULT_WRITER_OPTIONS); + if (append) + writer.skipBytes(file.length()); + return new ChecksumedSequentialWriter(writer, fn); + } + + @Override + public SequentialWriter delegate() + { + return (SequentialWriter) super.delegate(); + } + + public long getFilePointer() + { + return delegate().position(); + } + + @Override + public Throwable commit(Throwable accumulate) + { + return delegate().commit(accumulate); + } + + @Override + public Throwable abort(Throwable accumulate) + { + return delegate().abort(accumulate); + } + + @Override + public void prepareToCommit() + { + delegate().prepareToCommit(); + } + + @Override + public void close() + { + delegate().close(); + } +} diff --git a/src/java/org/apache/cassandra/io/util/DataInputPlus.java b/src/java/org/apache/cassandra/io/util/DataInputPlus.java index d117c7fe894c..cc44b3c6c66c 100644 --- a/src/java/org/apache/cassandra/io/util/DataInputPlus.java +++ b/src/java/org/apache/cassandra/io/util/DataInputPlus.java @@ -18,6 +18,7 @@ package org.apache.cassandra.io.util; import java.io.DataInput; +import java.io.DataInputStream; import java.io.EOFException; import java.io.IOException; import java.io.InputStream; @@ -83,6 +84,33 @@ default int readUnsignedVInt32() throws IOException return VIntCoding.readUnsignedVInt32(this); } + default long readLeastSignificantBytes(int bytes) throws IOException + { + switch (bytes) + { + case 0: return 0; + case 1: return readByte() & 0xffL; + case 2: return readShort() & 0xffffL; + case 3: + return ((readShort() & 0xffffL) << 8) + | (readByte() & 0xffL); + case 4: + return (readInt() & 0xffffffffL); + case 5: + return ((readInt() & 0xffffffffL) << 8) + | (readByte() & 0xffL); + case 6: + return ((readInt() & 0xffffffffL) << 16) + | (readShort() & 0xffffL); + case 7: + return ((readInt() & 0xffffffffL) << 24) + | ((readShort() & 0xffffL) << 8) + | (readByte() & 0xffL); + case 8: return readLong(); + default: throw new IllegalArgumentException(); + } + } + /** * Always skips the requested number of bytes, unless EOF is reached * @@ -98,6 +126,81 @@ public default void skipBytesFully(int n) throws IOException throw new EOFException("EOF after " + skipped + " bytes out of " + n); } + @Override + default byte readByte() throws IOException + { + return (byte) readUnsignedByte(); + } + + @Override + default boolean readBoolean() throws IOException + { + return readUnsignedByte() != 0; + } + + @Override + default short readShort() throws IOException + { + int ch1 = readUnsignedByte(); + int ch2 = readUnsignedByte(); + if ((ch1 | ch2) < 0) + throw new EOFException(); + return (short)((ch1 << 8) + (ch2 << 0)); + } + + @Override + default int readUnsignedShort() throws IOException + { + int ch1 = readUnsignedByte(); + int ch2 = readUnsignedByte(); + if ((ch1 | ch2) < 0) + throw new EOFException(); + return (ch1 << 8) + (ch2 << 0); + } + + @Override + default char readChar() throws IOException + { + int ch1 = readUnsignedByte(); + int ch2 = readUnsignedByte(); + if ((ch1 | ch2) < 0) + throw new EOFException(); + return (char)((ch1 << 8) + (ch2 << 0)); + } + + @Override + default int readInt() throws IOException + { + int ch1 = readUnsignedByte(); + int ch2 = readUnsignedByte(); + int ch3 = readUnsignedByte(); + int ch4 = readUnsignedByte(); + if ((ch1 | ch2 | ch3 | ch4) < 0) + throw new EOFException(); + return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0)); + } + + @Override + long readLong() throws IOException; + + @Override + default float readFloat() throws IOException + { + return Float.intBitsToFloat(readInt()); + } + + @Override + default double readDouble() throws IOException + { + return Double.longBitsToDouble(readLong()); + } + + @Override + default String readUTF() throws IOException + { + return DataInputStream.readUTF(this); + } + /** * Wrapper around an InputStream that provides no buffering but can decode varints */ diff --git a/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java b/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java index 3cb5db0f00c6..837e10c1b6c9 100644 --- a/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java +++ b/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java @@ -60,7 +60,7 @@ private enum AllocationType { DIRECT, ONHEAP } * Scratch buffers used mostly for serializing in memory. It's important to call #close() when finished * to keep the memory overhead from being too large in the system. */ - public static final FastThreadLocal scratchBuffer = new FastThreadLocal() + public static final FastThreadLocal scratchBuffer = new FastThreadLocal<>() { @Override protected DataOutputBuffer initialValue() diff --git a/src/java/org/apache/cassandra/io/util/DataOutputPlus.java b/src/java/org/apache/cassandra/io/util/DataOutputPlus.java index f8bc95953164..3c66424065ac 100644 --- a/src/java/org/apache/cassandra/io/util/DataOutputPlus.java +++ b/src/java/org/apache/cassandra/io/util/DataOutputPlus.java @@ -18,6 +18,7 @@ package org.apache.cassandra.io.util; import org.apache.cassandra.utils.Shared; +import org.apache.cassandra.utils.memory.MemoryUtil; import org.apache.cassandra.utils.vint.VIntCoding; import java.io.DataOutput; @@ -44,6 +45,11 @@ default void write(ReadableMemory memory, long offset, long length) throws IOExc write(buffer); } + default void writeMemory(long address, int length) throws IOException + { + write(MemoryUtil.getByteBuffer(address, length)); + } + default void writeVInt(long i) throws IOException { VIntCoding.writeVInt(i, this); @@ -130,7 +136,69 @@ default void writeMostSignificantBytes(long register, int bytes) throws IOExcept default: throw new IllegalArgumentException(); } + } + private static int numberOfBytes(long value) + { + return (64 + 7 - Long.numberOfLeadingZeros(value)) / 8; + } + + /** + * An efficient way to write the type {@code bytes} of a long + * + * @param register - the long value to be written + * @throws IOException + */ + default void writeLeastSignificantBytes(long register) throws IOException + { + writeLeastSignificantBytes(register, numberOfBytes(register)); + } + + /** + * An efficient way to write the type {@code bytes} of a long + * + * @param register - the long value to be written + * @param bytes - the number of bytes the register occupies. Valid values are between 0 and 8 inclusive. + * @throws IOException + */ + default void writeLeastSignificantBytes(long register, int bytes) throws IOException + { + switch (bytes) + { + case 0: + break; + case 1: + writeByte((int)register); + break; + case 2: + writeShort((int)register); + break; + case 3: + writeShort((int)(register >> 8)); + writeByte((int)register); + break; + case 4: + writeInt((int)register); + break; + case 5: + writeInt((int)(register >> 8)); + writeByte((int)register); + break; + case 6: + writeInt((int)(register >> 16)); + writeShort((int)register); + break; + case 7: + writeInt((int)(register >> 24)); + writeShort((int)(register >> 8)); + writeByte((int)register); + break; + case 8: + writeLong(register); + break; + default: + throw new IllegalArgumentException(); + } } /** @@ -194,4 +262,65 @@ default long paddedPosition() { throw new UnsupportedOperationException(); } + + @Override + default void writeBoolean(boolean v) throws IOException + { + write(v ? 1 : 0); + } + + @Override + default void writeShort(int v) throws IOException + { + write((v >>> 8) & 0xFF); + write((v >>> 0) & 0xFF); + } + + @Override + default void writeChar(int v) throws IOException + { + write((v >>> 8) & 0xFF); + write((v >>> 0) & 0xFF); + } + + @Override + default void writeInt(int v) throws IOException + { + write((v >>> 24) & 0xFF); + write((v >>> 16) & 0xFF); + write((v >>> 8) & 0xFF); + write((v >>> 0) & 0xFF); + } + + @Override + default void writeFloat(float v) throws IOException + { + writeInt(Float.floatToIntBits(v)); + } + + @Override + default void writeDouble(double v) throws IOException + { + writeLong(Double.doubleToLongBits(v)); + } + + @Override + default void writeBytes(String s) throws IOException + { + int len = s.length(); + for (int i = 0 ; i < len ; i++) { + write((byte)s.charAt(i)); + } + } + + @Override + default void writeChars(String s) throws IOException + { + int len = s.length(); + for (int i = 0 ; i < len ; i++) { + int v = s.charAt(i); + write((v >>> 8) & 0xFF); + write((v >>> 0) & 0xFF); + } + } } \ No newline at end of file diff --git a/src/java/org/apache/cassandra/io/util/File.java b/src/java/org/apache/cassandra/io/util/File.java index de415388ed9e..e814e37b741a 100644 --- a/src/java/org/apache/cassandra/io/util/File.java +++ b/src/java/org/apache/cassandra/io/util/File.java @@ -225,6 +225,15 @@ public void deleteRecursive() PathUtils.deleteRecursive(toPathForWrite()); } + /** + * Deletes all files and subdirectories under "dir". + * @return false if the root cannot be deleted + */ + public boolean tryDeleteRecursive() + { + return PathUtils.tryDeleteRecursive(toPathForWrite()); + } + /** * Try to delete the file on process exit. */ diff --git a/src/java/org/apache/cassandra/io/util/FileHandle.java b/src/java/org/apache/cassandra/io/util/FileHandle.java index 67bfd239d61e..7e4b214128ac 100644 --- a/src/java/org/apache/cassandra/io/util/FileHandle.java +++ b/src/java/org/apache/cassandra/io/util/FileHandle.java @@ -406,14 +406,14 @@ else if (mmapped) { if (compressionMetadata != null) { - regions = mmappedRegionsCache != null ? mmappedRegionsCache.getOrCreate(channel, compressionMetadata) + regions = mmappedRegionsCache != null ? mmappedRegionsCache.getOrCreate(channel, compressionMetadata, bufferSize) : MmappedRegions.map(channel, compressionMetadata); rebuffererFactory = maybeCached(new CompressedChunkReader.Mmap(channel, compressionMetadata, regions, crcCheckChanceSupplier)); } else { - regions = mmappedRegionsCache != null ? mmappedRegionsCache.getOrCreate(channel, length) - : MmappedRegions.map(channel, length); + regions = mmappedRegionsCache != null ? mmappedRegionsCache.getOrCreate(channel, length, bufferSize) + : MmappedRegions.map(channel, length, bufferSize); rebuffererFactory = new MmapRebufferer(channel, length, regions); } } diff --git a/src/java/org/apache/cassandra/io/util/FileUtils.java b/src/java/org/apache/cassandra/io/util/FileUtils.java index d5ea8dcde1be..c8c605d7da4b 100644 --- a/src/java/org/apache/cassandra/io/util/FileUtils.java +++ b/src/java/org/apache/cassandra/io/util/FileUtils.java @@ -166,7 +166,7 @@ public static void createHardLink(String from, String to) public static void createHardLink(File from, File to) { if (to.exists()) - throw new RuntimeException("Tried to create duplicate hard link to " + to); + throw new DuplicateHardlinkException("Tried to create duplicate hard link from " + from + " to " + to); if (!from.exists()) throw new RuntimeException("Tried to hard link to file that does not exist " + from); @@ -195,6 +195,10 @@ public static void createHardLinkWithConfirm(File from, File to) { throw ex; } + catch (DuplicateHardlinkException ex) + { + throw new RuntimeException(ex.getMessage()); + } catch (Throwable t) { throw new RuntimeException(String.format("Unable to hardlink from %s to %s", from, to), t); @@ -818,4 +822,12 @@ public static int getBlockSize(File directory) f.tryDelete(); } } + + public static class DuplicateHardlinkException extends RuntimeException + { + public DuplicateHardlinkException(String message) + { + super(message); + } + } } \ No newline at end of file diff --git a/src/java/org/apache/cassandra/io/util/Memory.java b/src/java/org/apache/cassandra/io/util/Memory.java index 1d1fca2edf96..512acd59d7a7 100644 --- a/src/java/org/apache/cassandra/io/util/Memory.java +++ b/src/java/org/apache/cassandra/io/util/Memory.java @@ -22,14 +22,15 @@ import net.nicoulaj.compilecommand.annotations.Inline; -import org.apache.cassandra.utils.Architecture; import org.apache.cassandra.utils.FastByteOperations; import org.apache.cassandra.utils.concurrent.Ref; +import org.apache.cassandra.utils.memory.LittleEndianMemoryUtil; import org.apache.cassandra.utils.memory.MemoryUtil; import sun.misc.Unsafe; /** * An off-heap region of memory that must be manually free'd when no longer needed. + * It uses Little Endian (LE). */ public class Memory implements AutoCloseable, ReadableMemory { @@ -81,7 +82,7 @@ public static Memory allocate(long bytes) if (bytes < 0) throw new IllegalArgumentException(); - if (Ref.DEBUG_ENABLED) + if (Ref.TRACE_ENABLED) return new SafeMemory(bytes); return new Memory(bytes); @@ -90,7 +91,7 @@ public static Memory allocate(long bytes) public void setByte(long offset, byte b) { checkBounds(offset, offset + 1); - unsafe.putByte(peer + offset, b); + LittleEndianMemoryUtil.setByte(peer + offset, b); } public void setMemory(long offset, long bytes, byte b) @@ -103,86 +104,13 @@ public void setMemory(long offset, long bytes, byte b) public void setLong(long offset, long l) { checkBounds(offset, offset + 8); - if (Architecture.IS_UNALIGNED) - unsafe.putLong(peer + offset, Architecture.BIG_ENDIAN ? Long.reverseBytes(l) : l); - else - putLongByByte(peer + offset, l); - } - - private void putLongByByte(long address, long value) - { - if (Architecture.BIG_ENDIAN) - { - unsafe.putByte(address, (byte) (value >> 56)); - unsafe.putByte(address + 1, (byte) (value >> 48)); - unsafe.putByte(address + 2, (byte) (value >> 40)); - unsafe.putByte(address + 3, (byte) (value >> 32)); - unsafe.putByte(address + 4, (byte) (value >> 24)); - unsafe.putByte(address + 5, (byte) (value >> 16)); - unsafe.putByte(address + 6, (byte) (value >> 8)); - unsafe.putByte(address + 7, (byte) (value)); - } - else - { - unsafe.putByte(address + 7, (byte) (value >> 56)); - unsafe.putByte(address + 6, (byte) (value >> 48)); - unsafe.putByte(address + 5, (byte) (value >> 40)); - unsafe.putByte(address + 4, (byte) (value >> 32)); - unsafe.putByte(address + 3, (byte) (value >> 24)); - unsafe.putByte(address + 2, (byte) (value >> 16)); - unsafe.putByte(address + 1, (byte) (value >> 8)); - unsafe.putByte(address, (byte) (value)); - } + LittleEndianMemoryUtil.setLong(peer + offset, l); } public void setInt(long offset, int l) { checkBounds(offset, offset + 4); - if (Architecture.IS_UNALIGNED) - unsafe.putInt(peer + offset, Architecture.BIG_ENDIAN ? Integer.reverseBytes(l) : l); - else - putIntByByte(peer + offset, l); - } - - private void putIntByByte(long address, int value) - { - if (Architecture.BIG_ENDIAN) - { - unsafe.putByte(address, (byte) (value >> 24)); - unsafe.putByte(address + 1, (byte) (value >> 16)); - unsafe.putByte(address + 2, (byte) (value >> 8)); - unsafe.putByte(address + 3, (byte) (value)); - } - else - { - unsafe.putByte(address + 3, (byte) (value >> 24)); - unsafe.putByte(address + 2, (byte) (value >> 16)); - unsafe.putByte(address + 1, (byte) (value >> 8)); - unsafe.putByte(address, (byte) (value)); - } - } - - public void setShort(long offset, short l) - { - checkBounds(offset, offset + 2); - if (Architecture.IS_UNALIGNED) - unsafe.putShort(peer + offset, Architecture.BIG_ENDIAN ? Short.reverseBytes(l) : l); - else - putShortByByte(peer + offset, l); - } - - private void putShortByByte(long address, short value) - { - if (Architecture.BIG_ENDIAN) - { - unsafe.putByte(address, (byte) (value >> 8)); - unsafe.putByte(address + 1, (byte) (value)); - } - else - { - unsafe.putByte(address + 1, (byte) (value >> 8)); - unsafe.putByte(address, (byte) (value)); - } + LittleEndianMemoryUtil.setInt(peer + offset, l); } public void setBytes(long memoryOffset, ByteBuffer buffer) @@ -230,69 +158,19 @@ else if (count == 0) public byte getByte(long offset) { checkBounds(offset, offset + 1); - return unsafe.getByte(peer + offset); + return LittleEndianMemoryUtil.getByte(peer + offset); } public long getLong(long offset) { checkBounds(offset, offset + 8); - if (Architecture.IS_UNALIGNED) - return Architecture.BIG_ENDIAN ? Long.reverseBytes(unsafe.getLong(peer+offset)) : unsafe.getLong(peer+offset); - else - return getLongByByte(peer + offset); - } - - private long getLongByByte(long address) - { - if (Architecture.BIG_ENDIAN) - { - return (((long) unsafe.getByte(address ) ) << 56) | - (((long) unsafe.getByte(address + 1) & 0xff) << 48) | - (((long) unsafe.getByte(address + 2) & 0xff) << 40) | - (((long) unsafe.getByte(address + 3) & 0xff) << 32) | - (((long) unsafe.getByte(address + 4) & 0xff) << 24) | - (((long) unsafe.getByte(address + 5) & 0xff) << 16) | - (((long) unsafe.getByte(address + 6) & 0xff) << 8) | - (((long) unsafe.getByte(address + 7) & 0xff) ); - } - else - { - return (((long) unsafe.getByte(address + 7) ) << 56) | - (((long) unsafe.getByte(address + 6) & 0xff) << 48) | - (((long) unsafe.getByte(address + 5) & 0xff) << 40) | - (((long) unsafe.getByte(address + 4) & 0xff) << 32) | - (((long) unsafe.getByte(address + 3) & 0xff) << 24) | - (((long) unsafe.getByte(address + 2) & 0xff) << 16) | - (((long) unsafe.getByte(address + 1) & 0xff) << 8) | - (((long) unsafe.getByte(address ) & 0xff) ); - } + return LittleEndianMemoryUtil.getLong(peer + offset); } public int getInt(long offset) { checkBounds(offset, offset + 4); - if (Architecture.IS_UNALIGNED) - return Architecture.BIG_ENDIAN ? Integer.reverseBytes(unsafe.getInt(peer+offset)) : unsafe.getInt(peer+offset); - else - return getIntByByte(peer + offset); - } - - private int getIntByByte(long address) - { - if (Architecture.BIG_ENDIAN) - { - return ((unsafe.getByte(address ) ) << 24) | - ((unsafe.getByte(address + 1) & 0xff) << 16) | - ((unsafe.getByte(address + 2) & 0xff) << 8 ) | - ((unsafe.getByte(address + 3) & 0xff) ); - } - else - { - return ((unsafe.getByte(address + 3) ) << 24) | - ((unsafe.getByte(address + 2) & 0xff) << 16) | - ((unsafe.getByte(address + 1) & 0xff) << 8) | - ((unsafe.getByte(address ) & 0xff) ); - } + return LittleEndianMemoryUtil.getInt(peer + offset); } /** @@ -378,18 +256,18 @@ public ByteBuffer[] asByteBuffers(long offset, long length) int size = (int) (size() / result.length); for (int i = 0 ; i < result.length - 1 ; i++) { - result[i] = MemoryUtil.getByteBuffer(peer + offset, size); + result[i] = LittleEndianMemoryUtil.getByteBuffer(peer + offset, size); offset += size; length -= size; } - result[result.length - 1] = MemoryUtil.getByteBuffer(peer + offset, (int) length); + result[result.length - 1] = LittleEndianMemoryUtil.getByteBuffer(peer + offset, (int) length); return result; } public ByteBuffer asByteBuffer(long offset, int length) { checkBounds(offset, offset + length); - return MemoryUtil.getByteBuffer(peer + offset, length); + return LittleEndianMemoryUtil.getByteBuffer(peer + offset, length); } // MUST provide a buffer created via MemoryUtil.getHollowDirectByteBuffer() diff --git a/src/java/org/apache/cassandra/io/util/MmappedRegions.java b/src/java/org/apache/cassandra/io/util/MmappedRegions.java index 0ab07b8d0f74..578217e279a4 100644 --- a/src/java/org/apache/cassandra/io/util/MmappedRegions.java +++ b/src/java/org/apache/cassandra/io/util/MmappedRegions.java @@ -65,27 +65,22 @@ public class MmappedRegions extends SharedCloseableImpl */ private volatile State copy; - private MmappedRegions(ChannelProxy channel, CompressionMetadata metadata, long length) - { - this(new State(channel), metadata, length); - } - - private MmappedRegions(State state, CompressionMetadata metadata, long length) + private MmappedRegions(State state, long length, int chunkSize) { super(new Tidier(state)); - this.state = state; - - if (metadata != null) - { - assert length == 0 : "expected no length with metadata"; - updateState(metadata); - } - else if (length > 0) + if (length > 0) { - updateState(length); + updateState(length, chunkSize); } + this.copy = new State(state); + } + private MmappedRegions(State state, CompressionMetadata metadata) + { + super(new Tidier(state)); + this.state = state; + updateState(metadata); this.copy = new State(state); } @@ -97,7 +92,7 @@ private MmappedRegions(MmappedRegions original) public static MmappedRegions empty(ChannelProxy channel) { - return new MmappedRegions(channel, null, 0); + return new MmappedRegions(new State(channel), 0, 0); } /** @@ -109,16 +104,16 @@ public static MmappedRegions map(ChannelProxy channel, CompressionMetadata metad { if (metadata == null) throw new IllegalArgumentException("metadata cannot be null"); - - return new MmappedRegions(channel, metadata, 0); + State state = new State(channel); + return new MmappedRegions(state, metadata); } - public static MmappedRegions map(ChannelProxy channel, long length) + public static MmappedRegions map(ChannelProxy channel, long length, int chunkSize) { if (length <= 0) throw new IllegalArgumentException("Length must be positive"); - - return new MmappedRegions(channel, null, length); + State state = new State(channel); + return new MmappedRegions(state, length, chunkSize); } /** @@ -140,8 +135,10 @@ private boolean isCopy() * * @return {@code true} if new regions have been created */ - public boolean extend(long length) + public boolean extend(long length, int chunkSize) { + // We cannot enforce length to be a multiple of chunkSize (at the very least the last extend on a file + // will not satisfy this), so we hope the caller knows what they are doing. if (length < 0) throw new IllegalArgumentException("Length must not be negative"); @@ -151,7 +148,7 @@ public boolean extend(long length) return false; int initialRegions = state.last; - updateState(length); + updateState(length, chunkSize); copy = new State(state); return state.last > initialRegions; } @@ -162,7 +159,7 @@ public boolean extend(long length) * * @return {@code true} if new regions have been created */ - public boolean extend(CompressionMetadata compressionMetadata) + public boolean extend(CompressionMetadata compressionMetadata, int chunkSize) { assert !isCopy() : "Copies cannot be extended"; @@ -171,7 +168,7 @@ public boolean extend(CompressionMetadata compressionMetadata) int initialRegions = state.last; if (compressionMetadata.compressedFileLength - state.length <= MAX_SEGMENT_SIZE) - updateState(compressionMetadata.compressedFileLength); + updateState(compressionMetadata.compressedFileLength, chunkSize); else updateState(compressionMetadata); @@ -183,13 +180,15 @@ public boolean extend(CompressionMetadata compressionMetadata) * Updates state by adding the remaining segments. It starts with the current state last segment end position and * subsequently add new segments until all data up to the provided length are mapped. */ - private void updateState(long length) + private void updateState(long length, int chunkSize) { + // make sure the regions span whole chunks + long maxSize = (long) (MAX_SEGMENT_SIZE / chunkSize) * chunkSize; state.length = length; long pos = state.getPosition(); while (pos < length) { - long size = Math.min(MAX_SEGMENT_SIZE, length - pos); + long size = Math.min(maxSize, length - pos); state.add(pos, size); pos += size; } diff --git a/src/java/org/apache/cassandra/io/util/MmappedRegionsCache.java b/src/java/org/apache/cassandra/io/util/MmappedRegionsCache.java index dff9561f4f7d..e3ebc34609d1 100644 --- a/src/java/org/apache/cassandra/io/util/MmappedRegionsCache.java +++ b/src/java/org/apache/cassandra/io/util/MmappedRegionsCache.java @@ -45,12 +45,12 @@ public class MmappedRegionsCache implements AutoCloseable * @param length length of the file * @return a shared copy of the cached mmapped regions */ - public MmappedRegions getOrCreate(ChannelProxy channel, long length) + public MmappedRegions getOrCreate(ChannelProxy channel, long length, int bufferSize) { Preconditions.checkState(!closed); - MmappedRegions regions = cache.computeIfAbsent(channel.file(), ignored -> MmappedRegions.map(channel, length)); + MmappedRegions regions = cache.computeIfAbsent(channel.file(), ignored -> MmappedRegions.map(channel, length, bufferSize)); Preconditions.checkArgument(regions.isValid(channel)); - regions.extend(length); + regions.extend(length, bufferSize); return regions.sharedCopy(); } @@ -62,12 +62,12 @@ public MmappedRegions getOrCreate(ChannelProxy channel, long length) * @param metadata compression metadata of the file * @return a shared copy of the cached mmapped regions */ - public MmappedRegions getOrCreate(ChannelProxy channel, CompressionMetadata metadata) + public MmappedRegions getOrCreate(ChannelProxy channel, CompressionMetadata metadata, int bufferSize) { Preconditions.checkState(!closed); MmappedRegions regions = cache.computeIfAbsent(channel.file(), ignored -> MmappedRegions.map(channel, metadata)); Preconditions.checkArgument(regions.isValid(channel)); - regions.extend(metadata); + regions.extend(metadata, bufferSize); return regions.sharedCopy(); } diff --git a/src/java/org/apache/cassandra/io/util/PathUtils.java b/src/java/org/apache/cassandra/io/util/PathUtils.java index 8ddd939b4c09..fa0a91543e46 100644 --- a/src/java/org/apache/cassandra/io/util/PathUtils.java +++ b/src/java/org/apache/cassandra/io/util/PathUtils.java @@ -346,11 +346,22 @@ public static Throwable delete(Path file, Throwable accumulate, @Nullable RateLi private static void deleteRecursiveUsingNixCommand(Path path, boolean quietly) { String [] cmd = new String[]{ "rm", quietly ? "-rdf" : "-rd", path.toAbsolutePath().toString() }; + IOException failure = null; + if (!quietly && !Files.exists(path)) + failure = new NoSuchFileException(path.toString()); + + if (failure == null) + failure = tryDeleteRecursiveUsingNixCommand(path, quietly); + + if (failure != null) + throw propagateUnchecked(failure, path, true); + } + + private static IOException tryDeleteRecursiveUsingNixCommand(Path path, boolean quietly) + { + String[] cmd = new String[]{ "rm", quietly ? "-rdf" : "-rd", path.toAbsolutePath().toString() }; try { - if (!quietly && !Files.exists(path)) - throw new NoSuchFileException(path.toString()); - Process p = Runtime.getRuntime().exec(cmd); int result = p.waitFor(); @@ -363,24 +374,39 @@ private static void deleteRecursiveUsingNixCommand(Path path, boolean quietly) } if (result != 0 && Files.exists(path)) - { - logger.error("{} returned:\nstdout:\n{}\n\nstderr:\n{}", Arrays.toString(cmd), out, err); - throw new IOException(String.format("%s returned non-zero exit code: %d%nstdout:%n%s%n%nstderr:%n%s", Arrays.toString(cmd), result, out, err)); - } + return new IOException(String.format("%s returned non-zero exit code: %d%nstdout:%n%s%n%nstderr:%n%s", Arrays.toString(cmd), result, out, err)); onDeletion.accept(path); + return null; } catch (IOException e) { - throw propagateUnchecked(e, path, true); + return e; } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new FSWriteError(e, path); + return new IOException("Interrupted while executing command " + Arrays.toString(cmd), e); } } + + /** + * Deletes all files and subdirectories under "path". + * @param path file to be deleted + * @return false if the root cannot be deleted + */ + public static boolean tryDeleteRecursive(Path path) + { + if (USE_NIX_RECURSIVE_DELETE.getBoolean() && path.getFileSystem() == java.nio.file.FileSystems.getDefault()) + return null == tryDeleteRecursiveUsingNixCommand(path, true); + + if (isDirectory(path)) + forEach(path, PathUtils::tryDeleteRecursive); + + // The directory should now be empty, so now it can be smoked + return tryDelete(path); + } + /** * Deletes all files and subdirectories under "path". * @param path file to be deleted diff --git a/src/java/org/apache/cassandra/io/util/RebufferingInputStream.java b/src/java/org/apache/cassandra/io/util/RebufferingInputStream.java index b7ae205e9661..9cc9032029b9 100644 --- a/src/java/org/apache/cassandra/io/util/RebufferingInputStream.java +++ b/src/java/org/apache/cassandra/io/util/RebufferingInputStream.java @@ -275,6 +275,18 @@ public long readUnsignedVInt() throws IOException return retval; } + @Override + public long readLeastSignificantBytes(int bytes) throws IOException + { + if (buffer.remaining() < 8) + return super.readLeastSignificantBytes(bytes); + + long retval = buffer.getLong(buffer.position()); + retval >>>= 64 - (bytes * 8); + buffer.position(buffer.position() + bytes); + return retval; + } + @Override public int readUnsignedVInt32() throws IOException { diff --git a/src/java/org/apache/cassandra/io/util/SequentialWriter.java b/src/java/org/apache/cassandra/io/util/SequentialWriter.java index 69643be98730..c3a90732eead 100644 --- a/src/java/org/apache/cassandra/io/util/SequentialWriter.java +++ b/src/java/org/apache/cassandra/io/util/SequentialWriter.java @@ -56,7 +56,7 @@ public class SequentialWriter extends BufferedDataOutputStreamPlus implements Tr // whether to do trickling fsync() to avoid sudden bursts of dirty buffer flushing by kernel causing read // latency spikes - private final SequentialWriterOption option; + protected final SequentialWriterOption option; private int bytesSinceTrickleFsync = 0; protected long lastFlushOffset; @@ -163,7 +163,12 @@ public SequentialWriter(File file, SequentialWriterOption option) */ public SequentialWriter(File file, SequentialWriterOption option, boolean strictFlushing) { - super(openChannel(file), option.allocateBuffer()); + this(file, option.allocateBuffer(), option, strictFlushing); + } + + protected SequentialWriter(File file, ByteBuffer buffer, SequentialWriterOption option, boolean strictFlushing) + { + super(openChannel(file), buffer); this.strictFlushing = strictFlushing; this.fchannel = (FileChannel)channel; diff --git a/src/java/org/apache/cassandra/io/util/ThreadLocalReadAheadBuffer.java b/src/java/org/apache/cassandra/io/util/ThreadLocalReadAheadBuffer.java index 824acaa8d88f..bc92407befaf 100644 --- a/src/java/org/apache/cassandra/io/util/ThreadLocalReadAheadBuffer.java +++ b/src/java/org/apache/cassandra/io/util/ThreadLocalReadAheadBuffer.java @@ -131,18 +131,21 @@ public int read(ByteBuffer dest, int length) public void clear(boolean deallocate) { - Block block = getBlock(); + // avoid calling block() here to reduce unintended allocations + Block block = blockMap.get().get(channel.filePath()); + if (block == null) + return; + block.index = -1; + if (block.buffer == null) + return; ByteBuffer blockBuffer = block.buffer; - if (blockBuffer != null) + blockBuffer.clear(); + if (deallocate) { - blockBuffer.clear(); - if (deallocate) - { - FileUtils.clean(blockBuffer); - block.buffer = null; - } + FileUtils.clean(blockBuffer); + block.buffer = null; } } diff --git a/src/java/org/apache/cassandra/io/util/TrackedDataOutputPlus.java b/src/java/org/apache/cassandra/io/util/TrackedDataOutputPlus.java new file mode 100644 index 000000000000..9613e11767c6 --- /dev/null +++ b/src/java/org/apache/cassandra/io/util/TrackedDataOutputPlus.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.io.util; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import org.apache.cassandra.utils.vint.VIntCoding; + +public class TrackedDataOutputPlus implements DataOutputPlus +{ + private final DataOutputPlus out; + private int position = 0; + + private TrackedDataOutputPlus(DataOutputPlus out) + { + this.out = out; + } + + public static TrackedDataOutputPlus wrap(DataOutputPlus out) + { + return new TrackedDataOutputPlus(out); + } + + @Override + public void write(int b) throws IOException + { + out.write(b); + position += 1; + } + + @Override + public void write(byte[] b) throws IOException + { + out.write(b); + position += b.length; + } + + @Override + public void write(byte[] b, int off, int len) throws IOException + { + out.write(b, off, len); + position += len; + } + + @Override + public void writeBoolean(boolean v) throws IOException + { + out.writeBoolean(v); + position += 1; + } + + @Override + public void writeByte(int v) throws IOException + { + out.writeByte(v); + position += 1; + } + + @Override + public void writeShort(int v) throws IOException + { + out.writeShort(v); + position += 2; + } + + @Override + public void writeChar(int v) throws IOException + { + out.writeChar(v); + position += 2; + } + + @Override + public void writeInt(int v) throws IOException + { + out.writeInt(v); + position += 4; + } + + @Override + public void writeLong(long v) throws IOException + { + out.writeLong(v); + position += 8; + } + + @Override + public void writeFloat(float v) throws IOException + { + out.writeFloat(v); + position += 4; + } + + @Override + public void writeDouble(double v) throws IOException + { + out.writeDouble(v); + position += 8; + } + + @Override + public void writeBytes(String s) throws IOException + { + out.writeBytes(s); + position += s.length(); + } + + @Override + public void writeChars(String s) throws IOException + { + out.writeChars(s); + position += s.length() * 2; + } + + @Override + public void writeUTF(String s) throws IOException + { + UnbufferedDataOutputStreamPlus.writeUTF(s, this); + } + + @Override + public void write(ByteBuffer buffer) throws IOException + { + out.write(buffer); + position += buffer.remaining(); + } + + @Override + public void write(ReadableMemory memory, long offset, long length) throws IOException + { + out.write(memory, offset, length); + position += length; + } + + @Override + public void writeVInt(long i) throws IOException + { + VIntCoding.writeVInt(i, this); + } + + @Override + public void writeUnsignedVInt(long i) throws IOException + { + VIntCoding.writeUnsignedVInt(i, this); + } + + @Override + public void writeMostSignificantBytes(long register, int bytes) throws IOException + { + out.writeMostSignificantBytes(register, bytes); + position += bytes; + } + + @Override + public void writeLeastSignificantBytes(long register, int bytes) throws IOException + { + out.writeLeastSignificantBytes(register, bytes); + position += bytes; + } + + @Override + public long position() + { + return position; + } + + @Override + public boolean hasPosition() + { + return true; + } +} diff --git a/src/java/org/apache/cassandra/journal/ActiveSegment.java b/src/java/org/apache/cassandra/journal/ActiveSegment.java new file mode 100644 index 000000000000..fb5245de9b14 --- /dev/null +++ b/src/java/org/apache/cassandra/journal/ActiveSegment.java @@ -0,0 +1,522 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.StandardOpenOption; +import java.util.concurrent.atomic.AtomicIntegerFieldUpdater; +import java.util.concurrent.atomic.AtomicLongFieldUpdater; +import java.util.concurrent.locks.LockSupport; + +import com.codahale.metrics.Timer; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.util.*; +import org.apache.cassandra.utils.*; +import org.apache.cassandra.utils.concurrent.OpOrder; +import org.apache.cassandra.utils.concurrent.Ref; +import org.apache.cassandra.utils.concurrent.WaitQueue; + +import static org.apache.cassandra.utils.Simulate.With.MONITORS; + +@Simulate(with=MONITORS) +public final class ActiveSegment extends Segment +{ + final FileChannel channel; + + // OpOrder used to order appends wrt flush + private final OpOrder appendOrder = new OpOrder(); + + // position in the buffer we are allocating from + private volatile long allocateOffset = 0; + private static final AtomicLongFieldUpdater allocateOffsetUpdater = AtomicLongFieldUpdater.newUpdater(ActiveSegment.class, "allocateOffset"); + + /* + * Everything before this offset has been written and flushed. + */ + private volatile int writtenTo = 0; + private volatile int fsyncedTo = 0; + @SuppressWarnings("rawtypes") + private static final AtomicIntegerFieldUpdater writtenToUpdater = AtomicIntegerFieldUpdater.newUpdater(ActiveSegment.class, "writtenTo"); + @SuppressWarnings("rawtypes") + private static final AtomicIntegerFieldUpdater fsyncedToUpdater = AtomicIntegerFieldUpdater.newUpdater(ActiveSegment.class, "fsyncedTo"); + + /* + * End position of the buffer; initially set to its capacity and + * updated to point to the last written position as the segment is being closed + * no need to be volatile as writes are protected by appendOrder barrier. + */ + private int endOfBuffer; + + // a signal that writers can wait on to be notified of a completed flush in BATCH and GROUP FlushMode + private final WaitQueue flushComplete = WaitQueue.newWaitQueue(); + + private final Ref> selfRef; + + final InMemoryIndex index; + + private ActiveSegment( + Descriptor descriptor, Params params, InMemoryIndex index, Metadata metadata, KeySupport keySupport) + { + super(descriptor, metadata, keySupport); + this.index = index; + try + { + channel = FileChannel.open(file.toPath(), StandardOpenOption.WRITE, StandardOpenOption.READ, StandardOpenOption.CREATE); + buffer = channel.map(FileChannel.MapMode.READ_WRITE, 0, params.segmentSize()); + endOfBuffer = buffer.capacity(); + selfRef = new Ref<>(this, new Tidier(descriptor, channel, buffer)); + } + catch (IOException e) + { + throw new JournalWriteError(descriptor, file, e); + } + } + + static ActiveSegment create(Descriptor descriptor, Params params, KeySupport keySupport) + { + InMemoryIndex index = InMemoryIndex.create(keySupport); + Metadata metadata = Metadata.create(); + return new ActiveSegment<>(descriptor, params, index, metadata, keySupport); + } + + @Override + public InMemoryIndex index() + { + return index; + } + + boolean isEmpty() + { + return allocateOffset == 0; + } + + @Override + boolean isActive() + { + return true; + } + + @Override + ActiveSegment asActive() + { + return this; + } + + @Override + StaticSegment asStatic() + { + throw new UnsupportedOperationException(); + } + + /** + * Read the entry and specified offset into the entry holder. + * Expects the caller to acquire the ref to the segment and the record to exist. + */ + @Override + boolean read(int offset, int size, EntrySerializer.EntryHolder into) + { + ByteBuffer duplicate = buffer.duplicate().position(offset).limit(offset + size); + try + { + EntrySerializer.read(into, keySupport, duplicate, descriptor.userVersion); + } + catch (IOException e) + { + throw new JournalReadError(descriptor, file, e); + } + return true; + } + + /** + * Stop writing to this file, flush and close it. Does nothing if the file is already closed. + */ + public synchronized void close(Journal journal) + { + close(journal, true); + } + + /** + * @return true if the closed segment was definitely empty, false otherwise + */ + private synchronized boolean close(Journal journal, boolean persistComponents) + { + boolean isEmpty = discardUnusedTail(); + if (!isEmpty) + { + updateWrittenTo(); + fsync(); + if (persistComponents) persistComponents(); + } + release(journal); + return isEmpty; + } + + /** + * Close and discard a pre-allocated, available segment, that's never been exposed + */ + void closeAndDiscard(Journal journal) + { + boolean isEmpty = close(journal, false); + if (!isEmpty) throw new IllegalStateException(); + discard(); + } + + void closeAndIfEmptyDiscard(Journal journal) + { + boolean isEmpty = close(journal, true); + if (isEmpty) discard(); + } + + void persistComponents() + { + index.persist(descriptor); + metadata.persist(descriptor); + SyncUtil.trySyncDir(descriptor.directory); + } + + private void discard() + { + selfRef.ensureReleased(); + + descriptor.fileFor(Component.DATA).deleteIfExists(); + descriptor.fileFor(Component.INDEX).deleteIfExists(); + descriptor.fileFor(Component.METADATA).deleteIfExists(); + } + + @Override + public Ref> tryRef() + { + return selfRef.tryRef(); + } + + @Override + public Ref> ref() + { + return selfRef.ref(); + } + + @Override + public Ref> selfRef() + { + return selfRef; + } + + private static final class Tidier extends Segment.Tidier implements Tidy + { + private final Descriptor descriptor; + private final FileChannel channel; + private final ByteBuffer buffer; + + Tidier(Descriptor descriptor, FileChannel channel, ByteBuffer buffer) + { + this.descriptor = descriptor; + this.channel = channel; + this.buffer = buffer; + } + + @Override + void onUnreferenced() + { + FileUtils.clean(buffer); + try + { + channel.close(); + } + catch (IOException e) + { + throw new JournalWriteError(descriptor, Component.DATA, e); + } + } + + @Override + public String name() + { + return descriptor.toString(); + } + } + + public boolean isFlushed(long position) + { + return writtenTo >= position; + } + + public int writtenToAtLeast() + { + return writtenTo; + } + + public int fsyncedTo() + { + return fsyncedTo; + } + + public int updateWrittenTo() + { + int allocatedTo = (int)allocateOffset; + if (writtenTo >= allocatedTo) + return writtenTo; + + waitForModifications(); + return writtenToUpdater.accumulateAndGet(this, allocatedTo, Math::max); + } + + // provides no ordering guarantees + void fsync() + { + int writtenTo = this.writtenTo; + if (fsyncedTo >= writtenTo) + return; + + fsyncInternal(); + fsyncedToUpdater.accumulateAndGet(this, writtenTo, Math::max); + flushComplete.signalAll(); + } + + private void waitForFlush(int position) + { + while (fsyncedTo < position) + { + WaitQueue.Signal signal = flushComplete.register(); + if (fsyncedTo < position) + signal.awaitThrowUncheckedOnInterrupt(); + else + signal.cancel(); + } + } + + /** + * Wait for any appends or discardUnusedTail() operations started before this method was called + */ + private void waitForModifications() + { + // issue a barrier and wait for it + appendOrder.awaitNewBarrier(); + } + + private void fsyncInternal() + { + try + { + SyncUtil.force((MappedByteBuffer) buffer); + } + catch (Exception e) // MappedByteBuffer.force() does not declare IOException but can actually throw it + { + throw new JournalWriteError(descriptor, file, e); + } + } + + /** + * Ensures no more of this segment is writeable, by allocating any unused section at the end + * and marking it discarded void discartUnusedTail() + * + * @return true if the segment was empty, false otherwise + */ + boolean discardUnusedTail() + { + try (OpOrder.Group ignored = appendOrder.start()) + { + while (true) + { + long prev = completeInProgress(); + int next = endOfBuffer + 1; + + if ((int)prev >= next) + { + // already stopped allocating, might also be closed + assert buffer == null || prev == buffer.capacity() + 1; + return false; + } + + if (allocateOffsetUpdater.compareAndSet(this, prev, next)) + { + // stopped allocating now; can only succeed once, no further allocation or discardUnusedTail can succeed + endOfBuffer = (int)prev; + assert buffer != null && next == buffer.capacity() + 1; + return prev == 0; + } + LockSupport.parkNanos(1); + } + } + } + + /* + * Entry/bytes allocation logic + */ + + @SuppressWarnings({ "resource", "RedundantSuppression" }) // op group will be closed by Allocation#write() + Allocation allocate(int entrySize) + { + int totalSize = totalEntrySize(entrySize); + OpOrder.Group opGroup = appendOrder.start(); + try + { + int position = allocateBytes(totalSize); + if (position < 0) + { + opGroup.close(); + return null; + } + return new Allocation(opGroup, buffer.duplicate().position(position).limit(position + totalSize), totalSize); + } + catch (Throwable t) + { + opGroup.close(); + throw t; + } + } + + private int totalEntrySize(int recordSize) + { + return EntrySerializer.headerSize(keySupport, descriptor.userVersion) + + recordSize + + TypeSizes.INT_SIZE // CRC + ; + } + + // allocate bytes in the segment, or return -1 if not enough space + private int allocateBytes(int size) + { + while (true) + { + long prev = maybeCompleteInProgress(); + if (prev < 0) + { + LockSupport.parkNanos(1); // ConstantBackoffCAS Algorithm from https://arxiv.org/pdf/1305.5800.pdf + continue; + } + + long next = prev + size; + if (next >= endOfBuffer) + return -1; + + // TODO (expected): if we write a "safe shutdown" marker we don't need this, + // but this provides safe restart in the event the process terminates abruptly but the host remains stable + long inProgress = prev | (next << 32); + if (!allocateOffsetUpdater.compareAndSet(this, prev, inProgress)) + { + LockSupport.parkNanos(1); // ConstantBackoffCAS Algorithm from https://arxiv.org/pdf/1305.5800.pdf + continue; + } + + assert buffer != null; + buffer.putInt((int)prev, (int)next); + allocateOffsetUpdater.compareAndSet(this, inProgress, next); + return (int) prev; + } + } + + final class Allocation + { + private final OpOrder.Group appendOp; + private final ByteBuffer buffer; + private final int start; + private final int length; + + Allocation(OpOrder.Group appendOp, ByteBuffer buffer, int length) + { + this.appendOp = appendOp; + this.buffer = buffer; + this.start = buffer.position(); + this.length = length; + } + + void write(K id, ByteBuffer record) + { + try + { + EntrySerializer.write(id, record, keySupport, buffer, descriptor.userVersion); + metadata.update(); + index.update(id, start, length); + } + catch (IOException e) + { + throw new JournalWriteError(descriptor, file, e); + } + finally + { + appendOp.close(); + } + } + + // Variant of write that does not allocate/return a record pointer + void writeInternal(K id, ByteBuffer record) + { + try + { + EntrySerializer.write(id, record, keySupport, buffer, descriptor.userVersion); + index.update(id, start, length); + metadata.update(); + } + catch (IOException e) + { + throw new JournalWriteError(descriptor, file, e); + } + finally + { + appendOp.close(); + } + } + + void awaitDurable(Timer waitingOnFlush) + { + try (Timer.Context ignored = waitingOnFlush.time()) + { + waitForFlush(start); + } + } + + boolean isFsynced() + { + return fsyncedTo >= start + length; + } + + Descriptor descriptor() + { + return descriptor; + } + + int start() + { + return start; + } + } + + private int maybeCompleteInProgress() + { + long cur = allocateOffset; + int inProgress = (int) (cur >>> 32); + if (inProgress == 0) return (int) cur; + // finish up the in-progress allocation + buffer.putInt((int)cur, inProgress); + if (!allocateOffsetUpdater.compareAndSet(this, cur, inProgress)) + return -1; + + return inProgress; + } + + private int completeInProgress() + { + int result = maybeCompleteInProgress(); + while (result < 0) + result = maybeCompleteInProgress(); + return result; + } +} diff --git a/src/java/org/apache/cassandra/journal/Compactor.java b/src/java/org/apache/cassandra/journal/Compactor.java new file mode 100644 index 000000000000..6525df5d05ea --- /dev/null +++ b/src/java/org/apache/cassandra/journal/Compactor.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.io.IOException; +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; + +import org.apache.cassandra.concurrent.ScheduledExecutorPlus; +import org.apache.cassandra.concurrent.Shutdownable; + +import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; + +public final class Compactor implements Runnable, Shutdownable +{ + private final Journal journal; + private final SegmentCompactor segmentCompactor; + private final ScheduledExecutorPlus executor; + private Future scheduled; + + Compactor(Journal journal, SegmentCompactor segmentCompactor) + { + this.executor = executorFactory().scheduled(false, journal.name + "-compactor"); + this.journal = journal; + this.segmentCompactor = segmentCompactor; + } + + synchronized void start() + { + if (journal.params.enableCompaction()) + schedule(journal.params.compactionPeriod(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS); + } + + private synchronized void schedule(long period, TimeUnit units) + { + scheduled = executor.scheduleWithFixedDelay(this, period, period, units); + } + + public synchronized void updateCompactionPeriod(int period, TimeUnit units) + { + if (!journal.params.enableCompaction()) + return; + + if (scheduled != null) + scheduled.cancel(false); + + schedule(period, units); + } + + @Override + public void run() + { + Set> toCompact = new HashSet<>(); + journal.segments().selectStatic(toCompact); + if (toCompact.size() < 2) + return; + + try + { + Collection> newSegments = segmentCompactor.compact(toCompact); + + for (StaticSegment segment : newSegments) + toCompact.remove(segment); + + journal.replaceCompactedSegments(toCompact, newSegments); + for (StaticSegment segment : toCompact) + segment.discard(journal); + } + catch (IOException e) + { + throw new RuntimeException("Could not compact segments: " + toCompact); + } + } + + @Override + public boolean isTerminated() + { + return executor.isTerminated(); + } + + @Override + public void shutdown() + { + executor.shutdown(); + } + + @Override + public Object shutdownNow() + { + return executor.shutdownNow(); + } + + @Override + public boolean awaitTermination(long timeout, TimeUnit units) throws InterruptedException + { + return executor.awaitTermination(timeout, units); + } +} diff --git a/src/java/org/apache/cassandra/journal/Component.java b/src/java/org/apache/cassandra/journal/Component.java new file mode 100644 index 000000000000..07da71536a6d --- /dev/null +++ b/src/java/org/apache/cassandra/journal/Component.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.util.List; + +import accord.utils.Invariants; +import org.apache.cassandra.io.util.File; + +import static accord.utils.SortedArrays.SortedArrayList.ofSorted; + +enum Component +{ + DATA ("data"), + INDEX ("indx"), + METADATA ("meta"); + //OFFSET_MAP (".offs"), + //INVLALIDATIONS (".invl"); + + public static final List VALUES = ofSorted(values()); + final String extension; + + Component(String extension) + { + this.extension = extension; + } + + /** + * @return if this component for the provided descrtiptor exists on disk + */ + boolean existsFor(Descriptor descriptor) + { + return descriptor.fileFor(this).exists(); + } + + void markCorrupted(Descriptor descriptor) + { + File file = descriptor.fileFor(this); + Invariants.require(file.exists()); + file.move(file.withSuffix(".corrupted")); + } +} diff --git a/src/java/org/apache/cassandra/journal/Descriptor.java b/src/java/org/apache/cassandra/journal/Descriptor.java new file mode 100644 index 000000000000..cea68c353e14 --- /dev/null +++ b/src/java/org/apache/cassandra/journal/Descriptor.java @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.cassandra.io.util.File; + +import static java.lang.String.format; +import static java.util.stream.Collectors.toList; + +/** + * Timestamp and version encoded in the file name, e.g. + * log-1637159888484-2-1-1.data + * log-1637159888484-2-1-1.indx + * log-1637159888484-2-1-1.meta + * log-1637159888484-2-1-1.sync + */ +public final class Descriptor implements Comparable +{ + private static final String SEPARATOR = "-"; + private static final String PREFIX = "log" + SEPARATOR; + private static final String TMP_SUFFIX = "tmp"; + + private static final Pattern DATA_FILE_PATTERN = + Pattern.compile( PREFIX + "(\\d+)" // timestamp + + SEPARATOR + "(\\d+)" // generation + + SEPARATOR + "(\\d+)" // journal version + + SEPARATOR + "(\\d+)" // user version + + "\\." + Component.DATA.extension); + + private static final Pattern TMP_FILE_PATTERN = + Pattern.compile( PREFIX + "\\d+" // timestamp + + SEPARATOR + "\\d+" // generation + + SEPARATOR + "\\d+" // journal version + + SEPARATOR + "\\d+" // user version + + "\\." + "[a-z]+" // component extension + + "\\." + TMP_SUFFIX); + + + /* + * NOTE: If and when another journal version is introduced, have implementations + * expose the version used via yaml. This way operators can force previous journal + * version on upgrade, temporarily, to allow easier downgrades if something goes wrong. + */ + static final int JOURNAL_VERSION_1 = 1; + static final int CURRENT_JOURNAL_VERSION = JOURNAL_VERSION_1; + + final File directory; + public final long timestamp; + public final int generation; + + /** + * Serialization version for journal components; bumped as journal + * implementation evolves over time. + */ + public final int journalVersion; + + /** + * Serialization version for user content - specifically journal keys + * and journal values; bumped when user logic evolves. + */ + public final int userVersion; + + Descriptor(File directory, long timestamp, int generation, int journalVersion, int userVersion) + { + this.directory = directory; + this.timestamp = timestamp; + this.generation = generation; + this.journalVersion = journalVersion; + this.userVersion = userVersion; + } + + static Descriptor create(File directory, long timestamp, int userVersion) + { + return new Descriptor(directory, timestamp, 1, CURRENT_JOURNAL_VERSION, userVersion); + } + + static Descriptor fromName(File directory, String name) + { + Matcher matcher = DATA_FILE_PATTERN.matcher(name); + if (!matcher.matches()) + throw new IllegalArgumentException("Provided filename " + new File(directory, name) + " is not valid for a data segment file"); + + long timestamp = Long.parseLong(matcher.group(1)); + int generation = Integer.parseInt(matcher.group(2)); + int journalVersion = Integer.parseInt(matcher.group(3)); + int userVersion = Integer.parseInt(matcher.group(4)); + + return new Descriptor(directory, timestamp, generation, journalVersion, userVersion); + } + + static Descriptor fromFile(File file) + { + return fromName(file.parent(), file.name()); + } + + File fileFor(Component component) + { + return new File(directory, formatFileName(component)); + } + + File tmpFileFor(Component component) + { + return new File(directory, formatFileName(component) + '.' + TMP_SUFFIX); + } + + static boolean isTmpFile(File file) + { + return TMP_FILE_PATTERN.matcher(file.name()).matches(); + } + + private String formatFileName(Component component) + { + return format("%s%d%s%d%s%d%s%d.%s", + PREFIX, timestamp, + SEPARATOR, generation, + SEPARATOR, journalVersion, + SEPARATOR, userVersion, + component.extension); + } + + static List list(File directory) + { + try + { + return Arrays.stream(directory.listNames((file, name) -> DATA_FILE_PATTERN.matcher(name).matches())) + .map(name -> fromName(directory, name)) + .collect(toList()); + } + catch (IOException e) + { + throw new RuntimeException(e); + } + } + + @Override + public int compareTo(Descriptor other) + { + assert this.directory.equals(other.directory) + : format("Descriptors have mismatching directories: %s and %s", this.directory, other.directory); + + int cmp = Long.compare(this.timestamp, other.timestamp); + if (cmp == 0) cmp = Integer.compare(this.generation, other.generation); + if (cmp == 0) cmp = Integer.compare(this.journalVersion, other.journalVersion); + if (cmp == 0) cmp = Integer.compare(this.userVersion, other.userVersion); + return cmp; + } + + @Override + public boolean equals(Object other) + { + if (this == other) + return true; + return (other instanceof Descriptor) && equals((Descriptor) other); + } + + boolean equals(Descriptor other) + { + assert this.directory.equals(other.directory) + : format("Descriptors have mismatching directories: %s and %s", this.directory, other.directory); + + return this.timestamp == other.timestamp + && this.generation == other.generation + && this.journalVersion == other.journalVersion + && this.userVersion == other.userVersion; + } + + @Override + public int hashCode() + { + int result = directory.hashCode(); + result = 31 * result + Long.hashCode(timestamp); + result = 31 * result + generation; + result = 31 * result + journalVersion; + result = 31 * result + userVersion; + return result; + } + + @Override + public String toString() + { + return format("dir: %s, ts: %d, gen: %d, journal ver: %d, user ver: %d", + directory, timestamp, generation, journalVersion, userVersion); + } +} diff --git a/src/java/org/apache/cassandra/journal/EntrySerializer.java b/src/java/org/apache/cassandra/journal/EntrySerializer.java new file mode 100644 index 000000000000..bf6464bb68dc --- /dev/null +++ b/src/java/org/apache/cassandra/journal/EntrySerializer.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.io.EOFException; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.zip.CRC32; + +import accord.utils.Invariants; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.Crc; + +import static org.apache.cassandra.journal.Journal.validateCRC; + +public final class EntrySerializer +{ + static void write(K key, + ByteBuffer record, + KeySupport keySupport, + ByteBuffer out, + int userVersion) + throws IOException + { + int start = out.position(); + int totalSize = out.getInt() - start; + Invariants.require(totalSize == TypeSizes.INT_SIZE + out.remaining()); + Invariants.require(totalSize == headerSize(keySupport, userVersion) + record.remaining() + TypeSizes.INT_SIZE); + + keySupport.serialize(key, out, userVersion); + + int headerCrcPosition = out.position(); + out.position(headerCrcPosition + TypeSizes.INT_SIZE); + + int recordSize = record.remaining(); + int recordEnd = out.position() + recordSize; + Invariants.require(out.limit() == recordEnd + TypeSizes.INT_SIZE); + ByteBufferUtil.copyBytes(record, record.position(), out, out.position(), recordSize); + + // update and write crcs + CRC32 crc = Crc.crc32(); + out.position(start); + out.limit(headerCrcPosition); + crc.update(out); + out.limit(recordEnd); + out.putInt((int) crc.getValue()); + crc.update(out); + out.limit(recordEnd + 4); + out.putInt((int) crc.getValue()); + } + + // we reuse record as the value we return + static void read(EntryHolder into, + KeySupport keySupport, + ByteBuffer from, + int userVersion) + throws IOException + { + into.clear(); + + int start = from.position(); + { + int totalSize = from.getInt(start) - start; + Invariants.require(totalSize == from.remaining()); + + CRC32 crc = Crc.crc32(); + int headerSize = EntrySerializer.headerSize(keySupport, userVersion); + int headerCrc = readAndUpdateHeaderCrc(crc, from, headerSize); + validateCRC(crc, headerCrc); + + int recordCrc = readAndUpdateRecordCrc(crc, from, start + totalSize); + validateCRC(crc, recordCrc); + } + + readValidated(into, from, start, keySupport, userVersion); + } + + // slices the provided buffer to assign to into.value + static int tryRead(EntryHolder into, + KeySupport keySupport, + ByteBuffer from, + int syncedOffset, + int userVersion) + throws IOException + { + CRC32 crc = Crc.crc32(); + into.clear(); + + int start = from.position(); + if (from.remaining() < TypeSizes.INT_SIZE) + return -1; + + int totalSize = from.getInt(start) - start; + if (totalSize == 0) + return -1; + + if (from.remaining() < totalSize) + return handleReadException(new EOFException(), from.limit(), syncedOffset); + + { + int headerSize = EntrySerializer.headerSize(keySupport, userVersion); + int headerCrc = readAndUpdateHeaderCrc(crc, from, headerSize); + try + { + validateCRC(crc, headerCrc); + } + catch (IOException e) + { + return handleReadException(e, from.position() + headerSize, syncedOffset); + } + + int recordCrc = readAndUpdateRecordCrc(crc, from, start + totalSize); + try + { + validateCRC(crc, recordCrc); + } + catch (IOException e) + { + return handleReadException(e, from.position(), syncedOffset); + } + } + + readValidated(into, from, start, keySupport, userVersion); + return totalSize; + } + + private static void readValidated(EntryHolder into, ByteBuffer from, int start, KeySupport keySupport, int userVersion) + { + from.position(start + TypeSizes.INT_SIZE); + into.key = keySupport.deserialize(from, userVersion); + from.position(from.position() + 4); + into.value = from; + into.userVersion = userVersion; + } + + private static int readAndUpdateHeaderCrc(CRC32 crc, ByteBuffer from, int headerSize) + { + int headerEnd = from.position() + headerSize - TypeSizes.INT_SIZE; + int headerCrc = from.getInt(headerEnd); + from.limit(headerEnd); + crc.update(from); + return headerCrc; + } + + private static int readAndUpdateRecordCrc(CRC32 crc, ByteBuffer from, int limit) + { + int recordEnd = limit - TypeSizes.INT_SIZE; + from.limit(limit); + int recordCrc = from.getInt(recordEnd); + from.position(from.position() + 4); + from.limit(recordEnd); + crc.update(from); + return recordCrc; + } + + private static int handleReadException(IOException e, int bufferPosition, int fsyncedLimit) throws IOException + { + if (bufferPosition <= fsyncedLimit) + throw e; + else + return -1; + } + + static int headerSize(KeySupport keySupport, int userVersion) + { + return TypeSizes.INT_SIZE // pointer to next entry + + keySupport.serializedSize(userVersion) // key/id + + TypeSizes.INT_SIZE; // CRC + } + + public static final class EntryHolder + { + public K key; + public ByteBuffer value; + + public int userVersion; + + public void clear() + { + key = null; + value = null; + } + } +} diff --git a/src/java/org/apache/cassandra/journal/Flusher.java b/src/java/org/apache/cassandra/journal/Flusher.java new file mode 100644 index 000000000000..02f85df0cbfc --- /dev/null +++ b/src/java/org/apache/cassandra/journal/Flusher.java @@ -0,0 +1,576 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.LockSupport; + +import javax.annotation.Nullable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.utils.Invariants; +import com.codahale.metrics.Timer; +import org.apache.cassandra.concurrent.Interruptible; +import org.apache.cassandra.concurrent.Interruptible.TerminateException; +import org.apache.cassandra.utils.MonotonicClock; +import org.apache.cassandra.utils.NoSpamLogger; +import org.apache.cassandra.utils.Simulate; +import org.apache.cassandra.utils.concurrent.Semaphore; +import org.apache.cassandra.utils.concurrent.WaitQueue; + +import static java.lang.String.format; +import static java.util.concurrent.TimeUnit.MINUTES; +import static java.util.concurrent.TimeUnit.NANOSECONDS; +import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; +import static org.apache.cassandra.concurrent.ExecutorFactory.SystemThreadTag.NON_DAEMON; +import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.Interrupts.SYNCHRONIZED; +import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.SimulatorSafe.SAFE; +import static org.apache.cassandra.concurrent.Interruptible.State.NORMAL; +import static org.apache.cassandra.concurrent.Interruptible.State.SHUTTING_DOWN; +import static org.apache.cassandra.journal.Params.FlushMode.PERIODIC; +import static org.apache.cassandra.utils.LocalizeString.toLowerCaseLocalized; +import static org.apache.cassandra.utils.MonotonicClock.Global.preciseTime; +import static org.apache.cassandra.utils.Simulate.With.GLOBAL_CLOCK; +import static org.apache.cassandra.utils.Simulate.With.LOCK_SUPPORT; +import static org.apache.cassandra.utils.Simulate.With.MONITORS; +import static org.apache.cassandra.utils.concurrent.Semaphore.newSemaphore; +import static org.apache.cassandra.utils.concurrent.WaitQueue.newWaitQueue; + +final class Flusher +{ + private static final Logger logger = LoggerFactory.getLogger(Flusher.class); + + private final Journal journal; + private final Params params; + + private volatile Interruptible flushExecutor; + private volatile Interruptible fsyncExecutor; + + // counts of total pending write and written entries + private final AtomicLong pending = new AtomicLong(0); + private final AtomicLong written = new AtomicLong(0); + + // the time of the last initiated flush + volatile long flushStartedAt; + // the time of the earliest flush that has completed an fsync; all Allocations written before this time are durable + volatile long fsyncFinishedFor = flushStartedAt; + volatile RecordPointer fsyncFinishedForPosition = new RecordPointer(0, 0); + + // a signal that writers can wait on to be notified of a completed flush in PERIODIC FlushMode + private final WaitQueue fsyncComplete = newWaitQueue(); // TODO (expected): this is only used for testing, can we remove this? + private final MonotonicClock clock = preciseTime; + + // a signal and flag that callers outside the flusher thread can use + // to signal they want the journal segments to be flushed to disk + private final Semaphore haveWork = newSemaphore(1); + private volatile boolean flushRequested; + + private final Mode mode; + private final Callbacks callbacks; + + Flusher(Journal journal, Callbacks callbacks) + { + this.journal = journal; + this.params = journal.params; + this.mode = mode(params); + this.callbacks = callbacks; + } + + void start() + { + String flushExecutorName = journal.name + "-disk-flusher-" + toLowerCaseLocalized(params.flushMode().toString()); + flushStartedAt = clock.now(); + flushExecutor = executorFactory().infiniteLoop(flushExecutorName, new FlushRunnable(), SAFE, NON_DAEMON, SYNCHRONIZED); + } + + void shutdown() throws InterruptedException + { + flushExecutor.shutdown(); + flushExecutor.awaitTermination(1, MINUTES); + if (fsyncExecutor != null) + { + fsyncExecutor.shutdownNow(); // `now` to interrupt potentially parked runnable + fsyncExecutor.awaitTermination(1, MINUTES); + } + } + + @Simulate(with={MONITORS,GLOBAL_CLOCK,LOCK_SUPPORT}) + // waits for writes to complete before triggering an fsync + private class FlushRunnable implements Interruptible.Task + { + @Simulate(with={MONITORS,GLOBAL_CLOCK,LOCK_SUPPORT}) + private class FSyncRunnable implements Interruptible.Task + { + // this is written only by the Flusher thread, and read only by the Fsync thread + ActiveSegment fsyncUpTo; + ActiveSegment fsyncing; + + private volatile Thread awaitingWork; + + // all Allocations written before this time will be written to at least the OS page cache; + volatile long fsyncWaitingSince = 0; + // the time of the earliest flush that has begun participating in an fsync + volatile long fsyncStartedFor = 0; + + @Override + public void run(Interruptible.State state) throws InterruptedException + { + try + { + doRun(state); + } + catch (Throwable t) + { + if (!journal.handleError("Failed to flush segments to disk", t)) + throw new TerminateException(); + } + } + + private boolean hasWork() + { + return hasWork(fsyncStartedFor); + } + + private boolean hasWork(long lastStartedAt) + { + return fsyncWaitingSince != lastStartedAt; + } + + private void awaitWork() throws InterruptedException + { + long lastStartedAt = fsyncStartedFor; + if (hasWork(lastStartedAt)) + return; + + awaitingWork = Thread.currentThread(); + while (true) + { + if (Thread.interrupted()) + { + awaitingWork = null; + throw new InterruptedException(); + } + + if (hasWork(lastStartedAt)) + break; + + LockSupport.park(); + } + + awaitingWork = null; + } + + void notify(Thread notify) + { + if (notify != null) + LockSupport.unpark(notify); + } + + public void doRun(Interruptible.State state) throws InterruptedException + { + if (state == NORMAL) awaitWork(); + else if (!hasWork()) return; + + if (fsyncing == null) + fsyncing = journal.oldestActiveSegment(); + + // invert order of access; we might see a future fsyncTo, but at worst this means redundantly invoking fsync before updating fsyncStartedFor + long startedAt = fsyncWaitingSince; + ActiveSegment fsyncTo = this.fsyncUpTo; + fsyncStartedFor = startedAt; + // synchronized to prevent thread interrupts while performing IO operations and also + // clear interrupted status to prevent ClosedByInterruptException in ActiveSegment::flush + int fsyncedTo; + synchronized (this) + { + boolean ignore = Thread.interrupted(); + while (fsyncing != fsyncTo) + { + fsyncing.fsync(); + journal.closeActiveSegmentAndOpenAsStatic(fsyncing); + fsyncing = journal.getActiveSegment(fsyncing.descriptor.timestamp + 1); + } + fsyncedTo = fsyncTo.writtenToAtLeast(); + fsyncTo.fsync(); + } + fsyncFinishedForPosition = new RecordPointer(fsyncTo.descriptor.timestamp, fsyncedTo, startedAt); + fsyncFinishedFor = startedAt; + fsyncComplete.signalAll(); + long finishedAt = clock.now(); + processDuration(startedAt, finishedAt); + } + + void afterFlush(long startedAt, ActiveSegment segment) + { + long requireFsyncTo = startedAt - periodicBlockNanos(); + + fsyncUpTo = segment; + fsyncWaitingSince = startedAt; + + notify(awaitingWork); + + if (requireFsyncTo > fsyncFinishedFor) + awaitFsyncAt(requireFsyncTo, journal.metrics.waitingOnFlush.time()); + callbacks.onFsync(); + } + } + + private final NoSpamLogger noSpamLogger; + private final @Nullable FSyncRunnable fSyncRunnable; + + private ActiveSegment current = null; + + private long firstLaggedAt = Long.MIN_VALUE; // first lag ever or since last logged warning + private int fsyncCount = 0; // flush count since firstLaggedAt + private int lagCount = 0; // lag count since firstLaggedAt + private long duration = 0; // time spent flushing since firstLaggedAt + private long lagDuration = 0; // cumulative lag since firstLaggedAt + + FlushRunnable() + { + this.noSpamLogger = NoSpamLogger.wrap(logger, 5, MINUTES); + this.fSyncRunnable = params.flushMode() == PERIODIC ? newFsyncRunnable() : null; + } + + @Override + public void run(Interruptible.State state) throws InterruptedException + { + try + { + doRun(state); + } + catch (Throwable t) + { + if (!journal.handleError("Failed to flush segments to disk", t)) + throw new TerminateException(); + else // sleep for full poll-interval after an error, so we don't spam the log file + haveWork.tryAcquire(1, flushPeriodNanos(), NANOSECONDS); + } + } + + public void doRun(Interruptible.State state) throws InterruptedException + { + long startedAt = clock.now(); + long flushPeriodNanos = flushPeriodNanos(); + boolean flushToDisk = flushStartedAt + flushPeriodNanos <= startedAt || state != NORMAL || flushRequested; + + // synchronized to prevent thread interrupts while performing IO operations and also + // clear interrupted status to prevent ClosedByInterruptException in ActiveSegment::flush + synchronized (this) + { + boolean ignore = Thread.interrupted(); + if (flushToDisk) + { + flushRequested = false; + flushStartedAt = startedAt; + doFlush(startedAt); + } + } + + if (state == SHUTTING_DOWN) + return; + + if (flushPeriodNanos <= 0) + { + Invariants.require(params.flushMode() != PERIODIC); + haveWork.acquire(1); + } + else + { + long wakeUpAt = startedAt + flushPeriodNanos; + haveWork.tryAcquireUntil(1, wakeUpAt); + } + } + + private void doFlush(long startedAt) + { + boolean synchronousFsync = fSyncRunnable == null; + + if (current == null) + current = journal.oldestActiveSegment(); + ActiveSegment newCurrent = journal.currentActiveSegment(); + + if (newCurrent == null) + return; + + try + { + while (current != newCurrent) + { + current.discardUnusedTail(); + current.updateWrittenTo(); + if (synchronousFsync) + { + current.fsync(); + journal.closeActiveSegmentAndOpenAsStatic(current); + } + current = journal.getActiveSegment(current.descriptor.timestamp + 1); + } + + int writtenTo = current.updateWrittenTo(); + if (synchronousFsync) + { + current.fsync(); + afterFSync(startedAt, current.descriptor.timestamp, writtenTo); + } + else + { + fSyncRunnable.afterFlush(startedAt, current); + } + } + catch (Throwable t) + { + callbacks.onFlushFailed(t); + throw t; + } + } + + private void processDuration(long startedFlushAt, long finishedFsyncAt) + { + fsyncCount++; + duration += (finishedFsyncAt - startedFlushAt); + + long flushPeriodNanos = flushPeriodNanos(); + long lag = finishedFsyncAt - (startedFlushAt + flushPeriodNanos); + if (flushPeriodNanos <= 0 || lag <= 0) + return; + + lagCount++; + lagDuration += lag; + + if (firstLaggedAt == Long.MIN_VALUE) + firstLaggedAt = finishedFsyncAt; + + boolean logged = + noSpamLogger.warn(finishedFsyncAt, + "Out of {} {} journal flushes over the past {}s with average duration of {}ms, " + + "{} have exceeded the configured flush period by an average of {}ms", + fsyncCount, + journal.name, + format("%.2f", (finishedFsyncAt - firstLaggedAt) * 1e-9d), + format("%.2f", duration * 1e-6d / fsyncCount), + lagCount, + format("%.2f", lagDuration * 1e-6d / lagCount)); + + if (logged) // reset metrics for next log statement + { + firstLaggedAt = Long.MIN_VALUE; + fsyncCount = lagCount = 0; + duration = lagDuration = 0; + } + } + + private void afterFSync(long startedAt, long segment, int position) + { + fsyncFinishedForPosition = new RecordPointer(segment, position, startedAt); + fsyncFinishedFor = startedAt; + callbacks.onFsync(); + fsyncComplete.signalAll(); + long finishedAt = clock.now(); + processDuration(startedAt, finishedAt); + } + + private FSyncRunnable newFsyncRunnable() + { + final FSyncRunnable fSyncRunnable = new FSyncRunnable(); + fsyncExecutor = executorFactory().infiniteLoop(journal.name + "-fsync", fSyncRunnable, SAFE, NON_DAEMON, SYNCHRONIZED); + return fSyncRunnable; + } + } + + private interface Mode + { + void flushAndAwaitDurable(ActiveSegment.Allocation alloc); + RecordPointer flushAsync(ActiveSegment.Allocation alloc); + boolean isDurable(RecordPointer recordPointer); + } + + private class BatchMode implements Mode + { + @Override + public void flushAndAwaitDurable(ActiveSegment.Allocation alloc) + { + pending.incrementAndGet(); + requestExtraFlush(); + alloc.awaitDurable(journal.metrics.waitingOnFlush); + pending.decrementAndGet(); + written.incrementAndGet(); + } + + @Override + public RecordPointer flushAsync(ActiveSegment.Allocation alloc) + { + requestExtraFlush(); + written.incrementAndGet(); + return new RecordPointer(alloc.descriptor().timestamp, alloc.start()); + } + + @Override + public boolean isDurable(RecordPointer pointer) + { + return pointer.compareTo(fsyncFinishedForPosition) <= 0; + } + } + + private class GroupMode implements Mode + { + @Override + public void flushAndAwaitDurable(ActiveSegment.Allocation alloc) + { + pending.incrementAndGet(); + alloc.awaitDurable(journal.metrics.waitingOnFlush); + pending.decrementAndGet(); + written.incrementAndGet(); + } + + @Override + public RecordPointer flushAsync(ActiveSegment.Allocation alloc) + { + written.incrementAndGet(); + return new RecordPointer(alloc.descriptor().timestamp, alloc.start()); + } + + @Override + public boolean isDurable(RecordPointer pointer) + { + return pointer.compareTo(fsyncFinishedForPosition) <= 0; + } + } + + private class PeriodicMode implements Mode + { + @Override + public void flushAndAwaitDurable(ActiveSegment.Allocation alloc) + { + RecordPointer pointer = flushAsync(alloc); + + long expectedFsyncTime = pointer.writtenAt - periodicBlockNanos(); + if (expectedFsyncTime > fsyncFinishedFor) + { + pending.incrementAndGet(); + awaitFsyncAt(expectedFsyncTime, journal.metrics.waitingOnFlush.time()); + pending.decrementAndGet(); + } + } + + @Override + public RecordPointer flushAsync(ActiveSegment.Allocation alloc) + { + written.incrementAndGet(); + return new RecordPointer(alloc.descriptor().timestamp, alloc.start(), clock.now()); + } + + @Override + public boolean isDurable(RecordPointer alloc) + { + long expectedFsyncTime = alloc.writtenAt - periodicBlockNanos(); + return expectedFsyncTime <= fsyncFinishedFor; + } + } + + Mode mode(Params params) + { + switch (params.flushMode()) + { + default: throw new AssertionError("Unexpected FlushMode: " + params.flushMode()); + case BATCH: return new BatchMode(); + case GROUP: return new GroupMode(); + case PERIODIC: return new PeriodicMode(); + } + } + + RecordPointer flush(ActiveSegment.Allocation alloc) + { + return mode.flushAsync(alloc); + } + + void flushAndAwaitDurable(ActiveSegment.Allocation alloc) + { + mode.flushAndAwaitDurable(alloc); + } + + boolean isDurable(RecordPointer pointer) + { + return mode.isDurable(pointer); + } + + /** + * Request an additional flush cycle without blocking + */ + void requestExtraFlush() + { + // note: cannot simply invoke executor.interrupt() as some filesystems don't like it (jimfs, at least) + flushRequested = true; + haveWork.release(1); + } + + private void awaitFsyncAt(long flushTime, Timer.Context context) + { + do + { + WaitQueue.Signal signal = fsyncComplete.register(context, Timer.Context::stop); + if (fsyncFinishedFor < flushTime) + { + signal.awaitThrowUncheckedOnInterrupt(); + + Journal.State state = journal.state.get(); + Invariants.require(state == Journal.State.NORMAL, + "Thread %s outlived journal, which is in %s state", Thread.currentThread(), state); + } + else + signal.cancel(); + } + while (fsyncFinishedFor < flushTime); + } + + private long flushPeriodNanos() + { + return params.flushPeriod(NANOSECONDS); + } + + private long periodicBlockNanos() + { + return params.periodicBlockPeriod(NANOSECONDS); + } + + long pendingEntries() + { + return pending.get(); + } + + long writtenEntries() + { + return written.get(); + } + + public interface Callbacks + { + /** + * Invoked after {@link Flusher} successfully flushes a segment or multiple segments to disk. + * Invocation of this callback implies that any segments older than {@code segment} have been + * completed and also flushed. + * callbacks for all entries earlier than (segment, position) have finished execution. + */ + void onFsync(); + + // TODO (required): tie this to specific allocations.. + void onFlushFailed(Throwable cause); + } +} diff --git a/src/java/org/apache/cassandra/journal/InMemoryIndex.java b/src/java/org/apache/cassandra/journal/InMemoryIndex.java new file mode 100644 index 000000000000..49fe4d136714 --- /dev/null +++ b/src/java/org/apache/cassandra/journal/InMemoryIndex.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.io.IOException; +import java.util.NavigableMap; +import java.util.TreeMap; +import java.util.concurrent.ConcurrentSkipListMap; +import java.util.concurrent.atomic.AtomicReference; + +import javax.annotation.Nullable; + +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.io.util.FileOutputStreamPlus; +import org.apache.cassandra.journal.StaticSegment.SequentialReader; + +/** + * An index for a segment that's still being updated by journal writers concurrently. + */ +public final class InMemoryIndex extends Index +{ + private static final long[] EMPTY = new long[0]; + + private final NavigableMap index; + + // CSLM#lastKey() can be costly, so track lastId separately; + // TODO: this could easily be premature and misguided; + // benchmark to ensure it's not acitevly harmful + private final AtomicReference lastId; + + static InMemoryIndex create(KeySupport keySupport) + { + return new InMemoryIndex<>(keySupport, new ConcurrentSkipListMap<>(keySupport)); + } + + private InMemoryIndex(KeySupport keySupport, NavigableMap index) + { + super(keySupport); + this.index = index; + this.lastId = new AtomicReference<>(); + } + + public int size() + { + return index.size(); + } + + public void update(K id, int offset, int size) + { + long currentOffsetAndSize = composeOffsetAndSize(offset, size); + index.merge(id, new long[] { currentOffsetAndSize }, + (current, value) -> + { + long inserting = value[0]; + int idx = 0; + while (idx < current.length) + { + long cur = current[idx]; + if (cur <= inserting) + { + if (cur == inserting) + return current; // TODO (expected): throw exception? + break; + } + ++idx; + } + + long[] merged = new long[current.length + 1]; + System.arraycopy(current, 0, merged, 0, idx); + merged[idx] = inserting; + System.arraycopy(current, idx, merged, idx + 1, current.length - idx); + return merged; + }); + + lastId.accumulateAndGet(id, (current, update) -> (null == current || keySupport.compare(current, update) < 0) ? update : current); + } + + @Override + @Nullable + public K firstId() + { + return index.isEmpty() ? null : index.firstKey(); + } + + @Override + @Nullable + public K lastId() + { + return lastId.get(); + } + + @Override + public long[] lookUp(K id) + { + K lastId = lastId(); + if (lastId == null || keySupport.compare(id, lastId) <= 0) + return index.getOrDefault(id, EMPTY); + return EMPTY; + } + + @Override + public long lookUpLast(K id) + { + long[] offsets = lookUp(id); + return offsets.length == 0 ? -1 : offsets[0]; + } + + @Override + long[] lookUpAll(K id) + { + return lookUp(id); + } + + public void persist(Descriptor descriptor) + { + File tmpFile = descriptor.tmpFileFor(Component.INDEX); + try (FileOutputStreamPlus out = new FileOutputStreamPlus(tmpFile)) + { + OnDiskIndex.write(index, keySupport, out, descriptor.userVersion); + + out.flush(); + out.sync(); + } + catch (IOException e) + { + throw new JournalWriteError(descriptor, tmpFile, e); + } + tmpFile.move(descriptor.fileFor(Component.INDEX)); + } + + static InMemoryIndex rebuild(Descriptor descriptor, KeySupport keySupport, int fsyncedLimit) + { + InMemoryIndex index = new InMemoryIndex<>(keySupport, new TreeMap<>(keySupport)); + + try (SequentialReader reader = StaticSegment.sequentialReader(descriptor, keySupport, fsyncedLimit)) + { + while (reader.advance()) + index.update(reader.key(), reader.offset, reader.buffer.position() - reader.offset); + } + return index; + } + + @Override + public void close() + { + } +} diff --git a/src/java/org/apache/cassandra/journal/Index.java b/src/java/org/apache/cassandra/journal/Index.java new file mode 100644 index 000000000000..fba9b99f86b9 --- /dev/null +++ b/src/java/org/apache/cassandra/journal/Index.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import javax.annotation.Nullable; + +import org.apache.cassandra.utils.Closeable; + +/** + * Mapping of client supplied ids to in-segment offsets + */ +abstract class Index implements Closeable +{ + final KeySupport keySupport; + + Index(KeySupport keySupport) + { + this.keySupport = keySupport; + } + + /** + * Look up offsets by id. It's possible, due to retries, for a segment + * to contain the same record with the same id more than once, at + * different offsets. + * + * @return the found offsets into the segment, if any; can be empty + */ + abstract long[] lookUp(K id); + + /** + * Look up offsets by id. It's possible, due to retries, for a segment + * to contain the same record with the same id more than once, at + * different offsets. Return the first offset for provided record id, or -1 if none. + * + * @return the first offset into the segment, or -1 is none were found + */ + abstract long lookUpLast(K id); + + abstract long[] lookUpAll(K id); + + /** + * @return the first (smallest) id in the index + */ + @Nullable + abstract K firstId(); + + /** + * @return the last (largest) id in the index + */ + @Nullable + abstract K lastId(); + + /** + * @return whether the id falls within lower/upper bounds of the index + */ + boolean mayContainId(K id) + { + K firstId = firstId(); + K lastId = lastId(); + + return null != firstId && keySupport.compare(id, firstId) >= 0 && (null == lastId || keySupport.compare(id, lastId) <= 0); + } + + /** + * Helper methods + */ + + public static int readOffset(long record) + { + return (int) (0xffffffffL & (record >> 32)); + } + + public static long writeOffset(long record, int offset) + { + record &= 0x00000000ffffffffL; //unset all higher bits + record |= ((long) offset) << 32; + return record; + } + + public static int readSize(long record) + { + return (int) (0xffffffffL & record); + } + + public static long writeSize(long record, int size) + { + record &= 0xffffffff00000000L; // unset all lower bits + record |= size; + return record; + } + + public static long composeOffsetAndSize(int offset, int size) + { + return writeSize(writeOffset(0, offset), size); + } +} diff --git a/src/java/org/apache/cassandra/journal/Journal.java b/src/java/org/apache/cassandra/journal/Journal.java new file mode 100644 index 000000000000..e70e93b9c6e7 --- /dev/null +++ b/src/java/org/apache/cassandra/journal/Journal.java @@ -0,0 +1,1090 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.io.IOException; +import java.nio.channels.ClosedByInterruptException; +import java.nio.file.FileStore; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicReference; +import java.util.concurrent.locks.LockSupport; +import java.util.function.BooleanSupplier; +import java.util.function.Function; +import java.util.function.LongConsumer; +import java.util.function.Predicate; +import java.util.zip.CRC32; + +import com.google.common.annotations.VisibleForTesting; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.utils.Invariants; +import com.codahale.metrics.Timer.Context; +import org.apache.cassandra.concurrent.Interruptible; +import org.apache.cassandra.concurrent.Interruptible.TerminateException; +import org.apache.cassandra.concurrent.SequentialExecutorPlus; +import org.apache.cassandra.concurrent.Shutdownable; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.io.util.PathUtils; +import org.apache.cassandra.journal.Segments.ReferencedSegments; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.utils.CloseableIterator; +import org.apache.cassandra.utils.Crc; +import org.apache.cassandra.utils.JVMStabilityInspector; +import org.apache.cassandra.utils.LazyToString; +import org.apache.cassandra.utils.MergeIterator; +import org.apache.cassandra.utils.Simulate; +import org.apache.cassandra.utils.concurrent.OpOrder; +import org.apache.cassandra.utils.concurrent.WaitQueue; +import org.jctools.queues.MpscUnboundedArrayQueue; + +import static java.lang.String.format; +import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; +import static org.apache.cassandra.concurrent.ExecutorFactory.SystemThreadTag.NON_DAEMON; +import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.Interrupts.SYNCHRONIZED; +import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.SimulatorSafe.SAFE; +import static org.apache.cassandra.concurrent.Interruptible.State.NORMAL; +import static org.apache.cassandra.concurrent.Interruptible.State.SHUTTING_DOWN; +import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis; +import static org.apache.cassandra.utils.Simulate.With.MONITORS; +import static org.apache.cassandra.utils.concurrent.WaitQueue.newWaitQueue; + +/** + * A generic append-only journal with some special features: + *

    + *
  • Records can be looked up by key + *
  • Invalidated records get purged during segment compaction + *

+ * + * Type parameters: + * @param the type of records stored in the journal + * @param the type of keys used to address the records; + must be fixed-size and byte-order comparable + */ +@Simulate(with=MONITORS) +public class Journal implements Shutdownable +{ + private static final Logger logger = LoggerFactory.getLogger(Journal.class); + + final String name; + final File directory; + final Params params; + + final KeySupport keySupport; + final ValueSerializer valueSerializer; + + final Metrics metrics; + + final Flusher flusher; + final Compactor compactor; + Interruptible allocator; + SequentialExecutorPlus closer, releaser; + + volatile long replayLimit; + final AtomicLong nextSegmentId = new AtomicLong(); + + private volatile ActiveSegment currentSegment = null; + + // segment that is ready to be used; allocator thread fills this and blocks until consumed + private volatile ActiveSegment availableSegment = null; + + private final AtomicReference> segments = new AtomicReference<>(); + + final AtomicReference state = new AtomicReference<>(State.UNINITIALIZED); + + // TODO (required): we do not need wait queues here, we can just wait on a signal on a segment while its byte buffer is being allocated + private final WaitQueue segmentPrepared = newWaitQueue(); + private final WaitQueue allocatorThreadWaitQueue = newWaitQueue(); + private final BooleanSupplier allocatorThreadWaitCondition = () -> (availableSegment == null); + + private final FlusherCallbacks flusherCallbacks; + + final OpOrder readOrder = new OpOrder(); + + private class FlusherCallbacks implements Flusher.Callbacks + { + private final MpscUnboundedArrayQueue waitingFor = new MpscUnboundedArrayQueue<>(256); + private List drained = new ArrayList<>(); + + @Override + public void onFsync() + { + waitingFor.drain(drained::add); + List remaining = new ArrayList<>(); + for (WaitingFor wait : drained) + { + if (flusher.isDurable(wait)) wait.run(); + else remaining.add(wait); + } + drained = remaining; + } + + @Override + public void onFlushFailed(Throwable cause) + { + // TODO (required): panic + } + + private void submit(RecordPointer pointer, Runnable runnable) + { + if (flusher.isDurable(pointer)) + runnable.run(); + else + waitingFor.add(new WaitingFor(pointer, runnable)); + } + } + + private static class WaitingFor extends RecordPointer implements Runnable + { + private final Runnable onFlush; + + public WaitingFor(RecordPointer pointer, Runnable onFlush) + { + super(pointer); + this.onFlush = onFlush; + } + + public void run() + { + onFlush.run(); + } + } + + public Journal(String name, + File directory, + Params params, + KeySupport keySupport, + ValueSerializer valueSerializer, + SegmentCompactor segmentCompactor) + { + this.name = name; + this.directory = directory; + this.params = params; + + this.keySupport = keySupport; + this.valueSerializer = valueSerializer; + + this.metrics = new Metrics<>(name); + this.flusherCallbacks = new FlusherCallbacks(); + this.flusher = new Flusher<>(this, flusherCallbacks); + this.compactor = new Compactor<>(this, segmentCompactor); + } + + public void onDurable(RecordPointer recordPointer, Runnable runnable) + { + flusherCallbacks.submit(recordPointer, runnable); + } + + public void start() + { + Invariants.require(state.compareAndSet(State.UNINITIALIZED, State.INITIALIZING), + "Unexpected journal state during initialization", state); + metrics.register(flusher); + + deleteTmpFiles(); + + List descriptors = Descriptor.list(directory); + // find the largest existing timestamp + descriptors.sort(null); + long maxTimestamp = descriptors.isEmpty() + ? Long.MIN_VALUE + : descriptors.get(descriptors.size() - 1).timestamp; + nextSegmentId.set(replayLimit = Math.max(currentTimeMillis(), maxTimestamp + 1)); + + segments.set(Segments.of(StaticSegment.open(descriptors, keySupport))); + closer = executorFactory().sequential(name + "-closer"); + releaser = executorFactory().sequential(name + "-releaser"); + allocator = executorFactory().infiniteLoop(name + "-allocator", new AllocateRunnable(), SAFE, NON_DAEMON, SYNCHRONIZED); + advanceSegment(null); + Invariants.require(state.compareAndSet(State.INITIALIZING, State.NORMAL), + "Unexpected journal state after initialization", state); + flusher.start(); + compactor.start(); + } + + @VisibleForTesting + public void runCompactorForTesting() + { + compactor.run(); + } + + public Compactor compactor() + { + return compactor; + } + + /** + * Cleans up unfinished component files from previous run (metadata and index) + */ + private void deleteTmpFiles() + { + for (File tmpFile : directory.listUnchecked(Descriptor::isTmpFile)) + tmpFile.delete(); + } + + @Override + public boolean isTerminated() + { + return state.get() == State.TERMINATED; + } + + public void shutdown() + { + try + { + Invariants.require(state.compareAndSet(State.NORMAL, State.SHUTDOWN), + "Unexpected journal state while trying to shut down", state); + allocator.shutdown(); + wakeAllocator(); // Wake allocator to force it into shutdown + // TODO (expected): why are we awaitingTermination here when we have a separate method for it? + allocator.awaitTermination(1, TimeUnit.MINUTES); + segmentPrepared.signalAll(); // Wake up all threads waiting on the new segment + compactor.shutdown(); + compactor.awaitTermination(1, TimeUnit.MINUTES); + flusher.shutdown(); + closeAllSegments(); + releaser.shutdown(); + closer.shutdown(); + closer.awaitTermination(1, TimeUnit.MINUTES); + releaser.awaitTermination(1, TimeUnit.MINUTES); + metrics.deregister(); + Invariants.require(state.compareAndSet(State.SHUTDOWN, State.TERMINATED), + "Unexpected journal state while trying to shut down", state); + } + catch (InterruptedException e) + { + logger.error("Could not shutdown journal", e); + } + } + + @Override + public Object shutdownNow() + { + shutdown(); + return null; + } + + @Override + public boolean awaitTermination(long timeout, TimeUnit units) throws InterruptedException + { + boolean r = true; + r &= allocator.awaitTermination(timeout, units); + r &= closer.awaitTermination(timeout, units); + r &= releaser.awaitTermination(timeout, units); + return r; + } + + /** + * Looks up a record by the provided id. + *

+ * Looking up an invalidated record may or may not return a record, depending on + * compaction progress. + *

+ * In case multiple copies of the record exist in the log (e.g. because of user retries), + * the first one found will be returned. + * + * @param id user-provided record id, expected to roughly correlate with time and go up + * @return deserialized record if found, null otherwise + */ + @SuppressWarnings("unused") + public V readLast(K id) + { + EntrySerializer.EntryHolder holder = new EntrySerializer.EntryHolder<>(); + + try (OpOrder.Group group = readOrder.start()) + { + for (Segment segment : segments.get().allSorted(true)) + { + if (segment.readLast(id, holder)) + { + try (DataInputBuffer in = new DataInputBuffer(holder.value, false)) + { + return valueSerializer.deserialize(holder.key, in, holder.userVersion); + } + catch (IOException e) + { + // can only throw if serializer is buggy + throw new RuntimeException(e); + } + } + } + } + return null; + } + + public void readAll(K id, RecordConsumer consumer) + { + EntrySerializer.EntryHolder holder = new EntrySerializer.EntryHolder<>(); + try (OpOrder.Group group = readOrder.start()) + { + for (Segment segment : segments.get().allSorted(false)) + { + segment.readAll(id, holder, consumer); + } + } + } + + @SuppressWarnings("unused") + public List readAll(K id) + { + List res = new ArrayList<>(2); + readAll(id, (segment, position, key, buffer, userVersion) -> { + try (DataInputBuffer in = new DataInputBuffer(buffer, false)) + { + res.add(valueSerializer.deserialize(key, in, userVersion)); + } + catch (IOException e) + { + // can only throw if serializer is buggy + throw new RuntimeException(e); + } + }); + return res; + } + + /** + * Looks up a record by the provided id, if the value satisfies the provided condition. + *

+ * Looking up an invalidated record may or may not return a record, depending on + * compaction progress. + *

+ * In case multiple copies of the record exist in the log (e.g. because of user retries), + * and more than one of them satisfy the provided condition, the first one found will be returned. + * + * @param id user-provided record id, expected to roughly correlate with time and go up + * @param condition predicate to test the record against + * @return deserialized record if found, null otherwise + */ + @SuppressWarnings("unused") + public V readFirstMatching(K id, Predicate condition) + { + EntrySerializer.EntryHolder holder = new EntrySerializer.EntryHolder<>(); + + try (OpOrder.Group group = readOrder.start()) + { + for (Segment segment : segments.get().all()) + { + long[] offsets = segment.index().lookUp(id); + for (long offsetAndSize : offsets) + { + int offset = Index.readOffset(offsetAndSize); + int size = Index.readSize(offsetAndSize); + holder.clear(); + if (segment.read(offset, size, holder)) + { + try (DataInputBuffer in = new DataInputBuffer(holder.value, false)) + { + V record = valueSerializer.deserialize(holder.key, in, segment.descriptor.userVersion); + if (condition.test(record)) + return record; + } + catch (IOException e) + { + // can only throw if serializer is buggy + throw new RuntimeException(e); + } + } + } + } + } + return null; + } + + /** + * Looks up a record by the provided id. + *

+ * Looking up an invalidated record may or may not return a record, depending on + * compaction progress. + *

+ * In case multiple copies of the record exist in the log (e.g. because of user retries), + * only the first found record will be consumed. + * + * @param id user-provided record id, expected to roughly correlate with time and go up + * @param consumer function to consume the raw record (bytes and invalidation set) if found + * @return true if the record was found, false otherwise + */ + @SuppressWarnings("unused") + public boolean readLast(K id, RecordConsumer consumer) + { + try (OpOrder.Group group = readOrder.start()) + { + for (Segment segment : segments.get().allSorted(false)) + { + if (!segment.index().mayContainId(id)) + continue; + + if (segment.readLast(id, consumer)) + return true; + } + } + return false; + } + + /** + * Synchronously write a record to the journal. + *

+ * Blocks until the record has been deemed durable according to the journal flush mode. + * + * @param id user-provided record id, expected to roughly correlate with time and go up + * @param record the record to store + */ + public void blockingWrite(K id, V record) + { + try (DataOutputBuffer dob = DataOutputBuffer.scratchBuffer.get()) + { + valueSerializer.serialize(id, record, dob, params.userVersion()); + ActiveSegment.Allocation alloc = allocate(dob.getLength()); + alloc.writeInternal(id, dob.unsafeGetBufferAndFlip()); + flusher.flushAndAwaitDurable(alloc); + } + catch (IOException e) + { + // exception during record serialization into the scratch buffer + throw new RuntimeException(e); + } + } + + /** + * Asynchronously write a record to the journal. Writes to the journal in the calling thread, + * but doesn't wait for flush. + *

+ * Executes the supplied callback on the executor provided once the record has been durably written to disk + * + * @param id user-provided record id, expected to roughly correlate with time and go up + * @param record the record to store + */ + public RecordPointer asyncWrite(K id, V record) + { + return asyncWrite(id, (out, userVersion) -> valueSerializer.serialize(id, record, out, userVersion)); + } + + public RecordPointer asyncWrite(K id, Writer writer) + { + try (DataOutputBuffer dob = DataOutputBuffer.scratchBuffer.get()) + { + writer.write(dob, params.userVersion()); + ActiveSegment.Allocation alloc = allocate(dob.getLength()); + alloc.write(id, dob.unsafeGetBufferAndFlip()); + return flusher.flush(alloc); + } + catch (IOException e) + { + // exception during record serialization into the scratch buffer + throw new RuntimeException(e); + } + } + + private ActiveSegment.Allocation allocate(int entrySize) + { + ActiveSegment segment = currentSegment; + + ActiveSegment.Allocation alloc; + while (null == (alloc = segment.allocate(entrySize))) + { + if (entrySize >= (params.segmentSize() * 3) / 4) + throw new IllegalStateException("entrySize " + entrySize + " too large for a segmentSize of " + params.segmentSize()); + // failed to allocate; move to a new segment with enough room + advanceSegment(segment); + segment = currentSegment; + } + return alloc; + } + + /* + * Segment allocation logic. + */ + + private void advanceSegment(ActiveSegment oldSegment) + { + while (true) + { + synchronized (this) + { + // do this in a critical section, so we can maintain the order of + // segment construction when moving to allocatingFrom/activeSegments + if (currentSegment != oldSegment) + return; + + // if a segment is ready, take it now, otherwise wait for the allocator thread to construct it + if (availableSegment != null) + { + // success - change allocatingFrom and activeSegments (which must be kept in order) before leaving the critical section + addNewActiveSegment(currentSegment = availableSegment); + availableSegment = null; + break; + } + } + + awaitAvailableSegment(oldSegment); + } + + // signal the allocator thread to prepare a new segment + wakeAllocator(); + + // request that the journal be flushed out-of-band, as we've finished a segment + flusher.requestExtraFlush(); + } + + private void awaitAvailableSegment(ActiveSegment currentActiveSegment) + { + do + { + WaitQueue.Signal prepared = segmentPrepared.register(metrics.waitingOnSegmentAllocation.time(), Context::stop); + if (availableSegment == null && currentSegment == currentActiveSegment) + { + prepared.awaitThrowUncheckedOnInterrupt(); + + // In case we woke up due to shutdown signal or interrupt, check mode + State state = this.state.get(); + if (state.ordinal() > State.NORMAL.ordinal()) + throw new IllegalStateException("Can not obtain allocated segment due to shutdown " + state); + } + else + prepared.cancel(); + } + while (availableSegment == null && currentSegment == currentActiveSegment); + } + + private void wakeAllocator() + { + allocatorThreadWaitQueue.signalAll(); + } + + private void discardAvailableSegment() + { + ActiveSegment next; + synchronized (this) + { + next = availableSegment; + availableSegment = null; + } + if (next != null) + next.closeAndDiscard(this); + } + + private class AllocateRunnable implements Interruptible.Task + { + @Override + public void run(Interruptible.State state) throws InterruptedException + { + if (state == NORMAL) + runNormal(); + else if (state == SHUTTING_DOWN) + shutDown(); + } + + private void runNormal() throws InterruptedException + { + boolean interrupted = false; + try + { + if (availableSegment != null) + throw new IllegalStateException("availableSegment is not null"); + + // synchronized to prevent thread interrupts while performing IO operations and also + // clear interrupted status to prevent ClosedByInterruptException in createSegment() + synchronized (this) + { + interrupted = Thread.interrupted(); + availableSegment = createSegment(); + + segmentPrepared.signalAll(); + Thread.yield(); + } + } + catch (JournalWriteError e) + { + if (!(e.getCause() instanceof ClosedByInterruptException)) + throw e; + } + catch (Throwable t) + { + if (!handleError("Failed allocating journal segments", t)) + { + discardAvailableSegment(); + throw new TerminateException(); + } + TimeUnit.SECONDS.sleep(1L); // sleep for a second to avoid log spam + } + + interrupted = interrupted || Thread.interrupted(); + if (!interrupted) + { + try + { + // If we offered a segment, wait for it to be taken before reentering the loop. + // There could be a new segment in next not offered, but only on failure to discard it while + // shutting down-- nothing more can or needs to be done in that case. + WaitQueue.waitOnCondition(allocatorThreadWaitCondition, allocatorThreadWaitQueue); + } + catch (InterruptedException e) + { + interrupted = true; + } + } + + if (interrupted) + { + discardAvailableSegment(); + throw new InterruptedException(); + } + } + + private void shutDown() throws InterruptedException + { + try + { + // if shutdown() started and finished during segment creation, we'll be left with a + // segment that no one will consume; discard it + discardAvailableSegment(); + } + catch (Throwable t) + { + handleError("Failed shutting down segment allocator", t); + throw new TerminateException(); + } + } + } + + private ActiveSegment createSegment() + { + Descriptor descriptor = Descriptor.create(directory, nextSegmentId.getAndIncrement(), params.userVersion()); + return ActiveSegment.create(descriptor, params, keySupport); + } + + private void closeAllSegments() + { + Segments segments = swapSegments(ignore -> Segments.none()); + + for (Segment segment : segments.all()) + { + if (segment.isActive()) + ((ActiveSegment) segment).closeAndIfEmptyDiscard(this); + else + segment.close(this); + } + } + + @SuppressWarnings("unused") + ReferencedSegments selectAndReference(Predicate> selector) + { + while (true) + { + ReferencedSegments referenced = segments().selectAndReference(selector); + if (null != referenced) + return referenced; + } + } + + Segments segments() + { + return segments.get(); + } + + private Segments swapSegments(Function, Segments> transformation) + { + Segments currentSegments, newSegments; + do + { + currentSegments = segments(); + newSegments = transformation.apply(currentSegments); + } + while (!segments.compareAndSet(currentSegments, newSegments)); + return currentSegments; + } + + private void addNewActiveSegment(ActiveSegment activeSegment) + { + swapSegments(current -> current.withNewActiveSegment(activeSegment)); + } + + private void removeEmptySegment(ActiveSegment activeSegment) + { + swapSegments(current -> current.withoutEmptySegment(activeSegment)); + } + + private void replaceCompletedSegment(ActiveSegment activeSegment, StaticSegment staticSegment) + { + swapSegments(current -> current.withCompletedSegment(activeSegment, staticSegment)); + } + + void replaceCompactedSegments(Collection> oldSegments, Collection> compactedSegments) + { + swapSegments(current -> current.withCompactedSegments(oldSegments, compactedSegments)); + } + + void selectSegmentToFlush(Collection> into) + { + segments().selectActive(currentSegment.descriptor.timestamp, into); + } + + ActiveSegment oldestActiveSegment() + { + ActiveSegment current = currentSegment; + if (current == null) + return null; + + ActiveSegment oldest = segments().oldestActive(); + if (oldest == null || oldest.descriptor.timestamp > current.descriptor.timestamp) + return current; + + return oldest; + } + + public ActiveSegment currentActiveSegment() + { + return currentSegment; + } + + ActiveSegment getActiveSegment(long timestamp) + { + // we can race with segment addition to the segments() collection, with a new segment appearing in currentSegment first + // since we are most likely to be requesting the currentSegment anyway, we resolve this case by checking currentSegment first + // and resort to the segments() collection only if we do not match + ActiveSegment currentSegment = this.currentSegment; + if (currentSegment == null) + throw new IllegalArgumentException("Requested an active segment with timestamp " + timestamp + " but there is no currently active segment"); + long currentSegmentTimestamp = currentSegment.descriptor.timestamp; + if (timestamp == currentSegmentTimestamp) + { + return currentSegment; + } + else if (timestamp > currentSegmentTimestamp) + { + throw new IllegalArgumentException("Requested a newer timestamp " + timestamp + " than the current active segment " + currentSegmentTimestamp); + } + else + { + Segment segment = segments().get(timestamp); + Invariants.require(segment != null, "Segment %d expected to be found, but neither current segment %d nor in active segments", timestamp, currentSegmentTimestamp); + if (segment == null) + throw new IllegalArgumentException("Request the active segment " + timestamp + " but this segment does not exist"); + if (!segment.isActive()) + throw new IllegalArgumentException(String.format("Request the active segment %d but this segment is not active: %s", timestamp, segment)); + return segment.asActive(); + } + } + + /** + * Take care of a finished active segment: + * 1. discard tail + * 2. flush to disk + * 3. persist index and metadata + * 4. open the segment as static + * 5. replace the finished active segment with the opened static one in Segments view + * 6. release the Ref so the active segment will be cleaned up by its Tidy instance + */ + private class CloseActiveSegmentRunnable implements Runnable + { + private final ActiveSegment activeSegment; + + CloseActiveSegmentRunnable(ActiveSegment activeSegment) + { + this.activeSegment = activeSegment; + } + + @Override + public void run() + { + activeSegment.discardUnusedTail(); + activeSegment.updateWrittenTo(); + activeSegment.fsync(); + activeSegment.persistComponents(); + replaceCompletedSegment(activeSegment, StaticSegment.open(activeSegment.descriptor, keySupport)); + activeSegment.release(Journal.this); + } + } + + void closeActiveSegmentAndOpenAsStatic(ActiveSegment activeSegment) + { + if (activeSegment.isEmpty()) + { + removeEmptySegment(activeSegment); + activeSegment.closeAndDiscard(this); + return; + } + + closer.execute(new CloseActiveSegmentRunnable(activeSegment)); + } + + @VisibleForTesting + public void closeCurrentSegmentForTestingIfNonEmpty() + { + ActiveSegment segment = currentSegment; + if (segment.isEmpty()) + return; + advanceSegment(segment); + while (!segments().isSwitched(segment)) + { + LockSupport.parkNanos(1000); + } + } + + /* + * Static helper methods used by journal components + */ + + static void validateCRC(CRC32 crc, int readCRC) throws Crc.InvalidCrc + { + if (readCRC != (int) crc.getValue()) + throw new Crc.InvalidCrc(readCRC, (int) crc.getValue()); + } + + /* + * Error handling + */ + + /** + * @return true if the invoking thread should continue, or false if it should terminate itself + */ + boolean handleError(String message, Throwable t) + { + Params.FailurePolicy policy = params.failurePolicy(); + JVMStabilityInspector.inspectJournalThrowable(t, name, policy); + + switch (policy) + { + default: + throw new AssertionError(policy); + case DIE: + case STOP: + StorageService.instance.stopTransports(); + //$FALL-THROUGH$ + case STOP_JOURNAL: + message = format("%s. Journal %s failure policy is %s; terminating thread.", message, name, policy); + logger.error(maybeAddDiskSpaceContext(message), t); + return false; + case IGNORE: + message = format("%s. Journal %s failure policy is %s; ignoring excepton.", message, name, policy); + logger.error(maybeAddDiskSpaceContext(message), t); + return true; + } + } + + /** + * Add additional information to the error message if the journal directory does not have enough free space. + * + * @param message the original error message + * @return the message with additional information if possible + */ + private String maybeAddDiskSpaceContext(String message) + { + long availableDiskSpace = PathUtils.tryGetSpace(directory.toPath(), FileStore::getTotalSpace); + int segmentSize = params.segmentSize(); + + if (availableDiskSpace >= segmentSize) + return message; + + return format("%s. %d bytes required for next journal segment but only %d bytes available. " + + "Check %s to see if not enough free space is the reason for this error.", + message, segmentSize, availableDiskSpace, directory); + } + + @VisibleForTesting + public void truncateForTesting() + { + ActiveSegment discarding = currentSegment; + if (!discarding.isEmpty()) // if there is no data in the segement then ignore it + { + closeCurrentSegmentForTestingIfNonEmpty(); + //TODO (desired): wait for the ActiveSegment to get released, else can see weird race conditions; + // this thread will see the static segmenet and will release it (which will delete the file), + // and the sync thread will then try to release and will fail as the file no longer exists... + while (discarding.selfRef().globalCount() > 0) {} + } + + Segments statics = swapSegments(s -> s.select(Segment::isActive)).select(Segment::isStatic); + for (Segment segment : statics.all()) + ((StaticSegment) segment).discard(this); + } + + public interface Writer + { + void write(DataOutputPlus out, int userVersion) throws IOException; + } + + /** + * Static segment iterator iterates all keys in _static_ segments in order. + */ + public StaticSegmentKeyIterator staticSegmentKeyIterator() + { + return new StaticSegmentKeyIterator(); + } + + /** + * List of key and a list of segment descriptors referencing this key + */ + public static class KeyRefs + { + long segments[]; + K key; + int size; + + public KeyRefs(K key) + { + this.key = key; + } + + private KeyRefs(int maxSize) + { + this.segments = new long[maxSize]; + } + + public void segments(LongConsumer consumer) + { + for (int i = 0; i < size; i++) + consumer.accept(segments[i]); + } + + public K key() + { + return key; + } + + private void add(K key, long segment) + { + this.key = key; + if (size == 0 || segments[size - 1] < segment) + segments[size++] = segment; + else + Invariants.require(segments[size - 1] == segment, + "Tried to add an out-of-order segment: %d, %s", segment, + LazyToString.lazy(() -> Arrays.toString(Arrays.copyOf(segments, size)))); + } + + private void reset() + { + key = null; + size = 0; + Arrays.fill(segments, 0); + } + } + + public class StaticSegmentKeyIterator implements CloseableIterator> + { + private final ReferencedSegments segments; + private final MergeIterator> iterator; + + public StaticSegmentKeyIterator() + { + this.segments = selectAndReference(Segment::isStatic); + List> iterators = new ArrayList<>(segments.count()); + + for (Segment segment : segments.allSorted(true)) + { + StaticSegment staticSegment = (StaticSegment) segment; + Iterator iter = staticSegment.index().reader(); + Head head = new Head(staticSegment.descriptor.timestamp); + iterators.add(new Iterator<>() + { + public boolean hasNext() + { + return iter.hasNext(); + } + + public Head next() + { + head.key = iter.next(); + return head; + } + }); + } + + this.iterator = MergeIterator.get(iterators, + (r1, r2) -> { + int keyCmp = keySupport.compare(r1.key, r2.key); + if (keyCmp != 0) + return keyCmp; + return Long.compare(r1.segment, r2.segment); + }, + new MergeIterator.Reducer>() + { + final KeyRefs ret = new KeyRefs<>(segments.count()); + + @Override + public void reduce(int idx, Head head) + { + ret.add(head.key, head.segment); + } + + @Override + protected KeyRefs getReduced() + { + return ret; + } + + @Override + protected void onKeyChange() + { + ret.reset(); + super.onKeyChange(); + } + }); + } + + @Override + public void close() + { + segments.close(); + } + + public KeyRefs peek() + { + if (iterator.hasNext()) + return iterator.peek(); + return null; + } + + @Override + public boolean hasNext() + { + return iterator.hasNext(); + } + + @Override + public KeyRefs next() + { + return iterator.next(); + } + + class Head + { + final long segment; + K key; + Head(long segment) { this.segment = segment; } + } + } + + enum State + { + UNINITIALIZED, + INITIALIZING, + NORMAL, + SHUTDOWN, + TERMINATED + } +} diff --git a/src/java/org/apache/cassandra/journal/JournalReadError.java b/src/java/org/apache/cassandra/journal/JournalReadError.java new file mode 100644 index 000000000000..87366c8d7c6b --- /dev/null +++ b/src/java/org/apache/cassandra/journal/JournalReadError.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import org.apache.cassandra.io.FSReadError; +import org.apache.cassandra.io.util.File; + +public class JournalReadError extends FSReadError +{ + public final Descriptor descriptor; + + JournalReadError(Descriptor descriptor, File file, Throwable throwable) + { + super(throwable, file); + this.descriptor = descriptor; + } + + JournalReadError(Descriptor descriptor, Component component, Throwable throwable) + { + super(throwable, descriptor.fileFor(component)); + this.descriptor = descriptor; + } +} diff --git a/src/java/org/apache/cassandra/journal/JournalWriteError.java b/src/java/org/apache/cassandra/journal/JournalWriteError.java new file mode 100644 index 000000000000..03193af5455a --- /dev/null +++ b/src/java/org/apache/cassandra/journal/JournalWriteError.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import org.apache.cassandra.io.FSWriteError; +import org.apache.cassandra.io.util.File; + +public class JournalWriteError extends FSWriteError +{ + public final Descriptor descriptor; + + JournalWriteError(Descriptor descriptor, File file, Throwable throwable) + { + super(throwable, file); + this.descriptor = descriptor; + } + + JournalWriteError(Descriptor descriptor, Component component, Throwable throwable) + { + super(throwable, descriptor.fileFor(component)); + this.descriptor = descriptor; + } +} diff --git a/src/java/org/apache/cassandra/journal/KeySupport.java b/src/java/org/apache/cassandra/journal/KeySupport.java new file mode 100644 index 000000000000..efc41aa6c88b --- /dev/null +++ b/src/java/org/apache/cassandra/journal/KeySupport.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Comparator; +import java.util.zip.Checksum; + +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + +/** + * Record keys must satisfy two properties: + *

+ * 1. Must have a fixed serialized size + * 2. Must be byte-order comparable + */ +public interface KeySupport extends Comparator +{ + int serializedSize(int userVersion); + + void serialize(K key, DataOutputPlus out, int userVersion) throws IOException; + void serialize(K key, ByteBuffer out, int userVersion) throws IOException; + + K deserialize(DataInputPlus in, int userVersion) throws IOException; + + K deserialize(ByteBuffer buffer, int position, int userVersion); + K deserialize(ByteBuffer buffer, int userVersion); + + void updateChecksum(Checksum crc, K key, int userVersion); + + int compareWithKeyAt(K key, ByteBuffer buffer, int position, int userVersion); +} diff --git a/src/java/org/apache/cassandra/journal/Metadata.java b/src/java/org/apache/cassandra/journal/Metadata.java new file mode 100644 index 000000000000..2742816316d6 --- /dev/null +++ b/src/java/org/apache/cassandra/journal/Metadata.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.io.EOFException; +import java.io.IOException; +import java.util.concurrent.atomic.AtomicIntegerFieldUpdater; +import java.util.zip.CRC32; + +import org.apache.cassandra.io.util.*; +import org.apache.cassandra.utils.Crc; + +import static org.apache.cassandra.journal.Journal.validateCRC; +import static org.apache.cassandra.utils.FBUtilities.updateChecksumInt; + +/** + * Tracks and serializes the following information: + * - total count of records in this segment file + * used for compaction prioritisation + */ +final class Metadata +{ + private int fsyncLimit; + + private volatile int recordsCount; + private static final AtomicIntegerFieldUpdater recordsCountUpdater = + AtomicIntegerFieldUpdater.newUpdater(Metadata.class, "recordsCount"); + + static Metadata create() + { + return new Metadata(0); + } + + private Metadata(int recordsCount) + { + this.recordsCount = recordsCount; + } + + void update() + { + incrementRecordsCount(); + } + + int fsyncLimit() + { + return fsyncLimit; + } + + private void incrementRecordsCount() + { + recordsCountUpdater.incrementAndGet(this); + } + + int totalCount() + { + return recordsCount; + } + + void write(DataOutputPlus out) throws IOException + { + CRC32 crc = Crc.crc32(); + out.writeInt(recordsCount); + updateChecksumInt(crc, recordsCount); + out.writeInt((int) crc.getValue()); + } + + static Metadata read(DataInputPlus in) throws IOException + { + CRC32 crc = Crc.crc32(); + int recordsCount = in.readInt(); + updateChecksumInt(crc, recordsCount); + validateCRC(crc, in.readInt()); + return new Metadata(recordsCount); + } + + void persist(Descriptor descriptor) + { + File tmpFile = descriptor.tmpFileFor(Component.METADATA); + try (FileOutputStreamPlus out = new FileOutputStreamPlus(tmpFile)) + { + write(out); + + out.flush(); + out.sync(); + } + catch (IOException e) + { + throw new JournalWriteError(descriptor, tmpFile, e); + } + tmpFile.move(descriptor.fileFor(Component.METADATA)); + } + + static Metadata load(Descriptor descriptor) + { + File file = descriptor.fileFor(Component.METADATA); + try (FileInputStreamPlus in = new FileInputStreamPlus(file)) + { + return read(in); + } + catch (IOException e) + { + throw new JournalReadError(descriptor, file, e); + } + } + + static Metadata rebuild(Descriptor descriptor, KeySupport keySupport) + { + int recordsCount = 0; + + try (StaticSegment.SequentialReader reader = StaticSegment.sequentialReader(descriptor, keySupport, Integer.MAX_VALUE)) + { + while (reader.advance()) + ++recordsCount; + } + catch (JournalReadError e) + { + // we expect EOF when rebuilding + if (!(e.getCause() instanceof EOFException)) + throw e; + } + + return new Metadata(recordsCount); + } + + static Metadata rebuildAndPersist(Descriptor descriptor, KeySupport keySupport) + { + Metadata metadata = rebuild(descriptor, keySupport); + metadata.persist(descriptor); + return metadata; + } +} diff --git a/src/java/org/apache/cassandra/journal/Metrics.java b/src/java/org/apache/cassandra/journal/Metrics.java new file mode 100644 index 000000000000..4bca57c77cae --- /dev/null +++ b/src/java/org/apache/cassandra/journal/Metrics.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import com.codahale.metrics.Gauge; +import com.codahale.metrics.Timer; +import org.apache.cassandra.metrics.CassandraMetricsRegistry; +import org.apache.cassandra.metrics.DefaultNameFactory; +import org.apache.cassandra.metrics.MetricNameFactory; + +public final class Metrics +{ + public static final String TYPE_NAME = "Journal"; + + private static final String WAITING_ON_FLUSH = "WaitingOnFlush"; + private static final String WAITING_ON_ALLOCATION = "WaitingOnSegmentAllocation"; + private static final String WRITTEN_ENTRIES = "WrittenEntries"; + private static final String PENDING_ENTRIES = "PendingEntries"; + + /** + * The time spent waiting on journal flush; for {@link org.apache.cassandra.journal.Params.FlushMode#PERIODIC} + * this is only occurs when the flush is lagging its flush interval. + */ + Timer waitingOnFlush; + + /** Time spent waiting for a segment to be allocated - under normal conditions this should be zero */ + Timer waitingOnSegmentAllocation; + + /** Number of pending (flush) entries */ + Gauge pendingEntries; + + /** Number of written (flushed) entries */ + Gauge writtenEntries; + + private final MetricNameFactory factory; + + Metrics(String name) + { + this.factory = new DefaultNameFactory(TYPE_NAME, name); + } + + void register(Flusher flusher) + { + waitingOnFlush = CassandraMetricsRegistry.Metrics.timer(createName(WAITING_ON_FLUSH)); + waitingOnSegmentAllocation = CassandraMetricsRegistry.Metrics.timer(createName(WAITING_ON_ALLOCATION)); + pendingEntries = CassandraMetricsRegistry.Metrics.register(createName(PENDING_ENTRIES), flusher::pendingEntries); + writtenEntries = CassandraMetricsRegistry.Metrics.register(createName(WRITTEN_ENTRIES), flusher::writtenEntries); + } + + void deregister() + { + CassandraMetricsRegistry.Metrics.remove(createName(WAITING_ON_FLUSH)); + CassandraMetricsRegistry.Metrics.remove(createName(WAITING_ON_ALLOCATION)); + CassandraMetricsRegistry.Metrics.remove(createName(PENDING_ENTRIES)); + CassandraMetricsRegistry.Metrics.remove(createName(WRITTEN_ENTRIES)); + } + + private CassandraMetricsRegistry.MetricName createName(String metricName) + { + return factory.createMetricName(metricName); + } +} diff --git a/src/java/org/apache/cassandra/journal/OnDiskIndex.java b/src/java/org/apache/cassandra/journal/OnDiskIndex.java new file mode 100644 index 000000000000..8c662d889a46 --- /dev/null +++ b/src/java/org/apache/cassandra/journal/OnDiskIndex.java @@ -0,0 +1,350 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.io.IOException; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.StandardOpenOption; +import java.util.Map; +import java.util.NavigableMap; +import java.util.zip.CRC32; +import javax.annotation.Nullable; + +import accord.utils.Invariants; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.io.util.FileUtils; +import org.apache.cassandra.utils.AbstractIterator; +import org.apache.cassandra.utils.Crc; + +import static org.apache.cassandra.journal.Journal.validateCRC; +import static org.apache.cassandra.utils.FBUtilities.updateChecksumInt; +import static org.apache.cassandra.utils.FBUtilities.updateChecksumLong; + +/** + * An on-disk (memory-mapped) index for a completed flushed segment. + *

+ * TODO (expected): block-level CRC + */ +final class OnDiskIndex extends Index +{ + private static final long[] EMPTY = new long[0]; + + private static final int FILE_PREFIX_SIZE = 4 + 4; // count of entries, CRC + private static final int VALUE_SIZE = Long.BYTES; // int offset + int size + + private final int KEY_SIZE; + private final int ENTRY_SIZE; + + private final Descriptor descriptor; + + private final FileChannel channel; + private volatile MappedByteBuffer buffer; + private final int entryCount; + + private volatile K firstId, lastId; + + private OnDiskIndex( + Descriptor descriptor, KeySupport keySupport, FileChannel channel, MappedByteBuffer buffer, int entryCount) + { + super(keySupport); + + this.descriptor = descriptor; + this.channel = channel; + this.buffer = buffer; + this.entryCount = entryCount; + + KEY_SIZE = keySupport.serializedSize(descriptor.userVersion); + ENTRY_SIZE = KEY_SIZE + VALUE_SIZE; + } + + /** + * Open the index for reading, validate CRC + */ + @SuppressWarnings({ "resource", "RedundantSuppression" }) + static OnDiskIndex open(Descriptor descriptor, KeySupport keySupport) + { + File file = descriptor.fileFor(Component.INDEX); + FileChannel channel = null; + MappedByteBuffer buffer = null; + try + { + channel = FileChannel.open(file.toPath(), StandardOpenOption.READ); + buffer = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size()); + + int entryCount = buffer.getInt(0); + OnDiskIndex index = new OnDiskIndex<>(descriptor, keySupport, channel, buffer, entryCount); + index.validate(); + index.init(); + return index; + } + catch (Throwable e) + { + FileUtils.clean(buffer); + FileUtils.closeQuietly(channel); + throw new JournalReadError(descriptor, file, e); + } + } + + private void init() + { + if (entryCount > 0) + { + firstId = keyAtIndex(0); + lastId = keyAtIndex(entryCount - 1); + } + } + + @Override + public void close() + { + try + { + FileUtils.clean(buffer); + buffer = null; + channel.close(); + } + catch (IOException e) + { + throw new JournalWriteError(descriptor, Component.INDEX, e); + } + } + + void validate() throws IOException + { + CRC32 crc = Crc.crc32(); + + try (DataInputBuffer in = new DataInputBuffer(buffer, true)) + { + int entryCount = in.readInt(); + updateChecksumInt(crc, entryCount); + validateCRC(crc, in.readInt()); + + Crc.updateCrc32(crc, buffer, FILE_PREFIX_SIZE, FILE_PREFIX_SIZE + entryCount * ENTRY_SIZE); + in.skipBytesFully(entryCount * ENTRY_SIZE); + validateCRC(crc, in.readInt()); + + if (in.available() != 0) + throw new IOException("Trailing data encountered in segment index " + descriptor.fileFor(Component.INDEX)); + } + } + + static void write( + NavigableMap entries, KeySupport keySupport, DataOutputPlus out, int userVersion) throws IOException + { + CRC32 crc = Crc.crc32(); + + int size = entries.values() + .stream() + .mapToInt(offsets -> offsets.length) + .sum(); + out.writeInt(size); + updateChecksumInt(crc, size); + out.writeInt((int) crc.getValue()); + + for (Map.Entry entry : entries.entrySet()) + { + long prev = -1; + for (long offsetAndSize : entry.getValue()) + { + K key = entry.getKey(); + keySupport.serialize(key, out, userVersion); + keySupport.updateChecksum(crc, key, userVersion); + + if (prev != -1) + { + long tmp = prev; + Invariants.require(readOffset(offsetAndSize) < readOffset(prev), + () -> String.format("Offsets should be strictly reverse monotonic, but found %d following %d", + readOffset(offsetAndSize), readOffset(tmp))); + } + out.writeLong(offsetAndSize); + updateChecksumLong(crc, offsetAndSize); + prev = offsetAndSize; + } + } + + out.writeInt((int) crc.getValue()); + } + + @Override + @Nullable + public K firstId() + { + return firstId; + } + + @Override + @Nullable + public K lastId() + { + return lastId; + } + + @Override + public long[] lookUp(K id) + { + return lookUpAll(id); + } + + @Override + public long lookUpLast(K id) + { + if (!mayContainId(id)) + return -1L; + + int keyIndex = binarySearch(id); + return keyIndex < 0 ? -1 : recordAtIndex(keyIndex); + } + + @Override + public long[] lookUpAll(K id) + { + if (!mayContainId(id)) + return EMPTY; + + int someIndex = binarySearch(id); + if (someIndex < 0) + return EMPTY; + + int firstKeyIndex = someIndex; + while (firstKeyIndex > 0 && id.equals(keyAtIndex(firstKeyIndex - 1))) + --firstKeyIndex; + + int lastKeyIndex = someIndex; + while (lastKeyIndex + 1 < entryCount && id.equals(keyAtIndex(lastKeyIndex + 1))) + ++lastKeyIndex; + + long[] all = new long[lastKeyIndex - firstKeyIndex + 1]; + int idx = firstKeyIndex; + for (int i = 0; i < all.length; i++) + all[i] = recordAtIndex(idx++); + return all; + } + + IndexReader reader() + { + return new IndexReader(); + } + + public class IndexReader extends AbstractIterator + { + int idx; + K key; + int offset; + int size; + + IndexReader() + { + idx = -1; + } + + protected K computeNext() + { + if (advance()) + return key; + else + return endOfData(); + } + + public int offset() + { + ensureAdvanced(); + return offset; + } + + public int recordSize() + { + ensureAdvanced(); + return size; + } + + public boolean advance() + { + if (idx >= entryCount - 1) + return false; + + idx++; + key = keyAtIndex(idx); + long record = recordAtIndex(idx); + offset = Index.readOffset(record); + size = Index.readSize(record); + return true; + } + + private void ensureAdvanced() + { + if (idx < 0) + throw new IllegalStateException("Must call advance() before accessing entry content"); + } + } + + private K keyAtIndex(int index) + { + return keySupport.deserialize(buffer, FILE_PREFIX_SIZE + index * ENTRY_SIZE, descriptor.userVersion); + } + + private long recordAtIndex(int index) + { + return buffer.getLong(FILE_PREFIX_SIZE + index * ENTRY_SIZE + KEY_SIZE); + } + + /* + * This has been lifted from {@see IndexSummary}'s implementation, + * which itself was lifted from Harmony's Collections implementation. + */ + private int binarySearch(K key) + { + int low = 0, mid = entryCount, high = mid - 1, result = -1; + while (low <= high) + { + mid = (low + high) >>> 1; + result = compareWithKeyAt(key, mid); + if (result > 0) + { + low = mid + 1; + } + else if (result == 0) + { + return mid; + } + else + { + high = mid - 1; + } + } + return -mid - (result < 0 ? 1 : 2); + } + + private int compareWithKeyAt(K key, int keyIndex) + { + int offset = FILE_PREFIX_SIZE + ENTRY_SIZE * keyIndex; + return keySupport.compareWithKeyAt(key, buffer, offset, descriptor.userVersion); + } + + static OnDiskIndex rebuildAndPersist(Descriptor descriptor, KeySupport keySupport, int fsyncedLimit) + { + try (InMemoryIndex index = InMemoryIndex.rebuild(descriptor, keySupport, fsyncedLimit)) + { + index.persist(descriptor); + } + return open(descriptor, keySupport); + } +} diff --git a/src/java/org/apache/cassandra/journal/Params.java b/src/java/org/apache/cassandra/journal/Params.java new file mode 100644 index 000000000000..452f024fbc64 --- /dev/null +++ b/src/java/org/apache/cassandra/journal/Params.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.util.concurrent.TimeUnit; + +public interface Params +{ + enum FlushMode { BATCH, GROUP, PERIODIC } + + enum FailurePolicy { STOP, STOP_JOURNAL, IGNORE, DIE } + + /** + * @return maximum segment size + */ + int segmentSize(); + + /** + * @return this journal's {@link FailurePolicy} + */ + FailurePolicy failurePolicy(); + + /** + * @return journal flush (sync) mode + */ + FlushMode flushMode(); + + boolean enableCompaction(); + + long compactionPeriod(TimeUnit units); + + /** + * @return milliseconds between journal flushes + */ + long flushPeriod(TimeUnit units); + + /** + * @return to block writes for while waiting for a slow fsync to complete + * when in {@link FlushMode#PERIODIC} mode + */ + long periodicBlockPeriod(TimeUnit units); + + /** + * @return user provided version to use for key and value serialization + */ + int userVersion(); +} diff --git a/src/java/org/apache/cassandra/journal/RecordConsumer.java b/src/java/org/apache/cassandra/journal/RecordConsumer.java new file mode 100644 index 000000000000..22d2bc4e9f8d --- /dev/null +++ b/src/java/org/apache/cassandra/journal/RecordConsumer.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.nio.ByteBuffer; + +@FunctionalInterface +public interface RecordConsumer +{ + void accept(long segment, int position, K key, ByteBuffer buffer, int userVersion); +} diff --git a/src/java/org/apache/cassandra/journal/RecordPointer.java b/src/java/org/apache/cassandra/journal/RecordPointer.java new file mode 100644 index 000000000000..1439e63c901f --- /dev/null +++ b/src/java/org/apache/cassandra/journal/RecordPointer.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.journal; + +import com.google.common.primitives.Ints; +import com.google.common.primitives.Longs; + +// TODO: make this available in the accord table as an ID +public class RecordPointer implements Comparable +{ + public final long segment; // unique segment id + public final int position; // record start position within the segment + public final long writtenAt; // only set for periodic mode + + public RecordPointer(long segment, int position) + { + this(segment, position, 0); + } + + public RecordPointer(long segment, int position, long writtenAt) + { + this.segment = segment; + this.position = position; + this.writtenAt = writtenAt; + } + + public RecordPointer(RecordPointer pointer) + { + this(pointer.segment, pointer.position, pointer.writtenAt); + } + + @Override + public boolean equals(Object other) + { + if (this == other) + return true; + if (!(other instanceof RecordPointer)) + return false; + RecordPointer that = (RecordPointer) other; + return this.segment == that.segment + && this.position == that.position; + } + + @Override + public int hashCode() + { + return Long.hashCode(segment) + position * 31; + } + + @Override + public String toString() + { + return "(" + segment + ", " + position + ')'; + } + + @Override + public int compareTo(RecordPointer that) + { + int cmp = Longs.compare(this.segment, that.segment); + return cmp != 0 ? cmp : Ints.compare(this.position, that.position); + } +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/journal/Segment.java b/src/java/org/apache/cassandra/journal/Segment.java new file mode 100644 index 000000000000..3854f0ee27b5 --- /dev/null +++ b/src/java/org/apache/cassandra/journal/Segment.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.nio.ByteBuffer; + +import accord.utils.Invariants; +import org.apache.cassandra.concurrent.ExecutorPlus; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.utils.concurrent.OpOrder; +import org.apache.cassandra.utils.concurrent.Ref; +import org.apache.cassandra.utils.concurrent.SelfRefCounted; + +public abstract class Segment implements SelfRefCounted>, Comparable> +{ + protected abstract static class Tidier implements Tidy, Runnable + { + OpOrder.Barrier await; + ExecutorPlus executor; + + abstract void onUnreferenced(); + + public final void run() + { + await.await(); + onUnreferenced(); + } + + public final void tidy() + { + executor.execute(this); + } + } + + final File file; + final Descriptor descriptor; + final Metadata metadata; + final KeySupport keySupport; + + ByteBuffer buffer; + + Segment(Descriptor descriptor, Metadata metadata, KeySupport keySupport) + { + this.file = descriptor.fileFor(Component.DATA); + this.descriptor = descriptor; + this.metadata = metadata; + this.keySupport = keySupport; + } + + abstract Index index(); + + abstract boolean isActive(); + abstract boolean isFlushed(long position); + boolean isStatic() { return !isActive(); } + + abstract ActiveSegment asActive(); + abstract StaticSegment asStatic(); + + public long id() + { + return descriptor.timestamp; + } + + /* + * Reading entries (by id, by offset, iterate) + */ + + boolean readLast(K id, RecordConsumer consumer) + { + long offsetAndSize = index().lookUpLast(id); + if (offsetAndSize == -1) + return false; + + EntrySerializer.EntryHolder into = new EntrySerializer.EntryHolder<>(); + int offset = Index.readOffset(offsetAndSize); + int size = Index.readSize(offsetAndSize); + if (read(offset, size, into)) + { + Invariants.require(id.equals(into.key), "Index for %s read incorrect key: expected %s but read %s", descriptor, id, into.key); + consumer.accept(descriptor.timestamp, offset, id, into.value, descriptor.userVersion); + return true; + } + return false; + } + + boolean readLast(K id, EntrySerializer.EntryHolder into) + { + long offsetAndSize = index().lookUpLast(id); + if (offsetAndSize == -1 || !read(Index.readOffset(offsetAndSize), Index.readSize(offsetAndSize), into)) + return false; + Invariants.require(id.equals(into.key), "Index for %s read incorrect key: expected %s but read %s", descriptor, id, into.key); + return true; + } + + void readAll(K id, EntrySerializer.EntryHolder into, RecordConsumer onEntry) + { + long[] all = index().lookUpAll(id); + int prevOffset = Integer.MAX_VALUE; + for (int i = 0; i < all.length; i++) + { + int offset = Index.readOffset(all[i]); + int size = Index.readSize(all[i]); + Invariants.require(offset < prevOffset); + Invariants.require(read(offset, size, into), "Read should always return true"); + Invariants.require(id.equals(into.key), "Index for %s read incorrect key: expected %s but read %s", descriptor, id, into.key); + onEntry.accept(descriptor.timestamp, offset, into.key, into.value, into.userVersion); + } + } + + @Override + public int compareTo(Segment that) + { + return this.descriptor.compareTo(that.descriptor); + } + + abstract boolean read(int offset, int size, EntrySerializer.EntryHolder into); + + abstract void close(Journal journal); + + void release(Journal journal) + { + Ref> selfRef = selfRef(); + Tidier tidier = (Tidier) selfRef.tidier(); + if (journal != null) + { + // permitted to be null ONLY for tests + tidier.await = journal.readOrder.newBarrier(); + tidier.await.issue(); + tidier.executor = journal.releaser; + } + selfRef.release(); + } +} diff --git a/src/java/org/apache/cassandra/journal/SegmentCompactor.java b/src/java/org/apache/cassandra/journal/SegmentCompactor.java new file mode 100644 index 000000000000..7b84ea82e12d --- /dev/null +++ b/src/java/org/apache/cassandra/journal/SegmentCompactor.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.io.IOException; +import java.util.Collection; + +public interface SegmentCompactor +{ + SegmentCompactor NOOP = (SegmentCompactor) (segments) -> segments; + + static SegmentCompactor noop() + { + //noinspection unchecked + return (SegmentCompactor) NOOP; + } + + Collection> compact(Collection> segments) throws IOException; +} diff --git a/src/java/org/apache/cassandra/journal/Segments.java b/src/java/org/apache/cassandra/journal/Segments.java new file mode 100644 index 000000000000..a2475557a604 --- /dev/null +++ b/src/java/org/apache/cassandra/journal/Segments.java @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.util.Collection; +import java.util.List; +import java.util.function.Predicate; + +import accord.utils.Invariants; +import accord.utils.SortedArrays.SortedArrayList; +import org.agrona.collections.Long2ObjectHashMap; +import org.apache.cassandra.utils.concurrent.Refs; + +/** + * Consistent, immutable view of active + static segments + *

+ * TODO (performance, expected): an interval/range structure for StaticSegment lookup based on min/max key bounds + */ +class Segments +{ + private final Long2ObjectHashMap> segments; + private SortedArrayList> sorted; + + Segments(Long2ObjectHashMap> segments) + { + this.segments = segments; + } + + static Segments of(Collection> segments) + { + Long2ObjectHashMap> newSegments = newMap(segments.size()); + for (Segment segment : segments) + newSegments.put(segment.descriptor.timestamp, segment); + return new Segments<>(newSegments); + } + + static Segments none() + { + return new Segments<>(emptyMap()); + } + + Segments withNewActiveSegment(ActiveSegment activeSegment) + { + Long2ObjectHashMap> newSegments = new Long2ObjectHashMap<>(segments); + Segment oldValue = newSegments.put(activeSegment.descriptor.timestamp, activeSegment); + Invariants.require(oldValue == null); + return new Segments<>(newSegments); + } + + Segments withoutEmptySegment(ActiveSegment activeSegment) + { + Long2ObjectHashMap> newSegments = new Long2ObjectHashMap<>(segments); + Segment oldValue = segments.remove(activeSegment.descriptor.timestamp); + Invariants.require(oldValue.asActive().isEmpty()); + return new Segments<>(newSegments); + } + + Segments withCompletedSegment(ActiveSegment activeSegment, StaticSegment staticSegment) + { + Invariants.requireArgument(activeSegment.descriptor.equals(staticSegment.descriptor)); + Long2ObjectHashMap> newSegments = new Long2ObjectHashMap<>(segments); + Segment oldValue = newSegments.put(staticSegment.descriptor.timestamp, staticSegment); + Invariants.require(oldValue == activeSegment, () -> String.format("old value %s != new %s", oldValue, activeSegment)); + return new Segments<>(newSegments); + } + + Segments withCompactedSegments(Collection> oldSegments, Collection> compactedSegments) + { + Long2ObjectHashMap> newSegments = new Long2ObjectHashMap<>(segments); + for (StaticSegment oldSegment : oldSegments) + { + Segment oldValue = newSegments.remove(oldSegment.descriptor.timestamp); + Invariants.require(oldValue == oldSegment); + } + + for (StaticSegment compactedSegment : compactedSegments) + { + Segment oldValue = newSegments.put(compactedSegment.descriptor.timestamp, compactedSegment); + Invariants.require(oldValue == null); + } + + return new Segments<>(newSegments); + } + + Iterable> all() + { + return this.segments.values(); + } + + /** + * Returns segments in timestamp order. Will allocate and sort the segment collection. + */ + List> allSorted(boolean asc) + { + if (sorted == null) + sorted = SortedArrayList.>copyUnsorted(segments.values(), Segment[]::new); + return asc ? sorted : sorted.reverse(); + } + + void selectActive(long maxTimestamp, Collection> into) + { + for (Segment segment : segments.values()) + if (segment.isActive() && segment.descriptor.timestamp <= maxTimestamp) + into.add(segment.asActive()); + } + + boolean isSwitched(ActiveSegment active) + { + for (Segment segment : segments.values()) + if (!segment.isActive() && active.descriptor.equals(segment.descriptor)) + return true; + + return false; + } + + ActiveSegment oldestActive() + { + List> sorted = allSorted(true); + for (int i = 0 ; i < sorted.size() ; ++i) + { + Segment segment = sorted.get(i); + if (segment.isActive()) + return segment.asActive(); + } + return null; + } + + Segment get(long timestamp) + { + return segments.get(timestamp); + } + + void selectStatic(Collection> into) + { + for (Segment segment : segments.values()) + if (segment.isStatic()) + into.add(segment.asStatic()); + } + + /** + * Select segments that could potentially have an entry with the specified ids and + * attempt to grab references to them all. + * + * @return a subset of segments with references to them, or {@code null} if failed to grab the refs + */ + ReferencedSegments selectAndReference(Predicate> test) + { + Long2ObjectHashMap> selectedSegments = select(test).segments; + Refs> refs = null; + if (!selectedSegments.isEmpty()) + { + refs = Refs.tryRef(selectedSegments.values()); + if (null == refs) + return null; + } + return new ReferencedSegments<>(selectedSegments, refs); + } + + /** + * Select segments that could potentially have an entry with the specified ids and + * attempt to grab references to them all. + * + * @return a subset of segments with references to them, or {@code null} if failed to grab the refs + */ + Segments select(Predicate> test) + { + Long2ObjectHashMap> selectedSegments = null; + for (Segment segment : segments.values()) + { + if (test.test(segment)) + { + if (null == selectedSegments) + selectedSegments = newMap(10); + selectedSegments.put(segment.descriptor.timestamp, segment); + } + } + + if (null == selectedSegments) + selectedSegments = emptyMap(); + + return new Segments<>(selectedSegments); + } + + static class ReferencedSegments extends Segments implements AutoCloseable + { + private final Refs> refs; + + ReferencedSegments(Long2ObjectHashMap> segments, Refs> refs) + { + super(segments); + this.refs = refs; + } + + public int count() + { + if (refs == null) return 0; + else return refs.size(); + } + + @Override + public void close() + { + if (null != refs) + refs.release(); + } + } + + private static final Long2ObjectHashMap EMPTY_MAP = new Long2ObjectHashMap<>(); + + @SuppressWarnings("unchecked") + private static Long2ObjectHashMap emptyMap() + { + return (Long2ObjectHashMap) EMPTY_MAP; + } + + private static Long2ObjectHashMap newMap(int expectedSize) + { + return new Long2ObjectHashMap<>(expectedSize, 0.65f, false); + } +} diff --git a/src/java/org/apache/cassandra/journal/StaticSegment.java b/src/java/org/apache/cassandra/journal/StaticSegment.java new file mode 100644 index 000000000000..92dd999c2d31 --- /dev/null +++ b/src/java/org/apache/cassandra/journal/StaticSegment.java @@ -0,0 +1,487 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.NoSuchFileException; +import java.nio.file.StandardOpenOption; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.io.util.FileUtils; +import org.apache.cassandra.utils.Closeable; +import org.apache.cassandra.utils.Throwables; +import org.apache.cassandra.utils.concurrent.Ref; + +/** + * An immutable data segment that is no longer written to. + *

+ * Can be compacted with input from {@code PersistedInvalidations} into a new smaller segment, + * with invalidated entries removed. + */ +public final class StaticSegment extends Segment +{ + public static final Logger logger = LoggerFactory.getLogger(StaticSegment.class); + final FileChannel channel; + final int fsyncLimit; + + private final Ref> selfRef; + + private final OnDiskIndex index; + + private StaticSegment(Descriptor descriptor, + FileChannel channel, + MappedByteBuffer buffer, + OnDiskIndex index, + Metadata metadata, + KeySupport keySupport) + { + super(descriptor, metadata, keySupport); + this.index = index; + + this.channel = channel; + this.fsyncLimit = metadata.fsyncLimit(); + this.buffer = buffer; + + selfRef = new Ref<>(this, new Tidier<>(descriptor, channel, buffer, index)); + } + + /** + * Loads all segments matching the supplied desctiptors + * + * @param descriptors descriptors of the segments to load + * @return list of the loaded segments + */ + static List> open(Collection descriptors, KeySupport keySupport) + { + List> segments = new ArrayList<>(descriptors.size()); + for (Descriptor descriptor : descriptors) + segments.add(open(descriptor, keySupport)); + return segments; + } + + /** + * Load the segment corresponding to the provided desrciptor + * + * @param descriptor descriptor of the segment to load + * @return the loaded segment + */ + @SuppressWarnings({ "resource", "RedundantSuppression" }) + static StaticSegment open(Descriptor descriptor, KeySupport keySupport) + { + if (!Component.DATA.existsFor(descriptor)) + throw new IllegalArgumentException("Data file for segment " + descriptor + " doesn't exist"); + + Metadata metadata = null; + if (Component.METADATA.existsFor(descriptor)) + { + try + { + metadata = Metadata.load(descriptor); + } + catch (Throwable t) + { + logger.error("Could not load metadata component for {}; rebuilding", descriptor, t); + Component.METADATA.markCorrupted(descriptor); + } + } + + if (metadata == null) + metadata = Metadata.rebuildAndPersist(descriptor, keySupport); + + OnDiskIndex index = null; + + if (Component.INDEX.existsFor(descriptor)) + { + try + { + index = OnDiskIndex.open(descriptor, keySupport); + } + catch (Throwable t) + { + logger.error("Could not load index component for {}; rebuilding", descriptor, t); + Component.INDEX.markCorrupted(descriptor); + } + } + + if (index == null) + index = OnDiskIndex.rebuildAndPersist(descriptor, keySupport, metadata.fsyncLimit()); + + try + { + return internalOpen(descriptor, index, metadata, keySupport); + } + catch (IOException e) + { + throw new JournalReadError(descriptor, Component.DATA, e); + } + } + + private static StaticSegment internalOpen( + Descriptor descriptor, OnDiskIndex index, Metadata metadata, KeySupport keySupport) + throws IOException + { + File file = descriptor.fileFor(Component.DATA); + FileChannel channel = FileChannel.open(file.toPath(), StandardOpenOption.READ); + MappedByteBuffer buffer = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size()); + return new StaticSegment<>(descriptor, channel, buffer, index, metadata, keySupport); + } + + public void close(Journal journal) + { + release(journal); + } + + /** + * Waits until this segment is unreferenced, closes it, and deltes all files associated with it. + */ + void discard(Journal journal) + { + ((Tidier)selfRef.tidier()).discard = true; + close(journal); + } + + @Override + public Ref> tryRef() + { + return selfRef.tryRef(); + } + + @Override + public Ref> ref() + { + return selfRef.ref(); + } + + @Override + public String toString() + { + return "StaticSegment{" + descriptor + '}'; + } + + @Override + public Ref> selfRef() + { + return selfRef; + } + + private static final class Tidier extends Segment.Tidier implements Tidy + { + private final Descriptor descriptor; + private final FileChannel channel; + private final ByteBuffer buffer; + private final Index index; + boolean discard; + + Tidier(Descriptor descriptor, FileChannel channel, ByteBuffer buffer, Index index) + { + this.descriptor = descriptor; + this.channel = channel; + this.buffer = buffer; + this.index = index; + } + + @Override + void onUnreferenced() + { + FileUtils.clean(buffer); + FileUtils.closeQuietly(channel); + index.close(); + if (discard) + { + Throwable fail = null; + for (Component component : Component.VALUES) + { + try { descriptor.fileFor(component).deleteIfExists(); } + catch (Throwable t) { fail = Throwables.merge(fail, t); } + } + Throwables.maybeFail(fail); + } + } + + @Override + public String name() + { + return descriptor.toString(); + } + } + + @Override + OnDiskIndex index() + { + return index; + } + + @Override + boolean isActive() + { + return false; + } + + @Override + boolean isFlushed(long position) + { + return true; + } + + @Override + ActiveSegment asActive() + { + throw new UnsupportedOperationException(); + } + + @Override + StaticSegment asStatic() + { + return this; + } + + /** + * Read the entry and specified offset into the entry holder. + * Expects the record to have been written at this offset, but potentially not flushed and lost. + */ + @Override + boolean read(int offset, int size, EntrySerializer.EntryHolder into) + { + ByteBuffer duplicate = buffer.duplicate().position(offset).limit(offset + size); + try + { + return 0 <= EntrySerializer.tryRead(into, keySupport, duplicate, fsyncLimit, descriptor.userVersion); + } + catch (IOException e) + { + throw new JournalReadError(descriptor, file, e); + } + } + + /** + * Iterate over and invoke the supplied callback on every record. + */ + void forEachRecord(RecordConsumer consumer) + { + try (SequentialReader reader = sequentialReader(descriptor, keySupport, fsyncLimit)) + { + while (reader.advance()) + { + consumer.accept(descriptor.timestamp, reader.offset(), reader.key(), reader.record(), descriptor.userVersion); + } + } + } + + /* + * Sequential and in-key order reading (replay and components rebuild) + */ + + static abstract class Reader implements Closeable + { + enum State { RESET, ADVANCED, EOF } + + public final Descriptor descriptor; + protected final KeySupport keySupport; + + protected final File file; + protected final FileChannel channel; + protected final MappedByteBuffer buffer; + + protected final EntrySerializer.EntryHolder holder = new EntrySerializer.EntryHolder<>(); + protected int offset = -1; + protected State state = State.RESET; + + Reader(Descriptor descriptor, KeySupport keySupport) + { + this.descriptor = descriptor; + this.keySupport = keySupport; + + file = descriptor.fileFor(Component.DATA); + try + { + channel = file.newReadChannel(); + buffer = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size()); + } + catch (NoSuchFileException e) + { + throw new IllegalArgumentException("Data file for segment " + descriptor + " doesn't exist"); + } + catch (IOException e) + { + throw new JournalReadError(descriptor, file, e); + } + } + + @Override + public void close() + { + FileUtils.closeQuietly(channel); + FileUtils.clean(buffer); + } + + public abstract boolean advance(); + + public int offset() + { + ensureHasAdvanced(); + return offset; + } + + public K key() + { + ensureHasAdvanced(); + return holder.key; + } + + public ByteBuffer record() + { + ensureHasAdvanced(); + return holder.value; + } + + protected void ensureHasAdvanced() + { + if (state != State.ADVANCED) + throw new IllegalStateException("Must call advance() before accessing entry content"); + } + + protected boolean eof() + { + state = State.EOF; + return false; + } + } + + static SequentialReader sequentialReader(Descriptor descriptor, KeySupport keySupport, int fsyncedLimit) + { + return new SequentialReader<>(descriptor, keySupport, fsyncedLimit); + } + + /** + * A sequential data segment reader to use for journal replay and rebuilding + * missing auxilirary components (index and metadata). + *

+ * Unexpected EOF and CRC mismatches in synced portions of segments are treated + * strictly, throwing {@link JournalReadError}. Errors encountered in unsynced portions + * of segments are treated as segment EOF. + */ + static final class SequentialReader extends Reader + { + private final int fsyncedLimit; // exclusive + + SequentialReader(Descriptor descriptor, KeySupport keySupport, int fsyncedLimit) + { + super(descriptor, keySupport); + this.fsyncedLimit = fsyncedLimit; + } + + @Override + public boolean advance() + { + if (state == State.EOF) + return false; + + reset(); + return buffer.hasRemaining() ? doAdvance() : eof(); + } + + private boolean doAdvance() + { + offset = buffer.position(); + try + { + int length = EntrySerializer.tryRead(holder, keySupport, buffer.duplicate(), fsyncedLimit, descriptor.userVersion); + if (length < 0) + return eof(); + buffer.position(offset + length); + } + catch (IOException e) + { + throw new JournalReadError(descriptor, file, e); + } + + state = State.ADVANCED; + return true; + } + + private void reset() + { + offset = -1; + holder.clear(); + state = State.RESET; + } + } + + public StaticSegment.KeyOrderReader keyOrderReader() + { + return new StaticSegment.KeyOrderReader<>(descriptor, keySupport, index.reader()); + } + + public static final class KeyOrderReader extends Reader implements Comparable> + { + private final OnDiskIndex.IndexReader indexReader; + + KeyOrderReader(Descriptor descriptor, KeySupport keySupport, OnDiskIndex.IndexReader indexReader) + { + super(descriptor, keySupport); + this.indexReader = indexReader; + } + + @Override + public boolean advance() + { + if (!indexReader.advance()) + return eof(); + + offset = indexReader.offset(); + + buffer.limit(offset + indexReader.recordSize()) + .position(offset); + try + { + EntrySerializer.read(holder, keySupport, buffer, descriptor.userVersion); + } + catch (IOException e) + { + throw new JournalReadError(descriptor, file, e); + } + + state = State.ADVANCED; + return true; + } + + @Override + public int compareTo(KeyOrderReader that) + { + this.ensureHasAdvanced(); + that.ensureHasAdvanced(); + + int cmp = keySupport.compare(this.key(), that.key()); + if (cmp != 0) + return cmp; + cmp = Long.compare(that.descriptor.timestamp, this.descriptor.timestamp); + if (cmp != 0) + return cmp; + return Integer.compare(that.offset, this.offset); + } + } +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/journal/ValueSerializer.java b/src/java/org/apache/cassandra/journal/ValueSerializer.java new file mode 100644 index 000000000000..69690d39b28a --- /dev/null +++ b/src/java/org/apache/cassandra/journal/ValueSerializer.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.io.IOException; + +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + +public interface ValueSerializer +{ + void serialize(K key, V value, DataOutputPlus out, int userVersion) throws IOException; + + /** + * Deserialize the value given the key is known. Allows to avoid serializing + * redundant information in values, if it can be derived from keys. + */ + V deserialize(K key, DataInputPlus in, int userVersion) throws IOException; +} diff --git a/src/java/org/apache/cassandra/journal/package-info.java b/src/java/org/apache/cassandra/journal/package-info.java new file mode 100644 index 000000000000..5ae20b9274ad --- /dev/null +++ b/src/java/org/apache/cassandra/journal/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * TODO + */ +package org.apache.cassandra.journal; diff --git a/src/java/org/apache/cassandra/locator/AbstractNetworkTopologySnitch.java b/src/java/org/apache/cassandra/locator/AbstractNetworkTopologySnitch.java index b6901e27660c..ff0842fdf59d 100644 --- a/src/java/org/apache/cassandra/locator/AbstractNetworkTopologySnitch.java +++ b/src/java/org/apache/cassandra/locator/AbstractNetworkTopologySnitch.java @@ -17,6 +17,10 @@ */ package org.apache.cassandra.locator; +import java.util.Comparator; + +import org.apache.cassandra.utils.Sortable; + /** * An endpoint snitch tells Cassandra information about network topology that it can use to route * requests more efficiently. @@ -31,4 +35,16 @@ public int compareEndpoints(InetAddressAndPort address, Replica r1, Replica r2) { return proximity.compareEndpoints(address, r1, r2); } + + @Override + public boolean supportCompareByEndpoint() + { + return proximity.supportCompareByEndpoint(); + } + + @Override + public > Comparator endpointComparator(InetAddressAndPort address, C addresses) + { + return proximity.endpointComparator(address, addresses); + } } diff --git a/src/java/org/apache/cassandra/locator/AbstractReplicationStrategy.java b/src/java/org/apache/cassandra/locator/AbstractReplicationStrategy.java index 0870902681c5..04cfac933c7c 100644 --- a/src/java/org/apache/cassandra/locator/AbstractReplicationStrategy.java +++ b/src/java/org/apache/cassandra/locator/AbstractReplicationStrategy.java @@ -20,9 +20,6 @@ import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; -import java.util.Collection; -import java.util.Collections; -import java.util.Map; import java.util.function.Supplier; import java.util.*; diff --git a/src/java/org/apache/cassandra/locator/CMSPlacementStrategy.java b/src/java/org/apache/cassandra/locator/CMSPlacementStrategy.java index 716b40a34ca8..e00289aef667 100644 --- a/src/java/org/apache/cassandra/locator/CMSPlacementStrategy.java +++ b/src/java/org/apache/cassandra/locator/CMSPlacementStrategy.java @@ -83,7 +83,7 @@ public Set reconfigure(ClusterMetadata metadata) { if (!filter.apply(metadata, peerId)) { - tmpDirectory = tmpDirectory.without(peerId); + tmpDirectory = tmpDirectory.without(metadata.nextEpoch(), peerId); tmpTokenMap = tmpTokenMap.unassignTokens(peerId); } } diff --git a/src/java/org/apache/cassandra/locator/DynamicEndpointSnitch.java b/src/java/org/apache/cassandra/locator/DynamicEndpointSnitch.java index 1d3810613f6d..003347ef29cd 100644 --- a/src/java/org/apache/cassandra/locator/DynamicEndpointSnitch.java +++ b/src/java/org/apache/cassandra/locator/DynamicEndpointSnitch.java @@ -42,6 +42,7 @@ import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.MBeanWrapper; +import org.apache.cassandra.utils.Sortable; import static org.apache.cassandra.config.CassandraRelevantProperties.IGNORE_DYNAMIC_SNITCH_SEVERITY; @@ -170,10 +171,13 @@ private > C sortedByProximityWithBadnes // TODO: avoid copy replicas = delegate.sortedByProximity(address, replicas); - HashMap scores = this.scores; // Make sure the score don't change in the middle of the loop below - // (which wouldn't really matter here but its cleaner that way). - ArrayList subsnitchOrderedScores = new ArrayList<>(replicas.size()); - for (Replica replica : replicas) + return shouldSortByScore(scores, replicas) ? sortedByProximityWithScore(address, replicas) : replicas; + } + + private > boolean shouldSortByScore(HashMap scores, C sortedReplicas) + { + ArrayList subsnitchOrderedScores = new ArrayList<>(sortedReplicas.size()); + for (Endpoint replica : sortedReplicas) { Double score = scores.get(replica.endpoint()); if (score == null) @@ -193,12 +197,10 @@ private > C sortedByProximityWithBadnes for (Double subsnitchScore : subsnitchOrderedScores) { if (subsnitchScore > (sortedScoreIterator.next() * badnessThreshold)) - { - return sortedByProximityWithScore(address, replicas); - } + return true; } - return replicas; + return false; } private static double defaultStore(InetAddressAndPort target) @@ -208,6 +210,11 @@ private static double defaultStore(InetAddressAndPort target) // Compare endpoints given an immutable snapshot of the scores private int compareEndpoints(InetAddressAndPort target, Replica a1, Replica a2, Map scores) + { + return compareEndpoints(a1, a2, scores, (a, b) -> delegate.compareEndpoints(target, a, b)); + } + + private int compareEndpoints(T a1, T a2, Map scores, Comparator subCompare) { Double scored1 = scores.get(a1.endpoint()); Double scored2 = scores.get(a2.endpoint()); @@ -223,7 +230,7 @@ private int compareEndpoints(InetAddressAndPort target, Replica a1, Replica a2, } if (scored1.equals(scored2)) - return delegate.compareEndpoints(target, a1, a2); + return subCompare.compare(a1, a2); if (scored1 < scored2) return -1; else @@ -409,4 +416,26 @@ private double maxScore(ReplicaCollection endpoints) } return maxScore; } + + @Override + public boolean supportCompareByEndpoint() + { + return delegate.supportCompareByEndpoint(); + } + + @Override + public > Comparator endpointComparator(InetAddressAndPort address, C addresses) + { + if (!delegate.supportCompareByEndpoint()) + throw new UnsupportedOperationException(); + assert address.equals(FBUtilities.getBroadcastAddressAndPort()); // we only know about ourself + Comparator compare = delegate.endpointComparator(address, addresses); + if (addresses.size() < 2) + return compare; + HashMap scores = this.scores; + Comparator compareWithScore = (r1, r2) -> compareEndpoints(r1, r2, scores, compare); + return dynamicBadnessThreshold == 0 || shouldSortByScore(scores, addresses.sorted(compare)) ? + compareWithScore : + compare; + } } diff --git a/src/java/org/apache/cassandra/locator/Endpoint.java b/src/java/org/apache/cassandra/locator/Endpoint.java new file mode 100644 index 000000000000..5a44bfd61c8c --- /dev/null +++ b/src/java/org/apache/cassandra/locator/Endpoint.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.locator; + +public interface Endpoint +{ + InetAddressAndPort endpoint(); +} diff --git a/src/java/org/apache/cassandra/locator/IEndpointSnitch.java b/src/java/org/apache/cassandra/locator/IEndpointSnitch.java index 4d5033681083..ef275ec12ff7 100644 --- a/src/java/org/apache/cassandra/locator/IEndpointSnitch.java +++ b/src/java/org/apache/cassandra/locator/IEndpointSnitch.java @@ -18,9 +18,11 @@ package org.apache.cassandra.locator; import java.net.InetSocketAddress; +import java.util.Comparator; import java.util.Set; import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.Sortable; /** * This interface helps determine location of node in the datacenter relative to another node. @@ -101,5 +103,14 @@ default boolean preferLocalConnections() { return false; } -} + default boolean supportCompareByEndpoint() + { + return false; + } + + default > Comparator endpointComparator(InetAddressAndPort address, C addresses) + { + throw new UnsupportedOperationException(); + } +} diff --git a/src/java/org/apache/cassandra/locator/InetAddressAndPort.java b/src/java/org/apache/cassandra/locator/InetAddressAndPort.java index 50f3368b2001..e2520659b810 100644 --- a/src/java/org/apache/cassandra/locator/InetAddressAndPort.java +++ b/src/java/org/apache/cassandra/locator/InetAddressAndPort.java @@ -25,6 +25,9 @@ import java.net.InetSocketAddress; import java.net.UnknownHostException; import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Set; import java.util.regex.Pattern; import java.util.List; import java.util.stream.Collectors; @@ -32,8 +35,12 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; +import com.google.common.base.Splitter; import com.google.common.net.HostAndPort; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; @@ -59,6 +66,7 @@ public final class InetAddressAndPort extends InetSocketAddress implements Comparable, Serializable { private static final long serialVersionUID = 0; + private static final Logger logger = LoggerFactory.getLogger(InetAddressAndPort.class); //Store these here to avoid requiring DatabaseDescriptor to be loaded. DatabaseDescriptor will set //these when it loads the config. A lot of unit tests won't end up loading DatabaseDescriptor. @@ -323,6 +331,52 @@ public static void initializeDefaultPort(int port) defaultPort = port; } + public static List stringify(Iterable endpoints) + { + return stringify(endpoints, true); + } + + public static List stringify(Iterable endpoints, boolean withPort) + { + List stringEndpoints = new ArrayList<>(); + for (InetAddressAndPort ep : endpoints) + { + stringEndpoints.add(ep.getHostAddress(withPort)); + } + return stringEndpoints; + } + + /** + * Parses a comma-separated list of hosts to a set of {@link InetAddressAndPort} + * + * @param value the comma-separated list of hosts to parse + * @param failOnError whether to fail when encountering an invalid hostname + * @return the set of parsed {@link InetAddressAndPort} + */ + public static Set parseHosts(String value, boolean failOnError) + { + Set hosts = new HashSet<>(); + for (String host : Splitter.on(',').split(value)) + { + try + { + hosts.add(InetAddressAndPort.getByName(host)); + } + catch (UnknownHostException e) + { + if (failOnError) + { + throw new IllegalArgumentException("Failed to parse host: " + host, e); + } + else + { + logger.warn("Invalid ip address {} from input={}", host, value); + } + } + } + return hosts; + } + static int getDefaultPort() { return defaultPort; diff --git a/src/java/org/apache/cassandra/locator/NetworkTopologyProximity.java b/src/java/org/apache/cassandra/locator/NetworkTopologyProximity.java index eddcb3630364..f7f30dafb7e4 100644 --- a/src/java/org/apache/cassandra/locator/NetworkTopologyProximity.java +++ b/src/java/org/apache/cassandra/locator/NetworkTopologyProximity.java @@ -20,10 +20,18 @@ import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.tcm.membership.Location; +import org.apache.cassandra.utils.Sortable; + +import java.util.Comparator; public class NetworkTopologyProximity extends BaseProximity { public int compareEndpoints(InetAddressAndPort address, Replica r1, Replica r2) + { + return compareByEndpoints(address, r1, r2); + } + + public int compareByEndpoints(InetAddressAndPort address, Endpoint r1, Endpoint r2) { InetAddressAndPort a1 = r1.endpoint(); InetAddressAndPort a2 = r2.endpoint(); @@ -48,4 +56,18 @@ public int compareEndpoints(InetAddressAndPort address, Replica r1, Replica r2) return 1; return 0; } + + @Override + public boolean supportCompareByEndpoint() + { + return true; + } + + @Override + public > Comparator endpointComparator(InetAddressAndPort address, C addresses) + { + if (!supportCompareByEndpoint()) + throw new UnsupportedOperationException(); + return (a, b) -> compareByEndpoints(address, a, b); + } } diff --git a/src/java/org/apache/cassandra/locator/NoOpProximity.java b/src/java/org/apache/cassandra/locator/NoOpProximity.java index 342f12e61958..69f5250d705d 100644 --- a/src/java/org/apache/cassandra/locator/NoOpProximity.java +++ b/src/java/org/apache/cassandra/locator/NoOpProximity.java @@ -18,6 +18,10 @@ package org.apache.cassandra.locator; +import org.apache.cassandra.utils.Sortable; + +import java.util.Comparator; + public class NoOpProximity extends BaseProximity { @Override @@ -34,4 +38,23 @@ public int compareEndpoints(InetAddressAndPort target, Replica r1, Replica r2) // Collections.sort is guaranteed to be stable) return 0; } + + @Override + public boolean supportCompareByEndpoint() + { + return true; + } + + @Override + public > Comparator endpointComparator(InetAddressAndPort address, C addresses) + { + return this::compareByEndpoint; + } + + private int compareByEndpoint(Endpoint a, Endpoint b) + { + // Making all endpoints equal ensures we won't change the original ordering (since + // Collections.sort is guaranteed to be stable) + return 0; + } } diff --git a/src/java/org/apache/cassandra/locator/NodeProximity.java b/src/java/org/apache/cassandra/locator/NodeProximity.java index cbb7158aafa8..af7c17eaacdf 100644 --- a/src/java/org/apache/cassandra/locator/NodeProximity.java +++ b/src/java/org/apache/cassandra/locator/NodeProximity.java @@ -18,6 +18,10 @@ package org.apache.cassandra.locator; +import org.apache.cassandra.utils.Sortable; + +import java.util.Comparator; + public interface NodeProximity { /** @@ -35,4 +39,14 @@ public interface NodeProximity * to be faster than 2 sequential queries, one against l1 followed by one against l2. */ public boolean isWorthMergingForRangeQuery(ReplicaCollection merged, ReplicaCollection l1, ReplicaCollection l2); + + default boolean supportCompareByEndpoint() + { + return false; + } + + default > Comparator endpointComparator(InetAddressAndPort address, C addresses) + { + throw new UnsupportedOperationException(); + } } diff --git a/src/java/org/apache/cassandra/locator/Replica.java b/src/java/org/apache/cassandra/locator/Replica.java index b1f68b2101e3..41a16a459be3 100644 --- a/src/java/org/apache/cassandra/locator/Replica.java +++ b/src/java/org/apache/cassandra/locator/Replica.java @@ -51,7 +51,7 @@ * and such and what the result is WRT to transientness. Definitely avoid creating fake Replicas with misinformation * about endpoints, ranges, or transientness. */ -public final class Replica implements Comparable +public final class Replica implements Comparable, Endpoint { public static final IPartitionerDependentSerializer serializer = new Serializer(); @@ -105,6 +105,7 @@ public String toString() return (full ? "Full" : "Transient") + '(' + endpoint() + ',' + range + ')'; } + @Override public final InetAddressAndPort endpoint() { return endpoint; diff --git a/src/java/org/apache/cassandra/locator/ReplicaCollection.java b/src/java/org/apache/cassandra/locator/ReplicaCollection.java index b679b506b01e..f1dac0042b78 100644 --- a/src/java/org/apache/cassandra/locator/ReplicaCollection.java +++ b/src/java/org/apache/cassandra/locator/ReplicaCollection.java @@ -24,11 +24,13 @@ import java.util.function.Predicate; import java.util.stream.Stream; +import org.apache.cassandra.utils.Sortable; + /** * A collection like class for Replica objects. Represents both a well defined order on the contained Replica objects, * and efficient methods for accessing the contained Replicas, directly and as a projection onto their endpoints and ranges. */ -public interface ReplicaCollection> extends Iterable +public interface ReplicaCollection> extends Sortable { /** * @return a Set of the endpoints of the contained Replicas. diff --git a/src/java/org/apache/cassandra/locator/ReplicaLayout.java b/src/java/org/apache/cassandra/locator/ReplicaLayout.java index f0069f2555cc..f961b4051d08 100644 --- a/src/java/org/apache/cassandra/locator/ReplicaLayout.java +++ b/src/java/org/apache/cassandra/locator/ReplicaLayout.java @@ -24,9 +24,12 @@ import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.PartitionPosition; import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.dht.LocalPartitioner; +import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; -import org.apache.cassandra.gms.FailureDetector; import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.reads.ReadCoordinator; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.utils.FBUtilities; @@ -354,32 +357,31 @@ static EndpointsForToken resolveWriteConflictsInPending(EndpointsForToken natura } /** - * @return the read layout for a token - this includes only live natural replicas, i.e. those that are not pending - * and not marked down by the failure detector. these are reverse sorted by the badness score of the configured snitch + * @return the read layout for a token - this includes natural replicas, i.e. those that are not pending. + * They are reverse sorted by the badness score of the configured snitch */ - static ReplicaLayout.ForTokenRead forTokenReadLiveSorted(ClusterMetadata metadata, Keyspace keyspace, AbstractReplicationStrategy replicationStrategy, Token token) + static ReplicaLayout.ForTokenRead forTokenReadSorted(ClusterMetadata metadata, Keyspace keyspace, AbstractReplicationStrategy replicationStrategy, TableId tableId, Token token, ReadCoordinator coordinator) { EndpointsForToken replicas = keyspace.getMetadata().params.replication.isLocal() ? forLocalStrategyToken(metadata, replicationStrategy, token) - : forNonLocalStrategyTokenRead(metadata, keyspace.getMetadata(), token); + : coordinator.forNonLocalStrategyTokenRead(metadata, keyspace.getMetadata(), tableId, token); + replicas = DatabaseDescriptor.getNodeProximity().sortedByProximity(FBUtilities.getBroadcastAddressAndPort(), replicas); - replicas = replicas.filter(FailureDetector.isReplicaAlive); + return new ReplicaLayout.ForTokenRead(replicationStrategy, replicas); } /** * TODO: we should really double check that the provided range does not overlap multiple token ring regions - * @return the read layout for a range - this includes only live natural replicas, i.e. those that are not pending - * and not marked down by the failure detector. these are reverse sorted by the badness score of the configured snitch + * @return the read layout for a range - these are reverse sorted by the badness score of the configured snitch */ - static ReplicaLayout.ForRangeRead forRangeReadLiveSorted(ClusterMetadata metadata, Keyspace keyspace, AbstractReplicationStrategy replicationStrategy, AbstractBounds range) + static ReplicaLayout.ForRangeRead forRangeReadSorted(ClusterMetadata metadata, Keyspace keyspace, AbstractReplicationStrategy replicationStrategy, AbstractBounds range) { EndpointsForRange replicas = keyspace.getMetadata().params.replication.isLocal() ? forLocalStrategyRange(metadata, replicationStrategy, range) : forNonLocalStategyRangeRead(metadata, keyspace.getMetadata(), range); replicas = DatabaseDescriptor.getNodeProximity().sortedByProximity(FBUtilities.getBroadcastAddressAndPort(), replicas); - replicas = replicas.filter(FailureDetector.isReplicaAlive); return new ReplicaLayout.ForRangeRead(replicationStrategy, range, replicas); } @@ -388,7 +390,7 @@ static EndpointsForRange forNonLocalStategyRangeRead(ClusterMetadata metadata, K return metadata.placements.get(keyspace.params.replication).reads.forRange(range.right.getToken()).get(); } - static EndpointsForToken forNonLocalStrategyTokenRead(ClusterMetadata metadata, KeyspaceMetadata keyspace, Token token) + public static EndpointsForToken forNonLocalStrategyTokenRead(ClusterMetadata metadata, KeyspaceMetadata keyspace, Token token) { return metadata.placements.get(keyspace.params.replication).reads.forToken(token).get(); } @@ -405,6 +407,10 @@ static EndpointsForRange forLocalStrategyRange(ClusterMetadata metadata, Abstrac static EndpointsForToken forLocalStrategyToken(ClusterMetadata metadata, AbstractReplicationStrategy replicationStrategy, Token t) { - return replicationStrategy.calculateNaturalReplicas(t, metadata).forToken(t); + if (!(t instanceof LocalPartitioner.LocalToken)) + return replicationStrategy.calculateNaturalReplicas(t, metadata).forToken(t); + + // local tokens use a different partitioner than the global one... so update the ranges + return EndpointsForToken.of(t, new Replica(FBUtilities.getBroadcastAddressAndPort(), new Range<>(t, t), true)); } } diff --git a/src/java/org/apache/cassandra/locator/ReplicaPlan.java b/src/java/org/apache/cassandra/locator/ReplicaPlan.java index 62db6f85f343..ee8198aac3f4 100644 --- a/src/java/org/apache/cassandra/locator/ReplicaPlan.java +++ b/src/java/org/apache/cassandra/locator/ReplicaPlan.java @@ -18,23 +18,23 @@ package org.apache.cassandra.locator; +import java.util.List; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.function.Supplier; + import com.google.common.collect.Iterables; + import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.PartitionPosition; import org.apache.cassandra.dht.AbstractBounds; import org.apache.cassandra.dht.Token; -import org.apache.cassandra.tcm.Epoch; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.tcm.ClusterMetadata; -import org.apache.cassandra.utils.FBUtilities; - -import java.util.List; -import java.util.concurrent.CopyOnWriteArrayList; -import java.util.function.BiFunction; -import java.util.function.Function; -import java.util.function.Predicate; -import java.util.function.Supplier; +import org.apache.cassandra.tcm.Epoch; public interface ReplicaPlan, P extends ReplicaPlan> { @@ -44,12 +44,13 @@ public interface ReplicaPlan, P extends ReplicaPlan ConsistencyLevel consistencyLevel(); E contacts(); + E liveAndDown(); Replica lookup(InetAddressAndPort endpoint); P withContacts(E contacts); void collectSuccess(InetAddressAndPort inetAddressAndPort); - void collectFailure(InetAddressAndPort inetAddressAndPort, RequestFailureReason t); + void collectFailure(InetAddressAndPort inetAddressAndPort, RequestFailure t); boolean stillAppliesTo(ClusterMetadata newMetadata); interface ForRead, P extends ReplicaPlan.ForRead> extends ReplicaPlan @@ -82,29 +83,28 @@ abstract class AbstractReplicaPlan, P extends ReplicaPlan // - paxos, includes all live replicas (natural+pending), for this DC if SERIAL_LOCAL // ==> live.all() (if consistencyLevel.isDCLocal(), then .filter(consistencyLevel.isLocal)) protected final E contacts; + protected final E liveAndDown; protected final Function recompute; protected List contacted = new CopyOnWriteArrayList<>(); - AbstractReplicaPlan(Keyspace keyspace, AbstractReplicationStrategy replicationStrategy, ConsistencyLevel consistencyLevel, E contacts, Function recompute, Epoch epoch) + AbstractReplicaPlan(Keyspace keyspace, AbstractReplicationStrategy replicationStrategy, ConsistencyLevel consistencyLevel, E contacts, E liveAndDown, Function recompute, Epoch epoch) { assert contacts != null; this.keyspace = keyspace; this.replicationStrategy = replicationStrategy; this.consistencyLevel = consistencyLevel; this.contacts = contacts; + this.liveAndDown = liveAndDown; this.recompute = recompute; this.epoch = epoch; } public E contacts() { return contacts; } + public E liveAndDown() { return liveAndDown; } public Keyspace keyspace() { return keyspace; } public AbstractReplicationStrategy replicationStrategy() { return replicationStrategy; } public ConsistencyLevel consistencyLevel() { return consistencyLevel; } - public boolean canDoLocalRequest() - { - return contacts.contains(FBUtilities.getBroadcastAddressAndPort()); - } public Epoch epoch() { @@ -116,7 +116,7 @@ public void collectSuccess(InetAddressAndPort addr) contacted.add(addr); } - public void collectFailure(InetAddressAndPort inetAddressAndPort, RequestFailureReason t) {} + public void collectFailure(InetAddressAndPort inetAddressAndPort, RequestFailure t) {} } @@ -132,10 +132,11 @@ public static abstract class AbstractForRead, P extends F ConsistencyLevel consistencyLevel, E candidates, E contacts, + E liveAndDown, Function recompute, Epoch epoch) { - super(keyspace, replicationStrategy, consistencyLevel, contacts, recompute, epoch); + super(keyspace, replicationStrategy, consistencyLevel, contacts, liveAndDown, recompute, epoch); this.candidates = candidates; this.readQuorum = consistencyLevel.blockFor(replicationStrategy); } @@ -171,13 +172,13 @@ public boolean stillAppliesTo(ClusterMetadata newMetadata) ForRead newPlan = recompute.apply(newMetadata); - if (readCandidates().equals(newPlan.readCandidates())) + if (liveAndDown().equals(newPlan.liveAndDown())) return true; int readQuorum = newPlan.readQuorum(); for (InetAddressAndPort addr : contacted) { - if (newPlan.readCandidates().contains(addr)) + if (newPlan.liveAndDown().contains(addr)) readQuorum--; } @@ -204,17 +205,18 @@ public ForTokenRead(Keyspace keyspace, ConsistencyLevel consistencyLevel, EndpointsForToken candidates, EndpointsForToken contacts, + EndpointsForToken liveAndDown, Function recompute, Function, ReplicaPlan.ForWrite> repairPlan, Epoch epoch) { - super(keyspace, replicationStrategy, consistencyLevel, candidates, contacts, recompute, epoch); + super(keyspace, replicationStrategy, consistencyLevel, candidates, contacts, liveAndDown, recompute, epoch); this.repairPlan = repairPlan; } public ForTokenRead withContacts(EndpointsForToken newContacts) { - ForTokenRead res = new ForTokenRead(keyspace, replicationStrategy, consistencyLevel, candidates, newContacts, recompute, repairPlan, epoch); + ForTokenRead res = new ForTokenRead(keyspace, replicationStrategy, consistencyLevel, candidates, newContacts, liveAndDown, recompute, repairPlan, epoch); res.contacted.addAll(contacted); return res; } @@ -240,12 +242,13 @@ public ForRangeRead(Keyspace keyspace, AbstractBounds range, EndpointsForRange candidates, EndpointsForRange contact, + EndpointsForRange liveAndDown, int vnodeCount, Function recompute, BiFunction, Token, ReplicaPlan.ForWrite> repairPlan, Epoch epoch) { - super(keyspace, replicationStrategy, consistencyLevel, candidates, contact, recompute, epoch); + super(keyspace, replicationStrategy, consistencyLevel, candidates, contact, liveAndDown, recompute, epoch); this.range = range; this.vnodeCount = vnodeCount; this.repairPlan = repairPlan; @@ -260,7 +263,7 @@ public ForRangeRead(Keyspace keyspace, public ForRangeRead withContacts(EndpointsForRange newContact) { - ForRangeRead res = new ForRangeRead(keyspace, replicationStrategy, consistencyLevel, range, readCandidates(), newContact, vnodeCount, recompute, repairPlan, epoch); + ForRangeRead res = new ForRangeRead(keyspace, replicationStrategy, consistencyLevel, range, readCandidates(), newContact, liveAndDown, vnodeCount, recompute, repairPlan, epoch); res.contacted.addAll(contacted); return res; } @@ -284,6 +287,7 @@ public ForFullRangeRead(Keyspace keyspace, AbstractBounds range, EndpointsForRange candidates, EndpointsForRange contact, + EndpointsForRange liveAndDown, int vnodeCount, Epoch epoch) { @@ -291,7 +295,7 @@ public ForFullRangeRead(Keyspace keyspace, // the epoch change during the course of query execution so no recomputation function is supplied. Likewise, // no read repair is expected to be performed during this type of query so a null is also used in place of a // function for calculating the repair plan. - super(keyspace, replicationStrategy, consistencyLevel, range, candidates, contact, vnodeCount, null, null, epoch); + super(keyspace, replicationStrategy, consistencyLevel, range, candidates, contact, liveAndDown, vnodeCount, null, null, epoch); } @Override @@ -305,7 +309,6 @@ public static class ForWrite extends AbstractReplicaPlan recompute, Epoch epoch) { - super(keyspace, replicationStrategy, consistencyLevel, contact, recompute, epoch); + super(keyspace, replicationStrategy, consistencyLevel, contact, liveAndDown, recompute, epoch); this.pending = pending; - this.liveAndDown = liveAndDown; this.live = live; this.writeQuorum = consistencyLevel.blockForWrite(replicationStrategy, pending); } @@ -331,9 +333,6 @@ public ForWrite(Keyspace keyspace, /** Replicas that a region of the ring is moving to; not yet ready to serve reads, but should receive writes */ public EndpointsForToken pending() { return pending; } - /** Replicas that can participate in the write - this always includes all nodes (pending and natural) in all DCs, except for paxos LOCAL_QUORUM (which is local DC only) */ - public EndpointsForToken liveAndDown() { return liveAndDown; } - /** The live replicas present in liveAndDown, usually derived from FailureDetector.isReplicaAlive */ public EndpointsForToken live() { return live; } diff --git a/src/java/org/apache/cassandra/locator/ReplicaPlans.java b/src/java/org/apache/cassandra/locator/ReplicaPlans.java index b6a03b683bd1..d009ed33db8e 100644 --- a/src/java/org/apache/cassandra/locator/ReplicaPlans.java +++ b/src/java/org/apache/cassandra/locator/ReplicaPlans.java @@ -62,7 +62,9 @@ import org.apache.cassandra.index.Index; import org.apache.cassandra.index.IndexStatusManager; import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.TableId; import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.reads.ReadCoordinator; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.service.reads.AlwaysSpeculativeRetryPolicy; import org.apache.cassandra.service.reads.SpeculativeRetryPolicy; @@ -533,7 +535,7 @@ public static List sortByProximity(Collection forRead, ClusterMetadata metadata, Keyspace keyspace, ConsistencyLevel consistencyLevel, Token token, Predicate isAlive) throws UnavailableException + public static ReplicaPlan.ForWrite forReadRepair(ReplicaPlan forRead, ClusterMetadata metadata, Keyspace keyspace, TableId tableId, ConsistencyLevel consistencyLevel, Token token, Predicate isAlive, ReadCoordinator coordinator) throws UnavailableException { AbstractReplicationStrategy replicationStrategy = keyspace.getReplicationStrategy(); Selector selector = writeReadRepair(forRead); @@ -550,7 +552,7 @@ public static ReplicaPlan.ForWrite forReadRepair(ReplicaPlan forRead, Clus liveAndDown.all(), live.all(), contacts, - (newClusterMetadata) -> forReadRepair(forRead, newClusterMetadata, keyspace, consistencyLevel, token, isAlive), + (newClusterMetadata) -> forReadRepair(forRead, newClusterMetadata, keyspace, tableId, consistencyLevel, token, isAlive, coordinator), metadata.epoch); } @@ -839,13 +841,9 @@ public static ReplicaPlan.ForTokenRead forSingleReplicaRead(Keyspace keyspace, T private static ReplicaPlan.ForTokenRead forSingleReplicaRead(ClusterMetadata metadata, Keyspace keyspace, Token token, Replica replica) { - // todo; replica does not always contain token, figure out why -// if (!metadata.placements.get(keyspace.getMetadata().params.replication).reads.forToken(token).contains(replica)) -// throw UnavailableException.create(ConsistencyLevel.ONE, 1, 1, 0, 0); - EndpointsForToken one = EndpointsForToken.of(token, replica); - return new ReplicaPlan.ForTokenRead(keyspace, keyspace.getReplicationStrategy(), ConsistencyLevel.ONE, one, one, + return new ReplicaPlan.ForTokenRead(keyspace, keyspace.getReplicationStrategy(), ConsistencyLevel.ONE, one, one, one, (newClusterMetadata) -> forSingleReplicaRead(newClusterMetadata, keyspace, token, replica), (self) -> { throw new IllegalStateException("Read repair is not supported for short read/replica filtering protection."); @@ -866,7 +864,7 @@ private static ReplicaPlan.ForRangeRead forSingleReplicaRead(ClusterMetadata met // TODO: this is unsafe, as one.range() may be inconsistent with our supplied range; should refactor Range/AbstractBounds to single class EndpointsForRange one = EndpointsForRange.of(replica); - return new ReplicaPlan.ForRangeRead(keyspace, keyspace.getReplicationStrategy(), ConsistencyLevel.ONE, range, one, one, vnodeCount, + return new ReplicaPlan.ForRangeRead(keyspace, keyspace.getReplicationStrategy(), ConsistencyLevel.ONE, range, one, one, one, vnodeCount, (newClusterMetadata) -> forSingleReplicaRead(metadata, keyspace, range, replica, vnodeCount), (self, token) -> { throw new IllegalStateException("Read repair is not supported for short read/replica filtering protection."); @@ -883,37 +881,50 @@ private static ReplicaPlan.ForRangeRead forSingleReplicaRead(ClusterMetadata met * it would break EACH_QUORUM to do so without further filtering */ public static ReplicaPlan.ForTokenRead forRead(Keyspace keyspace, + TableId tableId, Token token, @Nullable Index.QueryPlan indexQueryPlan, ConsistencyLevel consistencyLevel, - SpeculativeRetryPolicy retry) + SpeculativeRetryPolicy retry, + ReadCoordinator coordinator) { - return forRead(ClusterMetadata.current(), keyspace, token, indexQueryPlan, consistencyLevel, retry, false); + return forRead(ClusterMetadata.current(), keyspace, tableId, token, indexQueryPlan, consistencyLevel, retry, coordinator, false); } public static ReplicaPlan.ForTokenRead forRead(ClusterMetadata metadata, Keyspace keyspace, + TableId tableId, Token token, @Nullable Index.QueryPlan indexQueryPlan, ConsistencyLevel consistencyLevel, - SpeculativeRetryPolicy retry) + SpeculativeRetryPolicy retry, + ReadCoordinator coordinator) { - return forRead(metadata, keyspace, token, indexQueryPlan, consistencyLevel, retry, true); + return forRead(metadata, keyspace, tableId, token, indexQueryPlan, consistencyLevel, retry, coordinator, true); } - private static ReplicaPlan.ForTokenRead forRead(ClusterMetadata metadata, Keyspace keyspace, Token token, @Nullable Index.QueryPlan indexQueryPlan, ConsistencyLevel consistencyLevel, SpeculativeRetryPolicy retry, boolean throwOnInsufficientLiveReplicas) + private static ReplicaPlan.ForTokenRead forRead(ClusterMetadata metadata, + Keyspace keyspace, + TableId tableId, + Token token, + @Nullable Index.QueryPlan indexQueryPlan, + ConsistencyLevel consistencyLevel, + SpeculativeRetryPolicy retry, + ReadCoordinator coordinator, + boolean throwOnInsufficientLiveReplicas) { AbstractReplicationStrategy replicationStrategy = keyspace.getReplicationStrategy(); - ReplicaLayout.ForTokenRead forTokenRead = ReplicaLayout.forTokenReadLiveSorted(metadata, keyspace, replicationStrategy, token); - EndpointsForToken candidates = candidatesForRead(keyspace, indexQueryPlan, consistencyLevel, forTokenRead.natural()); + ReplicaLayout.ForTokenRead forTokenReadLiveAndDown = ReplicaLayout.forTokenReadSorted(metadata, keyspace, replicationStrategy, tableId, token, coordinator); + ReplicaLayout.ForTokenRead forTokenReadLive = forTokenReadLiveAndDown.filter(FailureDetector.isReplicaAlive); + EndpointsForToken candidates = candidatesForRead(keyspace, indexQueryPlan, consistencyLevel, forTokenReadLive.all()); EndpointsForToken contacts = contactForRead(metadata.locator, replicationStrategy, consistencyLevel, retry.equals(AlwaysSpeculativeRetryPolicy.INSTANCE), candidates); if (throwOnInsufficientLiveReplicas) assureSufficientLiveReplicasForRead(metadata.locator, replicationStrategy, consistencyLevel, contacts); - return new ReplicaPlan.ForTokenRead(keyspace, replicationStrategy, consistencyLevel, candidates, contacts, - (newClusterMetadata) -> forRead(newClusterMetadata, keyspace, token, indexQueryPlan, consistencyLevel, retry, false), - (self) -> forReadRepair(self, metadata, keyspace, consistencyLevel, token, FailureDetector.isReplicaAlive), + return new ReplicaPlan.ForTokenRead(keyspace, replicationStrategy, consistencyLevel, candidates, contacts, forTokenReadLiveAndDown.all(), + (newClusterMetadata) -> forRead(newClusterMetadata, keyspace, tableId, token, indexQueryPlan, consistencyLevel, retry, coordinator, false), + (self) -> forReadRepair(self, metadata, keyspace, tableId, consistencyLevel, token, FailureDetector.isReplicaAlive, coordinator), metadata.epoch); } @@ -925,16 +936,18 @@ private static ReplicaPlan.ForTokenRead forRead(ClusterMetadata metadata, Keyspa * There is no speculation for range read queries at present, so we never 'always speculate' here, and a failed response fails the query. */ public static ReplicaPlan.ForRangeRead forRangeRead(Keyspace keyspace, + TableId tableId, @Nullable Index.QueryPlan indexQueryPlan, ConsistencyLevel consistencyLevel, AbstractBounds range, int vnodeCount) { - return forRangeRead(ClusterMetadata.current(), keyspace, indexQueryPlan, consistencyLevel, range, vnodeCount, true); + return forRangeRead(ClusterMetadata.current(), keyspace, tableId, indexQueryPlan, consistencyLevel, range, vnodeCount, true); } public static ReplicaPlan.ForRangeRead forRangeRead(ClusterMetadata metadata, Keyspace keyspace, + TableId tableId, @Nullable Index.QueryPlan indexQueryPlan, ConsistencyLevel consistencyLevel, AbstractBounds range, @@ -942,8 +955,9 @@ public static ReplicaPlan.ForRangeRead forRangeRead(ClusterMetadata metadata, boolean throwOnInsufficientLiveReplicas) { AbstractReplicationStrategy replicationStrategy = keyspace.getReplicationStrategy(); - ReplicaLayout.ForRangeRead forRangeRead = ReplicaLayout.forRangeReadLiveSorted(metadata, keyspace, replicationStrategy, range); - EndpointsForRange candidates = candidatesForRead(keyspace, indexQueryPlan, consistencyLevel, forRangeRead.natural()); + ReplicaLayout.ForRangeRead forRangeReadLiveAndDown = ReplicaLayout.forRangeReadSorted(metadata, keyspace, replicationStrategy, range); + ReplicaLayout.ForRangeRead forRangeReadLive = forRangeReadLiveAndDown.filter(FailureDetector.isReplicaAlive); + EndpointsForRange candidates = candidatesForRead(keyspace, indexQueryPlan, consistencyLevel, forRangeReadLive.natural()); EndpointsForRange contacts = contactForRead(metadata.locator, replicationStrategy, consistencyLevel, false, candidates); if (throwOnInsufficientLiveReplicas) @@ -955,9 +969,10 @@ public static ReplicaPlan.ForRangeRead forRangeRead(ClusterMetadata metadata, range, candidates, contacts, + forRangeReadLiveAndDown.all(), vnodeCount, - (newClusterMetadata) -> forRangeRead(newClusterMetadata, keyspace, indexQueryPlan, consistencyLevel, range, vnodeCount, false), - (self, token) -> forReadRepair(self, metadata, keyspace, consistencyLevel, token, FailureDetector.isReplicaAlive), + (newClusterMetadata) -> forRangeRead(newClusterMetadata, keyspace, tableId, indexQueryPlan, consistencyLevel, range, vnodeCount, false), + (self, token) -> forReadRepair(self, metadata, keyspace, tableId, consistencyLevel, token, FailureDetector.isReplicaAlive, ReadCoordinator.DEFAULT), metadata.epoch); } @@ -983,7 +998,7 @@ public static ReplicaPlan.ForRangeRead forFullRangeRead(Keyspace keyspace, EndpointsForRange contacts = builder.build(); ClusterMetadata metadata = ClusterMetadata.current(); - return new ReplicaPlan.ForFullRangeRead(keyspace, replicationStrategy, consistencyLevel, range, contacts, contacts, vnodeCount, metadata.epoch); + return new ReplicaPlan.ForFullRangeRead(keyspace, replicationStrategy, consistencyLevel, range, contacts, contacts, contacts, vnodeCount, metadata.epoch); } /** @@ -991,6 +1006,7 @@ public static ReplicaPlan.ForRangeRead forFullRangeRead(Keyspace keyspace, */ public static ReplicaPlan.ForRangeRead maybeMerge(ClusterMetadata metadata, Keyspace keyspace, + TableId tableId, ConsistencyLevel consistencyLevel, ReplicaPlan.ForRangeRead left, ReplicaPlan.ForRangeRead right) @@ -1000,6 +1016,7 @@ public static ReplicaPlan.ForRangeRead maybeMerge(ClusterMetadata metadata, if (!left.epoch.equals(right.epoch)) return null; + EndpointsForRange mergedLiveAndDown = left.liveAndDown().keep(right.liveAndDown().endpoints()); EndpointsForRange mergedCandidates = left.readCandidates().keep(right.readCandidates().endpoints()); AbstractReplicationStrategy replicationStrategy = keyspace.getReplicationStrategy(); EndpointsForRange contacts = contactForRead(metadata.locator, replicationStrategy, consistencyLevel, false, mergedCandidates); @@ -1023,9 +1040,11 @@ public static ReplicaPlan.ForRangeRead maybeMerge(ClusterMetadata metadata, newRange, mergedCandidates, contacts, + mergedLiveAndDown, newVnodeCount, (newClusterMetadata) -> forRangeRead(newClusterMetadata, keyspace, + tableId, null, // TODO (TCM) - we only use the recomputed ForRangeRead to check stillAppliesTo - make sure passing null here is ok consistencyLevel, newRange, @@ -1034,7 +1053,7 @@ public static ReplicaPlan.ForRangeRead maybeMerge(ClusterMetadata metadata, (self, token) -> { // It might happen that the ring has moved forward since the operation has started, but because we'll be recomputing a quorum // after the operation is complete, we will catch inconsistencies either way. - return forReadRepair(self, ClusterMetadata.current(), keyspace, consistencyLevel, token, FailureDetector.isReplicaAlive); + return forReadRepair(self, ClusterMetadata.current(), keyspace, tableId, consistencyLevel, token, FailureDetector.isReplicaAlive, ReadCoordinator.DEFAULT); }, left.epoch); } diff --git a/src/java/org/apache/cassandra/locator/SimpleSnitch.java b/src/java/org/apache/cassandra/locator/SimpleSnitch.java index e06316fa2666..953f2e5c897c 100644 --- a/src/java/org/apache/cassandra/locator/SimpleSnitch.java +++ b/src/java/org/apache/cassandra/locator/SimpleSnitch.java @@ -17,6 +17,10 @@ */ package org.apache.cassandra.locator; +import java.util.Comparator; + +import org.apache.cassandra.utils.Sortable; + /** * A simple endpoint snitch implementation that treats Strategy order as proximity, * allowing non-read-repaired reads to prefer a single endpoint, which improves @@ -58,4 +62,16 @@ public boolean isWorthMergingForRangeQuery(ReplicaCollection merged, ReplicaC { return sorter.isWorthMergingForRangeQuery(merged, l1, l2); } + + @Override + public boolean supportCompareByEndpoint() + { + return sorter.supportCompareByEndpoint(); + } + + @Override + public > Comparator endpointComparator(InetAddressAndPort address, C addresses) + { + return sorter.endpointComparator(address, addresses); + } } diff --git a/src/java/org/apache/cassandra/locator/SnitchAdapter.java b/src/java/org/apache/cassandra/locator/SnitchAdapter.java index 1a32dd64970c..b90e6fed1615 100644 --- a/src/java/org/apache/cassandra/locator/SnitchAdapter.java +++ b/src/java/org/apache/cassandra/locator/SnitchAdapter.java @@ -18,12 +18,14 @@ package org.apache.cassandra.locator; +import java.util.Comparator; import java.util.HashSet; import java.util.Set; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.membership.Location; +import org.apache.cassandra.utils.Sortable; public class SnitchAdapter implements InitialLocationProvider, NodeProximity, NodeAddressConfig { @@ -81,4 +83,16 @@ public boolean preferLocalConnections() { return snitch.preferLocalConnections(); } + + @Override + public boolean supportCompareByEndpoint() + { + return snitch.supportCompareByEndpoint(); + } + + @Override + public > Comparator endpointComparator(InetAddressAndPort address, C addresses) + { + return snitch.endpointComparator(address, addresses); + } } diff --git a/src/java/org/apache/cassandra/metrics/AccordCacheMetrics.java b/src/java/org/apache/cassandra/metrics/AccordCacheMetrics.java new file mode 100644 index 000000000000..1250c34751df --- /dev/null +++ b/src/java/org/apache/cassandra/metrics/AccordCacheMetrics.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.metrics; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import com.codahale.metrics.Histogram; + +import static org.apache.cassandra.metrics.CacheMetrics.TYPE_NAME; +import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics; + +public class AccordCacheMetrics extends CacheAccessMetrics +{ + public static final String OBJECT_SIZE = "ObjectSize"; + + public final Histogram objectSize; + + private final Map instanceMetrics = new ConcurrentHashMap<>(2); + + private final String scope; + + public AccordCacheMetrics(String scope) + { + super(new DefaultNameFactory(TYPE_NAME, scope)); + objectSize = Metrics.histogram(factory.createMetricName(OBJECT_SIZE), false); + this.scope = scope; + } + + public CacheAccessMetrics forInstance(Class klass) + { + // cannot make Class hashCode deterministic, as cannot rewrite - so cannot safely use as Map key if want deterministic simulation + // (or we need to create extra hoops to catch this specific case in method rewriting) + return instanceMetrics.computeIfAbsent(klass.getSimpleName(), k -> new CacheAccessMetrics(new DefaultNameFactory(TYPE_NAME, String.format("%s-%s", scope, k)))); + } +} diff --git a/src/java/org/apache/cassandra/metrics/AccordClientRequestMetrics.java b/src/java/org/apache/cassandra/metrics/AccordClientRequestMetrics.java new file mode 100644 index 000000000000..3dc3509dff90 --- /dev/null +++ b/src/java/org/apache/cassandra/metrics/AccordClientRequestMetrics.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.metrics; + +import javax.annotation.Nullable; + +import com.codahale.metrics.Histogram; +import com.codahale.metrics.Meter; + +import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics; + +public class AccordClientRequestMetrics extends ClientRequestMetrics +{ + public final @Nullable ClientRequestMetrics shared; + + public final Histogram keySize; + + // During migration back to Paxos it's possible a transaction runs + // in an Epoch where Accord is no longer accepting transactions + // and we still run it to completion, but we do skip the read from Cassandra + // although it would be harmless. This should only occur briefly when coordinators + // start transactions on the wrong protocol due to temporarily out of data cluster metadata. + public final Meter migrationSkippedReads; + + // Number of times a key had to be run through PaxosRepair for migration to Accord + public final Meter paxosKeyMigrations; + + // Number of times a query was rejected by Accord in TxnQuery due to a migration back to Paxos + public final Meter accordMigrationRejects; + public final Meter preempted; + public final Meter topologyMismatches; + public final boolean isWrite; + + public AccordClientRequestMetrics(String scope, ClientRequestMetrics shared, boolean isWrite) + { + super(scope); + this.shared = shared; + this.isWrite = isWrite; + + keySize = Metrics.histogram(factory.createMetricName("KeySizeHistogram"), false); + migrationSkippedReads = Metrics.meter(factory.createMetricName("MigrationSkippedReads")); + paxosKeyMigrations = Metrics.meter(factory.createMetricName("PaxosKeyMigrations")); + accordMigrationRejects = Metrics.meter(factory.createMetricName("AccordMigrationRejects")); + preempted = Metrics.meter(factory.createMetricName("Preempted")); + topologyMismatches = Metrics.meter(factory.createMetricName("TopologyMismatches")); + } + + @Override + public void release() + { + super.release(); + Metrics.remove(factory.createMetricName("KeySizeHistogram")); + Metrics.remove(factory.createMetricName("MigrationSkippedReads")); + Metrics.remove(factory.createMetricName("PaxosKeyMigrations")); + Metrics.remove(factory.createMetricName("AccordMigrationRejects")); + Metrics.remove(factory.createMetricName("Preempted")); + Metrics.remove(factory.createMetricName("TopologyMismatches")); + } + +} diff --git a/src/java/org/apache/cassandra/metrics/AccordMetrics.java b/src/java/org/apache/cassandra/metrics/AccordMetrics.java new file mode 100644 index 000000000000..6367fe7873e7 --- /dev/null +++ b/src/java/org/apache/cassandra/metrics/AccordMetrics.java @@ -0,0 +1,315 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.metrics; + +import java.lang.reflect.Field; +import java.util.concurrent.TimeUnit; + +import accord.api.EventListener; +import accord.local.Command; +import accord.primitives.Deps; +import accord.primitives.PartialDeps; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import com.codahale.metrics.Counting; +import com.codahale.metrics.Histogram; +import com.codahale.metrics.Meter; +import com.codahale.metrics.Timer; +import org.apache.cassandra.service.accord.api.AccordTimeService; + +import static java.util.concurrent.TimeUnit.MICROSECONDS; +import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics; + +public class AccordMetrics +{ + public final static AccordMetrics readMetrics = new AccordMetrics("ro"); + public final static AccordMetrics writeMetrics = new AccordMetrics("rw"); + + public static final String STABLE_LATENCY = "StableLatency"; + public static final String EXECUTE_LATENCY = "ExecuteLatency"; + public static final String APPLY_LATENCY = "ApplyLatency"; + public static final String APPLY_DURATION = "ApplyDuration"; + public static final String PARTIAL_DEPENDENCIES = "PartialDependencies"; + public static final String PROGRESS_LOG_SIZE = "ProgressLogSize"; + + public static final String DEPENDENCIES = "Dependencies"; + public static final String EPHEMERAL = "Ephemeral"; + public static final String FAST_PATHS = "FastPaths"; + public static final String SLOW_PATHS = "SlowPaths"; + public static final String PREEMPTS = "Preempts"; + public static final String TIMEOUTS = "Timeouts"; + public static final String INVALIDATIONS = "Invalidations"; + public static final String RECOVERY_DELAY = "RecoveryDelay"; + public static final String RECOVERY_TIME = "RecoveryTime"; + public static final String FAST_PATH_TO_TOTAL = "FastPathToTotal"; + public static final String ACCORD_REPLICA = "AccordReplica"; + public static final String ACCORD_COORDINATOR = "AccordCoordinator"; + + /** + * The time between start on the coordinator and commit on this replica. + */ + public final Timer stableLatency; + + /** + * The time between start on the coordinator and execution on this replica. + */ + public final Timer executeLatency; + + /** + * The time between start on the coordinator and application on this replica. + */ + public final Timer applyLatency; + + /** + * Duration of applying changes. + */ + public final Timer applyDuration; + + /** + * A histogram of the number of dependencies per partial transaction at this replica. + */ + public final Histogram partialDependencies; + + public final Meter progressLogSize; + + /** + * A histogram of the number of dependencies per transaction at this coordinator. + */ + public final Histogram dependencies; + + /** + * The number of fast path transactions executed on this coordinator. + */ + public final Meter fastPaths; + + /** + * The number of slow path transactions executed on this coordinator. + */ + public final Meter slowPaths; + + /** + * The number of preempted transactions on this coordinator. + */ + public final Meter preempts; + + /** + * The number of timed out transactions on this coordinator. + */ + public final Meter timeouts; + + /** + * The number of invalidated transactions on this coordinator. + */ + public final Meter invalidations; + + /** + * The time between the start of the transaction and the start of the recovery, if the transaction is recovered. + */ + public final Timer recoveryDelay; + + /** + * The time between the start of the recovery and the execution of the transaction, if the transaction is recovered. + */ + public final Timer recoveryDuration; + + /** + * The ratio of the number of fast path transactions to the total number of transactions. + */ + public final RatioGaugeSet fastPathToTotal; + + private AccordMetrics(String scope) + { + DefaultNameFactory replica = new DefaultNameFactory(ACCORD_REPLICA, scope); + stableLatency = Metrics.timer(replica.createMetricName(STABLE_LATENCY)); + executeLatency = Metrics.timer(replica.createMetricName(EXECUTE_LATENCY)); + applyLatency = Metrics.timer(replica.createMetricName(APPLY_LATENCY)); + applyDuration = Metrics.timer(replica.createMetricName(APPLY_DURATION)); + partialDependencies = Metrics.histogram(replica.createMetricName(PARTIAL_DEPENDENCIES), true); + progressLogSize = Metrics.meter(replica.createMetricName(PROGRESS_LOG_SIZE)); + + DefaultNameFactory coordinator = new DefaultNameFactory(ACCORD_COORDINATOR, scope); + dependencies = Metrics.histogram(coordinator.createMetricName(DEPENDENCIES), true); + fastPaths = Metrics.meter(coordinator.createMetricName(FAST_PATHS)); + slowPaths = Metrics.meter(coordinator.createMetricName(SLOW_PATHS)); + preempts = Metrics.meter(coordinator.createMetricName(PREEMPTS)); + timeouts = Metrics.meter(coordinator.createMetricName(TIMEOUTS)); + invalidations = Metrics.meter(coordinator.createMetricName(INVALIDATIONS)); + recoveryDelay = Metrics.timer(coordinator.createMetricName(RECOVERY_DELAY)); + recoveryDuration = Metrics.timer(coordinator.createMetricName(RECOVERY_TIME)); + fastPathToTotal = new RatioGaugeSet(fastPaths, RatioGaugeSet.sum(fastPaths, slowPaths), coordinator, FAST_PATH_TO_TOTAL + ".%s"); + } + + @Override + public String toString() + { + StringBuilder builder = new StringBuilder(); + builder.append("AccordMetrics ["); + + try + { + for (Field f : getClass().getDeclaredFields()) + { + f.setAccessible(true); + if (Counting.class.isAssignableFrom(f.getType())) + { + Counting metric = (Counting) f.get(this); + builder.append(String.format("%s: count=%d, ", f.getName(), metric.getCount())); + } + } + } + catch (IllegalAccessException e) + { + throw new RuntimeException(e); + } + builder.append("]"); + return builder.toString(); + } + + public static class Listener implements EventListener + { + public final static Listener instance = new Listener(AccordMetrics.readMetrics, AccordMetrics.writeMetrics); + + private final AccordMetrics readMetrics; + private final AccordMetrics writeMetrics; + + public Listener(AccordMetrics readMetrics, AccordMetrics writeMetrics) + { + this.readMetrics = readMetrics; + this.writeMetrics = writeMetrics; + } + + private AccordMetrics forTransaction(TxnId txnId) + { + if (txnId.isWrite()) + return writeMetrics; + else if (txnId.isSomeRead()) + return readMetrics; + else + return null; + } + + @Override + public void onStable(Command cmd) + { + long now = AccordTimeService.nowMicros(); + AccordMetrics metrics = forTransaction(cmd.txnId()); + if (metrics != null) + { + long trxTimestamp = cmd.txnId().hlc(); + metrics.stableLatency.update(now - trxTimestamp, TimeUnit.MICROSECONDS); + } + } + + @Override + public void onExecuted(Command cmd) + { + long now = AccordTimeService.nowMicros(); + AccordMetrics metrics = forTransaction(cmd.txnId()); + if (metrics != null) + { + Timestamp trxTimestamp = cmd.txnId(); + metrics.executeLatency.update(now - trxTimestamp.hlc(), TimeUnit.MICROSECONDS); + PartialDeps deps = cmd.partialDeps(); + metrics.partialDependencies.update(deps != null ? deps.txnIdCount() : 0); + } + } + + @Override + public void onApplied(Command cmd, long applyStartTimestamp) + { + long now = AccordTimeService.nowMicros(); + AccordMetrics metrics = forTransaction(cmd.txnId()); + if (metrics != null) + { + Timestamp trxTimestamp = cmd.txnId(); + metrics.applyLatency.update(now - trxTimestamp.hlc(), TimeUnit.MICROSECONDS); + metrics.applyDuration.update(now - applyStartTimestamp, TimeUnit.MICROSECONDS); + } + } + + @Override + public void onFastPathTaken(TxnId txnId, Deps deps) + { + AccordMetrics metrics = forTransaction(txnId); + if (metrics != null) + { + metrics.fastPaths.mark(); + metrics.dependencies.update(deps.txnIdCount()); + } + } + + @Override + public void onSlowPathTaken(TxnId txnId, Deps deps) + { + AccordMetrics metrics = forTransaction(txnId); + if (metrics != null) + { + metrics.slowPaths.mark(); + metrics.dependencies.update(deps.txnIdCount()); + } + } + + @Override + public void onRecover(TxnId txnId, Timestamp recoveryTimestamp) + { + AccordMetrics metrics = forTransaction(txnId); + if (metrics != null) + { + long now = AccordTimeService.nowMicros(); + + metrics.recoveryDuration.update(now - recoveryTimestamp.hlc(), MICROSECONDS); + metrics.recoveryDelay.update(recoveryTimestamp.hlc() - txnId.hlc(), MICROSECONDS); + } + } + + @Override + public void onPreempted(TxnId txnId) + { + AccordMetrics metrics = forTransaction(txnId); + if (metrics != null) + metrics.preempts.mark(); + } + + @Override + public void onTimeout(TxnId txnId) + { + // TODO (required): we appear to be marking this twice, once in AccordResult and once here. + // why does AccordMetricsTest only see this one? remove duplication. + AccordMetrics metrics = forTransaction(txnId); + if (metrics != null) + metrics.timeouts.mark(); + } + + @Override + public void onInvalidated(TxnId txnId) + { + AccordMetrics metrics = forTransaction(txnId); + if (metrics != null) + metrics.invalidations.mark(); + } + + @Override + public void onProgressLogSizeChange(TxnId txnId, int delta) + { + AccordMetrics metrics = forTransaction(txnId); + if (metrics != null) + metrics.progressLogSize.mark(delta); + } + } +} diff --git a/src/java/org/apache/cassandra/metrics/AutoRepairMetrics.java b/src/java/org/apache/cassandra/metrics/AutoRepairMetrics.java new file mode 100644 index 000000000000..3ef24a9eec1b --- /dev/null +++ b/src/java/org/apache/cassandra/metrics/AutoRepairMetrics.java @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.metrics; + +import com.codahale.metrics.Counter; +import com.codahale.metrics.Gauge; +import com.google.common.annotations.VisibleForTesting; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig.RepairType; +import org.apache.cassandra.repair.autorepair.AutoRepairUtils; +import org.apache.cassandra.repair.autorepair.AutoRepair; +import org.apache.cassandra.service.AutoRepairService; + +import static java.util.concurrent.TimeUnit.MILLISECONDS; +import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics; +import static org.apache.cassandra.utils.LocalizeString.toLowerCaseLocalized; + +/** + * Metrics related to AutoRepair. + */ +public class AutoRepairMetrics +{ + public static final String TYPE_NAME = "autorepair"; + public final Gauge repairsInProgress; + public final Gauge nodeRepairTimeInSec; + public final Gauge clusterRepairTimeInSec; + public final Gauge longestUnrepairedSec; + public final Gauge repairStartLagSec; + public final Gauge succeededTokenRangesCount; + public final Gauge failedTokenRangesCount; + public final Gauge skippedTokenRangesCount; + public final Gauge skippedTablesCount; + public final Gauge totalMVTablesConsideredForRepair; + public final Gauge totalDisabledRepairTables; + public Counter repairTurnMyTurn; + public Counter repairTurnMyTurnDueToPriority; + public Counter repairTurnMyTurnForceRepair; + public Counter repairDelayedByReplica; + public Counter repairDelayedBySchedule; + + private final RepairType repairType; + + private volatile int repairStartLagSecVal; + + public AutoRepairMetrics(RepairType repairType) + { + this.repairType = repairType; + AutoRepairMetricsFactory factory = new AutoRepairMetricsFactory(repairType); + + repairsInProgress = Metrics.register(factory.createMetricName("RepairsInProgress"), new Gauge() + { + public Integer getValue() + { + return AutoRepair.instance.getRepairState(repairType).isRepairInProgress() ? 1 : 0; + } + }); + + nodeRepairTimeInSec = Metrics.register(factory.createMetricName("NodeRepairTimeInSec"), new Gauge() + { + public Integer getValue() + { + return AutoRepair.instance.getRepairState(repairType).getNodeRepairTimeInSec(); + } + }); + + clusterRepairTimeInSec = Metrics.register(factory.createMetricName("ClusterRepairTimeInSec"), new Gauge() + { + public Integer getValue() + { + return AutoRepair.instance.getRepairState(repairType).getClusterRepairTimeInSec(); + } + }); + + skippedTokenRangesCount = Metrics.register(factory.createMetricName("SkippedTokenRangesCount"), new Gauge() + { + public Integer getValue() + { + return AutoRepair.instance.getRepairState(repairType).getSkippedTokenRangesCount(); + } + }); + + skippedTablesCount = Metrics.register(factory.createMetricName("SkippedTablesCount"), new Gauge() + { + public Integer getValue() + { + return AutoRepair.instance.getRepairState(repairType).getSkippedTablesCount(); + } + }); + + longestUnrepairedSec = Metrics.register(factory.createMetricName("LongestUnrepairedSec"), new Gauge() + { + public Integer getValue() + { + return AutoRepair.instance.getRepairState(repairType).getLongestUnrepairedSec(); + } + }); + + repairStartLagSec = Metrics.register(factory.createMetricName("RepairStartLagSec"), new Gauge() + { + public Integer getValue() + { + return repairStartLagSecVal; + } + }); + + succeededTokenRangesCount = Metrics.register(factory.createMetricName("SucceededTokenRangesCount"), new Gauge() + { + public Integer getValue() + { + return AutoRepair.instance.getRepairState(repairType).getSucceededTokenRangesCount(); + } + }); + + failedTokenRangesCount = Metrics.register(factory.createMetricName("FailedTokenRangesCount"), new Gauge() + { + public Integer getValue() + { + return AutoRepair.instance.getRepairState(repairType).getFailedTokenRangesCount(); + } + }); + + repairTurnMyTurn = Metrics.counter(factory.createMetricName("RepairTurnMyTurn")); + repairTurnMyTurnDueToPriority = Metrics.counter(factory.createMetricName("RepairTurnMyTurnDueToPriority")); + repairTurnMyTurnForceRepair = Metrics.counter(factory.createMetricName("RepairTurnMyTurnForceRepair")); + + repairDelayedByReplica = Metrics.counter(factory.createMetricName("RepairDelayedByReplica")); + repairDelayedBySchedule = Metrics.counter(factory.createMetricName("RepairDelayedBySchedule")); + + totalMVTablesConsideredForRepair = Metrics.register(factory.createMetricName("TotalMVTablesConsideredForRepair"), new Gauge() + { + public Integer getValue() + { + return AutoRepair.instance.getRepairState(repairType).getTotalMVTablesConsideredForRepair(); + } + }); + + totalDisabledRepairTables = Metrics.register(factory.createMetricName("TotalDisabledRepairTables"), new Gauge() + { + public Integer getValue() + { + return AutoRepair.instance.getRepairState(repairType).getTotalDisabledTablesRepairCount(); + } + }); + } + + public void recordTurn(AutoRepairUtils.RepairTurn turn) + { + switch (turn) + { + case MY_TURN: + repairTurnMyTurn.inc(); + break; + case MY_TURN_FORCE_REPAIR: + repairTurnMyTurnForceRepair.inc(); + break; + case MY_TURN_DUE_TO_PRIORITY: + repairTurnMyTurnDueToPriority.inc(); + break; + default: + throw new RuntimeException(String.format("Unrecoginized turn: %s", turn.name())); + } + this.repairStartLagSecVal = 0; + } + + /** + * Record perceived lag in scheduling repair. + *

+ * Takes the current time and subtracts it from the given last repair finish time. It then compares the difference + * with the min repair interval for this repair type, and if that value is greater than 0, records it. + */ + public void recordRepairStartLag(long lastFinishTimeInMs) + { + long now = AutoRepair.instance.currentTimeMs(); + long deltaFinish = now - lastFinishTimeInMs; + long deltaMinRepairInterval = deltaFinish - AutoRepairService.instance + .getAutoRepairConfig().getRepairMinInterval(repairType) + .toMilliseconds(); + this.repairStartLagSecVal = deltaMinRepairInterval > 0 ? (int) MILLISECONDS.toSeconds(deltaMinRepairInterval) : 0; + } + + @VisibleForTesting + protected static class AutoRepairMetricsFactory implements MetricNameFactory + { + private static final String TYPE = "AutoRepair"; + @VisibleForTesting + protected final String repairType; + + protected AutoRepairMetricsFactory(RepairType repairType) + { + this.repairType = toLowerCaseLocalized(repairType.toString()); + } + + @Override + public CassandraMetricsRegistry.MetricName createMetricName(String metricName) + { + StringBuilder mbeanName = new StringBuilder(); + mbeanName.append(DefaultNameFactory.GROUP_NAME).append(':'); + mbeanName.append("type=").append(TYPE); + mbeanName.append(",name=").append(metricName); + mbeanName.append(",repairType=").append(repairType); + + StringBuilder scope = new StringBuilder(); + scope.append("repairType=").append(repairType); + + return new CassandraMetricsRegistry.MetricName(DefaultNameFactory.GROUP_NAME, toLowerCaseLocalized(TYPE), + metricName, scope.toString(), mbeanName.toString()); + } + } +} diff --git a/src/java/org/apache/cassandra/metrics/AutoRepairMetricsManager.java b/src/java/org/apache/cassandra/metrics/AutoRepairMetricsManager.java new file mode 100644 index 000000000000..e97ce34e5a73 --- /dev/null +++ b/src/java/org/apache/cassandra/metrics/AutoRepairMetricsManager.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.metrics; + +import org.apache.cassandra.repair.autorepair.AutoRepairConfig.RepairType; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * AutoRepair metrics manager holding all the auto-repair related metrics. + */ +public class AutoRepairMetricsManager +{ + private static final Map metrics = new ConcurrentHashMap<>(); + + public static AutoRepairMetrics getMetrics(RepairType repairType) + { + return metrics.computeIfAbsent(repairType, k -> new AutoRepairMetrics(repairType)); + } +} diff --git a/src/java/org/apache/cassandra/metrics/CASClientRequestMetrics.java b/src/java/org/apache/cassandra/metrics/CASClientRequestMetrics.java index 654bb059d16e..f3ae6c89248f 100644 --- a/src/java/org/apache/cassandra/metrics/CASClientRequestMetrics.java +++ b/src/java/org/apache/cassandra/metrics/CASClientRequestMetrics.java @@ -29,6 +29,12 @@ public class CASClientRequestMetrics extends ClientRequestMetrics public final Histogram contention; public final Counter unfinishedCommit; public final Meter unknownResult; + // CAS request rejected after Prepare/Promise due to migration from Paxos to Accord + public final Meter beginMigrationRejects; + // Number of times a CAS request was rejected after Propose/Accept due to migration from Paxos to Accord + public final Meter acceptMigrationRejects; + // Number of times a key was migrated from Accord to Paxos + public final Meter accordKeyMigrations; public CASClientRequestMetrics(String scope) { @@ -36,6 +42,9 @@ public CASClientRequestMetrics(String scope) contention = Metrics.histogram(factory.createMetricName("ContentionHistogram"), false); unfinishedCommit = Metrics.counter(factory.createMetricName("UnfinishedCommit")); unknownResult = Metrics.meter(factory.createMetricName("UnknownResult")); + beginMigrationRejects = Metrics.meter(factory.createMetricName("PaxosBeginMigrationRejects")); + acceptMigrationRejects = Metrics.meter(factory.createMetricName("PaxosAcceptMigrationRejects")); + accordKeyMigrations = Metrics.meter(factory.createMetricName("AccordKeyMigrations")); } public void release() @@ -44,5 +53,8 @@ public void release() Metrics.remove(factory.createMetricName("ContentionHistogram")); Metrics.remove(factory.createMetricName("UnfinishedCommit")); Metrics.remove(factory.createMetricName("UnknownResult")); + Metrics.remove(factory.createMetricName("PaxosBeginMigrationRejects")); + Metrics.remove(factory.createMetricName("PaxosAcceptMigrationRejects")); + Metrics.remove(factory.createMetricName("AccordKeyMigrations")); } } diff --git a/src/java/org/apache/cassandra/metrics/CacheAccessMetrics.java b/src/java/org/apache/cassandra/metrics/CacheAccessMetrics.java new file mode 100644 index 000000000000..59d76a9f4904 --- /dev/null +++ b/src/java/org/apache/cassandra/metrics/CacheAccessMetrics.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.metrics; + +import com.google.common.annotations.VisibleForTesting; + +import com.codahale.metrics.Meter; + +import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics; + +public class CacheAccessMetrics +{ + /** + * Total number of cache hits + */ + public final Meter hits; + + /** + * Total number of cache misses + */ + public final Meter misses; + + /** + * Total number of cache requests + */ + public final Meter requests; + + public final RatioGaugeSet hitRate; + + public final RatioGaugeSet missRate; + + protected final MetricNameFactory factory; + + public CacheAccessMetrics(MetricNameFactory factory) + { + this.factory = factory; + + this.hits = Metrics.meter(factory.createMetricName("Hits")); + this.misses = Metrics.meter(factory.createMetricName("Misses")); + this.requests = Metrics.meter(factory.createMetricName("Requests")); + + this.hitRate = new RatioGaugeSet(hits, requests, factory, "%sHitRate"); + this.missRate = new RatioGaugeSet(misses, requests, factory, "%sMissRate"); + } + + @VisibleForTesting + public void reset() + { + // No actual reset happens. The Meter counter is put to zero but will not reset the moving averages + // It rather injects a weird value into them. + // This method is being only used by CacheMetricsTest and CachingBench so fixing this issue was acknowledged + // but not considered mandatory to be fixed now (CASSANDRA-16228) + hits.mark(-hits.getCount()); + misses.mark(-misses.getCount()); + requests.mark(-requests.getCount()); + } +} diff --git a/src/java/org/apache/cassandra/metrics/CacheMetrics.java b/src/java/org/apache/cassandra/metrics/CacheMetrics.java index 574b0f065c20..13a59f88e658 100644 --- a/src/java/org/apache/cassandra/metrics/CacheMetrics.java +++ b/src/java/org/apache/cassandra/metrics/CacheMetrics.java @@ -38,7 +38,7 @@ public class CacheMetrics extends AbstractCacheMetrics public final Gauge entries; /** - * Create metrics for given cache. + * Create metrics for the given cache supporting entity. * * @param type Type of Cache to identify metrics * @param cache Weighted Cache to measure metrics diff --git a/src/java/org/apache/cassandra/metrics/CacheSizeMetrics.java b/src/java/org/apache/cassandra/metrics/CacheSizeMetrics.java new file mode 100644 index 000000000000..fb34cfcc19f7 --- /dev/null +++ b/src/java/org/apache/cassandra/metrics/CacheSizeMetrics.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.metrics; + +import com.codahale.metrics.Gauge; +import org.apache.cassandra.cache.CacheSize; + +import static org.apache.cassandra.metrics.CacheMetrics.TYPE_NAME; +import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics; + +public class CacheSizeMetrics +{ + public static final String CAPACITY = "Capacity"; + public static final String SIZE = "Size"; + public static final String ENTRIES = "Entries"; + /** + * Cache capacity in bytes + */ + public final Gauge capacity; + + /** + * Total size of cache, in bytes + */ + public final Gauge size; + + /** + * Total number of cache entries + */ + public final Gauge entries; + + /** + * Create metrics for the given cache supporting entity. + * + * @param type Type of Cache to identify metrics. + * @param cache Cache to measure metrics + */ + public CacheSizeMetrics(String type, CacheSize cache) + { + this(new DefaultNameFactory(TYPE_NAME, type), cache); + } + + public CacheSizeMetrics(MetricNameFactory factory, CacheSize cache) + { + capacity = Metrics.register(factory.createMetricName(CAPACITY), cache::capacity); + size = Metrics.register(factory.createMetricName(SIZE), cache::weightedSize); + entries = Metrics.register(factory.createMetricName(ENTRIES), cache::size); + } +} diff --git a/src/java/org/apache/cassandra/metrics/CassandraMetricsRegistry.java b/src/java/org/apache/cassandra/metrics/CassandraMetricsRegistry.java index 8cf83f520870..44bcf2d6cf87 100644 --- a/src/java/org/apache/cassandra/metrics/CassandraMetricsRegistry.java +++ b/src/java/org/apache/cassandra/metrics/CassandraMetricsRegistry.java @@ -114,6 +114,8 @@ public class CassandraMetricsRegistry extends MetricRegistry // for virtual tables. metricGroups = ImmutableSet.builder() .add(AbstractMetrics.TYPE) + .add(AccordMetrics.ACCORD_COORDINATOR) + .add(AccordMetrics.ACCORD_REPLICA) .add(BatchMetrics.TYPE_NAME) .add(BufferPoolMetrics.TYPE_NAME) .add(CIDRAuthorizerMetrics.TYPE_NAME) @@ -130,8 +132,10 @@ public class CassandraMetricsRegistry extends MetricRegistry .add(DroppedMessageMetrics.TYPE) .add(HintedHandoffMetrics.TYPE_NAME) .add(HintsServiceMetrics.TYPE_NAME) + .add(org.apache.cassandra.index.accord.IndexMetrics.TYPE) .add(InternodeInboundMetrics.TYPE_NAME) .add(InternodeOutboundMetrics.TYPE_NAME) + .add(org.apache.cassandra.journal.Metrics.TYPE_NAME) .add(KeyspaceMetrics.TYPE_NAME) .add(MemtablePool.TYPE_NAME) .add(MessagingMetrics.TYPE_NAME) @@ -150,6 +154,7 @@ public class CassandraMetricsRegistry extends MetricRegistry .add(ThreadPoolMetrics.TYPE_NAME) .add(TrieMemtableMetricsView.TYPE_NAME) .add(UnweightedCacheMetrics.TYPE_NAME) + .add(AutoRepairMetrics.TYPE_NAME) .build(); } @@ -300,9 +305,14 @@ public Counter counter(MetricName... name) } public Meter meter(MetricName... name) + { + return meter(false, name); + } + + public Meter meter(boolean gaugeCompatible, MetricName... name) { Meter meter = super.meter(name[0].getMetricName()); - Stream.of(name).forEach(n -> register(n, meter)); + Stream.of(name).forEach(n -> register(gaugeCompatible, n, meter)); return meter; } @@ -318,6 +328,11 @@ public Histogram histogram(MetricName name, MetricName alias, boolean considerZe return histogram; } + public > T gauge(MetricName name, T gauge) + { + return register(name, gauge); + } + public > T gauge(MetricName name, MetricName alias, T gauge) { T gaugeLoc = register(name, gauge); @@ -369,6 +384,11 @@ public static SnapshottingReservoir createReservoir(TimeUnit durationUnit) } public T register(MetricName name, T metric) + { + return register(false, name, metric); + } + + public T register(boolean gaugeCompatible, MetricName name, T metric) { if (metric instanceof MetricSet) throw new IllegalArgumentException("MetricSet registration using MetricName is not supported"); @@ -376,7 +396,7 @@ public T register(MetricName name, T metric) try { verifyUnknownMetric(name); - registerMBean(metric, name.getMBeanName(), MBeanWrapper.instance); + registerMBean(metric, name.getMBeanName(), MBeanWrapper.instance, gaugeCompatible); return super.register(name.getMetricName(), metric); } catch (IllegalArgumentException e) @@ -490,7 +510,7 @@ public interface MetricNameResolver @Nullable String resolve(String fullName); } - private void registerMBean(Metric metric, ObjectName name, MBeanWrapper mBeanServer) + public void registerMBean(Metric metric, ObjectName name, MBeanWrapper mBeanServer, boolean gaugeCompatible) { AbstractBean mbean; @@ -503,7 +523,18 @@ else if (metric instanceof Histogram) else if (metric instanceof Timer) mbean = new JmxTimer((Timer) metric, name, TimeUnit.SECONDS, DEFAULT_TIMER_UNIT); else if (metric instanceof Metered) - mbean = new JmxMeter((Metered) metric, name, TimeUnit.SECONDS); + { + // If a gauge compatible meter is requested, create a special implementation which + // also yields a 'Value' attribute for backwards compatibility. + if (gaugeCompatible) + { + mbean = new JmxMeterGaugeCompatible((Metered) metric, name, TimeUnit.SECONDS); + } + else + { + mbean = new JmxMeter((Metered) metric, name, TimeUnit.SECONDS); + } + } else throw new IllegalArgumentException("Unknown metric type: " + metric.getClass()); @@ -815,6 +846,29 @@ private String calculateRateUnit(TimeUnit unit) } } + public interface JmxMeterGaugeCompatibleMBean extends JmxMeterMBean, JmxGaugeMBean {} + + /** + * An implementation of {@link JmxMeter} that is compatible with {@link JmxGaugeMBean} in that it also + * implements {@link JmxGaugeMBean}. This is useful for metrics that were migrated from {@link JmxGauge} + * to {@link JmxMeter} like {@link TableMetrics#bytesAnticompacted} and + * {@link TableMetrics#bytesMutatedAnticompaction}. + */ + private static class JmxMeterGaugeCompatible extends JmxMeter implements JmxMeterGaugeCompatibleMBean + { + + private JmxMeterGaugeCompatible(Metered metric, ObjectName objectName, TimeUnit rateUnit) + { + super(metric, objectName, rateUnit); + } + + @Override + public Object getValue() + { + return getCount(); + } + } + /** * Exports a timer as a JMX MBean, check corresponding {@link org.apache.cassandra.db.virtual.model.TimerMetricRow} * for the same functionality for virtual tables. diff --git a/src/java/org/apache/cassandra/metrics/ClientRequestMetrics.java b/src/java/org/apache/cassandra/metrics/ClientRequestMetrics.java index 61fcc34bf129..e22eec6f5143 100644 --- a/src/java/org/apache/cassandra/metrics/ClientRequestMetrics.java +++ b/src/java/org/apache/cassandra/metrics/ClientRequestMetrics.java @@ -40,6 +40,8 @@ public class ClientRequestMetrics extends LatencyMetrics public final Meter readSizeAborts; public final Meter localRequests; public final Meter remoteRequests; + public final Meter retryDifferentSystem; + public final Meter retryCoordinatorBehind; public ClientRequestMetrics(String scope) { @@ -53,6 +55,8 @@ public ClientRequestMetrics(String scope) readSizeAborts = Metrics.meter(factory.createMetricName("ReadSizeAborts")); localRequests = Metrics.meter(factory.createMetricName("LocalRequests")); remoteRequests = Metrics.meter(factory.createMetricName("RemoteRequests")); + retryDifferentSystem = Metrics.meter(factory.createMetricName("RetryDifferentSystem")); + retryCoordinatorBehind = Metrics.meter(factory.createMetricName("RetryCoordinatorBehind")); } public void markAbort(Throwable cause) @@ -81,5 +85,7 @@ public void release() Metrics.remove(factory.createMetricName("ReadSizeAborts")); Metrics.remove(factory.createMetricName("LocalRequests")); Metrics.remove(factory.createMetricName("RemoteRequests")); + Metrics.remove(factory.createMetricName("RetryDifferentSystem")); + Metrics.remove(factory.createMetricName("RetryCoordinatorBehind")); } } diff --git a/src/java/org/apache/cassandra/metrics/ClientRequestsMetricsHolder.java b/src/java/org/apache/cassandra/metrics/ClientRequestsMetricsHolder.java index 26f2913263e6..d996e0140bc1 100644 --- a/src/java/org/apache/cassandra/metrics/ClientRequestsMetricsHolder.java +++ b/src/java/org/apache/cassandra/metrics/ClientRequestsMetricsHolder.java @@ -21,6 +21,7 @@ import java.util.Map; import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.service.accord.ClientRequestBookkeeping; public final class ClientRequestsMetricsHolder { @@ -29,6 +30,10 @@ public final class ClientRequestsMetricsHolder public static final CASClientWriteRequestMetrics casWriteMetrics = new CASClientWriteRequestMetrics("CASWrite"); public static final CASClientRequestMetrics casReadMetrics = new CASClientRequestMetrics("CASRead"); public static final ViewWriteMetrics viewWriteMetrics = new ViewWriteMetrics("ViewWrite"); + public static final AccordClientRequestMetrics accordReadMetrics = new AccordClientRequestMetrics("AccordRead", readMetrics, false); + public static final AccordClientRequestMetrics accordWriteMetrics = new AccordClientRequestMetrics("AccordWrite", writeMetrics, true); + public static final ClientRequestBookkeeping accordReadBookkeeping = new ClientRequestBookkeeping(false, accordReadMetrics); + public static final ClientRequestBookkeeping accordWriteBookkeeping = new ClientRequestBookkeeping(true, accordWriteMetrics); public static final Map readMetricsMap = new EnumMap<>(ConsistencyLevel.class); public static final Map writeMetricsMap = new EnumMap<>(ConsistencyLevel.class); diff --git a/src/java/org/apache/cassandra/metrics/HintsServiceMetrics.java b/src/java/org/apache/cassandra/metrics/HintsServiceMetrics.java index 2a8ce92776d9..275aeccfa646 100644 --- a/src/java/org/apache/cassandra/metrics/HintsServiceMetrics.java +++ b/src/java/org/apache/cassandra/metrics/HintsServiceMetrics.java @@ -17,14 +17,20 @@ */ package org.apache.cassandra.metrics; +import java.io.Serializable; +import java.net.UnknownHostException; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.codahale.metrics.Counter; +import com.codahale.metrics.Gauge; import com.codahale.metrics.Histogram; import com.codahale.metrics.Meter; import com.github.benmanes.caffeine.cache.Caffeine; import com.github.benmanes.caffeine.cache.LoadingCache; import org.apache.cassandra.concurrent.ImmediateExecutor; +import org.apache.cassandra.hints.HintsService; import org.apache.cassandra.locator.InetAddressAndPort; import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics; @@ -35,6 +41,22 @@ public final class HintsServiceMetrics { public static final String TYPE_NAME = "HintsService"; + + // Hint metrics are by address and hints that are for Accord need an address + public static final InetAddressAndPort ACCORD_HINT_ENDPOINT; + + static + { + try + { + ACCORD_HINT_ENDPOINT = InetAddressAndPort.getByNameOverrideDefaults("0.0.0.0", 0); + } + catch (UnknownHostException e) + { + throw new RuntimeException(e); + } + } + private static final Logger logger = LoggerFactory.getLogger(HintsServiceMetrics.class); private static final MetricNameFactory factory = new DefaultNameFactory(TYPE_NAME); @@ -42,6 +64,14 @@ public final class HintsServiceMetrics public static final Meter hintsSucceeded = Metrics.meter(factory.createMetricName("HintsSucceeded")); public static final Meter hintsFailed = Metrics.meter(factory.createMetricName("HintsFailed")); public static final Meter hintsTimedOut = Metrics.meter(factory.createMetricName("HintsTimedOut")); + public static final Meter hintsRetryDifferentSystem = Metrics.meter(factory.createMetricName("HintsRetryDifferentSystem")); + + public static final Gauge hintsFileSize = Metrics.gauge(factory.createMetricName("HintsFileSize"), new TotalHintsSizeGauge()); + // Corresponding to the hinted_handoff_throttle_in_kb configuration + public static final Counter hintsThrottle = Metrics.counter(factory.createMetricName("HintsThrottle")); + + public static final Meter hintsApplySucceeded = Metrics.meter(factory.createMetricName("HintsApplySucceeded")); + public static final Meter hintsApplyFailed = Metrics.meter(factory.createMetricName("HintsApplyFailed")); /** Histogram of all hint delivery delays */ private static final Histogram globalDelayHistogram = Metrics.histogram(factory.createMetricName("Hint_delays"), false); @@ -51,6 +81,18 @@ public final class HintsServiceMetrics .executor(ImmediateExecutor.INSTANCE) .build(address -> Metrics.histogram(factory.createMetricName("Hint_delays-"+address.toString().replace(':', '.')), false)); + // because at the time of static hintsFileSize being initialized, + // HintsService.instance is null / is not initialized yet so usage of method reference is not possible, + // so this is the workaround. + private static class TotalHintsSizeGauge implements Gauge, Serializable + { + @Override + public Long getValue() + { + return HintsService.instance.getTotalHintsSizeOfNode(); + } + } + public static void updateDelayMetrics(InetAddressAndPort endpoint, long delay) { if (delay <= 0) @@ -62,4 +104,9 @@ public static void updateDelayMetrics(InetAddressAndPort endpoint, long delay) globalDelayHistogram.update(delay); delayByEndpoint.get(endpoint).update(delay); } + + public static long getDelayCount(InetAddressAndPort endpoint) + { + return delayByEndpoint.get(endpoint).getCount(); + } } diff --git a/src/java/org/apache/cassandra/metrics/KeyspaceMetrics.java b/src/java/org/apache/cassandra/metrics/KeyspaceMetrics.java index a1916bebd071..e205230bd2ec 100644 --- a/src/java/org/apache/cassandra/metrics/KeyspaceMetrics.java +++ b/src/java/org/apache/cassandra/metrics/KeyspaceMetrics.java @@ -101,6 +101,14 @@ public class KeyspaceMetrics public final LatencyMetrics casPropose; /** CAS Commit metrics */ public final LatencyMetrics casCommit; + /** Latency for locally run key migrations **/ + public final LatencyMetrics keyMigration; + /** Latency for range migrations run by locally coordinated Accord repairs **/ + public final LatencyMetrics accordRepair; + public final LatencyMetrics accordPostStreamRepair; + public final Meter rangeMigrationUnexpectedFailures; + public final Meter mutationsRejectedOnWrongSystem; + public final Meter readsRejectedOnWrongSystem; /** Writes failed ideal consistency **/ public final Counter writeFailedIdealCL; /** Ideal CL write latency metrics */ @@ -182,6 +190,13 @@ public class KeyspaceMetrics public final Meter tooManySSTableIndexesReadWarnings; public final Meter tooManySSTableIndexesReadAborts; + public final Meter bytesAnticompacted; + public final Meter bytesMutatedAnticompaction; + public final Meter bytesPreviewed; + public final Meter tokenRangesPreviewedDesynchronized; + public final Meter bytesPreviewedDesynchronized; + + public final LatencyMetrics viewSSTableIntervalTree; public final ImmutableMap, ImmutableMap>> formatSpecificGauges; @@ -245,6 +260,12 @@ public KeyspaceMetrics(final Keyspace ks) casPrepare = createLatencyMetrics("CasPrepare"); casPropose = createLatencyMetrics("CasPropose"); casCommit = createLatencyMetrics("CasCommit"); + keyMigration = createLatencyMetrics("KeyMigration"); + accordRepair = createLatencyMetrics("AccordRepair"); + accordPostStreamRepair = createLatencyMetrics("AccordPostStreamRepair"); + rangeMigrationUnexpectedFailures = createKeyspaceMeter("RangeMigrationUnexpectedFailures"); + mutationsRejectedOnWrongSystem = createKeyspaceMeter("MutationsRejectedOnWrongSystem"); + readsRejectedOnWrongSystem = createKeyspaceMeter("ReadsRejectedOnWrongSystem"); writeFailedIdealCL = createKeyspaceCounter("WriteFailedIdealCL"); idealCLWriteLatency = createLatencyMetrics("IdealCLWrite"); @@ -291,6 +312,13 @@ public KeyspaceMetrics(final Keyspace ks) outOfRangeTokenReads = createKeyspaceCounter("ReadOutOfRangeToken"); outOfRangeTokenWrites = createKeyspaceCounter("WriteOutOfRangeToken"); outOfRangeTokenPaxosRequests = createKeyspaceCounter("PaxosOutOfRangeToken"); + + viewSSTableIntervalTree = createLatencyMetrics("ViewSSTableIntervalTree"); + bytesAnticompacted = createKeyspaceMeter("BytesAnticompacted"); + bytesMutatedAnticompaction = createKeyspaceMeter("BytesMutatedAnticompaction"); + bytesPreviewed = createKeyspaceMeter("BytesPreviewed"); + tokenRangesPreviewedDesynchronized = createKeyspaceMeter("TokenRangesPreviewedDesynchronized"); + bytesPreviewedDesynchronized = createKeyspaceMeter("BytesPreviewedDesynchronized"); } /** diff --git a/src/java/org/apache/cassandra/metrics/RatioGaugeSet.java b/src/java/org/apache/cassandra/metrics/RatioGaugeSet.java new file mode 100644 index 000000000000..057a69011b2b --- /dev/null +++ b/src/java/org/apache/cassandra/metrics/RatioGaugeSet.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.metrics; + +import java.util.function.DoubleSupplier; +import java.util.function.ToDoubleFunction; + +import com.codahale.metrics.Metered; +import com.codahale.metrics.RatioGauge; +import org.apache.cassandra.metrics.CassandraMetricsRegistry.MetricName; + +import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics; + +public class RatioGaugeSet +{ + public static final String ONE_MINUTE = "OneMinute"; + public static final String FIVE_MINUTE = "FiveMinute"; + public static final String FIFTEEN_MINUTE = "FifteenMinute"; + public static final String MEAN_RATIO = ""; + + public final RatioGauge oneMinute; + public final RatioGauge fiveMinute; + public final RatioGauge fifteenMinute; + public final RatioGauge mean; + + public RatioGaugeSet(Metered numerator, Metered denominator, MetricNameFactory factory, String namePattern) + { + this.oneMinute = ratioGauge(factory.createMetricName(String.format(namePattern, ONE_MINUTE)), numerator::getOneMinuteRate, denominator::getOneMinuteRate); + this.fiveMinute = ratioGauge(factory.createMetricName(String.format(namePattern, FIVE_MINUTE)), numerator::getFiveMinuteRate, denominator::getFiveMinuteRate); + this.fifteenMinute = ratioGauge(factory.createMetricName(String.format(namePattern, FIFTEEN_MINUTE)), numerator::getFifteenMinuteRate, denominator::getFifteenMinuteRate); + this.mean = ratioGauge(factory.createMetricName(String.format(namePattern, MEAN_RATIO)), numerator::getCount, denominator::getCount); + } + + private static RatioGauge ratioGauge(DoubleSupplier numerator, DoubleSupplier denominator) + { + return new RatioGauge() + { + protected Ratio getRatio() + { + return Ratio.of(numerator.getAsDouble(), denominator.getAsDouble()); + } + }; + } + + private RatioGauge ratioGauge(MetricName name, DoubleSupplier numerator, DoubleSupplier denominator) + { + return Metrics.register(name, ratioGauge(numerator, denominator)); + } + + public static Metered sum(Metered... meters) + { + return new SummingMeter(meters); + } + + private static class SummingMeter implements Metered + { + private final Metered[] meters; + + public SummingMeter(Metered... meters) + { + this.meters = meters; + } + + @Override + public long getCount() + { + long count = 0; + for (Metered meter : meters) + count += meter.getCount(); + return count; + } + + private double getRate(ToDoubleFunction rateSupplier) + { + double rate = 0; + for (Metered meter : meters) + rate += rateSupplier.applyAsDouble(meter); + return rate; + } + + @Override + public double getMeanRate() + { + return getRate(Metered::getMeanRate); + } + + @Override + public double getFifteenMinuteRate() + { + return getRate(Metered::getFifteenMinuteRate); + } + + @Override + public double getFiveMinuteRate() + { + return getRate(Metered::getFiveMinuteRate); + } + + @Override + public double getOneMinuteRate() + { + return getRate(Metered::getOneMinuteRate); + } + } +} diff --git a/src/java/org/apache/cassandra/metrics/ReadRepairMetrics.java b/src/java/org/apache/cassandra/metrics/ReadRepairMetrics.java index 1adb5dcd7f30..67d3a56ca74f 100644 --- a/src/java/org/apache/cassandra/metrics/ReadRepairMetrics.java +++ b/src/java/org/apache/cassandra/metrics/ReadRepairMetrics.java @@ -30,6 +30,24 @@ public class ReadRepairMetrics private static final MetricNameFactory factory = new DefaultNameFactory(TYPE_NAME); public static final Meter repairedBlocking = Metrics.meter(factory.createMetricName("RepairedBlocking")); + + /** + * Non-transactional read did a blocking read repair via an Accord transaction. This is expected/normal if non-transactional + * reads are interoperating with Accord. + */ + public static final Meter repairedBlockingViaAccord = Metrics.meter(factory.createMetricName("RepairedBlockingViaAccord")); + + /** + * This should be zero if you are trying to run Accord in a 100% correct way and interoperating with non-transactional writes. + * + * An Accord transaction read at QUORUM and ended up having to do BRR to make something it read monotonic. While it + * will be monotonic this is not 100% deterministic for transaction recovery because different Accord coordinators could + * read different things when computing a transaction's writes. + * + * If Accord is operating in TransactionalMode.full and the range is migrated then this metric will be zero just + * because Accord is reading at ONE not QUORUM and there are should be no non-transactional writes anywyas. + */ + public static final Meter repairedBlockingFromAccord = Metrics.meter(factory.createMetricName("RepairedBlockingFromAccord")); public static final Meter reconcileRead = Metrics.meter(factory.createMetricName("ReconcileRead")); /** @deprecated See CASSANDRA-13910 */ diff --git a/src/java/org/apache/cassandra/metrics/RepairMetrics.java b/src/java/org/apache/cassandra/metrics/RepairMetrics.java index 27dbbd31181c..bcdb1b44f98b 100644 --- a/src/java/org/apache/cassandra/metrics/RepairMetrics.java +++ b/src/java/org/apache/cassandra/metrics/RepairMetrics.java @@ -87,8 +87,8 @@ private static void reset(Counter counter) public static void retry(Verb verb, int attempt) { - retries.update(attempt); - retriesByVerb.get(verb).update(attempt); + retries.update(attempt - 1); + retriesByVerb.get(verb).update(attempt - 1); } public static void retryTimeout(Verb verb) diff --git a/src/java/org/apache/cassandra/metrics/Sampler.java b/src/java/org/apache/cassandra/metrics/Sampler.java index 4c4739b32984..b32d8111e2e7 100644 --- a/src/java/org/apache/cassandra/metrics/Sampler.java +++ b/src/java/org/apache/cassandra/metrics/Sampler.java @@ -170,6 +170,7 @@ public void updateEndTime(long endTimeMillis) */ public static class Sample implements Serializable { + private static final long serialVersionUID = 0; // for simulator support public final S value; public final long count; public final long error; diff --git a/src/java/org/apache/cassandra/metrics/TCMMetrics.java b/src/java/org/apache/cassandra/metrics/TCMMetrics.java index 01061b725a1d..909280c5771e 100644 --- a/src/java/org/apache/cassandra/metrics/TCMMetrics.java +++ b/src/java/org/apache/cassandra/metrics/TCMMetrics.java @@ -66,6 +66,7 @@ public class TCMMetrics public final Meter coordinatorBehindSchema; public final Meter coordinatorBehindPlacements; public final Gauge epochAwareDebounceTrackerSize; + public final Meter reconstructLogStateCall; private TCMMetrics() { @@ -127,6 +128,7 @@ private TCMMetrics() coordinatorBehindSchema = Metrics.meter(factory.createMetricName("CoordinatorBehindSchema")); coordinatorBehindPlacements = Metrics.meter(factory.createMetricName("CoordinatorBehindPlacements")); + reconstructLogStateCall = Metrics.meter(factory.createMetricName("ReconstructLogStateCall")); } public void recordCommitFailureLatency(long latency, TimeUnit timeUnit, boolean isRejection) diff --git a/src/java/org/apache/cassandra/metrics/TableMetrics.java b/src/java/org/apache/cassandra/metrics/TableMetrics.java index fabb0814e49a..2622b76daa4a 100644 --- a/src/java/org/apache/cassandra/metrics/TableMetrics.java +++ b/src/java/org/apache/cassandra/metrics/TableMetrics.java @@ -87,6 +87,8 @@ public class TableMetrics public final static LatencyMetrics GLOBAL_READ_LATENCY = new LatencyMetrics(GLOBAL_FACTORY, GLOBAL_ALIAS_FACTORY, "Read"); public final static LatencyMetrics GLOBAL_WRITE_LATENCY = new LatencyMetrics(GLOBAL_FACTORY, GLOBAL_ALIAS_FACTORY, "Write"); public final static LatencyMetrics GLOBAL_RANGE_LATENCY = new LatencyMetrics(GLOBAL_FACTORY, GLOBAL_ALIAS_FACTORY, "Range"); + public final static LatencyMetrics GLOBAL_KEY_MIGRATION_LATENCY = new LatencyMetrics(GLOBAL_FACTORY, GLOBAL_ALIAS_FACTORY, "KeyMigration"); + public final static LatencyMetrics GLOBAL_RANGE_MIGRATION_LATENCY = new LatencyMetrics(GLOBAL_FACTORY, GLOBAL_ALIAS_FACTORY, "RangeMigration"); /** Total amount of data stored in the memtable that resides on-heap, including column related overhead and partitions overwritten. */ public final Gauge memtableOnHeapDataSize; @@ -188,6 +190,14 @@ public class TableMetrics public final LatencyMetrics casPropose; /** CAS Commit metrics */ public final LatencyMetrics casCommit; + /** Latency for locally run key migrations **/ + public final LatencyMetrics keyMigration; + /** Latency for range migrations run by locally coordinated Accord repairs **/ + public final LatencyMetrics accordRepair; + public final LatencyMetrics accordPostStreamRepair; + public final TableMeter accordRepairUnexpectedFailures; + public final TableMeter mutationsRejectedOnWrongSystem; + public final TableMeter readsRejectedOnWrongSystem; /** percent of the data that is repaired */ public final Gauge percentRepaired; /** Reports the size of sstables in repaired, unrepaired, and any ongoing repair buckets */ @@ -209,9 +219,15 @@ public class TableMetrics /** number of partitions read creating merkle trees */ public final TableHistogram partitionsValidated; /** number of bytes read while doing anticompaction */ - public final Counter bytesAnticompacted; + public final TableMeter bytesAnticompacted; /** number of bytes where the whole sstable was contained in a repairing range so that we only mutated the repair status */ - public final Counter bytesMutatedAnticompaction; + public final TableMeter bytesMutatedAnticompaction; + /** number of bytes that were scanned during preview repair */ + public final TableMeter bytesPreviewed; + /** number of desynchronized token ranges that were detected during preview repair */ + public final TableMeter tokenRangesPreviewedDesynchronized; + /** number of desynchronized bytes that were detected during preview repair */ + public final TableMeter bytesPreviewedDesynchronized; /** ratio of how much we anticompact vs how much we could mutate the repair status*/ public final Gauge mutatedAnticompactionGauge; @@ -289,6 +305,9 @@ public class TableMetrics public final ImmutableMap, ImmutableMap>> formatSpecificGauges; + // Time spent building SSTableIntervalTree when constructing a new View under the Tracker lock + public final LatencyMetrics viewSSTableIntervalTree; + private static Pair totalNonSystemTablesSize(Predicate predicate) { long total = 0; @@ -621,6 +640,7 @@ public Long getValue() readLatency = createLatencyMetrics("Read", cfs.keyspace.metric.readLatency, GLOBAL_READ_LATENCY); writeLatency = createLatencyMetrics("Write", cfs.keyspace.metric.writeLatency, GLOBAL_WRITE_LATENCY); rangeLatency = createLatencyMetrics("Range", cfs.keyspace.metric.rangeLatency, GLOBAL_RANGE_LATENCY); + pendingFlushes = createTableCounter("PendingFlushes"); bytesFlushed = createTableCounter("BytesFlushed"); flushSizeOnDisk = ExpMovingAverage.decayBy1000(); @@ -801,6 +821,12 @@ public Long getValue() casPrepare = createLatencyMetrics("CasPrepare", cfs.keyspace.metric.casPrepare); casPropose = createLatencyMetrics("CasPropose", cfs.keyspace.metric.casPropose); casCommit = createLatencyMetrics("CasCommit", cfs.keyspace.metric.casCommit); + keyMigration = createLatencyMetrics("KeyMigration", cfs.keyspace.metric.keyMigration, GLOBAL_KEY_MIGRATION_LATENCY); + accordRepair = createLatencyMetrics("AccordRepair", cfs.keyspace.metric.accordRepair, GLOBAL_RANGE_MIGRATION_LATENCY); + accordPostStreamRepair = createLatencyMetrics("AccordPostStreamRepair", cfs.keyspace.metric.accordPostStreamRepair); + accordRepairUnexpectedFailures = createTableMeter("AccordRepairUnexpectedFailures", cfs.keyspace.metric.rangeMigrationUnexpectedFailures); + mutationsRejectedOnWrongSystem = createTableMeter("MutationsRejectedOnWrongSystem", cfs.keyspace.metric.mutationsRejectedOnWrongSystem); + readsRejectedOnWrongSystem = createTableMeter("ReadsRejectedOnWrongSystem", cfs.keyspace.metric.readsRejectedOnWrongSystem); repairsStarted = createTableCounter("RepairJobsStarted"); repairsCompleted = createTableCounter("RepairJobsCompleted"); @@ -811,12 +837,15 @@ public Long getValue() bytesValidated = createTableHistogram("BytesValidated", cfs.keyspace.metric.bytesValidated, false); partitionsValidated = createTableHistogram("PartitionsValidated", cfs.keyspace.metric.partitionsValidated, false); - bytesAnticompacted = createTableCounter("BytesAnticompacted"); - bytesMutatedAnticompaction = createTableCounter("BytesMutatedAnticompaction"); + bytesAnticompacted = createTableMeter("BytesAnticompacted", cfs.keyspace.metric.bytesAnticompacted, true); + bytesMutatedAnticompaction = createTableMeter("BytesMutatedAnticompaction", cfs.keyspace.metric.bytesMutatedAnticompaction, true); + bytesPreviewed = createTableMeter("BytesPreviewed", cfs.keyspace.metric.bytesPreviewed); + tokenRangesPreviewedDesynchronized = createTableMeter("TokenRangesPreviewedDesynchronized", cfs.keyspace.metric.tokenRangesPreviewedDesynchronized); + bytesPreviewedDesynchronized = createTableMeter("BytesPreviewedDesynchronized", cfs.keyspace.metric.bytesPreviewedDesynchronized); mutatedAnticompactionGauge = createTableGauge("MutatedAnticompactionGauge", () -> { - double bytesMutated = bytesMutatedAnticompaction.getCount(); - double bytesAnticomp = bytesAnticompacted.getCount(); + double bytesMutated = bytesMutatedAnticompaction.table.getCount(); + double bytesAnticomp = bytesAnticompacted.table.getCount(); if (bytesAnticomp + bytesMutated > 0) return bytesMutated / (bytesAnticomp + bytesMutated); return 0.0; @@ -861,6 +890,8 @@ public Long getValue() tooManySSTableIndexesReadWarnings = createTableMeter("TooManySSTableIndexesReadWarnings", cfs.keyspace.metric.tooManySSTableIndexesReadWarnings); tooManySSTableIndexesReadAborts = createTableMeter("TooManySSTableIndexesReadAborts", cfs.keyspace.metric.tooManySSTableIndexesReadAborts); + viewSSTableIntervalTree = createLatencyMetrics("ViewSSTableIntervalTree", cfs.keyspace.metric.viewSSTableIntervalTree); + formatSpecificGauges = createFormatSpecificGauges(cfs); } @@ -1081,16 +1112,21 @@ protected SnapshottingTimer createTableTimer(String name) protected TableMeter createTableMeter(String name, Meter keyspaceMeter) { - return createTableMeter(name, name, keyspaceMeter); + return createTableMeter(name, keyspaceMeter, false); } - protected TableMeter createTableMeter(String name, String alias, Meter keyspaceMeter) + protected TableMeter createTableMeter(String name, Meter keyspaceMeter, boolean globalMeterGaugeCompatible) + { + return createTableMeter(name, name, keyspaceMeter, globalMeterGaugeCompatible); + } + + protected TableMeter createTableMeter(String name, String alias, Meter keyspaceMeter, boolean globalMeterGaugeCompatible) { Meter meter = Metrics.meter(factory.createMetricName(name), aliasFactory.createMetricName(alias)); register(name, alias, meter); return new TableMeter(meter, keyspaceMeter, - Metrics.meter(GLOBAL_FACTORY.createMetricName(name), + Metrics.meter(globalMeterGaugeCompatible, GLOBAL_FACTORY.createMetricName(name), GLOBAL_ALIAS_FACTORY.createMetricName(alias))); } @@ -1148,10 +1184,15 @@ private TableMeter(Meter table, Meter keyspace, Meter global) } public void mark() + { + mark(1L); + } + + public void mark(long val) { for (Meter meter : all) { - meter.mark(); + meter.mark(val); } } } diff --git a/src/java/org/apache/cassandra/net/AbstractMessageHandler.java b/src/java/org/apache/cassandra/net/AbstractMessageHandler.java index 5b5b8b7f1ad7..895b277ad524 100644 --- a/src/java/org/apache/cassandra/net/AbstractMessageHandler.java +++ b/src/java/org/apache/cassandra/net/AbstractMessageHandler.java @@ -33,6 +33,7 @@ import io.netty.channel.ChannelHandlerContext; import io.netty.channel.ChannelInboundHandlerAdapter; import io.netty.channel.EventLoop; +import org.apache.cassandra.concurrent.ManyToOneConcurrentLinkedQueue; import org.apache.cassandra.metrics.ClientMetrics; import org.apache.cassandra.net.FrameDecoder.CorruptFrame; import org.apache.cassandra.net.FrameDecoder.Frame; @@ -43,7 +44,7 @@ import static java.lang.Math.max; import static java.lang.Math.min; -import static org.apache.cassandra.net.Crc.InvalidCrc; +import static org.apache.cassandra.utils.Crc.InvalidCrc; import static org.apache.cassandra.utils.MonotonicClock.Global.approxTime; /** diff --git a/src/java/org/apache/cassandra/net/ForwardingInfo.java b/src/java/org/apache/cassandra/net/ForwardingInfo.java index 7a117bd99915..bb8880c5b874 100644 --- a/src/java/org/apache/cassandra/net/ForwardingInfo.java +++ b/src/java/org/apache/cassandra/net/ForwardingInfo.java @@ -40,6 +40,7 @@ */ public final class ForwardingInfo implements Serializable { + private static final long serialVersionUID = 0; // for simulator support final List targets; final long[] messageIds; diff --git a/src/java/org/apache/cassandra/net/FrameDecoderCrc.java b/src/java/org/apache/cassandra/net/FrameDecoderCrc.java index 2a54f5f6636f..86f444245398 100644 --- a/src/java/org/apache/cassandra/net/FrameDecoderCrc.java +++ b/src/java/org/apache/cassandra/net/FrameDecoderCrc.java @@ -24,7 +24,7 @@ import io.netty.channel.ChannelPipeline; -import static org.apache.cassandra.net.Crc.*; +import static org.apache.cassandra.utils.Crc.*; /** * Framing format that protects integrity of data in movement with CRCs (of both header and payload). diff --git a/src/java/org/apache/cassandra/net/FrameDecoderLZ4.java b/src/java/org/apache/cassandra/net/FrameDecoderLZ4.java index 9cc100586e23..7dafb7fedf7c 100644 --- a/src/java/org/apache/cassandra/net/FrameDecoderLZ4.java +++ b/src/java/org/apache/cassandra/net/FrameDecoderLZ4.java @@ -26,7 +26,7 @@ import net.jpountz.lz4.LZ4Factory; import net.jpountz.lz4.LZ4SafeDecompressor; -import static org.apache.cassandra.net.Crc.*; +import static org.apache.cassandra.utils.Crc.*; /** * Framing format that compresses payloads with LZ4, and protects integrity of data in movement with CRCs diff --git a/src/java/org/apache/cassandra/net/FrameEncoderCrc.java b/src/java/org/apache/cassandra/net/FrameEncoderCrc.java index 364624816526..75b84aa14f53 100644 --- a/src/java/org/apache/cassandra/net/FrameEncoderCrc.java +++ b/src/java/org/apache/cassandra/net/FrameEncoderCrc.java @@ -24,7 +24,7 @@ import io.netty.buffer.ByteBuf; import io.netty.channel.ChannelHandler; -import static org.apache.cassandra.net.Crc.*; +import static org.apache.cassandra.utils.Crc.*; /** * Please see {@link FrameDecoderCrc} for description of the framing produced by this encoder. diff --git a/src/java/org/apache/cassandra/net/FrameEncoderLZ4.java b/src/java/org/apache/cassandra/net/FrameEncoderLZ4.java index 75f15c726b79..423e377a2770 100644 --- a/src/java/org/apache/cassandra/net/FrameEncoderLZ4.java +++ b/src/java/org/apache/cassandra/net/FrameEncoderLZ4.java @@ -28,7 +28,7 @@ import org.apache.cassandra.io.compress.BufferType; import org.apache.cassandra.utils.ByteBufferUtil; -import static org.apache.cassandra.net.Crc.*; +import static org.apache.cassandra.utils.Crc.*; /** * Please see {@link FrameDecoderLZ4} for description of the framing produced by this encoder. diff --git a/src/java/org/apache/cassandra/net/HandshakeProtocol.java b/src/java/org/apache/cassandra/net/HandshakeProtocol.java index 3217aeae8a8d..085c8b966c4c 100644 --- a/src/java/org/apache/cassandra/net/HandshakeProtocol.java +++ b/src/java/org/apache/cassandra/net/HandshakeProtocol.java @@ -35,7 +35,7 @@ import static org.apache.cassandra.locator.InetAddressAndPort.Serializer.inetAddressAndPortSerializer; import static org.apache.cassandra.net.MessagingService.VERSION_40; import static org.apache.cassandra.net.Message.validateLegacyProtocolMagic; -import static org.apache.cassandra.net.Crc.*; +import static org.apache.cassandra.utils.Crc.*; import static org.apache.cassandra.net.OutboundConnectionSettings.*; /** diff --git a/src/java/org/apache/cassandra/net/InboundConnectionInitiator.java b/src/java/org/apache/cassandra/net/InboundConnectionInitiator.java index fd5710e29773..c4cbc50a2b2b 100644 --- a/src/java/org/apache/cassandra/net/InboundConnectionInitiator.java +++ b/src/java/org/apache/cassandra/net/InboundConnectionInitiator.java @@ -67,7 +67,7 @@ import static java.util.concurrent.TimeUnit.MILLISECONDS; import static org.apache.cassandra.auth.IInternodeAuthenticator.InternodeConnectionDirection.INBOUND; import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; import static org.apache.cassandra.net.InternodeConnectionUtils.DISCARD_HANDLER_NAME; import static org.apache.cassandra.net.InternodeConnectionUtils.SSL_FACTORY_CONTEXT_DESCRIPTION; import static org.apache.cassandra.net.InternodeConnectionUtils.SSL_HANDLER_NAME; @@ -530,7 +530,7 @@ void setupMessagingPipeline(InetAddressAndPort from, int useMessagingVersion, in private static SslHandler getSslHandler(String description, Channel channel, EncryptionOptions.ServerEncryptionOptions encryptionOptions) throws IOException { - final EncryptionOptions.ClientAuth verifyPeerCertificate = REQUIRED; + final EncryptionOptions.ClientEncryptionOptions.ClientAuth verifyPeerCertificate = REQUIRED; SslContext sslContext = SSLFactory.getOrCreateSslContext(encryptionOptions, verifyPeerCertificate, ISslContextFactory.SocketType.SERVER, SSL_FACTORY_CONTEXT_DESCRIPTION); diff --git a/src/java/org/apache/cassandra/net/InboundConnectionSettings.java b/src/java/org/apache/cassandra/net/InboundConnectionSettings.java index 448da62cbb8c..4a9db7ca8112 100644 --- a/src/java/org/apache/cassandra/net/InboundConnectionSettings.java +++ b/src/java/org/apache/cassandra/net/InboundConnectionSettings.java @@ -146,7 +146,7 @@ public InboundConnectionSettings withLegacySslStoragePortDefaults() ServerEncryptionOptions encryption = this.encryption; if (encryption == null) encryption = DatabaseDescriptor.getInternodeMessagingEncyptionOptions(); - encryption = encryption.withOptional(false).withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all); + encryption = new ServerEncryptionOptions.Builder(encryption).withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all).withOptional(false).build(); return this.withBindAddress(bindAddress.withPort(DatabaseDescriptor.getSSLStoragePort())) .withEncryption(encryption) diff --git a/src/java/org/apache/cassandra/net/InboundMessageHandler.java b/src/java/org/apache/cassandra/net/InboundMessageHandler.java index edee108bb0d3..6c697d4848b8 100644 --- a/src/java/org/apache/cassandra/net/InboundMessageHandler.java +++ b/src/java/org/apache/cassandra/net/InboundMessageHandler.java @@ -40,6 +40,7 @@ import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tracing.TraceState; import org.apache.cassandra.tracing.Tracing; +import org.apache.cassandra.utils.Crc; import org.apache.cassandra.utils.JVMStabilityInspector; import org.apache.cassandra.utils.NoSpamLogger; diff --git a/src/java/org/apache/cassandra/net/InboundMessageHandlers.java b/src/java/org/apache/cassandra/net/InboundMessageHandlers.java index c7b946350d09..f1b98bdbbe31 100644 --- a/src/java/org/apache/cassandra/net/InboundMessageHandlers.java +++ b/src/java/org/apache/cassandra/net/InboundMessageHandlers.java @@ -30,6 +30,7 @@ import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.metrics.InternodeInboundMetrics; import org.apache.cassandra.net.Message.Header; +import org.apache.cassandra.utils.Crc; import static java.util.concurrent.TimeUnit.NANOSECONDS; import static org.apache.cassandra.utils.MonotonicClock.Global.approxTime; diff --git a/src/java/org/apache/cassandra/net/InboundSink.java b/src/java/org/apache/cassandra/net/InboundSink.java index d07703963547..7fbc50a2019c 100644 --- a/src/java/org/apache/cassandra/net/InboundSink.java +++ b/src/java/org/apache/cassandra/net/InboundSink.java @@ -23,17 +23,18 @@ import java.util.concurrent.atomic.AtomicReferenceFieldUpdater; import java.util.function.Predicate; +import org.apache.cassandra.index.IndexBuildInProgressException; import org.slf4j.LoggerFactory; import net.openhft.chronicle.core.util.ThrowingConsumer; import org.apache.cassandra.db.filter.TombstoneOverwhelmingException; import org.apache.cassandra.exceptions.CoordinatorBehindException; import org.apache.cassandra.exceptions.InvalidRoutingException; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.index.IndexNotAvailableException; import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.NotCMSException; import org.apache.cassandra.utils.NoSpamLogger; @@ -108,9 +109,9 @@ public void fail(Message.Header header, Throwable failure) if (header.callBackOnFailure()) { InetAddressAndPort to = header.respondTo() != null ? header.respondTo() : header.from; - Message response = Message.failureResponse(header.id, - header.expiresAtNanos, - RequestFailureReason.forException(failure)); + Message response = Message.failureResponse(header.id, + header.expiresAtNanos, + RequestFailure.forException(failure)); messaging.send(response, to); } } @@ -126,13 +127,24 @@ public void accept(Message message) fail(message.header, t); if (t instanceof NotCMSException || t instanceof CoordinatorBehindException) + { noSpamLogger.warn(t.getMessage()); - else if (t instanceof TombstoneOverwhelmingException || t instanceof IndexNotAvailableException || t instanceof InvalidRoutingException) + } + else if (t instanceof TombstoneOverwhelmingException || + t instanceof IndexNotAvailableException || + t instanceof IndexBuildInProgressException || + t instanceof InvalidRoutingException) + { noSpamLogger.error(t.getMessage()); + } else if (t instanceof RuntimeException) + { throw (RuntimeException) t; + } else + { throw new RuntimeException(t); + } } } diff --git a/src/java/org/apache/cassandra/net/Message.java b/src/java/org/apache/cassandra/net/Message.java index ead4317576ab..4bf114972bb7 100644 --- a/src/java/org/apache/cassandra/net/Message.java +++ b/src/java/org/apache/cassandra/net/Message.java @@ -35,6 +35,7 @@ import org.slf4j.LoggerFactory; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.io.IVersionedAsymmetricSerializer; import org.apache.cassandra.io.IVersionedSerializer; @@ -71,7 +72,7 @@ * * @param The type of the message payload. */ -public class Message +public class Message implements ResponseContext { private static final Logger logger = LoggerFactory.getLogger(Message.class); private static final NoSpamLogger noSpam1m = NoSpamLogger.getLogger(logger, 1, TimeUnit.MINUTES); @@ -90,22 +91,24 @@ public class Message this.payloadSerializer = verb().serializer(); } - /** Sender of the message. */ - public InetAddressAndPort from() - { - return header.from; - } - /** Whether the message has crossed the node boundary, that is whether it originated from another node. */ public boolean isCrossNode() { return !from().equals(getBroadcastAddressAndPort()); } + /** Sender of the message. */ + @Override + public InetAddressAndPort from() + { + return header.from; + } + /** * id of the request/message. In 4.0+ can be shared between multiple messages of the same logical request, * whilst in versions above a new id would be allocated for each message sent. */ + @Override public long id() { return header.id; @@ -116,6 +119,7 @@ public Epoch epoch() return header.epoch; } + @Override public Verb verb() { return header.verb; @@ -135,6 +139,7 @@ public long createdAtNanos() return header.createdAtNanos; } + @Override public long expiresAtNanos() { return header.expiresAtNanos; @@ -301,6 +306,7 @@ public static Message internalResponse(Verb verb, T payload) * Used by the {@code MultiRangeReadCommand} to split multi-range responses from a replica * into single-range responses. */ + @VisibleForTesting public static Message remoteResponse(InetAddressAndPort from, Verb verb, T payload) { assert verb.isResponse(); @@ -339,12 +345,17 @@ public static Message forgeIdentityForTests(Message msg, InetAddressAn /** Builds a response Message with provided payload, and all the right fields inferred from request Message */ public Message responseWith(T payload) { - Message msg = outWithParam(id(), verb().responseVerb, expiresAtNanos(), payload, null, null); + Message msg = responseWith(payload, this); if (header.hasFlag(MessageFlag.URGENT)) msg = msg.withFlag(MessageFlag.URGENT); return msg; } + public static Message responseWith(T payload, ResponseContext respondTo) + { + return outWithParam(respondTo.id(), respondTo.verb().responseVerb, respondTo.expiresAtNanos(), payload, null, null); + } + /** Builds a response Message with no payload, and all the right fields inferred from request Message */ public Message emptyResponse() { @@ -352,12 +363,22 @@ public Message emptyResponse() } /** Builds a failure response Message with an explicit reason, and fields inferred from request Message */ - public Message failureResponse(RequestFailureReason reason) + public Message failureResponse(RequestFailureReason reason) + { + return failureResponse(reason, null); + } + + public Message failureResponse(RequestFailureReason reason, @Nullable Throwable failure) { - return failureResponse(id(), expiresAtNanos(), reason); + return failureResponse(reason, failure, this); } - static Message failureResponse(long id, long expiresAtNanos, RequestFailureReason reason) + public static Message failureResponse(RequestFailureReason reason, @Nullable Throwable failure, ResponseContext respondTo) + { + return failureResponse(respondTo.id(), respondTo.expiresAtNanos(), new RequestFailure(reason, failure)); + } + + static Message failureResponse(long id, long expiresAtNanos, RequestFailure reason) { return outWithParam(id, Verb.FAILURE_RSP, expiresAtNanos, reason, null, null); } @@ -499,7 +520,7 @@ public String toString() * Split into a separate object to allow partial message deserialization without wasting work and allocation * afterwards, if the entire message is necessary and available. */ - public static class Header + public static class Header implements ResponseContext { public final long id; public final Epoch epoch; @@ -567,6 +588,11 @@ boolean trackWarnings() return MessageFlag.TRACK_WARNINGS.isIn(flags); } + boolean isFinal() + { + return !MessageFlag.NOT_FINAL.isIn(flags); + } + @Nullable ForwardingInfo forwardTo() { @@ -581,6 +607,31 @@ InetAddressAndPort respondTo() return respondTo; } + /** Sender of the message. */ + @Override + public InetAddressAndPort from() + { + return from; + } + + @Override + public long id() + { + return id; + } + + @Override + public Verb verb() + { + return verb; + } + + @Override + public long expiresAtNanos() + { + return expiresAtNanos; + } + @Nullable public TimeUUID traceSession() { diff --git a/src/java/org/apache/cassandra/net/MessageDelivery.java b/src/java/org/apache/cassandra/net/MessageDelivery.java index 0d052cb3d8c1..ccface225377 100644 --- a/src/java/org/apache/cassandra/net/MessageDelivery.java +++ b/src/java/org/apache/cassandra/net/MessageDelivery.java @@ -22,16 +22,18 @@ import java.util.Iterator; import java.util.Set; import java.util.concurrent.TimeUnit; - +import java.util.concurrent.TimeoutException; import javax.annotation.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import accord.utils.Invariants; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.utils.Backoff; +import org.apache.cassandra.service.WaitStrategy; import org.apache.cassandra.utils.Pair; import org.apache.cassandra.utils.concurrent.Accumulator; import org.apache.cassandra.utils.concurrent.AsyncPromise; @@ -39,6 +41,7 @@ import org.apache.cassandra.utils.concurrent.Future; import org.apache.cassandra.utils.concurrent.Promise; +import static java.util.concurrent.TimeUnit.NANOSECONDS; import static org.apache.cassandra.net.MessageFlag.CALL_BACK_ON_FAILURE; public interface MessageDelivery @@ -63,7 +66,7 @@ public void onResponse(Message msg) } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason reason) + public void onFailure(InetAddressAndPort from, RequestFailure reason) { logger.info("Received failure in response to {} from {}: {}", verb, from, reason); cdl.decrement(); @@ -83,7 +86,7 @@ public void onFailure(InetAddressAndPort from, RequestFailureReason reason) public void sendWithCallback(Message message, InetAddressAndPort to, RequestCallback cb, ConnectionType specifyConnection); public Future> sendWithResult(Message message, InetAddressAndPort to); - public default Future> sendWithRetries(Backoff backoff, RetryScheduler retryThreads, + public default Future> sendWithRetries(WaitStrategy backoff, RetryScheduler retryThreads, Verb verb, REQ request, Iterator candidates, RetryPredicate shouldRetry, @@ -99,17 +102,22 @@ public default Future> sendWithRetries(Backoff backoff, return promise; } - public default void sendWithRetries(Backoff backoff, RetryScheduler retryThreads, + public default void sendWithRetries(WaitStrategy backoff, RetryScheduler retryThreads, Verb verb, REQ request, Iterator candidates, OnResult onResult, RetryPredicate shouldRetry, RetryErrorMessage errorMessage) { - sendWithRetries(this, backoff, retryThreads, verb, request, candidates, onResult, shouldRetry, errorMessage, 0); + sendWithRetries(this, backoff, retryThreads, verb, request, candidates, onResult, shouldRetry, errorMessage, 1); } public void respond(V response, Message message); public default void respondWithFailure(RequestFailureReason reason, Message message) + { + respondWithFailure(RequestFailure.forReason(reason), message); + } + + public default void respondWithFailure(RequestFailure reason, Message message) { send(Message.failureResponse(message.id(), message.expiresAtNanos(), reason), message.respondTo()); } @@ -121,16 +129,21 @@ interface OnResult interface RetryPredicate { - boolean test(int attempt, InetAddressAndPort from, RequestFailureReason failure); + static RetryPredicate times(int n) { return (attempt, from, failure) -> attempt < n; } + RetryPredicate ALWAYS_RETRY = (i1, i2, i3) -> true; + RetryPredicate NEVER_RETRY = (i1, i2, i3) -> false; + boolean test(int attempt, InetAddressAndPort from, RequestFailure failure); } interface RetryErrorMessage { - String apply(int attempt, ResponseFailureReason retryFailure, @Nullable InetAddressAndPort from, @Nullable RequestFailureReason reason); + RetryErrorMessage EMPTY = (i1, i2, i3, i4) -> null; + String apply(int attempt, ResponseFailureReason retryFailure, @Nullable InetAddressAndPort from, @Nullable RequestFailure reason); } private static void sendWithRetries(MessageDelivery messaging, - Backoff backoff, RetryScheduler retryThreads, + WaitStrategy backoff, + RetryScheduler retryThreads, Verb verb, REQ request, Iterator candidates, OnResult onResult, @@ -138,6 +151,7 @@ private static void sendWithRetries(MessageDelivery messaging, RetryErrorMessage errorMessage, int attempt) { + Invariants.require(backoff != null); if (Thread.currentThread().isInterrupted()) { onResult.result(attempt, null, new InterruptedException(errorMessage.apply(attempt, ResponseFailureReason.Interrupted, null, null))); @@ -157,11 +171,13 @@ public void onResponse(Message msg) } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failure) + public void onFailure(InetAddressAndPort from, RequestFailure failure) { - if (!backoff.mayRetry(attempt)) + long retryDelay = backoff.computeWait(attempt + 1, NANOSECONDS); + // TODO (required): we already have a separate retry predicate, retries should not be taken into consideration when retrying + if (retryDelay < 0) { - onResult.result(attempt, null, new MaxRetriesException(attempt, errorMessage.apply(attempt, ResponseFailureReason.MaxRetries, from, failure))); + onResult.result(attempt, null, new GivingUpException(attempt, errorMessage.apply(attempt, ResponseFailureReason.GiveUp, from, failure))); return; } if (!shouldRetry.test(attempt, from, failure)) @@ -172,7 +188,7 @@ public void onFailure(InetAddressAndPort from, RequestFailureReason failure) try { retryThreads.schedule(() -> sendWithRetries(messaging, backoff, retryThreads, verb, request, candidates, onResult, shouldRetry, errorMessage, attempt + 1), - backoff.computeWaitTime(attempt), backoff.unit()); + retryDelay, NANOSECONDS); } catch (Throwable t) { @@ -183,7 +199,7 @@ public void onFailure(InetAddressAndPort from, RequestFailureReason failure) messaging.sendWithCallback(Message.outWithFlag(verb, request, CALL_BACK_ON_FAILURE), candidates.next(), new Request()); } - enum ResponseFailureReason { MaxRetries, Rejected, NoMoreCandidates, Interrupted, FailedSchedule } + enum ResponseFailureReason {GiveUp, Rejected, NoMoreCandidates, Interrupted, FailedSchedule } interface RetryScheduler { @@ -201,7 +217,7 @@ public void schedule(Runnable command, long delay, TimeUnit unit) } } - class NoMoreCandidatesException extends IllegalStateException + class NoMoreCandidatesException extends TimeoutException { public NoMoreCandidatesException(String s) { @@ -209,30 +225,30 @@ public NoMoreCandidatesException(String s) } } - class FailedResponseException extends IllegalStateException + class FailedResponseException extends RuntimeException { public final InetAddressAndPort from; - public final RequestFailureReason failure; + public final RequestFailure failure; - public FailedResponseException(InetAddressAndPort from, RequestFailureReason failure, String message) + public FailedResponseException(InetAddressAndPort from, RequestFailure failure, String message) { - super(message); + super(message, failure.failure); this.from = from; this.failure = failure; } } - class MaxRetriesException extends IllegalStateException + class GivingUpException extends TimeoutException { public final int attempts; - public MaxRetriesException(int attempts, String message) + public GivingUpException(int attempts, String message) { super(message); this.attempts = attempts; } } - class FailedScheduleException extends IllegalStateException + class FailedScheduleException extends RuntimeException { public FailedScheduleException(String message, Throwable cause) { diff --git a/src/java/org/apache/cassandra/net/MessageFlag.java b/src/java/org/apache/cassandra/net/MessageFlag.java index 1c2db557c340..4c5762f9796e 100644 --- a/src/java/org/apache/cassandra/net/MessageFlag.java +++ b/src/java/org/apache/cassandra/net/MessageFlag.java @@ -31,7 +31,10 @@ public enum MessageFlag /** allow creating warnings or aborting queries based off query - see CASSANDRA-16850 */ TRACK_WARNINGS(2), /** whether this message should be sent on an URGENT channel despite its Verb default priority */ - URGENT(3); + URGENT(3), + /** Allow a single callback to receive multiple responses until a final response is received **/ + NOT_FINAL(4) + ; private final int id; diff --git a/src/java/org/apache/cassandra/net/MessagingService.java b/src/java/org/apache/cassandra/net/MessagingService.java index f26d35ad95a4..ff5536d20811 100644 --- a/src/java/org/apache/cassandra/net/MessagingService.java +++ b/src/java/org/apache/cassandra/net/MessagingService.java @@ -39,6 +39,7 @@ import org.apache.cassandra.concurrent.Stage; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.SystemKeyspace; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.Replica; @@ -71,7 +72,7 @@ * message is received, {@link RequestCallback#onResponse(Message)} method will be invoked on the * provided callback - in case of a success response. In case of a failure response (see {@link Verb#FAILURE_RSP}), * or if a response doesn't arrive within verb's configured expiry time, - * {@link RequestCallback#onFailure(InetAddressAndPort, RequestFailureReason)} will be invoked instead. + * {@link RequestCallback#onFailure(InetAddressAndPort, RequestFailure)} will be invoked instead. * 2. To send a response back, or a message that expects no response, use {@link #send(Message, InetAddressAndPort)} * method. * @@ -223,22 +224,7 @@ public enum Version VERSION_50(13), VERSION_51(14); - public static final Version CURRENT; - - private static final Logger logger = LoggerFactory.getLogger(Version.class); - - static - { - if (DatabaseDescriptor.getStorageCompatibilityMode().isBefore(5)) - { - logger.warn("Starting in storage compatibility mode " + DatabaseDescriptor.getStorageCompatibilityMode()); - CURRENT = VERSION_40; - } - else - { - CURRENT = VERSION_51; - } - } + public static final Version MIN_ACCORD_VERSION = Version.VERSION_51; public final int value; @@ -247,6 +233,13 @@ public enum Version this.value = value; } + public static Version current() + { + // this enum is leveraged in yaml config so can not touch DatabaseDescriptor to figure out + // what the "current" is, so need to leverage MessagingService's field as it uses DatabaseDescriptor + return current; + } + public static List supportedVersions() { List versions = Lists.newArrayList(); @@ -256,6 +249,17 @@ public static List supportedVersions() return Collections.unmodifiableList(versions); } + + public List greaterThanOrEqual() + { + Version[] all = Version.values(); + if (ordinal() == all.length - 1) + return Collections.singletonList(this); + List values = new ArrayList<>(all.length - ordinal()); + for (int i = ordinal(); i < all.length; i++) + values.add(all[i]); + return values; + } } // Maintance Note: // Try to keep Version enum in-sync for testing. By having the versions in the enum tests can get access without forcing this class @@ -274,7 +278,8 @@ public static List supportedVersions() // we want to use a modified behavior for the tools and clients - that is, since they are not running a server, they // should not need to run in a compatibility mode. They should be able to connect to the server regardless whether // it uses messaving version 4 or 5 - public static final int current_version = DatabaseDescriptor.getStorageCompatibilityMode().isBefore(5) ? VERSION_40 : VERSION_51; + public static final Version current = DatabaseDescriptor.getStorageCompatibilityMode().isBefore(5) ? Version.VERSION_40 : Version.VERSION_51; + public static final int current_version = current.value; static AcceptVersions accept_messaging; static AcceptVersions accept_streaming; static @@ -368,9 +373,9 @@ public void onResponse(Message msg) } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) + public void onFailure(InetAddressAndPort from, RequestFailure failure) { - promise.tryFailure(new FailureResponseException(from, failureReason)); + promise.tryFailure(new FailureResponseException(from, failure)); } @Override @@ -387,11 +392,11 @@ public static class FailureResponseException extends IOException private final InetAddressAndPort from; private final RequestFailureReason failureReason; - public FailureResponseException(InetAddressAndPort from, RequestFailureReason failureReason) + public FailureResponseException(InetAddressAndPort from, RequestFailure failureReason) { - super(String.format("Failure from %s: %s", from, failureReason.name())); + super(String.format("Failure from %s: %s", from, failureReason.reason.name()), failureReason.failure); this.from = from; - this.failureReason = failureReason; + this.failureReason = failureReason.reason; } public InetAddressAndPort from() @@ -467,6 +472,7 @@ public void send(Message message, InetAddressAndPort to) * @param message messages to be sent. * @param response */ + @Override public void respond(V response, Message message) { send(message.responseWith(response), message.respondTo()); @@ -485,9 +491,9 @@ public void onResponse(Message msg) } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) + public void onFailure(InetAddressAndPort from, RequestFailure failure) { - future.setFailure(new RuntimeException(failureReason.toString())); + future.setFailure(new RuntimeException(failure.toString())); } }); @@ -496,7 +502,7 @@ public void onFailure(InetAddressAndPort from, RequestFailureReason failureReaso public void respondWithFailure(RequestFailureReason reason, Message message) { - Message r = Message.failureResponse(message.id(), message.expiresAtNanos(), reason); + Message r = Message.failureResponse(message.id(), message.expiresAtNanos(), new RequestFailure(reason, null)); if (r.header.hasFlag(MessageFlag.URGENT)) r = r.withFlag(MessageFlag.URGENT); send(r, message.respondTo()); diff --git a/src/java/org/apache/cassandra/net/MessagingUtils.java b/src/java/org/apache/cassandra/net/MessagingUtils.java new file mode 100644 index 000000000000..11735f8d7f66 --- /dev/null +++ b/src/java/org/apache/cassandra/net/MessagingUtils.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.net; + +import java.util.Collection; +import java.util.Iterator; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.repair.SharedContext; + +public class MessagingUtils +{ + private static final Logger logger = LoggerFactory.getLogger(MessagingUtils.class); + + /** + * Candidate iterator that would try all endpoints known to be alive first, and then try all endpoints + * in a round-robin manner. + *

+ * Calls onIteration every time after exhausting the peers. + */ + public static Iterator tryAliveFirst(SharedContext context, Collection peers, String verb) + { + return new Iterator<>() + { + boolean firstRun = true; + int attempt = 0; + Iterator iter = peers.iterator(); + boolean isEmpty = !iter.hasNext(); + + public boolean hasNext() + { + return !isEmpty; + } + + public InetAddressAndPort next() + { + // At first, try all alive nodes + if (firstRun) + { + while (iter.hasNext()) + { + InetAddressAndPort candidate = iter.next(); + if (context.failureDetector().isAlive(candidate)) + return candidate; + } + firstRun = false; + } + + // After that, cycle through all nodes + if (!iter.hasNext()) + { + logger.warn("Exhausted iterator on {} cycling through the set of peers: {} attempt #{}", verb, peers, attempt++); + iter = peers.iterator(); + } + + return iter.next(); + } + }; + } +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/net/NoPayload.java b/src/java/org/apache/cassandra/net/NoPayload.java index 3b2b1772a8cc..566b01df648e 100644 --- a/src/java/org/apache/cassandra/net/NoPayload.java +++ b/src/java/org/apache/cassandra/net/NoPayload.java @@ -18,6 +18,7 @@ package org.apache.cassandra.net; import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.UnversionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; @@ -50,4 +51,22 @@ public long serializedSize(NoPayload noPayload, int version) return 0; } }; + public static final UnversionedSerializer unversionedSerializer = new UnversionedSerializer() + { + public void serialize(NoPayload noPayload, DataOutputPlus out) + { + if (noPayload != NoPayload.noPayload) + throw new IllegalArgumentException(); + } + + public NoPayload deserialize(DataInputPlus in) + { + return noPayload; + } + + public long serializedSize(NoPayload noPayload) + { + return 0; + } + }; } diff --git a/src/java/org/apache/cassandra/net/OutboundConnectionInitiator.java b/src/java/org/apache/cassandra/net/OutboundConnectionInitiator.java index 27e55f6105bf..2bddc174f2e8 100644 --- a/src/java/org/apache/cassandra/net/OutboundConnectionInitiator.java +++ b/src/java/org/apache/cassandra/net/OutboundConnectionInitiator.java @@ -66,9 +66,9 @@ import static java.util.concurrent.TimeUnit.*; import static org.apache.cassandra.auth.IInternodeAuthenticator.InternodeConnectionDirection.OUTBOUND; import static org.apache.cassandra.auth.IInternodeAuthenticator.InternodeConnectionDirection.OUTBOUND_PRECONNECT; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.NOT_REQUIRED; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.OPTIONAL; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.NOT_REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.OPTIONAL; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; import static org.apache.cassandra.net.InternodeConnectionUtils.DISCARD_HANDLER_NAME; import static org.apache.cassandra.net.InternodeConnectionUtils.SSL_FACTORY_CONTEXT_DESCRIPTION; import static org.apache.cassandra.net.InternodeConnectionUtils.SSL_HANDLER_NAME; @@ -245,7 +245,7 @@ public void initChannel(SocketChannel channel) throws Exception private SslContext getSslContext(SslFallbackConnectionType connectionType) throws IOException { - EncryptionOptions.ClientAuth requireClientAuth = NOT_REQUIRED; + EncryptionOptions.ClientEncryptionOptions.ClientAuth requireClientAuth = NOT_REQUIRED; if (connectionType == SslFallbackConnectionType.MTLS ) { requireClientAuth = REQUIRED; diff --git a/src/java/org/apache/cassandra/net/OutboundMessageQueue.java b/src/java/org/apache/cassandra/net/OutboundMessageQueue.java index 8280055e6891..5098f32bc9e4 100644 --- a/src/java/org/apache/cassandra/net/OutboundMessageQueue.java +++ b/src/java/org/apache/cassandra/net/OutboundMessageQueue.java @@ -27,6 +27,7 @@ import com.google.common.annotations.VisibleForTesting; +import org.apache.cassandra.concurrent.ManyToOneConcurrentLinkedQueue; import org.apache.cassandra.utils.concurrent.CountDownLatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/src/java/org/apache/cassandra/net/RequestCallback.java b/src/java/org/apache/cassandra/net/RequestCallback.java index 14e0169b858a..1265b1ea6c60 100644 --- a/src/java/org/apache/cassandra/net/RequestCallback.java +++ b/src/java/org/apache/cassandra/net/RequestCallback.java @@ -19,6 +19,7 @@ import java.util.Map; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.locator.InetAddressAndPort; @@ -38,7 +39,7 @@ public interface RequestCallback /** * Called when there is an exception on the remote node or timeout happens */ - default void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) + default void onFailure(InetAddressAndPort from, RequestFailure failure) { } diff --git a/src/java/org/apache/cassandra/net/RequestCallbackWithFailure.java b/src/java/org/apache/cassandra/net/RequestCallbackWithFailure.java index 685797abebd1..a7d807380eb1 100644 --- a/src/java/org/apache/cassandra/net/RequestCallbackWithFailure.java +++ b/src/java/org/apache/cassandra/net/RequestCallbackWithFailure.java @@ -18,7 +18,7 @@ package org.apache.cassandra.net; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.locator.InetAddressAndPort; public interface RequestCallbackWithFailure extends RequestCallback @@ -26,7 +26,7 @@ public interface RequestCallbackWithFailure extends RequestCallback /** * Called when there is an exception on the remote node or timeout happens */ - void onFailure(InetAddressAndPort from, RequestFailureReason failureReason); + void onFailure(InetAddressAndPort from, RequestFailure failure); /** * @return true if the callback should be invoked on failure diff --git a/src/java/org/apache/cassandra/net/RequestCallbacks.java b/src/java/org/apache/cassandra/net/RequestCallbacks.java index ee63c5a3e652..485efc30b1ea 100644 --- a/src/java/org/apache/cassandra/net/RequestCallbacks.java +++ b/src/java/org/apache/cassandra/net/RequestCallbacks.java @@ -21,17 +21,15 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.TimeoutException; - import javax.annotation.Nullable; import com.google.common.annotations.VisibleForTesting; - -import org.apache.cassandra.concurrent.ScheduledExecutorPlus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.concurrent.ScheduledExecutorPlus; import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.metrics.InternodeOutboundMetrics; @@ -154,7 +152,7 @@ private void onExpired(CallbackInfo info) messagingService.markExpiredCallback(info.peer); if (info.invokeOnFailure()) - INTERNAL_RESPONSE.submit(() -> info.callback.onFailure(info.peer, RequestFailureReason.TIMEOUT)); + INTERNAL_RESPONSE.submit(() -> info.callback.onFailure(info.peer, RequestFailure.TIMEOUT)); } void shutdownNow(boolean expireCallbacks) diff --git a/src/java/org/apache/cassandra/net/ResponseContext.java b/src/java/org/apache/cassandra/net/ResponseContext.java new file mode 100644 index 000000000000..4f254e3e6a45 --- /dev/null +++ b/src/java/org/apache/cassandra/net/ResponseContext.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.net; + +import accord.messages.ReplyContext; +import org.apache.cassandra.locator.InetAddressAndPort; + +public interface ResponseContext extends ReplyContext +{ + long id(); + InetAddressAndPort from(); + Verb verb(); + long expiresAtNanos(); +} diff --git a/src/java/org/apache/cassandra/net/ResponseVerbHandler.java b/src/java/org/apache/cassandra/net/ResponseVerbHandler.java index 36e5cf067040..517a10fd2a29 100644 --- a/src/java/org/apache/cassandra/net/ResponseVerbHandler.java +++ b/src/java/org/apache/cassandra/net/ResponseVerbHandler.java @@ -24,6 +24,7 @@ import org.slf4j.LoggerFactory; import org.apache.cassandra.concurrent.Stage; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.ClusterMetadataService; @@ -33,6 +34,7 @@ import static java.util.concurrent.TimeUnit.NANOSECONDS; import static org.apache.cassandra.exceptions.RequestFailureReason.COORDINATOR_BEHIND; import static org.apache.cassandra.exceptions.RequestFailureReason.INVALID_ROUTING; +import static org.apache.cassandra.exceptions.RequestFailureReason.RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM; import static org.apache.cassandra.utils.MonotonicClock.Global.approxTime; class ResponseVerbHandler implements IVerbHandler @@ -58,7 +60,11 @@ class ResponseVerbHandler implements IVerbHandler @Override public void doVerb(Message message) { - RequestCallbacks.CallbackInfo callbackInfo = MessagingService.instance().callbacks.remove(message.id(), message.from()); + RequestCallbacks.CallbackInfo callbackInfo; + if (message.header.isFinal()) + callbackInfo = MessagingService.instance().callbacks.remove(message.id(), message.from()); + else + callbackInfo = MessagingService.instance().callbacks.get(message.id(), message.from()); if (callbackInfo == null) { String msg = "Callback already removed for {} (from {})"; @@ -74,7 +80,7 @@ public void doVerb(Message message) RequestCallback cb = callbackInfo.callback; if (message.isFailureResponse()) { - cb.onFailure(message.from(), (RequestFailureReason) message.payload); + cb.onFailure(message.from(), (RequestFailure) message.payload); } else { @@ -97,8 +103,11 @@ private void maybeFetchLogs(Message message) // Gossip stage is single-threaded, so we may end up in a deadlock with after-commit hook // that executes something on the gossip stage as well. - if (message.isFailureResponse() && - (message.payload == COORDINATOR_BEHIND || message.payload == INVALID_ROUTING) && + boolean isFailureResponse = message.isFailureResponse(); + // RequestFailure is not a singleton so we need to extract and compare against the reason + RequestFailureReason reason = isFailureResponse ? ((RequestFailure)message.payload).reason : null; + if (isFailureResponse && + (reason == COORDINATOR_BEHIND || reason == INVALID_ROUTING || reason == RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM) && // Gossip stage is single-threaded, so we may end up in a deadlock with after-commit hook // that executes something on the gossip stage as well. !Stage.GOSSIP.executor().inExecutor()) diff --git a/src/java/org/apache/cassandra/net/SocketFactory.java b/src/java/org/apache/cassandra/net/SocketFactory.java index 78256267ac67..bb4013dee59f 100644 --- a/src/java/org/apache/cassandra/net/SocketFactory.java +++ b/src/java/org/apache/cassandra/net/SocketFactory.java @@ -60,6 +60,7 @@ import io.netty.util.concurrent.ThreadPerTaskExecutor; import io.netty.util.internal.logging.InternalLoggerFactory; import io.netty.util.internal.logging.Slf4JLoggerFactory; +import org.apache.cassandra.concurrent.ManyToOneConcurrentLinkedQueue; import org.apache.cassandra.concurrent.NamedThreadFactory; import org.apache.cassandra.config.EncryptionOptions; import org.apache.cassandra.locator.InetAddressAndPort; diff --git a/src/java/org/apache/cassandra/net/Verb.java b/src/java/org/apache/cassandra/net/Verb.java index c2cce663efd6..5253124ffee2 100644 --- a/src/java/org/apache/cassandra/net/Verb.java +++ b/src/java/org/apache/cassandra/net/Verb.java @@ -41,13 +41,13 @@ import org.apache.cassandra.db.ReadRepairVerbHandler; import org.apache.cassandra.db.ReadResponse; import org.apache.cassandra.db.SnapshotCommand; +import org.apache.cassandra.db.TruncateRequest; import org.apache.cassandra.db.TruncateResponse; import org.apache.cassandra.db.TruncateVerbHandler; -import org.apache.cassandra.db.TruncateRequest; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; +import org.apache.cassandra.gms.GossipDigestAck; import org.apache.cassandra.gms.GossipDigestAck2; import org.apache.cassandra.gms.GossipDigestAck2VerbHandler; -import org.apache.cassandra.gms.GossipDigestAck; import org.apache.cassandra.gms.GossipDigestAckVerbHandler; import org.apache.cassandra.gms.GossipDigestSyn; import org.apache.cassandra.gms.GossipDigestSynVerbHandler; @@ -55,6 +55,8 @@ import org.apache.cassandra.gms.GossipShutdownVerbHandler; import org.apache.cassandra.hints.HintMessage; import org.apache.cassandra.hints.HintVerbHandler; +import org.apache.cassandra.io.AsymmetricUnversionedSerializer; +import org.apache.cassandra.io.AsymmetricVersionedSerializer; import org.apache.cassandra.io.IVersionedAsymmetricSerializer; import org.apache.cassandra.repair.RepairMessageVerbHandler; import org.apache.cassandra.repair.messages.CleanupMessage; @@ -68,93 +70,156 @@ import org.apache.cassandra.repair.messages.SnapshotMessage; import org.apache.cassandra.repair.messages.StatusRequest; import org.apache.cassandra.repair.messages.StatusResponse; -import org.apache.cassandra.repair.messages.SyncResponse; import org.apache.cassandra.repair.messages.SyncRequest; -import org.apache.cassandra.repair.messages.ValidationResponse; +import org.apache.cassandra.repair.messages.SyncResponse; import org.apache.cassandra.repair.messages.ValidationRequest; +import org.apache.cassandra.repair.messages.ValidationResponse; import org.apache.cassandra.schema.SchemaMutationsSerializer; import org.apache.cassandra.schema.SchemaPullVerbHandler; import org.apache.cassandra.schema.SchemaPushVerbHandler; import org.apache.cassandra.schema.SchemaVersionVerbHandler; +import org.apache.cassandra.service.EchoVerbHandler; +import org.apache.cassandra.service.SnapshotVerbHandler; +import org.apache.cassandra.service.accord.AccordSerializers; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.AccordSyncPropagator; +import org.apache.cassandra.service.accord.AccordSyncPropagator.Notification; +import org.apache.cassandra.service.accord.FetchTopologies; +import org.apache.cassandra.service.accord.WatermarkCollector; +import org.apache.cassandra.service.accord.interop.AccordInteropApply; +import org.apache.cassandra.service.accord.interop.AccordInteropRead; +import org.apache.cassandra.service.accord.interop.AccordInteropReadRepair; +import org.apache.cassandra.service.accord.interop.AccordInteropStableThenRead; +import org.apache.cassandra.service.accord.serializers.AcceptSerializers; +import org.apache.cassandra.service.accord.serializers.ApplySerializers; +import org.apache.cassandra.service.accord.serializers.AwaitSerializers; +import org.apache.cassandra.service.accord.serializers.BeginInvalidationSerializers; +import org.apache.cassandra.service.accord.serializers.CheckStatusSerializers; +import org.apache.cassandra.service.accord.serializers.CommitSerializers; +import org.apache.cassandra.service.accord.serializers.EnumSerializer; +import org.apache.cassandra.service.accord.serializers.FetchSerializers; +import org.apache.cassandra.service.accord.serializers.GetEphmrlReadDepsSerializers; +import org.apache.cassandra.service.accord.serializers.GetMaxConflictSerializers; +import org.apache.cassandra.service.accord.serializers.InformDurableSerializers; +import org.apache.cassandra.service.accord.serializers.LatestDepsSerializers; +import org.apache.cassandra.service.accord.serializers.PreacceptSerializers; +import org.apache.cassandra.service.accord.serializers.GetDurableBeforeSerializers; +import org.apache.cassandra.service.accord.serializers.ReadDataSerializers; +import org.apache.cassandra.service.accord.serializers.RecoverySerializers; +import org.apache.cassandra.service.accord.serializers.SetDurableSerializers; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.service.consensus.migration.ConsensusKeyMigrationState; +import org.apache.cassandra.service.consensus.migration.ConsensusKeyMigrationState.ConsensusKeyMigrationFinished; +import org.apache.cassandra.service.paxos.Commit; +import org.apache.cassandra.service.paxos.Commit.Agreed; import org.apache.cassandra.service.paxos.PaxosCommit; import org.apache.cassandra.service.paxos.PaxosCommitAndPrepare; import org.apache.cassandra.service.paxos.PaxosPrepare; import org.apache.cassandra.service.paxos.PaxosPrepareRefresh; import org.apache.cassandra.service.paxos.PaxosPropose; import org.apache.cassandra.service.paxos.PaxosRepair; +import org.apache.cassandra.service.paxos.PrepareResponse; +import org.apache.cassandra.service.paxos.cleanup.PaxosCleanupComplete; import org.apache.cassandra.service.paxos.cleanup.PaxosCleanupHistory; import org.apache.cassandra.service.paxos.cleanup.PaxosCleanupRequest; import org.apache.cassandra.service.paxos.cleanup.PaxosCleanupResponse; -import org.apache.cassandra.service.paxos.cleanup.PaxosCleanupComplete; -import org.apache.cassandra.service.paxos.cleanup.PaxosStartPrepareCleanup; import org.apache.cassandra.service.paxos.cleanup.PaxosFinishPrepareCleanup; +import org.apache.cassandra.service.paxos.cleanup.PaxosStartPrepareCleanup; +import org.apache.cassandra.service.paxos.v1.PrepareVerbHandler; +import org.apache.cassandra.service.paxos.v1.ProposeVerbHandler; import org.apache.cassandra.streaming.DataMovement; import org.apache.cassandra.streaming.DataMovementVerbHandler; +import org.apache.cassandra.streaming.ReplicationDoneVerbHandler; import org.apache.cassandra.tcm.Discovery; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.FetchCMSLog; import org.apache.cassandra.tcm.FetchPeerLog; +import org.apache.cassandra.tcm.ReconstructLogState; import org.apache.cassandra.tcm.migration.CMSInitializationResponse; import org.apache.cassandra.tcm.migration.Election; import org.apache.cassandra.tcm.migration.CMSInitializationRequest; import org.apache.cassandra.tcm.sequences.DataMovements; import org.apache.cassandra.tcm.serialization.MessageSerializers; import org.apache.cassandra.utils.BooleanSerializer; -import org.apache.cassandra.service.EchoVerbHandler; -import org.apache.cassandra.service.SnapshotVerbHandler; -import org.apache.cassandra.service.paxos.Commit; -import org.apache.cassandra.service.paxos.Commit.Agreed; -import org.apache.cassandra.service.paxos.PrepareResponse; -import org.apache.cassandra.service.paxos.v1.PrepareVerbHandler; -import org.apache.cassandra.service.paxos.v1.ProposeVerbHandler; -import org.apache.cassandra.streaming.ReplicationDoneVerbHandler; import org.apache.cassandra.utils.ReflectionUtils; import org.apache.cassandra.utils.TimeUUID; import org.apache.cassandra.utils.UUIDSerializer; import static java.util.concurrent.TimeUnit.NANOSECONDS; -import static org.apache.cassandra.concurrent.Stage.*; -import static org.apache.cassandra.net.VerbTimeouts.*; -import static org.apache.cassandra.net.Verb.Kind.*; -import static org.apache.cassandra.net.Verb.Priority.*; +import static org.apache.cassandra.concurrent.Stage.ANTI_ENTROPY; +import static org.apache.cassandra.concurrent.Stage.COUNTER_MUTATION; +import static org.apache.cassandra.concurrent.Stage.FETCH_METADATA; +import static org.apache.cassandra.concurrent.Stage.GOSSIP; +import static org.apache.cassandra.concurrent.Stage.IMMEDIATE; +import static org.apache.cassandra.concurrent.Stage.INTERNAL_METADATA; +import static org.apache.cassandra.concurrent.Stage.INTERNAL_RESPONSE; +import static org.apache.cassandra.concurrent.Stage.MIGRATION; +import static org.apache.cassandra.concurrent.Stage.MISC; +import static org.apache.cassandra.concurrent.Stage.MUTATION; +import static org.apache.cassandra.concurrent.Stage.PAXOS_REPAIR; +import static org.apache.cassandra.concurrent.Stage.READ; +import static org.apache.cassandra.concurrent.Stage.REQUEST_RESPONSE; +import static org.apache.cassandra.concurrent.Stage.TRACING; +import static org.apache.cassandra.net.ResponseHandlerSupplier.RESPONSE_HANDLER; +import static org.apache.cassandra.net.Verb.Kind.CUSTOM; +import static org.apache.cassandra.net.Verb.Kind.NORMAL; +import static org.apache.cassandra.net.Verb.Priority.P0; +import static org.apache.cassandra.net.Verb.Priority.P1; +import static org.apache.cassandra.net.Verb.Priority.P2; +import static org.apache.cassandra.net.Verb.Priority.P3; +import static org.apache.cassandra.net.Verb.Priority.P4; +import static org.apache.cassandra.net.VerbTimeouts.counterTimeout; +import static org.apache.cassandra.net.VerbTimeouts.longTimeout; +import static org.apache.cassandra.net.VerbTimeouts.noTimeout; +import static org.apache.cassandra.net.VerbTimeouts.pingTimeout; +import static org.apache.cassandra.net.VerbTimeouts.rangeTimeout; +import static org.apache.cassandra.net.VerbTimeouts.readTimeout; +import static org.apache.cassandra.net.VerbTimeouts.repairTimeout; +import static org.apache.cassandra.net.VerbTimeouts.repairValidationRspTimeout; +import static org.apache.cassandra.net.VerbTimeouts.repairWithBackoffTimeout; +import static org.apache.cassandra.net.VerbTimeouts.rpcTimeout; +import static org.apache.cassandra.net.VerbTimeouts.shortTimeout; +import static org.apache.cassandra.net.VerbTimeouts.truncateTimeout; +import static org.apache.cassandra.net.VerbTimeouts.writeTimeout; import static org.apache.cassandra.tcm.ClusterMetadataService.commitRequestHandler; import static org.apache.cassandra.tcm.ClusterMetadataService.currentEpochRequestHandler; -import static org.apache.cassandra.tcm.ClusterMetadataService.logNotifyHandler; import static org.apache.cassandra.tcm.ClusterMetadataService.fetchLogRequestHandler; +import static org.apache.cassandra.tcm.ClusterMetadataService.logNotifyHandler; import static org.apache.cassandra.tcm.ClusterMetadataService.replicationHandler; /** * Note that priorities except P0 are presently unused. P0 corresponds to urgent, i.e. what used to be the "Gossip" connection. */ +@SuppressWarnings("Convert2MethodRef") // we must defer all initialisation, which includes e.g. taking a method reference to a static object/singleton, which this inspection does not disambiguate public enum Verb { - MUTATION_RSP (60, P1, writeTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), + MUTATION_RSP (60, P1, writeTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, RESPONSE_HANDLER ), MUTATION_REQ (0, P3, writeTimeout, MUTATION, () -> Mutation.serializer, () -> MutationVerbHandler.instance, MUTATION_RSP ), - HINT_RSP (61, P1, writeTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), + HINT_RSP (61, P1, writeTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, RESPONSE_HANDLER ), HINT_REQ (1, P4, writeTimeout, MUTATION, () -> HintMessage.serializer, () -> HintVerbHandler.instance, HINT_RSP ), - READ_REPAIR_RSP (62, P1, writeTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), + READ_REPAIR_RSP (62, P1, writeTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, RESPONSE_HANDLER ), READ_REPAIR_REQ (2, P1, writeTimeout, MUTATION, () -> Mutation.serializer, () -> ReadRepairVerbHandler.instance, READ_REPAIR_RSP ), - BATCH_STORE_RSP (65, P1, writeTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), + BATCH_STORE_RSP (65, P1, writeTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, RESPONSE_HANDLER ), BATCH_STORE_REQ (5, P3, writeTimeout, MUTATION, () -> Batch.serializer, () -> BatchStoreVerbHandler.instance, BATCH_STORE_RSP ), - BATCH_REMOVE_RSP (66, P1, writeTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), + BATCH_REMOVE_RSP (66, P1, writeTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, RESPONSE_HANDLER ), BATCH_REMOVE_REQ (6, P3, writeTimeout, MUTATION, () -> TimeUUID.Serializer.instance, () -> BatchRemoveVerbHandler.instance, BATCH_REMOVE_RSP ), - PAXOS_PREPARE_RSP (93, P2, writeTimeout, REQUEST_RESPONSE, () -> PrepareResponse.serializer, () -> ResponseVerbHandler.instance ), + PAXOS_PREPARE_RSP (93, P2, writeTimeout, REQUEST_RESPONSE, () -> PrepareResponse.serializer, RESPONSE_HANDLER ), PAXOS_PREPARE_REQ (33, P2, writeTimeout, MUTATION, () -> Commit.serializer, () -> PrepareVerbHandler.instance, PAXOS_PREPARE_RSP ), - PAXOS_PROPOSE_RSP (94, P2, writeTimeout, REQUEST_RESPONSE, () -> BooleanSerializer.serializer, () -> ResponseVerbHandler.instance ), + PAXOS_PROPOSE_RSP (94, P2, writeTimeout, REQUEST_RESPONSE, () -> BooleanSerializer.messagingSerializer, RESPONSE_HANDLER ), PAXOS_PROPOSE_REQ (34, P2, writeTimeout, MUTATION, () -> Commit.serializer, () -> ProposeVerbHandler.instance, PAXOS_PROPOSE_RSP ), - PAXOS_COMMIT_RSP (95, P2, writeTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), + PAXOS_COMMIT_RSP (95, P2, writeTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, RESPONSE_HANDLER ), PAXOS_COMMIT_REQ (35, P2, writeTimeout, MUTATION, () -> Agreed.serializer, () -> PaxosCommit.requestHandler, PAXOS_COMMIT_RSP ), - TRUNCATE_RSP (79, P0, truncateTimeout, REQUEST_RESPONSE, () -> TruncateResponse.serializer, () -> ResponseVerbHandler.instance ), + TRUNCATE_RSP (79, P0, truncateTimeout, REQUEST_RESPONSE, () -> TruncateResponse.serializer, RESPONSE_HANDLER ), TRUNCATE_REQ (19, P0, truncateTimeout, MUTATION, () -> TruncateRequest.serializer, () -> TruncateVerbHandler.instance, TRUNCATE_RSP ), - COUNTER_MUTATION_RSP (84, P1, counterTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), + COUNTER_MUTATION_RSP (84, P1, counterTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, RESPONSE_HANDLER ), COUNTER_MUTATION_REQ (24, P2, counterTimeout, COUNTER_MUTATION, () -> CounterMutation.serializer, () -> CounterMutationVerbHandler.instance, COUNTER_MUTATION_RSP), - READ_RSP (63, P2, readTimeout, REQUEST_RESPONSE, () -> ReadResponse.serializer, () -> ResponseVerbHandler.instance ), + READ_RSP (63, P2, readTimeout, REQUEST_RESPONSE, () -> ReadResponse.serializer, RESPONSE_HANDLER ), READ_REQ (3, P3, readTimeout, READ, () -> ReadCommand.serializer, () -> ReadCommandVerbHandler.instance, READ_RSP ), - RANGE_RSP (69, P2, rangeTimeout, REQUEST_RESPONSE, () -> ReadResponse.serializer, () -> ResponseVerbHandler.instance ), + RANGE_RSP (69, P2, rangeTimeout, REQUEST_RESPONSE, () -> ReadResponse.serializer, RESPONSE_HANDLER ), RANGE_REQ (9, P3, rangeTimeout, READ, () -> ReadCommand.serializer, () -> ReadCommandVerbHandler.instance, RANGE_RSP ), GOSSIP_DIGEST_SYN (14, P0, longTimeout, GOSSIP, () -> GossipDigestSyn.serializer, () -> GossipDigestSynVerbHandler.instance ), @@ -162,26 +227,26 @@ public enum Verb GOSSIP_DIGEST_ACK2 (16, P0, longTimeout, GOSSIP, () -> GossipDigestAck2.serializer, () -> GossipDigestAck2VerbHandler.instance ), GOSSIP_SHUTDOWN (29, P0, rpcTimeout, GOSSIP, () -> GossipShutdown.serializer, () -> GossipShutdownVerbHandler.instance ), - ECHO_RSP (91, P0, rpcTimeout, GOSSIP, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), + ECHO_RSP (91, P0, rpcTimeout, GOSSIP, () -> NoPayload.serializer, RESPONSE_HANDLER ), ECHO_REQ (31, P0, rpcTimeout, GOSSIP, () -> NoPayload.serializer, () -> EchoVerbHandler.instance, ECHO_RSP ), - PING_RSP (97, P1, pingTimeout, GOSSIP, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), + PING_RSP (97, P1, pingTimeout, GOSSIP, () -> NoPayload.serializer, RESPONSE_HANDLER ), PING_REQ (37, P1, pingTimeout, GOSSIP, () -> PingRequest.serializer, () -> PingVerbHandler.instance, PING_RSP ), // P1 because messages can be arbitrarily large or aren't crucial @Deprecated (since = "CEP-21") - SCHEMA_PUSH_RSP (98, P1, rpcTimeout, MIGRATION, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), + SCHEMA_PUSH_RSP (98, P1, rpcTimeout, MIGRATION, () -> NoPayload.serializer, RESPONSE_HANDLER ), @Deprecated (since = "CEP-21") SCHEMA_PUSH_REQ (18, P1, rpcTimeout, MIGRATION, () -> SchemaMutationsSerializer.instance, () -> SchemaPushVerbHandler.instance, SCHEMA_PUSH_RSP ), @Deprecated (since = "CEP-21") - SCHEMA_PULL_RSP (88, P1, rpcTimeout, MIGRATION, () -> SchemaMutationsSerializer.instance, () -> ResponseVerbHandler.instance ), + SCHEMA_PULL_RSP (88, P1, rpcTimeout, MIGRATION, () -> SchemaMutationsSerializer.instance, RESPONSE_HANDLER ), @Deprecated (since = "CEP-21") SCHEMA_PULL_REQ (28, P1, rpcTimeout, MIGRATION, () -> NoPayload.serializer, () -> SchemaPullVerbHandler.instance, SCHEMA_PULL_RSP ), - SCHEMA_VERSION_RSP (80, P1, rpcTimeout, MIGRATION, () -> UUIDSerializer.serializer, () -> ResponseVerbHandler.instance ), + SCHEMA_VERSION_RSP (80, P1, rpcTimeout, MIGRATION, () -> UUIDSerializer.serializer, RESPONSE_HANDLER ), SCHEMA_VERSION_REQ (20, P1, rpcTimeout, MIGRATION, () -> NoPayload.serializer, () -> SchemaVersionVerbHandler.instance, SCHEMA_VERSION_RSP ), // repair; mostly doesn't use callbacks and sends responses as their own request messages, with matching sessions by uuid; should eventually harmonize and make idiomatic // for the repair messages that implement retry logic, use rpcTimeout so the single request fails faster, then retries can be used to recover - REPAIR_RSP (100, P1, repairTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), + REPAIR_RSP (100, P1, repairTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, RESPONSE_HANDLER ), VALIDATION_RSP (102, P1, repairValidationRspTimeout, ANTI_ENTROPY, () -> ValidationResponse.serializer, () -> RepairMessageVerbHandler.instance(), REPAIR_RSP ), VALIDATION_REQ (101, P1, repairWithBackoffTimeout, ANTI_ENTROPY, () -> ValidationRequest.serializer, () -> RepairMessageVerbHandler.instance(), REPAIR_RSP ), SYNC_RSP (104, P1, repairWithBackoffTimeout, ANTI_ENTROPY, () -> SyncResponse.serializer, () -> RepairMessageVerbHandler.instance(), REPAIR_RSP ), @@ -198,57 +263,117 @@ public enum Verb STATUS_RSP (115, P1, repairTimeout, ANTI_ENTROPY, () -> StatusResponse.serializer, () -> RepairMessageVerbHandler.instance(), REPAIR_RSP ), STATUS_REQ (114, P1, repairTimeout, ANTI_ENTROPY, () -> StatusRequest.serializer, () -> RepairMessageVerbHandler.instance(), REPAIR_RSP ), - REPLICATION_DONE_RSP (82, P0, rpcTimeout, MISC, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), + REPLICATION_DONE_RSP (82, P0, rpcTimeout, MISC, () -> NoPayload.serializer, RESPONSE_HANDLER ), REPLICATION_DONE_REQ (22, P0, rpcTimeout, MISC, () -> NoPayload.serializer, () -> ReplicationDoneVerbHandler.instance, REPLICATION_DONE_RSP), - SNAPSHOT_RSP (87, P0, rpcTimeout, MISC, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), + SNAPSHOT_RSP (87, P0, rpcTimeout, MISC, () -> NoPayload.serializer, RESPONSE_HANDLER ), SNAPSHOT_REQ (27, P0, rpcTimeout, MISC, () -> SnapshotCommand.serializer, () -> SnapshotVerbHandler.instance, SNAPSHOT_RSP ), PAXOS2_COMMIT_REMOTE_REQ (38, P2, writeTimeout, MUTATION, () -> Mutation.serializer, () -> MutationVerbHandler.instance, MUTATION_RSP ), - PAXOS2_COMMIT_REMOTE_RSP (39, P2, writeTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), - PAXOS2_PREPARE_RSP (50, P2, writeTimeout, REQUEST_RESPONSE, () -> PaxosPrepare.responseSerializer, () -> ResponseVerbHandler.instance ), + PAXOS2_COMMIT_REMOTE_RSP (39, P2, writeTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, RESPONSE_HANDLER ), + PAXOS2_PREPARE_RSP (50, P2, writeTimeout, REQUEST_RESPONSE, () -> PaxosPrepare.responseSerializer, RESPONSE_HANDLER ), PAXOS2_PREPARE_REQ (40, P2, writeTimeout, MUTATION, () -> PaxosPrepare.requestSerializer, () -> PaxosPrepare.requestHandler, PAXOS2_PREPARE_RSP ), - PAXOS2_PREPARE_REFRESH_RSP (51, P2, writeTimeout, REQUEST_RESPONSE, () -> PaxosPrepareRefresh.responseSerializer, () -> ResponseVerbHandler.instance ), + PAXOS2_PREPARE_REFRESH_RSP (51, P2, writeTimeout, REQUEST_RESPONSE, () -> PaxosPrepareRefresh.responseSerializer, RESPONSE_HANDLER ), PAXOS2_PREPARE_REFRESH_REQ (41, P2, writeTimeout, MUTATION, () -> PaxosPrepareRefresh.requestSerializer, () -> PaxosPrepareRefresh.requestHandler, PAXOS2_PREPARE_REFRESH_RSP ), - PAXOS2_PROPOSE_RSP (52, P2, writeTimeout, REQUEST_RESPONSE, () -> PaxosPropose.responseSerializer, () -> ResponseVerbHandler.instance ), + PAXOS2_PROPOSE_RSP (52, P2, writeTimeout, REQUEST_RESPONSE, () -> PaxosPropose.ACCEPT_RESULT_SERIALIZER, RESPONSE_HANDLER ), PAXOS2_PROPOSE_REQ (42, P2, writeTimeout, MUTATION, () -> PaxosPropose.requestSerializer, () -> PaxosPropose.requestHandler, PAXOS2_PROPOSE_RSP ), - PAXOS2_COMMIT_AND_PREPARE_RSP (53, P2, writeTimeout, REQUEST_RESPONSE, () -> PaxosPrepare.responseSerializer, () -> ResponseVerbHandler.instance ), + PAXOS2_COMMIT_AND_PREPARE_RSP (53, P2, writeTimeout, REQUEST_RESPONSE, () -> PaxosPrepare.responseSerializer, RESPONSE_HANDLER ), PAXOS2_COMMIT_AND_PREPARE_REQ (43, P2, writeTimeout, MUTATION, () -> PaxosCommitAndPrepare.requestSerializer, () -> PaxosCommitAndPrepare.requestHandler, PAXOS2_COMMIT_AND_PREPARE_RSP ), - PAXOS2_REPAIR_RSP (54, P2, writeTimeout, PAXOS_REPAIR, () -> PaxosRepair.responseSerializer, () -> ResponseVerbHandler.instance ), + PAXOS2_REPAIR_RSP (54, P2, writeTimeout, PAXOS_REPAIR, () -> PaxosRepair.responseSerializer, RESPONSE_HANDLER ), PAXOS2_REPAIR_REQ (44, P2, writeTimeout, PAXOS_REPAIR, () -> PaxosRepair.requestSerializer, () -> PaxosRepair.requestHandler, PAXOS2_REPAIR_RSP ), - PAXOS2_CLEANUP_START_PREPARE_RSP (55, P2, repairTimeout, PAXOS_REPAIR, () -> PaxosCleanupHistory.serializer, () -> ResponseVerbHandler.instance ), + PAXOS2_CLEANUP_START_PREPARE_RSP (55, P2, repairTimeout, PAXOS_REPAIR, () -> PaxosCleanupHistory.serializer, RESPONSE_HANDLER ), PAXOS2_CLEANUP_START_PREPARE_REQ (45, P2, repairTimeout, PAXOS_REPAIR, () -> PaxosStartPrepareCleanup.serializer, () -> PaxosStartPrepareCleanup.verbHandler, PAXOS2_CLEANUP_START_PREPARE_RSP ), - PAXOS2_CLEANUP_RSP (56, P2, repairTimeout, PAXOS_REPAIR, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), + PAXOS2_CLEANUP_RSP (56, P2, repairTimeout, PAXOS_REPAIR, () -> NoPayload.serializer, RESPONSE_HANDLER ), PAXOS2_CLEANUP_REQ (46, P2, repairTimeout, PAXOS_REPAIR, () -> PaxosCleanupRequest.serializer, () -> PaxosCleanupRequest.verbHandler, PAXOS2_CLEANUP_RSP ), PAXOS2_CLEANUP_RSP2 (57, P2, repairTimeout, PAXOS_REPAIR, () -> PaxosCleanupResponse.serializer, () -> PaxosCleanupResponse.verbHandler ), - PAXOS2_CLEANUP_FINISH_PREPARE_RSP(58, P2, repairTimeout, PAXOS_REPAIR, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), + PAXOS2_CLEANUP_FINISH_PREPARE_RSP(58, P2, repairTimeout, PAXOS_REPAIR, () -> NoPayload.serializer, RESPONSE_HANDLER ), PAXOS2_CLEANUP_FINISH_PREPARE_REQ(47, P2, repairTimeout, IMMEDIATE, () -> PaxosCleanupHistory.serializer, () -> PaxosFinishPrepareCleanup.verbHandler, PAXOS2_CLEANUP_FINISH_PREPARE_RSP), - PAXOS2_CLEANUP_COMPLETE_RSP (59, P2, repairTimeout, PAXOS_REPAIR, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), + PAXOS2_CLEANUP_COMPLETE_RSP (59, P2, repairTimeout, PAXOS_REPAIR, () -> NoPayload.serializer, RESPONSE_HANDLER ), PAXOS2_CLEANUP_COMPLETE_REQ (48, P2, repairTimeout, PAXOS_REPAIR, () -> PaxosCleanupComplete.serializer, () -> PaxosCleanupComplete.verbHandler, PAXOS2_CLEANUP_COMPLETE_RSP ), // transactional cluster metadata - TCM_COMMIT_RSP (801, P0, rpcTimeout, INTERNAL_METADATA, MessageSerializers::commitResultSerializer, () -> ResponseVerbHandler.instance ), + TCM_COMMIT_RSP (801, P0, rpcTimeout, INTERNAL_METADATA, MessageSerializers::commitResultSerializer, RESPONSE_HANDLER ), TCM_COMMIT_REQ (802, P0, rpcTimeout, INTERNAL_METADATA, MessageSerializers::commitSerializer, () -> commitRequestHandler(), TCM_COMMIT_RSP ), - TCM_FETCH_CMS_LOG_RSP (803, P0, rpcTimeout, FETCH_LOG, MessageSerializers::logStateSerializer, () -> ResponseVerbHandler.instance ), - TCM_FETCH_CMS_LOG_REQ (804, P0, rpcTimeout, FETCH_LOG, () -> FetchCMSLog.serializer, () -> fetchLogRequestHandler(), TCM_FETCH_CMS_LOG_RSP ), + TCM_FETCH_CMS_LOG_RSP (803, P0, shortTimeout, FETCH_METADATA, MessageSerializers::logStateSerializer, RESPONSE_HANDLER ), + TCM_FETCH_CMS_LOG_REQ (804, P0, rpcTimeout, FETCH_METADATA, () -> FetchCMSLog.serializer, () -> fetchLogRequestHandler(), TCM_FETCH_CMS_LOG_RSP ), TCM_REPLICATION (805, P0, rpcTimeout, INTERNAL_METADATA, MessageSerializers::logStateSerializer, () -> replicationHandler() ), - TCM_NOTIFY_RSP (806, P0, rpcTimeout, INTERNAL_METADATA, () -> Epoch.messageSerializer, () -> ResponseVerbHandler.instance ), + TCM_NOTIFY_RSP (806, P0, rpcTimeout, INTERNAL_METADATA, () -> Epoch.messageSerializer, RESPONSE_HANDLER ), TCM_NOTIFY_REQ (807, P0, rpcTimeout, INTERNAL_METADATA, MessageSerializers::logStateSerializer, () -> logNotifyHandler(), TCM_NOTIFY_RSP ), TCM_CURRENT_EPOCH_REQ (808, P0, rpcTimeout, INTERNAL_METADATA, () -> Epoch.messageSerializer, () -> currentEpochRequestHandler(), TCM_NOTIFY_RSP ), - TCM_INIT_MIG_RSP (809, P0, rpcTimeout, INTERNAL_METADATA, () -> CMSInitializationResponse.serializer, () -> ResponseVerbHandler.instance ), + TCM_INIT_MIG_RSP (809, P0, rpcTimeout, INTERNAL_METADATA, () -> CMSInitializationResponse.serializer, RESPONSE_HANDLER ), TCM_INIT_MIG_REQ (810, P0, rpcTimeout, INTERNAL_METADATA, MessageSerializers::initRequestSerializer, () -> Election.instance.prepareHandler, TCM_INIT_MIG_RSP ), TCM_ABORT_MIG (811, P0, rpcTimeout, INTERNAL_METADATA, () -> CMSInitializationRequest.Initiator.serializer,() -> Election.instance.abortHandler, TCM_INIT_MIG_RSP ), - TCM_DISCOVER_RSP (812, P0, rpcTimeout, INTERNAL_METADATA, () -> Discovery.serializer, () -> ResponseVerbHandler.instance ), + TCM_DISCOVER_RSP (812, P0, rpcTimeout, INTERNAL_METADATA, () -> Discovery.serializer, RESPONSE_HANDLER ), TCM_DISCOVER_REQ (813, P0, rpcTimeout, INTERNAL_METADATA, () -> NoPayload.serializer, () -> Discovery.instance.requestHandler, TCM_DISCOVER_RSP ), - TCM_FETCH_PEER_LOG_RSP (818, P0, rpcTimeout, FETCH_LOG, MessageSerializers::logStateSerializer, () -> ResponseVerbHandler.instance ), - TCM_FETCH_PEER_LOG_REQ (819, P0, rpcTimeout, FETCH_LOG, () -> FetchPeerLog.serializer, () -> FetchPeerLog.Handler.instance, TCM_FETCH_PEER_LOG_RSP ), + TCM_FETCH_PEER_LOG_RSP (818, P0, shortTimeout, FETCH_METADATA, MessageSerializers::logStateSerializer, RESPONSE_HANDLER ), + TCM_FETCH_PEER_LOG_REQ (819, P0, rpcTimeout, FETCH_METADATA, () -> FetchPeerLog.serializer, () -> FetchPeerLog.Handler.instance, TCM_FETCH_PEER_LOG_RSP ), + TCM_RECONSTRUCT_EPOCH_RSP (820, P0, rpcTimeout, FETCH_METADATA, MessageSerializers::logStateSerializer, () -> ResponseVerbHandler.instance ), + TCM_RECONSTRUCT_EPOCH_REQ (821, P0, rpcTimeout, FETCH_METADATA, () -> ReconstructLogState.serializer, () -> ReconstructLogState.Handler.instance, TCM_FETCH_PEER_LOG_RSP ), - INITIATE_DATA_MOVEMENTS_RSP (814, P1, rpcTimeout, MISC, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), + INITIATE_DATA_MOVEMENTS_RSP (814, P1, rpcTimeout, MISC, () -> NoPayload.serializer, RESPONSE_HANDLER ), INITIATE_DATA_MOVEMENTS_REQ (815, P1, rpcTimeout, MISC, () -> DataMovement.serializer, () -> DataMovementVerbHandler.instance, INITIATE_DATA_MOVEMENTS_RSP ), - DATA_MOVEMENT_EXECUTED_RSP (816, P1, rpcTimeout, MISC, () -> NoPayload.serializer, () -> ResponseVerbHandler.instance ), + DATA_MOVEMENT_EXECUTED_RSP (816, P1, rpcTimeout, MISC, () -> NoPayload.serializer, RESPONSE_HANDLER ), DATA_MOVEMENT_EXECUTED_REQ (817, P1, rpcTimeout, MISC, () -> DataMovement.Status.serializer, () -> DataMovements.instance, DATA_MOVEMENT_EXECUTED_RSP ), + // accord + ACCORD_SIMPLE_RSP (119, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(EnumSerializer.simpleReply), AccordService::responseHandlerOrNoop ), + ACCORD_PRE_ACCEPT_RSP (120, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(PreacceptSerializers.reply), AccordService::responseHandlerOrNoop ), + ACCORD_PRE_ACCEPT_REQ (121, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(PreacceptSerializers.request), AccordService::requestHandlerOrNoop, ACCORD_PRE_ACCEPT_RSP ), + ACCORD_ACCEPT_RSP (122, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(AcceptSerializers.reply), AccordService::responseHandlerOrNoop ), + ACCORD_ACCEPT_REQ (123, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(AcceptSerializers.request), AccordService::requestHandlerOrNoop, ACCORD_ACCEPT_RSP ), + ACCORD_NOT_ACCEPT_REQ (124, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(AcceptSerializers.notAccept), AccordService::requestHandlerOrNoop, ACCORD_ACCEPT_RSP ), + ACCORD_READ_RSP (125, P2, readTimeout, IMMEDIATE, () -> accordEmbedded(ReadDataSerializers.reply), AccordService::responseHandlerOrNoop ), + ACCORD_READ_REQ (126, P2, readTimeout, IMMEDIATE, () -> accordEmbedded(ReadDataSerializers.readData), AccordService::requestHandlerOrNoop, ACCORD_READ_RSP ), + ACCORD_STABLE_THEN_READ_REQ (127, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(ReadDataSerializers.stableThenRead), AccordService::requestHandlerOrNoop, ACCORD_READ_RSP ), + ACCORD_COMMIT_REQ (128, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(CommitSerializers.request), AccordService::requestHandlerOrNoop, ACCORD_READ_RSP ), + ACCORD_COMMIT_INVALIDATE_REQ (129, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(CommitSerializers.invalidate), AccordService::requestHandlerOrNoop ), + ACCORD_APPLY_RSP (130, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(ApplySerializers.reply), AccordService::responseHandlerOrNoop ), + ACCORD_APPLY_REQ (131, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(ApplySerializers.request), AccordService::requestHandlerOrNoop, ACCORD_APPLY_RSP ), + ACCORD_APPLY_AND_WAIT_REQ (132, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(ReadDataSerializers.readData), AccordService::requestHandlerOrNoop, ACCORD_READ_RSP), + ACCORD_BEGIN_RECOVER_RSP (133, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(RecoverySerializers.reply), AccordService::responseHandlerOrNoop ), + ACCORD_BEGIN_RECOVER_REQ (134, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(RecoverySerializers.request), AccordService::requestHandlerOrNoop, ACCORD_BEGIN_RECOVER_RSP ), + ACCORD_BEGIN_INVALIDATE_RSP (135, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(BeginInvalidationSerializers.reply), AccordService::responseHandlerOrNoop ), + ACCORD_BEGIN_INVALIDATE_REQ (136, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(BeginInvalidationSerializers.request), AccordService::requestHandlerOrNoop, ACCORD_BEGIN_INVALIDATE_RSP ), + ACCORD_AWAIT_RSP (137, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(AwaitSerializers.syncReply), AccordService::responseHandlerOrNoop ), + ACCORD_AWAIT_REQ (138, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(AwaitSerializers.request), AccordService::requestHandlerOrNoop, ACCORD_AWAIT_RSP ), + ACCORD_AWAIT_ASYNC_RSP_REQ (139, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(AwaitSerializers.asyncReply), AccordService::requestHandlerOrNoop ), + ACCORD_WAIT_UNTIL_APPLIED_REQ (140, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(ReadDataSerializers.waitUntilApplied), AccordService::requestHandlerOrNoop, ACCORD_READ_RSP ), + ACCORD_RECOVER_AWAIT_RSP (141, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(AwaitSerializers.recoverReply), AccordService::responseHandlerOrNoop ), + ACCORD_RECOVER_AWAIT_REQ (142, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(AwaitSerializers.recoverRequest), AccordService::requestHandlerOrNoop, ACCORD_RECOVER_AWAIT_RSP), + ACCORD_INFORM_DURABLE_REQ (143, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(InformDurableSerializers.request), AccordService::requestHandlerOrNoop, ACCORD_SIMPLE_RSP ), + ACCORD_CHECK_STATUS_RSP (144, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(CheckStatusSerializers.reply), AccordService::responseHandlerOrNoop ), + ACCORD_CHECK_STATUS_REQ (145, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(CheckStatusSerializers.request), AccordService::requestHandlerOrNoop, ACCORD_CHECK_STATUS_RSP ), + ACCORD_FETCH_DATA_RSP (146, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(FetchSerializers.reply), AccordService::responseHandlerOrNoop ), + ACCORD_FETCH_DATA_REQ (147, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(FetchSerializers.request), AccordService::requestHandlerOrNoop, ACCORD_FETCH_DATA_RSP ), + ACCORD_GET_EPHMRL_READ_DEPS_RSP (148, P2, readTimeout, IMMEDIATE, () -> accordEmbedded(GetEphmrlReadDepsSerializers.reply), AccordService::responseHandlerOrNoop ), + ACCORD_GET_EPHMRL_READ_DEPS_REQ (149, P2, readTimeout, IMMEDIATE, () -> accordEmbedded(GetEphmrlReadDepsSerializers.request), AccordService::requestHandlerOrNoop, ACCORD_GET_EPHMRL_READ_DEPS_RSP), + ACCORD_GET_LATEST_DEPS_RSP (150, P2, readTimeout, IMMEDIATE, () -> accordEmbedded(LatestDepsSerializers.reply), AccordService::responseHandlerOrNoop ), + ACCORD_GET_LATEST_DEPS_REQ (151, P2, readTimeout, IMMEDIATE, () -> accordEmbedded(LatestDepsSerializers.request), AccordService::requestHandlerOrNoop, ACCORD_GET_LATEST_DEPS_RSP), + ACCORD_GET_MAX_CONFLICT_RSP (152, P2, readTimeout, IMMEDIATE, () -> accordEmbedded(GetMaxConflictSerializers.reply), AccordService::responseHandlerOrNoop ), + ACCORD_GET_MAX_CONFLICT_REQ (153, P2, readTimeout, IMMEDIATE, () -> accordEmbedded(GetMaxConflictSerializers.request), AccordService::requestHandlerOrNoop, ACCORD_GET_MAX_CONFLICT_RSP), + ACCORD_GET_DURABLE_BEFORE_RSP (154, P2, readTimeout, MISC, () -> accordEmbedded(GetDurableBeforeSerializers.reply), AccordService::responseHandlerOrNoop ), + ACCORD_GET_DURABLE_BEFORE_REQ (155, P2, readTimeout, MISC, () -> accordEmbedded(GetDurableBeforeSerializers.request), AccordService::requestHandlerOrNoop, ACCORD_GET_DURABLE_BEFORE_RSP ), + ACCORD_SET_SHARD_DURABLE_REQ (156, P2, rpcTimeout, MISC, () -> accordEmbedded(SetDurableSerializers.shardDurable), AccordService::requestHandlerOrNoop, ACCORD_SIMPLE_RSP ), + ACCORD_SET_GLOBALLY_DURABLE_REQ (157, P2, rpcTimeout, MISC, () -> accordEmbedded(SetDurableSerializers.globallyDurable),AccordService::requestHandlerOrNoop, ACCORD_SIMPLE_RSP ), + + ACCORD_SYNC_NOTIFY_RSP (158, P2, writeTimeout, MISC, () -> accordEmbedded(EnumSerializer.simpleReply), RESPONSE_HANDLER), + ACCORD_SYNC_NOTIFY_REQ (159, P2, writeTimeout, MISC, () -> accordEmbedded(Notification.serializer), () -> AccordSyncPropagator.verbHandler, ACCORD_SYNC_NOTIFY_RSP ), + + CONSENSUS_KEY_MIGRATION (160, P1, writeTimeout, MISC, () -> accordEmbedded(ConsensusKeyMigrationFinished.serializer),() -> ConsensusKeyMigrationState.consensusKeyMigrationFinishedHandler), + + ACCORD_INTEROP_READ_RSP (161, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(AccordInteropRead.replySerializer), AccordService::responseHandlerOrNoop), + ACCORD_INTEROP_READ_REQ (162, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(AccordInteropRead.requestSerializer), AccordService::requestHandlerOrNoop, ACCORD_INTEROP_READ_RSP), + ACCORD_INTEROP_STABLE_THEN_READ_REQ(163, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(AccordInteropStableThenRead.requestSerializer), AccordService::requestHandlerOrNoop, ACCORD_INTEROP_READ_RSP), + ACCORD_INTEROP_READ_REPAIR_RSP (164, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(AccordInteropReadRepair.replySerializer), AccordService::responseHandlerOrNoop), + ACCORD_INTEROP_READ_REPAIR_REQ (165, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(AccordInteropReadRepair.requestSerializer), AccordService::requestHandlerOrNoop, ACCORD_INTEROP_READ_REPAIR_RSP), + ACCORD_INTEROP_APPLY_REQ (166, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(AccordInteropApply.serializer), AccordService::requestHandlerOrNoop, ACCORD_APPLY_RSP), + ACCORD_FETCH_WATERMARKS_RSP (167, P0, shortTimeout, FETCH_METADATA, () -> accordEmbedded(WatermarkCollector.serializer), RESPONSE_HANDLER), + // NoPayload can not be prefixed with accord version as it is special cased in C* messaging + ACCORD_FETCH_WATERMARKS_REQ (168, P0, shortTimeout, FETCH_METADATA, () -> NoPayload.serializer, AccordService::watermarkHandlerOrNoop, ACCORD_FETCH_WATERMARKS_RSP), + ACCORD_FETCH_TOPOLOGY_RSP (169, P0, shortTimeout, FETCH_METADATA, () -> accordEmbedded(FetchTopologies.responseSerializer), RESPONSE_HANDLER), + ACCORD_FETCH_TOPOLOGY_REQ (170, P0, shortTimeout, FETCH_METADATA, () -> accordEmbedded(FetchTopologies.serializer), () -> FetchTopologies.handler, ACCORD_FETCH_TOPOLOGY_RSP), + // generic failure response - FAILURE_RSP (99, P0, noTimeout, REQUEST_RESPONSE, () -> RequestFailureReason.serializer, () -> ResponseVerbHandler.instance ), + FAILURE_RSP (99, P0, noTimeout, REQUEST_RESPONSE, () -> RequestFailure.serializer, RESPONSE_HANDLER ), // dummy verbs _TRACE (30, P1, rpcTimeout, TRACING, () -> NoPayload.serializer, () -> null ), @@ -318,7 +443,7 @@ public enum Kind */ Verb(int id, Priority priority, ToLongFunction expiration, Stage stage, Supplier> serializer, Supplier> handler) { - this(id, priority, expiration, stage, serializer, handler, null); + this(NORMAL, id, priority, expiration, stage, serializer, handler, null); } Verb(int id, Priority priority, ToLongFunction expiration, Stage stage, Supplier> serializer, Supplier> handler, Verb responseVerb) @@ -377,6 +502,11 @@ public long expiresAfterNanos() return expiration.applyAsLong(NANOSECONDS); } + public long expiresAfter(TimeUnit units) + { + return expiration.applyAsLong(units); + } + // this is a little hacky, but reduces the number of parameters up top public boolean isResponse() { @@ -506,12 +636,23 @@ private static int idForCustomVerb(int id) { return CUSTOM_VERB_START - id; } + + private static IVersionedAsymmetricSerializer accordEmbedded(AsymmetricVersionedSerializer delegate) + { + return AccordSerializers.embedded(Version.CLUSTER_SAFE_VERSION, delegate); + } + + private static IVersionedAsymmetricSerializer accordEmbedded(AsymmetricUnversionedSerializer delegate) + { + return accordEmbedded(AsymmetricVersionedSerializer.from(delegate)); + } } @SuppressWarnings("unused") class VerbTimeouts { static final ToLongFunction rpcTimeout = DatabaseDescriptor::getRpcTimeout; + static final ToLongFunction shortTimeout = DatabaseDescriptor::getShortRpcTimeout; static final ToLongFunction writeTimeout = DatabaseDescriptor::getWriteRpcTimeout; static final ToLongFunction readTimeout = DatabaseDescriptor::getReadRpcTimeout; static final ToLongFunction rangeTimeout = DatabaseDescriptor::getRangeRpcTimeout; @@ -534,3 +675,8 @@ class VerbTimeouts return rpcTimeout.applyAsLong(units); }; } + +class ResponseHandlerSupplier +{ + static final Supplier> RESPONSE_HANDLER = () -> ResponseVerbHandler.instance; +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/repair/AbstractRepairTask.java b/src/java/org/apache/cassandra/repair/AbstractRepairTask.java index f27e72deb177..ff3d5e6f9cbf 100644 --- a/src/java/org/apache/cassandra/repair/AbstractRepairTask.java +++ b/src/java/org/apache/cassandra/repair/AbstractRepairTask.java @@ -24,7 +24,6 @@ import com.google.common.collect.Lists; import com.google.common.util.concurrent.FutureCallback; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -59,6 +58,7 @@ private List submitRepairSessions(TimeUUID parentSession, ExecutorPlus executor, Scheduler validationScheduler, List commonRanges, + boolean excludedDeadNodes, String... cfnames) { List futures = new ArrayList<>(options.getRanges().size()); @@ -68,15 +68,17 @@ private List submitRepairSessions(TimeUUID parentSession, logger.info("Starting RepairSession for {}", commonRange); RepairSession session = coordinator.ctx.repair().submitRepairSession(parentSession, commonRange, + excludedDeadNodes, keyspace, options.getParallelism(), isIncremental, options.isPullRepair(), options.getPreviewKind(), options.optimiseStreams(), + options.repairData(), options.repairPaxos(), - options.paxosOnly(), options.dontPurgeTombstones(), + options.repairAccord(), executor, validationScheduler, cfnames); @@ -93,9 +95,10 @@ protected Future runRepair(TimeUUID parentSession, ExecutorPlus executor, Scheduler validationScheduler, List commonRanges, + boolean excludedDeadNodes, String... cfnames) { - List allSessions = submitRepairSessions(parentSession, isIncremental, executor, validationScheduler, commonRanges, cfnames); + List allSessions = submitRepairSessions(parentSession, isIncremental, executor, validationScheduler, commonRanges, excludedDeadNodes, cfnames); List>> ranges = Lists.transform(allSessions, RepairSession::ranges); Future> f = FutureCombiner.successfulOf(allSessions); return f.map(results -> { diff --git a/src/java/org/apache/cassandra/repair/IncrementalRepairTask.java b/src/java/org/apache/cassandra/repair/IncrementalRepairTask.java index 347846cf6b4e..956973c49418 100644 --- a/src/java/org/apache/cassandra/repair/IncrementalRepairTask.java +++ b/src/java/org/apache/cassandra/repair/IncrementalRepairTask.java @@ -64,7 +64,7 @@ public Future performUnsafe(ExecutorPlus executor, Sche CoordinatorSession coordinatorSession = coordinator.ctx.repair().consistent.coordinated.registerSession(parentSession, allParticipants, neighborsAndRanges.shouldExcludeDeadParticipants); - return coordinatorSession.execute(() -> runRepair(parentSession, true, executor, validationScheduler, allRanges, cfnames)); + return coordinatorSession.execute(() -> runRepair(parentSession, true, executor, validationScheduler, allRanges, neighborsAndRanges.shouldExcludeDeadParticipants, cfnames)); } } diff --git a/src/java/org/apache/cassandra/repair/NormalRepairTask.java b/src/java/org/apache/cassandra/repair/NormalRepairTask.java index e304280c5822..05b721d17c61 100644 --- a/src/java/org/apache/cassandra/repair/NormalRepairTask.java +++ b/src/java/org/apache/cassandra/repair/NormalRepairTask.java @@ -27,16 +27,19 @@ public class NormalRepairTask extends AbstractRepairTask { private final TimeUUID parentSession; private final List commonRanges; + private final boolean excludedDeadNodes; private final String[] cfnames; protected NormalRepairTask(RepairCoordinator coordinator, TimeUUID parentSession, List commonRanges, + boolean excludedDeadNodes, String[] cfnames) { super(coordinator); this.parentSession = parentSession; this.commonRanges = commonRanges; + this.excludedDeadNodes = excludedDeadNodes; this.cfnames = cfnames; } @@ -49,6 +52,6 @@ public String name() @Override public Future performUnsafe(ExecutorPlus executor, Scheduler validationScheduler) { - return runRepair(parentSession, false, executor, validationScheduler, commonRanges, cfnames); + return runRepair(parentSession, false, executor, validationScheduler, commonRanges, excludedDeadNodes, cfnames); } } diff --git a/src/java/org/apache/cassandra/repair/PreviewRepairTask.java b/src/java/org/apache/cassandra/repair/PreviewRepairTask.java index 95c7a63f9466..6323d8e2751d 100644 --- a/src/java/org/apache/cassandra/repair/PreviewRepairTask.java +++ b/src/java/org/apache/cassandra/repair/PreviewRepairTask.java @@ -26,6 +26,7 @@ import org.apache.cassandra.concurrent.ExecutorPlus; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; @@ -42,14 +43,16 @@ public class PreviewRepairTask extends AbstractRepairTask { private final TimeUUID parentSession; private final List commonRanges; + private final boolean excludedDeadNodes; private final String[] cfnames; private volatile String successMessage = name() + " completed successfully"; - protected PreviewRepairTask(RepairCoordinator coordinator, TimeUUID parentSession, List commonRanges, String[] cfnames) + protected PreviewRepairTask(RepairCoordinator coordinator, TimeUUID parentSession, List commonRanges, boolean excludedDeadNodes, String[] cfnames) { super(coordinator); this.parentSession = parentSession; this.commonRanges = commonRanges; + this.excludedDeadNodes = excludedDeadNodes; this.cfnames = cfnames; } @@ -68,7 +71,7 @@ public String successMessage() @Override public Future performUnsafe(ExecutorPlus executor, Scheduler validationScheduler) { - Future f = runRepair(parentSession, false, executor, validationScheduler, commonRanges, cfnames); + Future f = runRepair(parentSession, false, executor, validationScheduler, commonRanges, excludedDeadNodes, cfnames); return f.map(result -> { if (result.hasFailed()) return result; @@ -86,10 +89,10 @@ public Future performUnsafe(ExecutorPlus executor, Sche else { message = (previewKind == PreviewKind.REPAIRED ? "Repaired data is inconsistent\n" : "Preview complete\n") + summary; - RepairMetrics.previewFailures.inc(); if (previewKind == PreviewKind.REPAIRED) maybeSnapshotReplicas(parentSession, keyspace, result.results.get()); // we know its present as summary used it } + emitMetrics(summary); successMessage += "; " + message; coordinator.notification(message); @@ -97,6 +100,21 @@ public Future performUnsafe(ExecutorPlus executor, Sche }); } + private void emitMetrics(SyncStatSummary summary) + { + if (!summary.isEmpty()) + RepairMetrics.previewFailures.inc(); + + summary.getTotals().forEach((key, table) -> { + if (table.isCounter()) + return; + + ColumnFamilyStore cfs = Keyspace.open(key.left).getColumnFamilyStore(key.right); + cfs.metric.tokenRangesPreviewedDesynchronized.mark(table.getRanges()); + cfs.metric.bytesPreviewedDesynchronized.mark(table.getBytes()); + }); + } + private void maybeSnapshotReplicas(TimeUUID parentSession, String keyspace, List results) { if (!DatabaseDescriptor.snapshotOnRepairedDataMismatch()) diff --git a/src/java/org/apache/cassandra/repair/RepairCoordinator.java b/src/java/org/apache/cassandra/repair/RepairCoordinator.java index c4b7a9fd55c9..fa2036b02f13 100644 --- a/src/java/org/apache/cassandra/repair/RepairCoordinator.java +++ b/src/java/org/apache/cassandra/repair/RepairCoordinator.java @@ -39,14 +39,6 @@ import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Sets; - -import org.apache.cassandra.locator.RangesAtEndpoint; -import org.apache.cassandra.net.Verb; -import org.apache.cassandra.repair.messages.FailSession; -import org.apache.cassandra.repair.messages.RepairMessage; -import org.apache.cassandra.repair.state.ParticipateState; -import org.apache.cassandra.transport.Dispatcher; -import org.apache.cassandra.utils.concurrent.Future; import org.apache.commons.lang3.time.DurationFormatUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -66,12 +58,18 @@ import org.apache.cassandra.exceptions.RepairException; import org.apache.cassandra.locator.EndpointsForRange; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.RangesAtEndpoint; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.metrics.StorageMetrics; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.repair.messages.FailSession; +import org.apache.cassandra.repair.messages.RepairMessage; import org.apache.cassandra.repair.messages.RepairOption; import org.apache.cassandra.repair.state.CoordinatorState; +import org.apache.cassandra.repair.state.ParticipateState; import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.schema.SystemDistributedKeyspace; +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.ActiveRepairService; import org.apache.cassandra.service.ActiveRepairService.ParentRepairStatus; import org.apache.cassandra.service.ClientState; @@ -80,12 +78,14 @@ import org.apache.cassandra.tracing.TraceKeyspace; import org.apache.cassandra.tracing.TraceState; import org.apache.cassandra.tracing.Tracing; +import org.apache.cassandra.transport.Dispatcher; import org.apache.cassandra.transport.messages.ResultMessage; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.Pair; import org.apache.cassandra.utils.Throwables; import org.apache.cassandra.utils.TimeUUID; import org.apache.cassandra.utils.WrappedRunnable; +import org.apache.cassandra.utils.concurrent.Future; import org.apache.cassandra.utils.progress.ProgressEvent; import org.apache.cassandra.utils.progress.ProgressEventNotifier; import org.apache.cassandra.utils.progress.ProgressEventType; @@ -128,7 +128,7 @@ public RepairCoordinator(StorageService storageService, int cmd, RepairOption op { this.ctx = ctx; this.validationScheduler = Scheduler.build(DatabaseDescriptor.getConcurrentMerkleTreeRequests()); - this.state = new CoordinatorState(ctx.clock(), cmd, keyspace, options); + this.state = new CoordinatorState(ctx, cmd, keyspace, options); this.tag = "repair:" + cmd; this.validColumnFamilies = validColumnFamilies; this.getLocalReplicas = getLocalReplicas; @@ -290,17 +290,35 @@ public void run() } } + private static void validate(RepairOption options, List columnFamilies) + { + if (options.paxosOnly() && options.accordOnly()) + throw new IllegalArgumentException("Cannot specify a repair as both paxos only and accord only"); + + for (ColumnFamilyStore cfs : columnFamilies) + { + TableMetadata metadata = cfs.metadata(); + if (options.paxosOnly() && !metadata.supportsPaxosOperations()) + throw new IllegalArgumentException(String.format("Cannot run paxos only repair on %s.%s, which isn't configured for paxos operations", cfs.keyspace.getName(), cfs.name)); + + if (options.accordOnly() && !metadata.requiresAccordSupport()) + throw new IllegalArgumentException(String.format("Cannot run accord only repair on %s.%s, which isn't configured for accord operations", cfs.keyspace.getName(), cfs.name)); + } + } + private void runMayThrow() throws Throwable { state.phase.setup(); ctx.repair().recordRepairStatus(state.cmd, ParentRepairStatus.IN_PROGRESS, ImmutableList.of()); List columnFamilies = getColumnFamilies(); + validate(state.options, columnFamilies); String[] cfnames = columnFamilies.stream().map(cfs -> cfs.name).toArray(String[]::new); this.traceState = maybeCreateTraceState(columnFamilies); notifyStarting(); NeighborsAndRanges neighborsAndRanges = getNeighborsAndRanges(); + // We test to validate the start JMX notification is seen before we compute neighbors and ranges // but in state (vtable) tracking, we rely on getNeighborsAndRanges to know where we are running repair... // JMX start != state start, its possible we fail in getNeighborsAndRanges and state start is never reached @@ -478,7 +496,7 @@ private Future>> repair(String[] RepairTask task; if (state.options.isPreview()) { - task = new PreviewRepairTask(this, state.id, neighborsAndRanges.filterCommonRanges(state.keyspace, cfnames), cfnames); + task = new PreviewRepairTask(this, state.id, neighborsAndRanges.filterCommonRanges(state.keyspace, cfnames), neighborsAndRanges.shouldExcludeDeadParticipants, cfnames); } else if (state.options.isIncremental()) { @@ -486,7 +504,7 @@ else if (state.options.isIncremental()) } else { - task = new NormalRepairTask(this, state.id, neighborsAndRanges.filterCommonRanges(state.keyspace, cfnames), cfnames); + task = new NormalRepairTask(this, state.id, neighborsAndRanges.filterCommonRanges(state.keyspace, cfnames), neighborsAndRanges.shouldExcludeDeadParticipants, cfnames); } ExecutorPlus executor = createExecutor(); diff --git a/src/java/org/apache/cassandra/repair/RepairJob.java b/src/java/org/apache/cassandra/repair/RepairJob.java index 7b60df94160c..f20fe189f22f 100644 --- a/src/java/org/apache/cassandra/repair/RepairJob.java +++ b/src/java/org/apache/cassandra/repair/RepairJob.java @@ -17,7 +17,14 @@ */ package org.apache.cassandra.repair; -import java.util.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.Executor; import java.util.function.Function; @@ -28,15 +35,11 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableMap; -import com.google.common.util.concurrent.*; - -import org.apache.cassandra.schema.Schema; -import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.repair.state.JobState; -import org.apache.cassandra.utils.concurrent.AsyncFuture; +import com.google.common.util.concurrent.FutureCallback; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import accord.primitives.Ranges; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Keyspace; @@ -47,17 +50,25 @@ import org.apache.cassandra.repair.asymmetric.HostDifferences; import org.apache.cassandra.repair.asymmetric.PreferedNodeFilter; import org.apache.cassandra.repair.asymmetric.ReduceHelper; +import org.apache.cassandra.repair.state.JobState; import org.apache.cassandra.schema.SystemDistributedKeyspace; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.repair.AccordRepair; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationRepairResult; import org.apache.cassandra.service.paxos.cleanup.PaxosCleanup; import org.apache.cassandra.streaming.PreviewKind; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.MerkleTrees; import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.concurrent.AsyncFuture; import org.apache.cassandra.utils.concurrent.Future; import org.apache.cassandra.utils.concurrent.FutureCombiner; import org.apache.cassandra.utils.concurrent.ImmediateFuture; +import static com.google.common.util.concurrent.Futures.getUnchecked; import static org.apache.cassandra.config.DatabaseDescriptor.paxosRepairEnabled; import static org.apache.cassandra.schema.SchemaConstants.METADATA_KEYSPACE_NAME; import static org.apache.cassandra.service.paxos.Paxos.useV2; @@ -69,6 +80,8 @@ public class RepairJob extends AsyncFuture implements Runnable { private static final Logger logger = LoggerFactory.getLogger(RepairJob.class); + protected final Keyspace ks; + protected final ColumnFamilyStore cfs; private final SharedContext ctx; public final JobState state; private final RepairJobDesc desc; @@ -94,7 +107,17 @@ public RepairJob(RepairSession session, String columnFamily) this.taskExecutor = session.taskExecutor; this.parallelismDegree = session.parallelismDegree; this.desc = new RepairJobDesc(session.state.parentRepairSession, session.getId(), session.state.keyspace, columnFamily, session.state.commonRange.ranges); + this.ks = Keyspace.open(desc.keyspace); + this.cfs = ks.getColumnFamilyStore(columnFamily); this.state = new JobState(ctx.clock(), desc, session.state.commonRange.endpoints); + + TableMetadata metadata = this.cfs.metadata(); + if ((!session.repairData && !session.repairAccord) && !metadata.supportsPaxosOperations()) + throw new IllegalArgumentException(String.format("Cannot run paxos only repair on %s.%s, which isn't configured for paxos operations", cfs.keyspace.getName(), cfs.name)); + + if ((!session.repairData && !session.repairPaxos) && !metadata.requiresAccordSupport()) + throw new IllegalArgumentException(String.format("Cannot run accord only repair on %s.%s, which isn't configured for accord operations", cfs.keyspace.getName(), cfs.name)); + } public long getNowInSeconds() @@ -110,26 +133,38 @@ public long getNowInSeconds() } } + @Override + public void run() + { + state.phase.start(); + cfs.metric.repairsStarted.inc(); + runRepair(); + } + /** * Runs repair job. *

* This sets up necessary task and runs them on given {@code taskExecutor}. * After submitting all tasks, waits until validation with replica completes. */ - public void run() + protected void runRepair() { - state.phase.start(); - Keyspace ks = Keyspace.open(desc.keyspace); - ColumnFamilyStore cfs = ks.getColumnFamilyStore(desc.columnFamily); - cfs.metric.repairsStarted.inc(); List allEndpoints = new ArrayList<>(session.state.commonRange.endpoints); allEndpoints.add(ctx.broadcastAddressAndPort()); + TableMetadata metadata = cfs.metadata(); Future paxosRepair; - if (paxosRepairEnabled() && (((useV2() || isMetadataKeyspace()) && session.repairPaxos) || session.paxosOnly)) + Epoch repairStartingEpoch = ClusterMetadata.current().epoch; + + Preconditions.checkArgument(session.repairData || session.repairPaxos || session.repairAccord); + boolean doPaxosRepair = paxosRepairEnabled() + && ((useV2() || isMetadataKeyspace()) && session.repairPaxos) + && metadata.supportsPaxosOperations(); + boolean doAccordRepair = metadata.requiresAccordSupport() && session.repairAccord; + + if (doPaxosRepair) { logger.info("{} {}.{} starting paxos repair", session.previewKind.logPrefix(session.getId()), desc.keyspace, desc.columnFamily); - TableMetadata metadata = Schema.instance.getTableMetadata(desc.keyspace, desc.columnFamily); paxosRepair = PaxosCleanup.cleanup(ctx, allEndpoints, metadata, desc.ranges, session.state.commonRange.hasSkippedReplicas, taskExecutor); } else @@ -138,80 +173,101 @@ public void run() paxosRepair = ImmediateFuture.success(null); } - if (session.paxosOnly) + Future accordRepair; + if (doAccordRepair) { - paxosRepair.addCallback(new FutureCallback<>() - { - public void onSuccess(Void v) - { - logger.info("{} {}.{} paxos repair completed", session.previewKind.logPrefix(session.getId()), desc.keyspace, desc.columnFamily); - trySuccess(new RepairResult(desc, Collections.emptyList())); - } - - /** - * Snapshot, validation and sync failures are all handled here - */ - public void onFailure(Throwable t) + accordRepair = paxosRepair.flatMap(unused -> { + boolean requireAllEndpoints; + // If the session excluded dead nodes it's not eligible for migration and is not supposed to occur at ALL anyways + if (session.excludedDeadNodes) + requireAllEndpoints = false; + else { - logger.warn("{} {}.{} paxos repair failed", session.previewKind.logPrefix(session.getId()), desc.keyspace, desc.columnFamily); - tryFailure(t); + // If the session is doing a data repair (which flushes sstables if not incremental) we can do the barriers at QUORUM + if (session.repairData && !session.isIncremental) + requireAllEndpoints = false; + else + requireAllEndpoints = true; } + logger.info("{} {}.{} starting accord repair, require all endpoints {}", session.previewKind.logPrefix(session.getId()), desc.keyspace, desc.columnFamily, requireAllEndpoints); + AccordRepair repair = new AccordRepair(ctx, cfs, desc.sessionId, desc.keyspace, desc.ranges, requireAllEndpoints, allEndpoints); + return repair.repair(taskExecutor); }, taskExecutor); - return; + } + else + { + accordRepair = paxosRepair.flatMap(unused -> { + logger.info("{} {}.{} not running accord repair", session.previewKind.logPrefix(session.getId()), desc.keyspace, desc.columnFamily); + return ImmediateFuture.success(null); + }); } - // Create a snapshot at all nodes unless we're using pure parallel repairs - final Future allSnapshotTasks; - if (parallelismDegree != RepairParallelism.PARALLEL) + Future> syncResults; + if (session.repairData) { - if (session.isIncremental) + // Create a snapshot at all nodes unless we're using pure parallel repairs + final Future allSnapshotTasks; + if (parallelismDegree != RepairParallelism.PARALLEL) { - // consistent repair does it's own "snapshotting" - allSnapshotTasks = paxosRepair.map(input -> allEndpoints); + if (session.isIncremental) + { + // consistent repair does it's own "snapshotting" + allSnapshotTasks = accordRepair.map(input -> allEndpoints); + } + else + { + // Request snapshot to all replica + allSnapshotTasks = accordRepair.flatMap(input -> { + List> snapshotTasks = new ArrayList<>(allEndpoints.size()); + state.phase.snapshotsSubmitted(); + for (InetAddressAndPort endpoint : allEndpoints) + { + SnapshotTask snapshotTask = new SnapshotTask(ctx, desc, endpoint); + snapshotTasks.add(snapshotTask); + taskExecutor.execute(snapshotTask); + } + return FutureCombiner.allOf(snapshotTasks).map(a -> { + state.phase.snapshotsCompleted(); + return a; + }); + }); + } } else { - // Request snapshot to all replica - allSnapshotTasks = paxosRepair.flatMap(input -> { - List> snapshotTasks = new ArrayList<>(allEndpoints.size()); - state.phase.snapshotsSubmitted(); - for (InetAddressAndPort endpoint : allEndpoints) - { - SnapshotTask snapshotTask = new SnapshotTask(ctx, desc, endpoint); - snapshotTasks.add(snapshotTask); - taskExecutor.execute(snapshotTask); - } - return FutureCombiner.allOf(snapshotTasks).map(a -> { - state.phase.snapshotsCompleted(); - return a; - }); - }); + allSnapshotTasks = null; } + + // Run validations and the creation of sync tasks in the scheduler, so it can limit the number of Merkle trees + // that there are in memory at once. When all validations complete, submit sync tasks out of the scheduler. + syncResults = session.validationScheduler.schedule(() -> createSyncTasks(accordRepair, allSnapshotTasks, allEndpoints), taskExecutor) + .flatMap(this::executeTasks, taskExecutor); } else { - allSnapshotTasks = null; + syncResults = accordRepair.flatMap(unused -> { + logger.info("{} {}.{} not running data repair", session.previewKind.logPrefix(session.getId()), desc.keyspace, desc.columnFamily); + return ImmediateFuture.success(Collections.emptyList()); + }); } - // Run validations and the creation of sync tasks in the scheduler, so it can limit the number of Merkle trees - // that there are in memory at once. When all validations complete, submit sync tasks out of the scheduler. - Future> syncResults = session.validationScheduler.schedule(() -> createSyncTasks(paxosRepair, allSnapshotTasks, allEndpoints), taskExecutor) - .flatMap(this::executeTasks, taskExecutor); - // When all sync complete, set the final result syncResults.addCallback(new FutureCallback<>() { @Override public void onSuccess(List stats) { + logger.info("{} {}.{} Successfully did repair repairData {}, repairPaxos {}, repairAccord {}, excludedDeadNodes {}", session.previewKind.logPrefix(session.getId()), desc.keyspace, desc.columnFamily, session.repairData, session.repairPaxos, session.repairAccord, session.excludedDeadNodes); state.phase.success(); - if (!session.previewKind.isPreview()) + if (!session.previewKind.isPreview() && session.repairData) { logger.info("{} {}.{} is fully synced", session.previewKind.logPrefix(session.getId()), desc.keyspace, desc.columnFamily); SystemDistributedKeyspace.successfulRepairJob(session.getId(), desc.keyspace, desc.columnFamily); } cfs.metric.repairsCompleted.inc(); - trySuccess(new RepairResult(desc, stats)); + logger.info("Completing repair with excludedDeadNodes {}", session.excludedDeadNodes); + ConsensusMigrationRepairResult cmrs = ConsensusMigrationRepairResult.fromRepair(repairStartingEpoch, getUnchecked(accordRepair), session.repairData, doPaxosRepair, doAccordRepair, session.excludedDeadNodes); + trySuccess(new RepairResult(desc, stats, cmrs)); } /** @@ -220,10 +276,11 @@ public void onSuccess(List stats) @Override public void onFailure(Throwable t) { + logger.info("{} {}.{} Failed repair repairData {}, repairPaxos {}, repairAccord {}, excludedDeadNodes {}", session.previewKind.logPrefix(session.getId()), desc.keyspace, desc.columnFamily, session.repairData, session.repairPaxos, session.repairAccord, session.excludedDeadNodes); state.phase.fail(t); abort(t); - if (!session.previewKind.isPreview()) + if (!session.previewKind.isPreview() && session.repairData) { logger.warn("{} {}.{} sync failed", session.previewKind.logPrefix(session.getId()), desc.keyspace, desc.columnFamily); SystemDistributedKeyspace.failedRepairJob(session.getId(), desc.keyspace, desc.columnFamily, t); @@ -236,7 +293,7 @@ public void onFailure(Throwable t) }, taskExecutor); } - private Future> createSyncTasks(Future paxosRepair, Future allSnapshotTasks, List allEndpoints) + private Future> createSyncTasks(Future accordRepair, Future allSnapshotTasks, List allEndpoints) { Future> treeResponses; if (allSnapshotTasks != null) @@ -252,7 +309,7 @@ private Future> createSyncTasks(Future paxosRepair, Future< else { // If not sequential, just send validation request to all replica - treeResponses = paxosRepair.flatMap(input -> sendValidationRequest(allEndpoints)); + treeResponses = accordRepair.flatMap(input -> sendValidationRequest(allEndpoints)); } treeResponses = treeResponses.map(a -> { diff --git a/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java b/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java index f7771260195a..ea9742609fed 100644 --- a/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java +++ b/src/java/org/apache/cassandra/repair/RepairMessageVerbHandler.java @@ -17,10 +17,12 @@ */ package org.apache.cassandra.repair; -import java.util.*; +import java.util.ArrayList; +import java.util.List; import java.util.function.BiFunction; import java.util.function.Function; +import org.apache.cassandra.config.DatabaseDescriptor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -28,7 +30,14 @@ import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.IVerbHandler; import org.apache.cassandra.net.Message; -import org.apache.cassandra.repair.messages.*; +import org.apache.cassandra.repair.messages.CleanupMessage; +import org.apache.cassandra.repair.messages.FailSession; +import org.apache.cassandra.repair.messages.PrepareMessage; +import org.apache.cassandra.repair.messages.RepairMessage; +import org.apache.cassandra.repair.messages.StatusRequest; +import org.apache.cassandra.repair.messages.StatusResponse; +import org.apache.cassandra.repair.messages.SyncRequest; +import org.apache.cassandra.repair.messages.ValidationRequest; import org.apache.cassandra.repair.state.AbstractCompletable; import org.apache.cassandra.repair.state.AbstractState; import org.apache.cassandra.repair.state.Completable; @@ -84,8 +93,12 @@ private PreviewKind previewKind(TimeUUID sessionID) throws NoSuchRepairSessionEx return prs != null ? prs.previewKind : PreviewKind.NONE; } + @Override public void doVerb(final Message message) { + if (DatabaseDescriptor.getAccordTransactionsEnabled() + && ctx.cms().maybeFetchLogFromPeerOrCMSAsync(ctx.messaging(), message, () -> doVerb(message))) + return; // TODO add cancel/interrupt message RepairJobDesc desc = message.payload.desc; try @@ -110,6 +123,13 @@ public void doVerb(final Message message) sendFailureResponse(message); return; } + if (!ActiveRepairService.verifyDiskHeadroomThreshold(prepareMessage.parentRepairSession, prepareMessage.previewKind, prepareMessage.isIncremental)) + { + // error is logged in verifyDiskHeadroomThreshold + state.phase.fail("Not enough disk headroom to perform incremental repair"); + sendFailureResponse(message); + return; + } List columnFamilyStores = new ArrayList<>(prepareMessage.tableIds.size()); for (TableId tableId : prepareMessage.tableIds) diff --git a/src/java/org/apache/cassandra/repair/RepairResult.java b/src/java/org/apache/cassandra/repair/RepairResult.java index 333b48ad33e7..6c04f6be0760 100644 --- a/src/java/org/apache/cassandra/repair/RepairResult.java +++ b/src/java/org/apache/cassandra/repair/RepairResult.java @@ -19,6 +19,8 @@ import java.util.List; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationRepairResult; + /** * RepairJob's result */ @@ -26,10 +28,12 @@ public class RepairResult { public final RepairJobDesc desc; public final List stats; + public final ConsensusMigrationRepairResult consensusMigrationRepairResult; - public RepairResult(RepairJobDesc desc, List stats) + public RepairResult(RepairJobDesc desc, List stats, ConsensusMigrationRepairResult consensusMigrationRepairResult) { this.desc = desc; this.stats = stats; + this.consensusMigrationRepairResult = consensusMigrationRepairResult; } } diff --git a/src/java/org/apache/cassandra/repair/RepairSession.java b/src/java/org/apache/cassandra/repair/RepairSession.java index 92d56390fe3a..3285fec9dbc3 100644 --- a/src/java/org/apache/cassandra/repair/RepairSession.java +++ b/src/java/org/apache/cassandra/repair/RepairSession.java @@ -30,13 +30,11 @@ import java.util.concurrent.Executor; import java.util.concurrent.RejectedExecutionException; import java.util.concurrent.atomic.AtomicBoolean; - import javax.annotation.Nullable; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Lists; -import com.google.common.util.concurrent.*; - +import com.google.common.util.concurrent.FutureCallback; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -48,7 +46,9 @@ import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.exceptions.RepairException; -import org.apache.cassandra.gms.*; +import org.apache.cassandra.gms.EndpointState; +import org.apache.cassandra.gms.IEndpointStateChangeSubscriber; +import org.apache.cassandra.gms.IFailureDetectionEventListener; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.Message; import org.apache.cassandra.repair.consistent.ConsistentSession; @@ -59,6 +59,7 @@ import org.apache.cassandra.repair.state.SessionState; import org.apache.cassandra.schema.SystemDistributedKeyspace; import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.consensus.migration.ConsensusTableMigration; import org.apache.cassandra.streaming.PreviewKind; import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.utils.FBUtilities; @@ -119,9 +120,11 @@ public class RepairSession extends AsyncFuture implements I /** Range to repair */ public final boolean isIncremental; public final PreviewKind previewKind; - public final boolean repairPaxos; - public final boolean paxosOnly; + public final boolean repairData; + public final boolean repairPaxos; // TODO (now): rename to repairPaxosIfSupported + public final boolean repairAccord; public final boolean dontPurgeTombstones; + public final boolean excludedDeadNodes; private final AtomicBoolean isFailed = new AtomicBoolean(false); @@ -141,36 +144,40 @@ public class RepairSession extends AsyncFuture implements I /** * Create new repair session. - * @param parentRepairSession the parent sessions id - * @param commonRange ranges to repair - * @param keyspace name of keyspace - * @param parallelismDegree specifies the degree of parallelism when calculating the merkle trees - * @param pullRepair true if the repair should be one way (from remote host to this host and only applicable between two hosts--see RepairOption) - * @param repairPaxos true if incomplete paxos operations should be completed as part of repair - * @param paxosOnly true if we should only complete paxos operations, not run a normal repair - * @param cfnames names of columnfamilies + * + * @param parentRepairSession the parent sessions id + * @param commonRange ranges to repair + * @param excludedDeadNodes Was the repair started for --force and were dead nodes excluded as a result + * @param keyspace name of keyspace + * @param parallelismDegree specifies the degree of parallelism when calculating the merkle trees + * @param pullRepair true if the repair should be one way (from remote host to this host and only applicable between two hosts--see RepairOption) + * @param repairPaxos true if incomplete paxos operations should be completed as part of repair + * @param cfnames names of columnfamilies */ public RepairSession(SharedContext ctx, Scheduler validationScheduler, TimeUUID parentRepairSession, CommonRange commonRange, + boolean excludedDeadNodes, String keyspace, RepairParallelism parallelismDegree, boolean isIncremental, boolean pullRepair, PreviewKind previewKind, boolean optimiseStreams, + boolean repairData, boolean repairPaxos, - boolean paxosOnly, boolean dontPurgeTombstones, + boolean repairAccord, String... cfnames) { this.ctx = ctx; this.validationScheduler = validationScheduler; + this.repairData = repairData; this.repairPaxos = repairPaxos; - this.paxosOnly = paxosOnly; + this.repairAccord = repairAccord; assert cfnames.length > 0 : "Repairing no column families seems pointless, doesn't it"; - this.state = new SessionState(ctx.clock(), parentRepairSession, keyspace, cfnames, commonRange); + this.state = new SessionState(ctx, parentRepairSession, keyspace, cfnames, commonRange); this.parallelismDegree = parallelismDegree; this.isIncremental = isIncremental; this.previewKind = previewKind; @@ -178,6 +185,7 @@ public RepairSession(SharedContext ctx, this.optimiseStreams = optimiseStreams; this.dontPurgeTombstones = dontPurgeTombstones; this.taskExecutor = new SafeExecutor(createExecutor(ctx)); + this.excludedDeadNodes = excludedDeadNodes; } @VisibleForTesting @@ -300,7 +308,7 @@ public void start(ExecutorPlus executor) logger.info("{} parentSessionId = {}: new session: will sync {} on range {} for {}.{}", previewKind.logPrefix(getId()), state.parentRepairSession, repairedNodes(), state.commonRange, state.keyspace, Arrays.toString(state.cfnames)); Tracing.traceRepair("Syncing range {}", state.commonRange); - if (!previewKind.isPreview() && !paxosOnly) + if (!previewKind.isPreview() && repairData) { SystemDistributedKeyspace.startRepairs(getId(), state.parentRepairSession, state.keyspace, state.cfnames, state.commonRange); } @@ -342,6 +350,8 @@ public void start(ExecutorPlus executor) for (String cfname : state.cfnames) { RepairJob job = new RepairJob(this, cfname); + // Repairs can drive forward progress for consensus migration so always check + job.addCallback(ConsensusTableMigration.completedRepairJobHandler); state.register(job.state); executor.execute(job); jobs.add(job); @@ -452,6 +462,11 @@ public void onIRStateChange(LocalSession session) } } + public boolean accordOnly() + { + return repairData && repairAccord && !repairPaxos; + } + private boolean includesTables(Set tableIds) { Keyspace ks = Keyspace.open(state.keyspace); diff --git a/src/java/org/apache/cassandra/repair/SharedContext.java b/src/java/org/apache/cassandra/repair/SharedContext.java index 54ec4214570f..790fe94886b9 100644 --- a/src/java/org/apache/cassandra/repair/SharedContext.java +++ b/src/java/org/apache/cassandra/repair/SharedContext.java @@ -39,9 +39,11 @@ import org.apache.cassandra.service.ActiveRepairService; import org.apache.cassandra.service.paxos.cleanup.PaxosRepairState; import org.apache.cassandra.streaming.StreamPlan; +import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.utils.Clock; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.MBeanWrapper; +import org.apache.cassandra.utils.TimeUUID; /** * Access methods to shared resources and services. @@ -50,6 +52,7 @@ * * See {@link Global#instance} for the main production path */ +// TODO (required, clarity): move under Util since this is a class with shared logic public interface SharedContext { InetAddressAndPort broadcastAddressAndPort(); @@ -81,6 +84,15 @@ public MessageDelivery messaging() TableRepairManager repairManager(ColumnFamilyStore store); StreamExecutor streamExecutor(); PaxosRepairState paxosRepairState(); + default Supplier timeUUID() + { + return TimeUUID.Generator::nextTimeUUID; + } + + default ClusterMetadataService cms() + { + return ClusterMetadataService.instance(); + } class Global implements SharedContext { diff --git a/src/java/org/apache/cassandra/repair/SnapshotTask.java b/src/java/org/apache/cassandra/repair/SnapshotTask.java index ad45070cfb27..a95e0668d81e 100644 --- a/src/java/org/apache/cassandra/repair/SnapshotTask.java +++ b/src/java/org/apache/cassandra/repair/SnapshotTask.java @@ -19,10 +19,10 @@ import java.util.concurrent.RunnableFuture; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.net.RequestCallback; import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.RequestCallback; import org.apache.cassandra.repair.messages.RepairMessage; import org.apache.cassandra.repair.messages.SnapshotMessage; import org.apache.cassandra.utils.concurrent.AsyncFuture; @@ -81,9 +81,9 @@ public boolean invokeOnFailure() } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) + public void onFailure(InetAddressAndPort from, RequestFailure failure) { - task.tryFailure(new RuntimeException("Could not create snapshot at " + from + "; " + failureReason)); + task.tryFailure(new RuntimeException("Could not create snapshot at " + from + "; " + failure.reason)); } } } diff --git a/src/java/org/apache/cassandra/repair/ValidationManager.java b/src/java/org/apache/cassandra/repair/ValidationManager.java index ca7ad3a68eea..f4229751984c 100644 --- a/src/java/org/apache/cassandra/repair/ValidationManager.java +++ b/src/java/org/apache/cassandra/repair/ValidationManager.java @@ -37,6 +37,7 @@ import org.apache.cassandra.metrics.TableMetrics; import org.apache.cassandra.metrics.TopPartitionTracker; import org.apache.cassandra.repair.state.ValidationState; +import org.apache.cassandra.streaming.PreviewKind; import org.apache.cassandra.utils.Clock; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.MerkleTree; @@ -143,6 +144,10 @@ public static void doValidation(ColumnFamilyStore cfs, Validator validator) thro { cfs.metric.bytesValidated.update(state.estimatedTotalBytes); cfs.metric.partitionsValidated.update(state.partitionsProcessed); + if (validator.getPreviewKind() != PreviewKind.NONE) + { + cfs.metric.bytesPreviewed.mark(state.estimatedTotalBytes); + } if (topPartitionCollector != null) cfs.topPartitions.merge(topPartitionCollector); } diff --git a/src/java/org/apache/cassandra/repair/autorepair/AutoRepair.java b/src/java/org/apache/cassandra/repair/autorepair/AutoRepair.java new file mode 100644 index 000000000000..10d18fae639f --- /dev/null +++ b/src/java/org/apache/cassandra/repair/autorepair/AutoRepair.java @@ -0,0 +1,603 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.EnumMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.function.BiConsumer; +import java.util.function.Consumer; +import java.util.function.Supplier; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Lists; +import com.google.common.util.concurrent.Uninterruptibles; + +import org.apache.cassandra.repair.RepairCoordinator; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.config.DurationSpec; +import org.apache.cassandra.utils.Clock; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.concurrent.ScheduledExecutorPlus; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.Tables; +import org.apache.cassandra.service.AutoRepairService; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.repair.autorepair.AutoRepairUtils.RepairTurn; +import org.apache.cassandra.utils.concurrent.Condition; +import org.apache.cassandra.utils.concurrent.Future; +import org.apache.cassandra.utils.progress.ProgressEvent; +import org.apache.cassandra.utils.progress.ProgressEventType; +import org.apache.cassandra.utils.progress.ProgressListener; + +import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; +import static org.apache.cassandra.repair.autorepair.AutoRepairUtils.RepairTurn.MY_TURN; +import static org.apache.cassandra.repair.autorepair.AutoRepairUtils.RepairTurn.MY_TURN_DUE_TO_PRIORITY; +import static org.apache.cassandra.repair.autorepair.AutoRepairUtils.RepairTurn.MY_TURN_FORCE_REPAIR; +import static org.apache.cassandra.utils.concurrent.Condition.newOneTimeCondition; + +/** + * AutoRepair scheduler responsible for running different types of repairs. + */ +public class AutoRepair +{ + private static final Logger logger = LoggerFactory.getLogger(AutoRepair.class); + private static final SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss,SSS"); + + @VisibleForTesting + protected static Supplier timeFunc = Clock.Global::currentTimeMillis; + + // Sleep for 5 seconds if repair finishes quickly to flush JMX metrics; it happens only for Cassandra nodes with tiny amount of data. + public static DurationSpec.IntSecondsBound SLEEP_IF_REPAIR_FINISHES_QUICKLY = new DurationSpec.IntSecondsBound("5s"); + + @VisibleForTesting + public Map repairStates; + + @VisibleForTesting + protected Map repairExecutors; + + protected Map repairRunnableExecutors; + + @VisibleForTesting + // Auto-repair is likely to be run on multiple nodes independently, we want to avoid running multiple repair + // sessions on overlapping datasets at the same time. Shuffling keyspaces reduces the likelihood of this happening. + protected static Consumer> shuffleFunc = java.util.Collections::shuffle; + + @VisibleForTesting + protected static BiConsumer sleepFunc = Uninterruptibles::sleepUninterruptibly; + + @VisibleForTesting + public boolean isSetupDone = false; + public static AutoRepair instance = new AutoRepair(); + + public volatile boolean isShutDown = false; + + private AutoRepair() + { + // Private constructor to prevent instantiation + } + + public void setup() + { + // Ensure setup is done only once; this is only for unit tests + // For production, this method should be called only once. + synchronized (this) + { + if (isSetupDone) + { + return; + } + repairExecutors = new EnumMap<>(AutoRepairConfig.RepairType.class); + repairRunnableExecutors = new EnumMap<>(AutoRepairConfig.RepairType.class); + repairStates = new EnumMap<>(AutoRepairConfig.RepairType.class); + for (AutoRepairConfig.RepairType repairType : AutoRepairConfig.RepairType.values()) + { + repairExecutors.put(repairType, executorFactory().scheduled(false, "AutoRepair-Repair-" + repairType.getConfigName(), Thread.NORM_PRIORITY)); + repairRunnableExecutors.put(repairType, executorFactory().scheduled(false, "AutoRepair-RepairRunnable-" + repairType.getConfigName(), Thread.NORM_PRIORITY)); + repairStates.put(repairType, AutoRepairConfig.RepairType.getAutoRepairState(repairType)); + } + + AutoRepairConfig config = DatabaseDescriptor.getAutoRepairConfig(); + AutoRepairUtils.setup(); + + for (AutoRepairConfig.RepairType repairType : AutoRepairConfig.RepairType.values()) + { + if (config.isAutoRepairEnabled(repairType)) + AutoRepairService.instance.checkCanRun(repairType); + + repairExecutors.get(repairType).scheduleWithFixedDelay( + () -> repair(repairType), + config.getInitialSchedulerDelay(repairType).toSeconds(), + config.getRepairCheckInterval().toSeconds(), + TimeUnit.SECONDS); + } + isSetupDone = true; + } + } + + /** + * @return The current observed system time in ms. + */ + public long currentTimeMs() + { + return timeFunc.get(); + } + + // repair runs a repair session of the given type synchronously. + public void repair(AutoRepairConfig.RepairType repairType) + { + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + if (!config.isAutoRepairEnabled(repairType)) + { + logger.debug("Auto-repair is disabled for repair type {}", repairType); + return; + } + AutoRepairService.instance.checkCanRun(repairType); + AutoRepairState repairState = repairStates.get(repairType); + try + { + String localDC = DatabaseDescriptor.getLocalDataCenter(); + if (config.getIgnoreDCs(repairType).contains(localDC)) + { + logger.info("Not running repair as this node belongs to datacenter {}", localDC); + return; + } + + // refresh the longest unrepaired node + repairState.setLongestUnrepairedNode(AutoRepairUtils.getHostWithLongestUnrepairTime(repairType)); + + //consistency level to use for local query + UUID myId = StorageService.instance.getHostIdForEndpoint(FBUtilities.getBroadcastAddressAndPort()); + + // If it's too soon to run repair, don't bother checking if it's our turn. + if (tooSoonToRunRepair(repairType, repairState, config, myId)) + { + return; + } + + RepairTurn turn = AutoRepairUtils.myTurnToRunRepair(repairType, myId); + if (turn == MY_TURN || turn == MY_TURN_DUE_TO_PRIORITY || turn == MY_TURN_FORCE_REPAIR) + { + repairState.recordTurn(turn); + // For normal auto repair, we will use primary range only repairs (Repair with -pr option). + // For some cases, we may set the auto_repair_primary_token_range_only flag to false then we will do repair + // without -pr. We may also do force repair for certain node that we want to repair all the data on one node + // When doing force repair, we want to repair without -pr. + boolean primaryRangeOnly = config.getRepairPrimaryTokenRangeOnly(repairType) + && turn != MY_TURN_FORCE_REPAIR; + + long startTimeInMillis = timeFunc.get(); + logger.info("My host id: {}, my turn to run repair...repair primary-ranges only? {}", myId, + config.getRepairPrimaryTokenRangeOnly(repairType)); + AutoRepairUtils.updateStartAutoRepairHistory(repairType, myId, timeFunc.get(), turn); + + repairState.setRepairKeyspaceCount(0); + repairState.setRepairInProgress(true); + repairState.setTotalTablesConsideredForRepair(0); + repairState.setTotalMVTablesConsideredForRepair(0); + + CollectedRepairStats collectedRepairStats = new CollectedRepairStats(); + + List keyspaces = new ArrayList<>(); + Keyspace.all().forEach(keyspaces::add); + // Filter out keyspaces and tables to repair and group into a map by keyspace. + Map> keyspacesAndTablesToRepair = new LinkedHashMap<>(); + for (Keyspace keyspace : keyspaces) + { + if (!AutoRepairUtils.shouldConsiderKeyspace(keyspace)) + { + continue; + } + List tablesToBeRepairedList = retrieveTablesToBeRepaired(keyspace, config, repairType, repairState, collectedRepairStats); + keyspacesAndTablesToRepair.put(keyspace.getName(), tablesToBeRepairedList); + } + + // Separate out the keyspaces and tables to repair based on their priority, with each repair plan representing a uniquely occuring priority. + List repairPlans = PrioritizedRepairPlan.build(keyspacesAndTablesToRepair, repairType, shuffleFunc); + + // calculate the repair assignments for each priority:keyspace. + Iterator repairAssignmentsIterator = config.getTokenRangeSplitterInstance(repairType).getRepairAssignments(primaryRangeOnly, repairPlans); + + while (repairAssignmentsIterator.hasNext()) + { + KeyspaceRepairAssignments repairAssignments = repairAssignmentsIterator.next(); + List assignments = repairAssignments.getRepairAssignments(); + if (assignments.isEmpty()) + { + logger.info("Skipping repairs for priorityBucket={} for keyspace={} since it yielded no assignments", repairAssignments.getPriority(), repairAssignments.getKeyspaceName()); + continue; + } + + logger.info("Submitting repairs for priorityBucket={} for keyspace={} with assignmentCount={}", repairAssignments.getPriority(), repairAssignments.getKeyspaceName(), repairAssignments.getRepairAssignments().size()); + repairKeyspace(repairType, primaryRangeOnly, repairAssignments.getKeyspaceName(), repairAssignments.getRepairAssignments(), collectedRepairStats); + } + + cleanupAndUpdateStats(turn, repairType, repairState, myId, startTimeInMillis, collectedRepairStats); + } + else + { + logger.info("Waiting for my turn..."); + } + } + catch (Exception e) + { + logger.error("Exception in autorepair:", e); + } + } + + private void repairKeyspace(AutoRepairConfig.RepairType repairType, boolean primaryRangeOnly, String keyspaceName, List repairAssignments, CollectedRepairStats collectedRepairStats) + { + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + AutoRepairState repairState = repairStates.get(repairType); + + // evaluate over each keyspace's repair assignments. + repairState.setRepairKeyspaceCount(repairState.getRepairKeyspaceCount() + 1); + + int totalRepairAssignments = repairAssignments.size(); + long keyspaceStartTime = timeFunc.get(); + RepairAssignment previousAssignment = null; + long tableStartTime = timeFunc.get(); + int totalProcessedAssignments = 0; + Set> ranges = new HashSet<>(); + for (RepairAssignment curRepairAssignment : repairAssignments) + { + try + { + totalProcessedAssignments++; + boolean repairOneTableAtATime = !config.getRepairByKeyspace(repairType); + if (previousAssignment != null && repairOneTableAtATime && !previousAssignment.tableNames.equals(curRepairAssignment.tableNames)) + { + // In the repair assignment, all the tables are appended sequnetially. + // Check if we have a different table, and if so, we should reset the table start time. + tableStartTime = timeFunc.get(); + } + previousAssignment = curRepairAssignment; + if (!config.isAutoRepairEnabled(repairType)) + { + logger.error("Auto-repair for type {} is disabled hence not running repair", repairType); + repairState.setRepairInProgress(false); + return; + } + if (AutoRepairUtils.keyspaceMaxRepairTimeExceeded(repairType, keyspaceStartTime, repairAssignments.size())) + { + collectedRepairStats.skippedTokenRanges += totalRepairAssignments - totalProcessedAssignments; + logger.info("Keyspace took too much time to repair hence skipping it {}", + keyspaceName); + break; + } + if (repairOneTableAtATime && AutoRepairUtils.tableMaxRepairTimeExceeded(repairType, tableStartTime)) + { + collectedRepairStats.skippedTokenRanges += 1; + logger.info("Table took too much time to repair hence skipping it table name {}.{}, token range {}", + keyspaceName, curRepairAssignment.tableNames, curRepairAssignment.tokenRange); + continue; + } + + Range tokenRange = curRepairAssignment.getTokenRange(); + logger.debug("Current Token Left side {}, right side {}", + tokenRange.left.toString(), + tokenRange.right.toString()); + + ranges.add(curRepairAssignment.getTokenRange()); + if ((totalProcessedAssignments % config.getRepairThreads(repairType) == 0) || + (totalProcessedAssignments == totalRepairAssignments)) + { + boolean success = false; + int retryCount = 0; + Future f = null; + while (retryCount <= config.getRepairMaxRetries(repairType)) + { + RepairCoordinator task = repairState.getRepairRunnable(keyspaceName, + Lists.newArrayList(curRepairAssignment.getTableNames()), + ranges, primaryRangeOnly); + RepairProgressListener listener = new RepairProgressListener(repairType); + task.addProgressListener(listener); + f = repairRunnableExecutors.get(repairType).submit(task); + try + { + long jobStartTime = timeFunc.get(); + listener.await(config.getRepairSessionTimeout(repairType)); + success = listener.isSuccess(); + soakAfterRepair(jobStartTime, config.getRepairTaskMinDuration().toMilliseconds()); + } + catch (InterruptedException e) + { + logger.error("Exception in cond await:", e); + } + if (success) + { + break; + } + else if (retryCount < config.getRepairMaxRetries(repairType)) + { + boolean cancellationStatus = f.cancel(true); + logger.warn("Repair failed for range {}-{} for {} tables {} with cancellationStatus: {} retrying after {} seconds...", + tokenRange.left, tokenRange.right, + keyspaceName, curRepairAssignment.getTableNames(), + cancellationStatus, config.getRepairRetryBackoff(repairType).toSeconds()); + sleepFunc.accept(config.getRepairRetryBackoff(repairType).toSeconds(), TimeUnit.SECONDS); + } + retryCount++; + } + //check repair status + if (success) + { + logger.info("Repair completed for range {}-{} for {} tables {}, total assignments: {}," + + "processed assignments: {}", tokenRange.left, tokenRange.right, + keyspaceName, curRepairAssignment.getTableNames(), totalRepairAssignments, totalProcessedAssignments); + collectedRepairStats.succeededTokenRanges += ranges.size(); + } + else + { + boolean cancellationStatus = true; + if (f != null) + { + cancellationStatus = f.cancel(true); + } + //in the future we can add retry, etc. + logger.error("Repair failed for range {}-{} for {} tables {} after {} retries, total assignments: {}," + + "processed assignments: {}, cancellationStatus: {}", tokenRange.left, tokenRange.right, keyspaceName, + curRepairAssignment.getTableNames(), retryCount, totalRepairAssignments, totalProcessedAssignments, cancellationStatus); + collectedRepairStats.failedTokenRanges += ranges.size(); + } + ranges.clear(); + } + logger.info("Repair completed for {} tables {}, range {}", keyspaceName, curRepairAssignment.getTableNames(), curRepairAssignment.getTokenRange()); + } + catch (Exception e) + { + logger.error("Exception while repairing keyspace {}:", keyspaceName, e); + } + } + } + + private boolean tooSoonToRunRepair(AutoRepairConfig.RepairType repairType, AutoRepairState repairState, AutoRepairConfig config, UUID myId) + { + if (repairState.getLastRepairTime() == 0) + { + // the node has either just boooted or has not run repair before, + // we should check for the node's repair history in the DB + repairState.setLastRepairTime(AutoRepairUtils.getLastRepairTimeForNode(repairType, myId)); + } + /* + * check if it is too soon to run repair. one of the reason we + * should not run frequent repair is that repair triggers + * memtable flush + */ + long timeElapsedSinceLastRepair = TimeUnit.MILLISECONDS.toSeconds(timeFunc.get() - repairState.getLastRepairTime()); + if (timeElapsedSinceLastRepair < config.getRepairMinInterval(repairType).toSeconds()) + { + logger.info("Too soon to run repair, last repair was done {} seconds ago", + timeElapsedSinceLastRepair); + return true; + } + return false; + } + + private List retrieveTablesToBeRepaired(Keyspace keyspace, AutoRepairConfig config, AutoRepairConfig.RepairType repairType, AutoRepairState repairState, CollectedRepairStats collectedRepairStats) + { + Tables tables = keyspace.getMetadata().tables; + List tablesToBeRepaired = new ArrayList<>(); + Iterator iter = tables.iterator(); + while (iter.hasNext()) + { + repairState.setTotalTablesConsideredForRepair(repairState.getTotalTablesConsideredForRepair() + 1); + TableMetadata tableMetadata = iter.next(); + String tableName = tableMetadata.name; + + ColumnFamilyStore columnFamilyStore = keyspace.getColumnFamilyStore(tableName); + if (!columnFamilyStore.metadata().params.autoRepair.repairEnabled(repairType)) + { + logger.info("Repair is disabled for keyspace {} for tables: {}", keyspace.getName(), tableName); + repairState.setTotalDisabledTablesRepairCount(repairState.getTotalDisabledTablesRepairCount() + 1); + collectedRepairStats.skippedTables++; + continue; + } + + // this is done to make autorepair safe as running repair on table with more sstables + // may have its own challenges + int totalSSTables = columnFamilyStore.getLiveSSTables().size(); + if (totalSSTables > config.getRepairSSTableCountHigherThreshold(repairType)) + { + logger.info("Too many SSTables for repair for table {}.{}" + + "totalSSTables {}", keyspace.getName(), tableName, totalSSTables); + collectedRepairStats.skippedTables++; + continue; + } + + tablesToBeRepaired.add(tableName); + + // See if we should repair MVs as well that are associated with this given table + List mvs = AutoRepairUtils.getAllMVs(repairType, keyspace, tableMetadata); + if (!mvs.isEmpty()) + { + tablesToBeRepaired.addAll(mvs); + repairState.setTotalMVTablesConsideredForRepair(repairState.getTotalMVTablesConsideredForRepair() + mvs.size()); + } + } + return tablesToBeRepaired; + } + + private void cleanupAndUpdateStats(RepairTurn turn, AutoRepairConfig.RepairType repairType, AutoRepairState repairState, UUID myId, + long startTimeInMillis, CollectedRepairStats collectedRepairStats) throws InterruptedException + { + //if it was due to priority then remove it now + if (turn == MY_TURN_DUE_TO_PRIORITY) + { + logger.info("Remove current host from priority list"); + AutoRepairUtils.removePriorityStatus(repairType, myId); + } + long repairScheduleElapsedInMillis = timeFunc.get() - startTimeInMillis; + if (repairScheduleElapsedInMillis < SLEEP_IF_REPAIR_FINISHES_QUICKLY.toMilliseconds()) + { + //If repair finished quickly, happens for Cassndra cluster with empty (or tiny) data, in such cases, + //wait for some duration so that the JMX metrics can detect the repairInProgress + logger.info("Wait for {}ms for repair type {}.", SLEEP_IF_REPAIR_FINISHES_QUICKLY.toMilliseconds() - repairScheduleElapsedInMillis, repairType); + Thread.sleep(SLEEP_IF_REPAIR_FINISHES_QUICKLY.toMilliseconds() - repairScheduleElapsedInMillis); + } + repairState.setFailedTokenRangesCount(collectedRepairStats.failedTokenRanges); + repairState.setSucceededTokenRangesCount(collectedRepairStats.succeededTokenRanges); + repairState.setSkippedTokenRangesCount(collectedRepairStats.skippedTokenRanges); + repairState.setSkippedTablesCount(collectedRepairStats.skippedTables); + repairState.setNodeRepairTimeInSec((int) TimeUnit.MILLISECONDS.toSeconds(timeFunc.get() - startTimeInMillis)); + long timeInHours = TimeUnit.SECONDS.toHours(repairState.getNodeRepairTimeInSec()); + logger.info("Local {} repair time {} hour(s), stats: repairKeyspaceCount {}, " + + "repairTokenRangesSuccessCount {}, repairTokenRangesFailureCount {}, " + + "repairTokenRangesSkipCount {}, repairTablesSkipCount {}", repairType, timeInHours, repairState.getRepairKeyspaceCount(), + repairState.getSucceededTokenRangesCount(), repairState.getFailedTokenRangesCount(), + repairState.getSkippedTokenRangesCount(), repairState.getSkippedTablesCount()); + if (repairState.getLastRepairTime() != 0) + { + repairState.setClusterRepairTimeInSec((int) TimeUnit.MILLISECONDS.toSeconds(timeFunc.get() - + repairState.getLastRepairTime())); + logger.info("Cluster repair time for repair type {}: {} day(s)", repairType, + TimeUnit.SECONDS.toDays(repairState.getClusterRepairTimeInSec())); + } + repairState.setLastRepairTime(timeFunc.get()); + + repairState.setRepairInProgress(false); + AutoRepairUtils.updateFinishAutoRepairHistory(repairType, myId, timeFunc.get()); + } + + public AutoRepairState getRepairState(AutoRepairConfig.RepairType repairType) + { + return repairStates.get(repairType); + } + + private void soakAfterRepair(long startTimeMilis, long minDurationMilis) + { + long currentTime = timeFunc.get(); + long timeElapsed = currentTime - startTimeMilis; + if (timeElapsed < minDurationMilis) + { + long timeToSoak = minDurationMilis - timeElapsed; + logger.info("Soaking for {} ms after repair", timeToSoak); + sleepFunc.accept(timeToSoak, TimeUnit.MILLISECONDS); + } + } + + static class CollectedRepairStats + { + int failedTokenRanges = 0; + int succeededTokenRanges = 0; + int skippedTokenRanges = 0; + int skippedTables = 0; + } + + @VisibleForTesting + protected static class RepairProgressListener implements ProgressListener + { + private final AutoRepairConfig.RepairType repairType; + @VisibleForTesting + protected boolean success; + @VisibleForTesting + protected final Condition condition = newOneTimeCondition(); + + public RepairProgressListener(AutoRepairConfig.RepairType repairType) + { + this.repairType = repairType; + } + + public void await(DurationSpec.IntSecondsBound repairSessionTimeout) throws InterruptedException + { + //if for some reason we don't hear back on repair progress for sometime + if (!condition.await(repairSessionTimeout.toSeconds(), TimeUnit.SECONDS)) + { + success = false; + } + } + + public boolean isSuccess() + { + return success; + } + + @Override + public void progress(String tag, ProgressEvent event) + { + ProgressEventType type = event.getType(); + String message = String.format("[%s] %s", format.format(timeFunc.get()), event.getMessage()); + if (type == ProgressEventType.ERROR) + { + logger.error("Repair failure for repair {}: {}", repairType.toString(), message); + success = false; + condition.signalAll(); + } + if (type == ProgressEventType.PROGRESS) + { + message = message + " (progress: " + (int) event.getProgressPercentage() + "%)"; + logger.debug("Repair progress for repair {}: {}", repairType.toString(), message); + } + if (type == ProgressEventType.COMPLETE) + { + logger.debug("Repair completed for repair {}: {}", repairType.toString(), message); + success = true; + condition.signalAll(); + } + } + } + + public synchronized void shutdownBlocking() throws ExecutionException, InterruptedException + { + if (!isSetupDone) + { + // By default, executors within AutoRepair are not initialized as the feature is opt-in. + // If the AutoRepair has not been set up, then there is no need to worry about shutting it down + return; + } + if (isShutDown) + { + throw new IllegalStateException("AutoRepair has already been shut down"); + } + isShutDown = true; + for (AutoRepairConfig.RepairType repairType : AutoRepairConfig.RepairType.values()) + { + repairRunnableExecutors.get(repairType).shutdown(); + repairExecutors.get(repairType).shutdown(); + } + logger.info("Paused AutoRepair"); + } + + public Map getRepairExecutors() + { + return repairExecutors; + } + + public Map getRepairRunnableExecutors() + { + return repairRunnableExecutors; + } +} diff --git a/src/java/org/apache/cassandra/repair/autorepair/AutoRepairConfig.java b/src/java/org/apache/cassandra/repair/autorepair/AutoRepairConfig.java new file mode 100644 index 000000000000..045e6d21a8c8 --- /dev/null +++ b/src/java/org/apache/cassandra/repair/autorepair/AutoRepairConfig.java @@ -0,0 +1,599 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.io.Serializable; +import java.util.Collections; +import java.util.EnumMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ConcurrentMap; +import java.util.function.Function; + +import javax.annotation.Nonnull; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Maps; + +import org.apache.cassandra.config.DurationSpec; +import org.apache.cassandra.config.ParameterizedClass; +import org.apache.cassandra.utils.LocalizeString; +import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.utils.FBUtilities; + +/** + * Defines configurations for AutoRepair. + */ +public class AutoRepairConfig implements Serializable +{ + // Enable/Disable the auto-repair scheduler. + // If set to false, the scheduler thread will not be started. + // If set to true, the repair scheduler thread will be created. The thread will + // check for secondary configuration available for each repair type (full, incremental, + // and preview_repaired), and based on that, it will schedule repairs. + public volatile Boolean enabled; + // Time interval between successive checks to see if ongoing repairs are complete or if it is time to schedule + // repairs. + public final DurationSpec.IntSecondsBound repair_check_interval = new DurationSpec.IntSecondsBound("5m"); + // The scheduler needs to adjust its order when nodes leave the ring. Deleted hosts are tracked in metadata + // for a specified duration to ensure they are indeed removed before adjustments are made to the schedule. + public volatile DurationSpec.IntSecondsBound history_clear_delete_hosts_buffer_interval = new DurationSpec.IntSecondsBound("2h"); + // Minimum duration for the execution of a single repair task. This prevents the scheduler from overwhelming + // the node by scheduling too many repair tasks in a short period of time. + public volatile DurationSpec.LongSecondsBound repair_task_min_duration = new DurationSpec.LongSecondsBound("5s"); + + // global_settings overides Options.defaultOptions for all repair types + public volatile Options global_settings; + + public static final Class DEFAULT_SPLITTER = RepairTokenRangeSplitter.class; + + // make transient so gets consturcted in the implementation. + private final transient Map tokenRangeSplitters = new EnumMap<>(RepairType.class); + + public enum RepairType implements Serializable + { + FULL, + INCREMENTAL, + PREVIEW_REPAIRED; + + private final String configName; + + RepairType() + { + this.configName = LocalizeString.toLowerCaseLocalized(name()); + } + + /** + * @return Format of the repair type as it should be represented in configuration. + * Canonically this is the enum name in lowerCase. + */ + public String getConfigName() + { + return configName; + } + + public static AutoRepairState getAutoRepairState(RepairType repairType) + { + switch (repairType) + { + case FULL: + return new FullRepairState(); + case INCREMENTAL: + return new IncrementalRepairState(); + case PREVIEW_REPAIRED: + return new PreviewRepairedState(); + } + + throw new IllegalArgumentException("Invalid repair type: " + repairType); + } + + /** + * Case-insensitive parsing of the repair type string into {@link RepairType} + * + * @param repairTypeStr the repair type string + * @return the {@link RepairType} represented by the {@code repairTypeStr} string + * @throws IllegalArgumentException when the repair type string does not match any repair type + */ + public static RepairType parse(String repairTypeStr) + { + return RepairType.valueOf(LocalizeString.toUpperCaseLocalized(Objects.requireNonNull(repairTypeStr, "repairTypeStr cannot be null"))); + } + } + + // repair_type_overrides overrides the global_settings for a specific repair type. String used as key instead + // of enum to allow lower case key in yaml. + public volatile ConcurrentMap repair_type_overrides = Maps.newConcurrentMap(); + + public AutoRepairConfig() + { + this(false); + } + + public AutoRepairConfig(boolean enabled) + { + this.enabled = enabled; + global_settings = Options.getDefaultOptions(); + } + + public DurationSpec.IntSecondsBound getRepairCheckInterval() + { + return repair_check_interval; + } + + public boolean isAutoRepairSchedulingEnabled() + { + return enabled; + } + + @VisibleForTesting + public void setAutoRepairSchedulingEnabled(boolean enabled) + { + this.enabled = enabled; + } + + public DurationSpec.IntSecondsBound getAutoRepairHistoryClearDeleteHostsBufferInterval() + { + return history_clear_delete_hosts_buffer_interval; + } + + public void startScheduler() + { + enabled = true; + AutoRepair.instance.setup(); + } + + public void setAutoRepairHistoryClearDeleteHostsBufferInterval(String duration) + { + history_clear_delete_hosts_buffer_interval = new DurationSpec.IntSecondsBound(duration); + } + + public DurationSpec.LongSecondsBound getRepairTaskMinDuration() + { + return repair_task_min_duration; + } + + public void setRepairTaskMinDuration(String duration) + { + repair_task_min_duration = new DurationSpec.LongSecondsBound(duration); + } + + public boolean isAutoRepairEnabled(RepairType repairType) + { + return enabled && applyOverrides(repairType, opt -> opt.enabled); + } + + public void setAutoRepairEnabled(RepairType repairType, boolean enabled) + { + getOptions(repairType).enabled = enabled; + } + + public void setRepairByKeyspace(RepairType repairType, boolean repairByKeyspace) + { + getOptions(repairType).repair_by_keyspace = repairByKeyspace; + } + + public boolean getRepairByKeyspace(RepairType repairType) + { + return applyOverrides(repairType, opt -> opt.repair_by_keyspace); + } + + public int getRepairThreads(RepairType repairType) + { + return applyOverrides(repairType, opt -> opt.number_of_repair_threads); + } + + public void setRepairThreads(RepairType repairType, int repairThreads) + { + getOptions(repairType).number_of_repair_threads = repairThreads; + } + + public DurationSpec.IntSecondsBound getRepairMinInterval(RepairType repairType) + { + return applyOverrides(repairType, opt -> opt.min_repair_interval); + } + + public void setRepairMinInterval(RepairType repairType, String minRepairInterval) + { + getOptions(repairType).min_repair_interval = new DurationSpec.IntSecondsBound(minRepairInterval); + } + + public int getRepairSSTableCountHigherThreshold(RepairType repairType) + { + return applyOverrides(repairType, opt -> opt.sstable_upper_threshold); + } + + public void setRepairSSTableCountHigherThreshold(RepairType repairType, int sstableHigherThreshold) + { + getOptions(repairType).sstable_upper_threshold = sstableHigherThreshold; + } + + public DurationSpec.IntSecondsBound getAutoRepairTableMaxRepairTime(RepairType repairType) + { + return applyOverrides(repairType, opt -> opt.table_max_repair_time); + } + + public void setAutoRepairTableMaxRepairTime(RepairType repairType, String autoRepairTableMaxRepairTime) + { + getOptions(repairType).table_max_repair_time = new DurationSpec.IntSecondsBound(autoRepairTableMaxRepairTime); + } + + public Set getIgnoreDCs(RepairType repairType) + { + return applyOverrides(repairType, opt -> opt.ignore_dcs); + } + + public void setIgnoreDCs(RepairType repairType, Set ignoreDCs) + { + getOptions(repairType).ignore_dcs = ignoreDCs; + } + + public boolean getRepairPrimaryTokenRangeOnly(RepairType repairType) + { + return applyOverrides(repairType, opt -> opt.repair_primary_token_range_only); + } + + public void setRepairPrimaryTokenRangeOnly(RepairType repairType, boolean primaryTokenRangeOnly) + { + getOptions(repairType).repair_primary_token_range_only = primaryTokenRangeOnly; + } + + public int getParallelRepairPercentage(RepairType repairType) + { + return applyOverrides(repairType, opt -> opt.parallel_repair_percentage); + } + + public void setParallelRepairPercentage(RepairType repairType, int percentage) + { + getOptions(repairType).parallel_repair_percentage = percentage; + } + + public int getParallelRepairCount(RepairType repairType) + { + return applyOverrides(repairType, opt -> opt.parallel_repair_count); + } + + public void setParallelRepairCount(RepairType repairType, int count) + { + getOptions(repairType).parallel_repair_count = count; + } + + public boolean getAllowParallelReplicaRepair(RepairType repairType) + { + return applyOverrides(repairType, opt -> opt.allow_parallel_replica_repair); + } + + public void setAllowParallelReplicaRepair(RepairType repairType, boolean enabled) + { + getOptions(repairType).allow_parallel_replica_repair = enabled; + } + + public boolean getAllowParallelReplicaRepairAcrossSchedules(RepairType repairType) + { + return applyOverrides(repairType, opt -> opt.allow_parallel_replica_repair_across_schedules); + } + + public void setAllowParallelReplicaRepairAcrossSchedules(RepairType repairType, boolean enabled) + { + getOptions(repairType).allow_parallel_replica_repair_across_schedules = enabled; + } + + public boolean getMaterializedViewRepairEnabled(RepairType repairType) + { + return applyOverrides(repairType, opt -> opt.materialized_view_repair_enabled); + } + + public void setMaterializedViewRepairEnabled(RepairType repairType, boolean enabled) + { + getOptions(repairType).materialized_view_repair_enabled = enabled; + } + + public void setForceRepairNewNode(RepairType repairType, boolean forceRepairNewNode) + { + getOptions(repairType).force_repair_new_node = forceRepairNewNode; + } + + public boolean getForceRepairNewNode(RepairType repairType) + { + return applyOverrides(repairType, opt -> opt.force_repair_new_node); + } + + public ParameterizedClass getTokenRangeSplitter(RepairType repairType) + { + return applyOverrides(repairType, opt -> opt.token_range_splitter); + } + + public IAutoRepairTokenRangeSplitter getTokenRangeSplitterInstance(RepairType repairType) + { + return tokenRangeSplitters.computeIfAbsent(repairType, + key -> newAutoRepairTokenRangeSplitter(key, getTokenRangeSplitter(key))); + } + + public void setInitialSchedulerDelay(RepairType repairType, String initialSchedulerDelay) + { + getOptions(repairType).initial_scheduler_delay = new DurationSpec.IntSecondsBound(initialSchedulerDelay); + } + + public DurationSpec.IntSecondsBound getInitialSchedulerDelay(RepairType repairType) + { + return applyOverrides(repairType, opt -> opt.initial_scheduler_delay); + } + + public DurationSpec.IntSecondsBound getRepairSessionTimeout(RepairType repairType) + { + return applyOverrides(repairType, opt -> opt.repair_session_timeout); + } + + public void setRepairSessionTimeout(RepairType repairType, String repairSessionTimeout) + { + getOptions(repairType).repair_session_timeout = new DurationSpec.IntSecondsBound(repairSessionTimeout); + } + + public int getRepairMaxRetries(RepairType repairType) + { + return applyOverrides(repairType, opt -> opt.repair_max_retries); + } + + public void setRepairMaxRetries(RepairType repairType, int maxRetries) + { + getOptions(repairType).repair_max_retries = maxRetries; + } + + public DurationSpec.LongSecondsBound getRepairRetryBackoff(RepairType repairType) + { + return applyOverrides(repairType, opt -> opt.repair_retry_backoff); + } + + public void setRepairRetryBackoff(RepairType repairType, String interval) + { + getOptions(repairType).repair_retry_backoff = new DurationSpec.LongSecondsBound(interval); + } + + @VisibleForTesting + static IAutoRepairTokenRangeSplitter newAutoRepairTokenRangeSplitter(RepairType repairType, ParameterizedClass parameterizedClass) throws ConfigurationException + { + try + { + Class tokenRangeSplitterClass; + final String className; + if (parameterizedClass.class_name != null && !parameterizedClass.class_name.isEmpty()) + { + className = parameterizedClass.class_name.contains(".") ? + parameterizedClass.class_name : + "org.apache.cassandra.repair.autorepair." + parameterizedClass.class_name; + tokenRangeSplitterClass = FBUtilities.classForName(className, "token_range_splitter"); + } + else + { + // If token_range_splitter.class_name is not defined, just use default, this is for convenience. + tokenRangeSplitterClass = AutoRepairConfig.DEFAULT_SPLITTER; + } + try + { + Map parameters = parameterizedClass.parameters != null ? parameterizedClass.parameters : Collections.emptyMap(); + // first attempt to initialize with RepairType and Map arguments. + return tokenRangeSplitterClass.getConstructor(RepairType.class, Map.class).newInstance(repairType, parameters); + } + catch (NoSuchMethodException nsme) + { + // fall back on no argument constructor. + return tokenRangeSplitterClass.getConstructor().newInstance(); + } + } + catch (Exception ex) + { + throw new ConfigurationException("Unable to create instance of IAutoRepairTokenRangeSplitter", ex); + } + } + + // Options configures auto-repair behavior for a given repair type. + // All fields can be modified dynamically. + public static class Options implements Serializable + { + // defaultOptions defines the default auto-repair behavior when no overrides are defined + @VisibleForTesting + private static Map defaultOptions; + + private static Map initializeDefaultOptions() + { + Map options = new EnumMap<>(AutoRepairConfig.RepairType.class); + options.put(AutoRepairConfig.RepairType.FULL, getDefaultOptions()); + options.put(RepairType.INCREMENTAL, getDefaultOptions()); + options.put(RepairType.PREVIEW_REPAIRED, getDefaultOptions()); + + return options; + } + + public static Map getDefaultOptionsMap() + { + if (defaultOptions == null) + { + synchronized (AutoRepairConfig.class) + { + if (defaultOptions == null) + { + defaultOptions = initializeDefaultOptions(); + } + } + } + return defaultOptions; + } + + public Options() + { + } + + @VisibleForTesting + protected static Options getDefaultOptions() + { + Options opts = new Options(); + + opts.enabled = false; + opts.repair_by_keyspace = true; + opts.number_of_repair_threads = 1; + opts.parallel_repair_count = 3; + opts.parallel_repair_percentage = 3; + opts.allow_parallel_replica_repair = false; + opts.allow_parallel_replica_repair_across_schedules = true; + opts.sstable_upper_threshold = 50000; + opts.ignore_dcs = new HashSet<>(); + opts.repair_primary_token_range_only = true; + opts.force_repair_new_node = false; + opts.table_max_repair_time = new DurationSpec.IntSecondsBound("6h"); + opts.materialized_view_repair_enabled = false; + opts.token_range_splitter = new ParameterizedClass(DEFAULT_SPLITTER.getName(), Collections.emptyMap()); + opts.initial_scheduler_delay = new DurationSpec.IntSecondsBound("5m"); + opts.repair_session_timeout = new DurationSpec.IntSecondsBound("3h"); + opts.min_repair_interval = new DurationSpec.IntSecondsBound("24h"); + + return opts; + } + + // Enable/Disable full or incremental or previewed_repair auto repair + public volatile Boolean enabled; + // If true, attempts to group tables in the same keyspace into one repair; otherwise, each table is repaired + // individually. + public volatile Boolean repair_by_keyspace; + // Number of threads to use for each repair job scheduled by the scheduler. Similar to the -j option in nodetool + // repair. + public volatile Integer number_of_repair_threads; + // Number of nodes running repair in parallel. If parallel_repair_percentage is set, the larger value is used. + public volatile Integer parallel_repair_count; + // Percentage of nodes in the cluster running repair in parallel. If parallel_repair_count is set, the larger value + // is used. Recommendation is that the repair cycle on the cluster should finish within gc_grace_seconds. + public volatile Integer parallel_repair_percentage; + // Whether to allow a node to take its turn running repair while one or more of its replicas are running repair. + // Defaults to false, as running repairs concurrently on replicas can increase load and also cause + // anticompaction conflicts while running incremental repair. + public volatile Boolean allow_parallel_replica_repair; + // An addition to allow_parallel_replica_repair that also blocks repairs when replicas (including this node itself) + // are repairing in any schedule. For example, if a replica is executing full repairs, a value of false will + // prevent starting incremental repairs for this node. Defaults to true and is only evaluated when + // allow_parallel_replica_repair is false. + public volatile Boolean allow_parallel_replica_repair_across_schedules; + // Threshold to skip repairing tables with too many SSTables. Defaults to 10,000 SSTables to avoid penalizing good + // tables. + public volatile Integer sstable_upper_threshold; + // Minimum duration between repairing the same node again. This is useful for tiny clusters, such as + // clusters with 5 nodes that finish repairs quickly. The default is 24 hours. This means that if the scheduler + // completes one round on all nodes in less than 24 hours, it will not start a new repair round on a given node + // until 24 hours have passed since the last repair. + public volatile DurationSpec.IntSecondsBound min_repair_interval; + // Avoid running repairs in specific data centers. By default, repairs run in all data centers. Specify data + // centers to exclude in this list. Note that repair sessions will still consider all replicas from excluded + // data centers. Useful if you have keyspaces that are not replicated in certain data centers, and you want to + // not run repair schedule in certain data centers. + public volatile Set ignore_dcs; + // Repair only the primary ranges owned by a node. Equivalent to the -pr option in nodetool repair. Defaults + // to true. General advice is to keep this true. + public volatile Boolean repair_primary_token_range_only; + // Force immediate repair on new nodes after they join the ring. + public volatile Boolean force_repair_new_node; + // Maximum time allowed for repairing one table on a given node. If exceeded, the repair proceeds to the + // next table. + public volatile DurationSpec.IntSecondsBound table_max_repair_time; + // Repairs materialized views if true. + public volatile Boolean materialized_view_repair_enabled; + /** + * Splitter implementation to use for generating repair assignments. + *

+ * The default is {@link RepairTokenRangeSplitter}. The class should implement {@link IAutoRepairTokenRangeSplitter} + * and have a constructor accepting ({@link RepairType}, {@link java.util.Map}) + */ + public volatile ParameterizedClass token_range_splitter; + // After a node restart, wait for this much delay before scheduler starts running repair; this is to avoid starting repair immediately after a node restart. + public volatile DurationSpec.IntSecondsBound initial_scheduler_delay; + // Timeout for retrying stuck repair sessions. + public volatile DurationSpec.IntSecondsBound repair_session_timeout; + // Maximum number of retries for a repair session. + public volatile Integer repair_max_retries = 3; + // Backoff time before retrying a repair session. + public volatile DurationSpec.LongSecondsBound repair_retry_backoff = new DurationSpec.LongSecondsBound("30s"); + + public String toString() + { + return "Options{" + + "enabled=" + enabled + + ", repair_by_keyspace=" + repair_by_keyspace + + ", number_of_repair_threads=" + number_of_repair_threads + + ", parallel_repair_count=" + parallel_repair_count + + ", parallel_repair_percentage=" + parallel_repair_percentage + + ", allow_parallel_replica_repair=" + allow_parallel_replica_repair + + ", allow_parallel_replica_repair_across_schedules=" + allow_parallel_replica_repair_across_schedules + + ", sstable_upper_threshold=" + sstable_upper_threshold + + ", min_repair_interval=" + min_repair_interval + + ", ignore_dcs=" + ignore_dcs + + ", repair_primary_token_range_only=" + repair_primary_token_range_only + + ", force_repair_new_node=" + force_repair_new_node + + ", table_max_repair_time=" + table_max_repair_time + + ", materialized_view_repair_enabled=" + materialized_view_repair_enabled + + ", token_range_splitter=" + token_range_splitter + + ", intial_scheduler_delay=" + initial_scheduler_delay + + ", repair_session_timeout=" + repair_session_timeout + + '}'; + } + } + + @Nonnull + protected Options getOptions(RepairType repairType) + { + return repair_type_overrides.computeIfAbsent(repairType.getConfigName(), k -> new Options()); + } + + private static T getOverride(Options options, Function optionSupplier) + { + return options != null ? optionSupplier.apply(options) : null; + } + + @VisibleForTesting + protected T applyOverrides(RepairType repairType, Function optionSupplier) + { + // Check option by repair type first + Options repairTypeOverrides = getOptions(repairType); + T val = optionSupplier.apply(repairTypeOverrides); + + if (val != null) + return val; + + // Check option in global settings + if (global_settings != null) + { + val = getOverride(global_settings, optionSupplier); + + if (val != null) + return val; + } + + // Otherwise check defaults + return getOverride(Options.getDefaultOptionsMap().get(repairType), optionSupplier); + } + + public String toString() + { + return "AutoRepairConfig{" + + "enabled=" + enabled + + ", repair_check_interval=" + repair_check_interval + + ", history_clear_delete_hosts_buffer_interval=" + history_clear_delete_hosts_buffer_interval + + ", repair_task_min_duration=" + repair_task_min_duration + + ", global_settings=" + global_settings + + ", repair_type_overrides=" + repair_type_overrides + + "}"; + } +} diff --git a/src/java/org/apache/cassandra/repair/autorepair/AutoRepairState.java b/src/java/org/apache/cassandra/repair/autorepair/AutoRepairState.java new file mode 100644 index 000000000000..6822f20cf023 --- /dev/null +++ b/src/java/org/apache/cassandra/repair/autorepair/AutoRepairState.java @@ -0,0 +1,326 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import com.google.common.annotations.VisibleForTesting; + +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.view.TableViews; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.metrics.AutoRepairMetricsManager; +import org.apache.cassandra.metrics.AutoRepairMetrics; +import org.apache.cassandra.repair.RepairCoordinator; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig.RepairType; +import org.apache.cassandra.repair.autorepair.AutoRepairUtils.AutoRepairHistory; +import org.apache.cassandra.repair.RepairParallelism; +import org.apache.cassandra.repair.messages.RepairOption; +import org.apache.cassandra.service.AutoRepairService; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.streaming.PreviewKind; +import org.apache.cassandra.utils.Clock; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.text.SimpleDateFormat; +import java.util.List; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +/** + * AutoRepairState represents the state of automated repair for a given repair type. + */ +public abstract class AutoRepairState +{ + protected static final Logger logger = LoggerFactory.getLogger(AutoRepairState.class); + private final SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss,SSS"); + @VisibleForTesting + protected static Supplier timeFunc = Clock.Global::currentTimeMillis; + + @VisibleForTesting + protected final RepairType repairType; + @VisibleForTesting + protected int totalTablesConsideredForRepair = 0; + @VisibleForTesting + protected long lastRepairTimeInMs; + @VisibleForTesting + protected int nodeRepairTimeInSec = 0; + @VisibleForTesting + protected int clusterRepairTimeInSec = 0; + @VisibleForTesting + protected boolean repairInProgress = false; + @VisibleForTesting + protected int repairKeyspaceCount = 0; + @VisibleForTesting + protected int totalMVTablesConsideredForRepair = 0; + @VisibleForTesting + protected int totalDisabledTablesRepairCount = 0; + @VisibleForTesting + protected int failedTokenRangesCount = 0; + @VisibleForTesting + protected int succeededTokenRangesCount = 0; + @VisibleForTesting + protected int skippedTokenRangesCount = 0; + @VisibleForTesting + protected int skippedTablesCount = 0; + @VisibleForTesting + protected AutoRepairHistory longestUnrepairedNode; + protected final AutoRepairMetrics metrics; + + protected AutoRepairState(RepairType repairType) + { + metrics = AutoRepairMetricsManager.getMetrics(repairType); + this.repairType = repairType; + } + + public abstract RepairCoordinator getRepairRunnable(String keyspace, List tables, Set> ranges, boolean primaryRangeOnly); + + protected RepairCoordinator getRepairRunnable(String keyspace, RepairOption options) + { + return new RepairCoordinator(StorageService.instance, StorageService.nextRepairCommand.incrementAndGet(), + options, keyspace); + } + + public long getLastRepairTime() + { + return lastRepairTimeInMs; + } + + public void setTotalTablesConsideredForRepair(int count) + { + totalTablesConsideredForRepair = count; + } + + public int getTotalTablesConsideredForRepair() + { + return totalTablesConsideredForRepair; + } + + public void setLastRepairTime(long lastRepairTime) + { + lastRepairTimeInMs = lastRepairTime; + } + + public int getClusterRepairTimeInSec() + { + return clusterRepairTimeInSec; + } + + public int getNodeRepairTimeInSec() + { + return nodeRepairTimeInSec; + } + + public void setRepairInProgress(boolean repairInProgress) + { + this.repairInProgress = repairInProgress; + } + + public boolean isRepairInProgress() + { + return repairInProgress; + } + + public int getLongestUnrepairedSec() + { + if (longestUnrepairedNode == null) + { + return 0; + } + return (int) TimeUnit.MILLISECONDS.toSeconds(timeFunc.get() - longestUnrepairedNode.getLastRepairFinishTime()); + } + + public void setTotalMVTablesConsideredForRepair(int count) + { + totalMVTablesConsideredForRepair = count; + } + + public int getTotalMVTablesConsideredForRepair() + { + return totalMVTablesConsideredForRepair; + } + + public void setNodeRepairTimeInSec(int elapsed) + { + nodeRepairTimeInSec = elapsed; + } + + public void setClusterRepairTimeInSec(int seconds) + { + clusterRepairTimeInSec = seconds; + } + + public void setRepairKeyspaceCount(int count) + { + repairKeyspaceCount = count; + } + + public int getRepairKeyspaceCount() + { + return repairKeyspaceCount; + } + + public void setLongestUnrepairedNode(AutoRepairHistory longestUnrepairedNode) + { + this.longestUnrepairedNode = longestUnrepairedNode; + } + + public void setFailedTokenRangesCount(int count) + { + failedTokenRangesCount = count; + } + + public int getFailedTokenRangesCount() + { + return failedTokenRangesCount; + } + + public void setSucceededTokenRangesCount(int count) + { + succeededTokenRangesCount = count; + } + + public int getSucceededTokenRangesCount() + { + return succeededTokenRangesCount; + } + + public void setSkippedTokenRangesCount(int count) + { + skippedTokenRangesCount = count; + } + + public int getSkippedTokenRangesCount() + { + return skippedTokenRangesCount; + } + + public void setSkippedTablesCount(int count) + { + skippedTablesCount = count; + } + + public int getSkippedTablesCount() + { + return skippedTablesCount; + } + + public void recordTurn(AutoRepairUtils.RepairTurn turn) + { + metrics.recordTurn(turn); + } + + public void setTotalDisabledTablesRepairCount(int count) + { + totalDisabledTablesRepairCount = count; + } + + public int getTotalDisabledTablesRepairCount() + { + return totalDisabledTablesRepairCount; + } +} + +class PreviewRepairedState extends AutoRepairState +{ + public PreviewRepairedState() + { + super(RepairType.PREVIEW_REPAIRED); + } + + @Override + public RepairCoordinator getRepairRunnable(String keyspace, List tables, Set> ranges, boolean primaryRangeOnly) + { + RepairOption option = new RepairOption(RepairParallelism.PARALLEL, primaryRangeOnly, false, false, + AutoRepairService.instance.getAutoRepairConfig().getRepairThreads(repairType), ranges, false, false, PreviewKind.REPAIRED, false, true, true, false, false, false); + + option.getColumnFamilies().addAll(tables); + + return getRepairRunnable(keyspace, option); + } +} + +class IncrementalRepairState extends AutoRepairState +{ + public IncrementalRepairState() + { + super(RepairType.INCREMENTAL); + } + + @Override + public RepairCoordinator getRepairRunnable(String keyspace, List tables, Set> ranges, boolean primaryRangeOnly) + { + RepairOption option = new RepairOption(RepairParallelism.PARALLEL, primaryRangeOnly, true, false, + AutoRepairService.instance.getAutoRepairConfig().getRepairThreads(repairType), ranges, + false, false, PreviewKind.NONE, true, true, true, false, false, false); + + option.getColumnFamilies().addAll(filterOutUnsafeTables(keyspace, tables)); + + return getRepairRunnable(keyspace, option); + } + + @VisibleForTesting + protected List filterOutUnsafeTables(String keyspaceName, List tables) + { + Keyspace keyspace = Keyspace.open(keyspaceName); + + return tables.stream() + .filter(table -> { + ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(table); + TableViews views = keyspace.viewManager.forTable(cfs.metadata()); + if (views != null && !views.isEmpty()) + { + logger.debug("Skipping incremental repair for {}.{} as it has materialized views", keyspaceName, table); + return false; + } + + if (cfs.metadata().params != null && cfs.metadata().params.cdc) + { + logger.debug("Skipping incremental repair for {}.{} as it has CDC enabled", keyspaceName, table); + return false; + } + + return true; + }).collect(Collectors.toList()); + } +} + +class FullRepairState extends AutoRepairState +{ + public FullRepairState() + { + super(RepairType.FULL); + } + + @Override + public RepairCoordinator getRepairRunnable(String keyspace, List tables, Set> ranges, boolean primaryRangeOnly) + { + RepairOption option = new RepairOption(RepairParallelism.PARALLEL, primaryRangeOnly, false, false, + AutoRepairService.instance.getAutoRepairConfig().getRepairThreads(repairType), ranges, + false, false, PreviewKind.NONE, true, true, true, false, false, false); + + option.getColumnFamilies().addAll(tables); + + return getRepairRunnable(keyspace, option); + } +} diff --git a/src/java/org/apache/cassandra/repair/autorepair/AutoRepairUtils.java b/src/java/org/apache/cassandra/repair/autorepair/AutoRepairUtils.java new file mode 100644 index 000000000000..6da487e5e06b --- /dev/null +++ b/src/java/org/apache/cassandra/repair/autorepair/AutoRepairUtils.java @@ -0,0 +1,1189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.repair.autorepair; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.TreeSet; +import java.util.UUID; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import javax.annotation.Nullable; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.MoreObjects; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; + +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Splitter; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.locator.EndpointsByRange; +import org.apache.cassandra.locator.EndpointsForRange; +import org.apache.cassandra.locator.LocalStrategy; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.QueryOptions; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.cql3.UntypedResultSet; +import org.apache.cassandra.cql3.statements.ModificationStatement; +import org.apache.cassandra.cql3.statements.SelectStatement; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.db.marshal.UUIDType; +import org.apache.cassandra.gms.Gossiper; +import org.apache.cassandra.locator.AbstractReplicationStrategy; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.MetaStrategy; +import org.apache.cassandra.locator.NetworkTopologyStrategy; +import org.apache.cassandra.locator.RangesAtEndpoint; +import org.apache.cassandra.locator.Replica; +import org.apache.cassandra.metrics.AutoRepairMetricsManager; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.SystemDistributedKeyspace; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.ViewMetadata; +import org.apache.cassandra.serializers.SetSerializer; +import org.apache.cassandra.serializers.UUIDSerializer; +import org.apache.cassandra.service.AutoRepairService; +import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.QueryState; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.membership.Directory; +import org.apache.cassandra.tcm.membership.NodeAddresses; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.transport.Dispatcher; +import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.transport.messages.ResultMessage; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig.RepairType; +import org.apache.cassandra.utils.NoSpamLogger; + +import static org.apache.cassandra.repair.autorepair.AutoRepairUtils.RepairTurn.MY_TURN; +import static org.apache.cassandra.repair.autorepair.AutoRepairUtils.RepairTurn.MY_TURN_DUE_TO_PRIORITY; +import static org.apache.cassandra.repair.autorepair.AutoRepairUtils.RepairTurn.NOT_MY_TURN; +import static org.apache.cassandra.repair.autorepair.AutoRepairUtils.RepairTurn.MY_TURN_FORCE_REPAIR; +import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis; + +/** + * This class serves as a utility class for AutoRepair. It contains various helper APIs + * to store/retrieve repair status, decide whose turn is next, etc. + */ +public class AutoRepairUtils +{ + private static final Logger logger = LoggerFactory.getLogger(AutoRepairUtils.class); + static final String COL_REPAIR_TYPE = "repair_type"; + static final String COL_HOST_ID = "host_id"; + static final String COL_REPAIR_START_TS = "repair_start_ts"; + static final String COL_REPAIR_FINISH_TS = "repair_finish_ts"; + static final String COL_REPAIR_PRIORITY = "repair_priority"; + static final String COL_DELETE_HOSTS = "delete_hosts"; // this set stores the host ids which think the row should be deleted + static final String COL_REPAIR_TURN = "repair_turn"; // this record the last repair turn. Normal turn or turn due to priority + static final String COL_DELETE_HOSTS_UPDATE_TIME = "delete_hosts_update_time"; // the time when delete hosts are upated + static final String COL_FORCE_REPAIR = "force_repair"; // if set to true, the node will do non-primary range rapair + + final static String SELECT_REPAIR_HISTORY = String.format( + "SELECT * FROM %s.%s WHERE %s = ?", SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, + SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, COL_REPAIR_TYPE); + final static String SELECT_REPAIR_PRIORITY = String.format( + "SELECT * FROM %s.%s WHERE %s = ?", SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, + SystemDistributedKeyspace.AUTO_REPAIR_PRIORITY, COL_REPAIR_TYPE); + final static String DEL_REPAIR_PRIORITY = String.format( + "DELETE %s[?] FROM %s.%s WHERE %s = ?", COL_REPAIR_PRIORITY, SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, + SystemDistributedKeyspace.AUTO_REPAIR_PRIORITY, COL_REPAIR_TYPE); + final static String ADD_PRIORITY_HOST = String.format( + "UPDATE %s.%s SET %s = %s + ? WHERE %s = ?", SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, + SystemDistributedKeyspace.AUTO_REPAIR_PRIORITY, COL_REPAIR_PRIORITY, COL_REPAIR_PRIORITY, COL_REPAIR_TYPE); + + final static String INSERT_NEW_REPAIR_HISTORY = String.format( + "INSERT INTO %s.%s (%s, %s, %s, %s, %s, %s) values (?, ? ,?, ?, {}, ?) IF NOT EXISTS", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, COL_REPAIR_TYPE, + COL_HOST_ID, COL_REPAIR_START_TS, COL_REPAIR_FINISH_TS, COL_DELETE_HOSTS, COL_DELETE_HOSTS_UPDATE_TIME); + + final static String ADD_HOST_ID_TO_DELETE_HOSTS = String.format( + "UPDATE %s.%s SET %s = %s + ?, %s = ? WHERE %s = ? AND %s = ? IF EXISTS" + , SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, COL_DELETE_HOSTS, + COL_DELETE_HOSTS, COL_DELETE_HOSTS_UPDATE_TIME, COL_REPAIR_TYPE, COL_HOST_ID); + + final static String DEL_AUTO_REPAIR_HISTORY = String.format( + "DELETE FROM %s.%s WHERE %s = ? AND %s = ?" + , SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, COL_REPAIR_TYPE, + COL_HOST_ID); + + final static String RECORD_START_REPAIR_HISTORY = String.format( + "UPDATE %s.%s SET %s= ?, repair_turn = ? WHERE %s = ? AND %s = ?" + , SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, COL_REPAIR_START_TS, + COL_REPAIR_TYPE, COL_HOST_ID); + + final static String RECORD_FINISH_REPAIR_HISTORY = String.format( + "UPDATE %s.%s SET %s= ?, %s=false WHERE %s = ? AND %s = ?" + , SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, COL_REPAIR_FINISH_TS, + COL_FORCE_REPAIR, COL_REPAIR_TYPE, COL_HOST_ID); + + final static String CLEAR_DELETE_HOSTS = String.format( + "UPDATE %s.%s SET %s= {} WHERE %s = ? AND %s = ?" + , SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, COL_DELETE_HOSTS, + COL_REPAIR_TYPE, COL_HOST_ID); + + final static String SET_FORCE_REPAIR = String.format( + "UPDATE %s.%s SET %s=true WHERE %s = ? AND %s = ?" + , SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, COL_FORCE_REPAIR, + COL_REPAIR_TYPE, COL_HOST_ID); + + final static String SELECT_LAST_REPAIR_TIME_FOR_NODE = String.format( + "SELECT %s FROM %s.%s WHERE %s = ? AND %s = ?", COL_REPAIR_FINISH_TS, SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, + SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, COL_REPAIR_TYPE, COL_HOST_ID); + + static ModificationStatement delStatementRepairHistory; + static SelectStatement selectStatementRepairHistory; + static ModificationStatement delStatementPriorityStatus; + static SelectStatement selectStatementRepairPriority; + static SelectStatement selectLastRepairTimeForNode; + static ModificationStatement addPriorityHost; + static ModificationStatement insertNewRepairHistoryStatement; + static ModificationStatement recordStartRepairHistoryStatement; + static ModificationStatement recordFinishRepairHistoryStatement; + static ModificationStatement addHostIDToDeleteHostsStatement; + static ModificationStatement clearDeleteHostsStatement; + static ModificationStatement setForceRepairStatement; + static ConsistencyLevel internalQueryCL; + + public enum RepairTurn + { + MY_TURN, + NOT_MY_TURN, + MY_TURN_DUE_TO_PRIORITY, + MY_TURN_FORCE_REPAIR + } + + public static void setup() + { + selectStatementRepairHistory = (SelectStatement) QueryProcessor.getStatement(SELECT_REPAIR_HISTORY, ClientState + .forInternalCalls()); + selectStatementRepairPriority = (SelectStatement) QueryProcessor.getStatement(SELECT_REPAIR_PRIORITY, ClientState + .forInternalCalls()); + selectLastRepairTimeForNode = (SelectStatement) QueryProcessor.getStatement(SELECT_LAST_REPAIR_TIME_FOR_NODE, ClientState + .forInternalCalls()); + delStatementPriorityStatus = (ModificationStatement) QueryProcessor.getStatement(DEL_REPAIR_PRIORITY, ClientState + .forInternalCalls()); + addPriorityHost = (ModificationStatement) QueryProcessor.getStatement(ADD_PRIORITY_HOST, ClientState + .forInternalCalls()); + insertNewRepairHistoryStatement = (ModificationStatement) QueryProcessor.getStatement(INSERT_NEW_REPAIR_HISTORY, ClientState + .forInternalCalls()); + recordStartRepairHistoryStatement = (ModificationStatement) QueryProcessor.getStatement(RECORD_START_REPAIR_HISTORY, ClientState + .forInternalCalls()); + recordFinishRepairHistoryStatement = (ModificationStatement) QueryProcessor.getStatement(RECORD_FINISH_REPAIR_HISTORY, ClientState + .forInternalCalls()); + addHostIDToDeleteHostsStatement = (ModificationStatement) QueryProcessor.getStatement(ADD_HOST_ID_TO_DELETE_HOSTS, ClientState + .forInternalCalls()); + setForceRepairStatement = (ModificationStatement) QueryProcessor.getStatement(SET_FORCE_REPAIR, ClientState + .forInternalCalls()); + clearDeleteHostsStatement = (ModificationStatement) QueryProcessor.getStatement(CLEAR_DELETE_HOSTS, ClientState + .forInternalCalls()); + delStatementRepairHistory = (ModificationStatement) QueryProcessor.getStatement(DEL_AUTO_REPAIR_HISTORY, ClientState + .forInternalCalls()); + Keyspace autoRepairKS = Schema.instance.getKeyspaceInstance(SchemaConstants.DISTRIBUTED_KEYSPACE_NAME); + internalQueryCL = autoRepairKS.getReplicationStrategy().getClass() == NetworkTopologyStrategy.class ? + ConsistencyLevel.LOCAL_QUORUM : ConsistencyLevel.ONE; + } + + public static class AutoRepairHistory + { + UUID hostId; + String repairTurn; + long lastRepairStartTime; + long lastRepairFinishTime; + Set deleteHosts; + long deleteHostsUpdateTime; + boolean forceRepair; + + public AutoRepairHistory(UUID hostId, String repairTurn, long lastRepairStartTime, long lastRepairFinishTime, + Set deleteHosts, long deleteHostsUpateTime, boolean forceRepair) + { + this.hostId = hostId; + this.repairTurn = repairTurn; + this.lastRepairStartTime = lastRepairStartTime; + this.lastRepairFinishTime = lastRepairFinishTime; + this.deleteHosts = deleteHosts; + if (this.deleteHosts == null) + { + this.deleteHosts = new HashSet<>(); + } + this.deleteHostsUpdateTime = deleteHostsUpateTime; + this.forceRepair = forceRepair; + } + + public String toString() + { + return MoreObjects.toStringHelper(this). + add("hostId", hostId). + add("repairTurn", repairTurn). + add("lastRepairStartTime", lastRepairStartTime). + add("lastRepairFinishTime", lastRepairFinishTime). + add("deleteHosts", deleteHosts). + toString(); + } + + public boolean isRepairRunning() + { + // if a repair history record has start time laster than finish time, it means the repair is running + return lastRepairStartTime > lastRepairFinishTime; + } + + public long getLastRepairFinishTime() + { + return lastRepairFinishTime; + } + } + + public static class CurrentRepairStatus + { + public Set hostIdsWithOnGoingRepair; // hosts that is running repair + public Set hostIdsWithOnGoingForceRepair; // hosts that is running repair because of force repair + Set priority; + public AutoRepairHistory myRepairHistory; + List historiesWithoutOnGoingRepair; // hosts that is NOT running repair + + public CurrentRepairStatus(List repairHistories, Set priority, UUID myId) + { + hostIdsWithOnGoingRepair = new HashSet<>(); + hostIdsWithOnGoingForceRepair = new HashSet<>(); + historiesWithoutOnGoingRepair = new ArrayList<>(); + + for (AutoRepairHistory history : repairHistories) + { + if (history.isRepairRunning()) + { + if (history.forceRepair) + { + hostIdsWithOnGoingForceRepair.add(history.hostId); + } + else + { + hostIdsWithOnGoingRepair.add(history.hostId); + } + } + else + { + historiesWithoutOnGoingRepair.add(history); + } + if (history.hostId.equals(myId)) + { + myRepairHistory = history; + } + } + this.priority = priority; + } + + public Set getAllHostsWithOngoingRepair() + { + return Sets.union(hostIdsWithOnGoingRepair, hostIdsWithOnGoingForceRepair); + } + + public String toString() + { + return MoreObjects.toStringHelper(this). + add("hostIdsWithOnGoingRepair", hostIdsWithOnGoingRepair). + add("hostIdsWithOnGoingForceRepair", hostIdsWithOnGoingForceRepair). + add("historiesWithoutOnGoingRepair", historiesWithoutOnGoingRepair). + add("priority", priority). + add("myRepairHistory", myRepairHistory). + toString(); + } + } + + @VisibleForTesting + public static List getAutoRepairHistory(RepairType repairType) + { + UntypedResultSet repairHistoryResult; + + ResultMessage.Rows repairStatusRows = selectStatementRepairHistory.execute(QueryState.forInternalCalls(), + QueryOptions.forInternalCalls(internalQueryCL, Lists.newArrayList(ByteBufferUtil.bytes(repairType.toString()))), Dispatcher.RequestTime.forImmediateExecution()); + repairHistoryResult = UntypedResultSet.create(repairStatusRows.result); + + List repairHistories = new ArrayList<>(); + if (!repairHistoryResult.isEmpty()) + { + for (UntypedResultSet.Row row : repairHistoryResult) + { + UUID hostId = row.getUUID(COL_HOST_ID); + String repairTurn = null; + if (row.has(COL_REPAIR_TURN)) + repairTurn = row.getString(COL_REPAIR_TURN); + long lastRepairStartTime = row.getLong(COL_REPAIR_START_TS, 0); + long lastRepairFinishTime = row.getLong(COL_REPAIR_FINISH_TS, 0); + Set deleteHosts = row.getSet(COL_DELETE_HOSTS, UUIDType.instance); + long deleteHostsUpdateTime = row.getLong(COL_DELETE_HOSTS_UPDATE_TIME, 0); + boolean forceRepair = row.has(COL_FORCE_REPAIR) && row.getBoolean(COL_FORCE_REPAIR); + repairHistories.add(new AutoRepairHistory(hostId, repairTurn, lastRepairStartTime, lastRepairFinishTime, + deleteHosts, deleteHostsUpdateTime, forceRepair)); + } + return repairHistories; + } + logger.info("No repair history found"); + return null; + } + + // A host may add itself in delete hosts for some other hosts due to restart or some temp gossip issue. If a node's record + // delete_hosts is not growing for more than 2 hours, we consider it as a normal node so we clear the delete_hosts for that node + public static void clearDeleteHosts(RepairType repairType, UUID hostId) + { + clearDeleteHostsStatement.execute(QueryState.forInternalCalls(), + QueryOptions.forInternalCalls(internalQueryCL, + Lists.newArrayList(ByteBufferUtil.bytes(repairType.toString()), + ByteBufferUtil.bytes(hostId))), Dispatcher.RequestTime.forImmediateExecution()); + } + + public static void setForceRepairNewNode(RepairType repairType) + { + // this function will be called when a node bootstrap finished + UUID hostId = StorageService.instance.getHostIdForEndpoint(FBUtilities.getBroadcastAddressAndPort()); + // insert the data first + insertNewRepairHistory(repairType, currentTimeMillis(), currentTimeMillis()); + setForceRepair(repairType, hostId); + } + + public static void setForceRepair(RepairType repairType, Set hosts) + { + // this function is used by nodetool + for (InetAddressAndPort host : hosts) + { + UUID hostId = StorageService.instance.getHostIdForEndpoint(host); + setForceRepair(repairType, hostId); + } + } + + public static void setForceRepair(RepairType repairType, UUID hostId) + { + setForceRepairStatement.execute(QueryState.forInternalCalls(), + QueryOptions.forInternalCalls(internalQueryCL, + Lists.newArrayList(ByteBufferUtil.bytes(repairType.toString()), + ByteBufferUtil.bytes(hostId))), + Dispatcher.RequestTime.forImmediateExecution()); + + logger.info("Set force repair repair type: {}, node: {}", repairType, hostId); + } + + public static long getLastRepairTimeForNode(RepairType repairType, UUID hostId) + { + ResultMessage.Rows rows = selectLastRepairTimeForNode.execute(QueryState.forInternalCalls(), + QueryOptions.forInternalCalls(internalQueryCL, + Lists.newArrayList( + ByteBufferUtil.bytes(repairType.toString()), + ByteBufferUtil.bytes(hostId))), + Dispatcher.RequestTime.forImmediateExecution()); + UntypedResultSet repairTime = UntypedResultSet.create(rows.result); + if (repairTime.isEmpty()) + { + return 0; + } + return repairTime.one().getLong(COL_REPAIR_FINISH_TS); + } + + @VisibleForTesting + public static CurrentRepairStatus getCurrentRepairStatus(RepairType repairType, List autoRepairHistories, UUID myId) + { + if (autoRepairHistories != null) + { + return new CurrentRepairStatus(autoRepairHistories, getPriorityHostIds(repairType), myId); + } + return null; + } + + @VisibleForTesting + protected static TreeSet getHostIdsInCurrentRing(RepairType repairType, Collection allNodesInRing) + { + TreeSet hostIdsInCurrentRing = new TreeSet<>(); + for (NodeAddresses node : allNodesInRing) + { + String nodeDC = DatabaseDescriptor.getLocator().location(node.broadcastAddress).datacenter; + if (AutoRepairService.instance.getAutoRepairConfig().getIgnoreDCs(repairType).contains(nodeDC)) + { + logger.info("Ignore node {} because its datacenter is {}", node, nodeDC); + continue; + } + /* + * Check if endpoint state exists in gossip or not. If it + * does not then this maybe a ghost node so ignore it + */ + if (Gossiper.instance.isAlive(node.broadcastAddress)) + { + UUID hostId = StorageService.instance.getHostIdForEndpoint(node.broadcastAddress); + hostIdsInCurrentRing.add(hostId); + } + else + { + logger.warn("Node is not present in Gossip cache node {}, node data center {}", node, nodeDC); + } + } + return hostIdsInCurrentRing; + } + + public static TreeSet getHostIdsInCurrentRing(RepairType repairType) + { + Collection allNodesInRing = ClusterMetadata.current().directory.addresses.values(); + return getHostIdsInCurrentRing(repairType, allNodesInRing); + } + + // This function will return the host ID for the node which has not been repaired for longest time + public static AutoRepairHistory getHostWithLongestUnrepairTime(RepairType repairType) + { + List autoRepairHistories = getAutoRepairHistory(repairType); + return getHostWithLongestUnrepairTime(autoRepairHistories); + } + + /** + * Convenience method to resolve the broadcast address of a host id from {@link ClusterMetadata} + * @return broadcast address if it exists in CMS, otherwise null. + */ + @Nullable + private static InetAddressAndPort getBroadcastAddress(UUID hostId) + { + Directory directory = ClusterMetadata.current().directory; + + NodeId nodeId = directory.nodeIdFromHostId(hostId); + if (nodeId != null) + { + NodeAddresses nodeAddresses = directory.getNodeAddresses(nodeId); + if (nodeAddresses != null) + { + return nodeAddresses.broadcastAddress; + } + } + return null; + } + + /** + * @return Map of broadcast address to host id, if a broadcast address cannot be found for a host, it is + * not included in the map. + */ + private static Map getBroadcastAddressToHostIdMap(Set hosts) + { + // Get a mapping of endpoint : host id + Map broadcastAddressMap = new HashMap<>(hosts.size()); + for (UUID hostId : hosts) + { + InetAddressAndPort broadcastAddress = getBroadcastAddress(hostId); + if (broadcastAddress == null) + { + logger.warn("Could not resolve broadcast address from host id {} in ClusterMetadata can't accurately " + + "determine if this node is a replica of the local node.", hostId); + } + else + { + broadcastAddressMap.put(broadcastAddress, hostId); + } + } + return broadcastAddressMap; + } + + /** + * @return Mapping of unique replication strategy to keyspaces using that strategy that we care about repairing. + */ + private static Map> getReplicationStrategies() + { + // Collect all unique replication strategies among all keyspaces. + Map> replicationStrategies = new HashMap<>(); + for (Keyspace keyspace : Keyspace.all()) + { + if (AutoRepairUtils.shouldConsiderKeyspace(keyspace)) + { + replicationStrategies.computeIfAbsent(keyspace.getReplicationStrategy(), k -> new ArrayList<>()) + .add(keyspace.getName()); + } + } + return replicationStrategies; + } + + /** + * Collects all hosts being repaired among all active repair schedules and their schedule if + * {@link AutoRepairConfig#getAllowParallelReplicaRepairAcrossSchedules(RepairType)} is true for this repairType. + * Accepts the currently evaluated repairType's schedule as an optimization to avoid grabbing its repair status an + * additional time. + * + * @param myRepairType The repair type schedule being evaluated. + * @param myRepairStatus The repair status for that repair type. + * @return All hosts among active schedules currently being repaired. + */ + private static Map getHostsBeingRepaired(RepairType myRepairType, CurrentRepairStatus myRepairStatus) + { + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + + Map hostsBeingRepaired = myRepairStatus.getAllHostsWithOngoingRepair().stream() + .collect(Collectors.toMap((h) -> h, (v) -> myRepairType)); + + // If we don't allow repairing across schedules, iterate over other enabled schedules and include hosts + // actively being repaired. + if (!config.getAllowParallelReplicaRepairAcrossSchedules(myRepairType)) + { + for (RepairType repairType : RepairType.values()) + { + if (myRepairType == repairType) + continue; + + if (config.isAutoRepairEnabled(repairType)) + { + CurrentRepairStatus repairStatus = getCurrentRepairStatus(repairType, getAutoRepairHistory(repairType), null); + if (repairStatus != null) + { + for (UUID hostId : repairStatus.getAllHostsWithOngoingRepair()) + { + hostsBeingRepaired.putIfAbsent(hostId, repairType); + } + } + } + } + } + return hostsBeingRepaired; + } + + /** + * Identifies the most eligible host to repair for nodes preceding or equal to this nodes' lastRepairFinishTime. + * The criteria for this is to find the node with the oldest last repair finish time of which none of its replicas + * are currently under repair. + * @return The most eligible host to repair or null if no candidates before and including this nodes' current repair status. + */ + @VisibleForTesting + public static AutoRepairHistory getMostEligibleHostToRepair(RepairType repairType, CurrentRepairStatus currentRepairStatus, UUID myId) + { + // 0. If this repairType allows parallel replica repair, short circuit and return the host with the longest unrepair time + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + if (config.getAllowParallelReplicaRepair(repairType)) + { + return getHostWithLongestUnrepairTime(currentRepairStatus.historiesWithoutOnGoingRepair); + } + + // 1. Sort repair histories from oldest completed to newest + Stream finishedRepairHistories = currentRepairStatus.historiesWithoutOnGoingRepair + .stream() + .sorted(Comparator.comparingLong(h -> h.lastRepairFinishTime)); + + // 2. Optimization: Truncate repair histories after myId so we don't evaluate anything more recent as if we + // aren't interested in anything that isn't this node. + final AtomicBoolean myHistoryFound = new AtomicBoolean(false); + finishedRepairHistories = finishedRepairHistories.takeWhile((history) -> { + if (myHistoryFound.get()) return false; + + myHistoryFound.set(history.hostId.equals(myId)); + return true; + }); + + // If there are any hosts with ongoing repair, filter the repair histories to not include nodes whose replicas + // are ongoing repair. + Map hostsBeingRepairedToRepairType = getHostsBeingRepaired(repairType, currentRepairStatus); + + // 3. If I am already actively being repaired in another schedule, defer submitting repairs; if already + // repairing for this type, return node so it can take its turn. + RepairType alreadyRepairingType = hostsBeingRepairedToRepairType.get(myId); + if (alreadyRepairingType != null) + { + if (repairType != alreadyRepairingType) + { + logger.info("Deferring repair because I am already actively repairing in schedule {}", hostsBeingRepairedToRepairType.get(myId)); + AutoRepairMetricsManager.getMetrics(repairType).repairDelayedBySchedule.inc(); + return null; + } + else if (currentRepairStatus.myRepairHistory != null) + { + // if the repair type matches this repair, assume the node was restarted while repairing, return node + // so it can take its turn. + logAlreadyMyTurn(); + return currentRepairStatus.myRepairHistory; + } + } + + if (!hostsBeingRepairedToRepairType.isEmpty()) + { + // 4. Extract InetAddresses for each UUID as replicas are identified by their address. + Map hostsBeingRepaired = getBroadcastAddressToHostIdMap(hostsBeingRepairedToRepairType.keySet()); + + // 5. Collect unique replication strategies and group them up with their keyspaces. + Map> replicationStrategies = getReplicationStrategies(); + + // 6. Filter out repair histories who have a replica being repaired, note that this is lazy, given the stream + // is completed using findFirst, it should stop as soon as the matching criteria is met. + finishedRepairHistories = finishedRepairHistories.filter((history) -> !hasReplicaWithOngoingRepair(history, + myId, + repairType, + hostsBeingRepaired, + hostsBeingRepairedToRepairType, + replicationStrategies)); + } + + // 7. Select the first (oldest lastRepairFinishTime) repair history without replicas being repaired + return finishedRepairHistories.findFirst().orElse(null); + } + + + /** + * @return Whether the host for the given eligibleRepairHistory has any replicas in hostsBeingRepaired. + * @param eligibleHistory History of node to check + * @param myId Host id of this node, if the repair history is for this node, additional logging will take place. + * @param myRepairType repair type being evaluated + * @param hostsBeingRepaired Hosts being repaired. + * @param hostIdToRepairType mapping of hosts being repaired to the repair type its being repaired for. + * @param replicationStrategies Mapping of unique replication strategies to keyspaces having that strategy. + */ + private static boolean hasReplicaWithOngoingRepair(AutoRepairHistory eligibleHistory, + UUID myId, + RepairType myRepairType, + Map hostsBeingRepaired, + Map hostIdToRepairType, + Map> replicationStrategies) + { + // If no broadcast address found for this host id in cluster metadata, just skip it, a node should always + // see itself in cluster metadata. + InetAddressAndPort eligibleBroadcastAddress = getBroadcastAddress(eligibleHistory.hostId); + if (eligibleBroadcastAddress == null) + { + return true; + } + + // For each replication strategy, determine if host being repaired is a replica of the local node. + for (Map.Entry> entry : replicationStrategies.entrySet()) + { + AbstractReplicationStrategy replicationStrategy = entry.getKey(); + EndpointsByRange endpointsByRange = replicationStrategy.getRangeAddresses(ClusterMetadata.current()); + + // get ranges of the eligible address for the given replication strategy. + RangesAtEndpoint rangesAtEndpoint = StorageService.instance.getReplicas(replicationStrategy, eligibleBroadcastAddress); + for (Replica replica : rangesAtEndpoint) + { + // get the endpoints involved in this range. + EndpointsForRange endpointsForRange = endpointsByRange.get(replica.range()); + // For each host in this range... + for (InetAddressAndPort inetAddressAndPort : endpointsForRange.endpoints()) + { + // If the address of the node in the range belongs to a host being repaired, return true. + UUID hostId = hostsBeingRepaired.get(inetAddressAndPort); + if (hostId != null) + { + // log if the repair history matches the current running node. + InetAddressAndPort myBroadcastAddress = getBroadcastAddress(myId); + if (myBroadcastAddress != null && myBroadcastAddress.equals(eligibleBroadcastAddress)) + { + logger.info("Deferring repair because replica {} ({}) with shared ranges for " + + "{} keyspace(s) (e.g. {}) is currently taking its turn for schedule {}", + hostId, inetAddressAndPort, entry.getValue().size(), entry.getValue().get(0), + hostIdToRepairType.get(hostId)); + AutoRepairMetricsManager.getMetrics(myRepairType).repairDelayedByReplica.inc(); + } + else if (logger.isDebugEnabled()) + { + logger.debug("Not considering node {} ({}) for repair as it has replica {} ({}) with " + + "shared ranges for {} keyspace(s) (e.g. {}) which is currently taking its " + + "turn for schedule {}", + eligibleHistory.hostId, eligibleBroadcastAddress, + hostId, inetAddressAndPort, entry.getValue().size(), entry.getValue().get(0), + hostIdToRepairType.get(hostId)); + + } + return true; + } + } + } + } + + // No replicas found of eligible host. + return false; + } + + private static AutoRepairHistory getHostWithLongestUnrepairTime(List autoRepairHistories) + { + if (autoRepairHistories == null) + { + return null; + } + AutoRepairHistory rst = null; + long oldestTimestamp = Long.MAX_VALUE; + for (AutoRepairHistory autoRepairHistory : autoRepairHistories) + { + if (autoRepairHistory.lastRepairFinishTime < oldestTimestamp) + { + rst = autoRepairHistory; + oldestTimestamp = autoRepairHistory.lastRepairFinishTime; + } + } + return rst; + } + + public static int getMaxNumberOfNodeRunAutoRepair(RepairType repairType, int groupSize) + { + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + if (groupSize == 0) + { + return Math.max(config.getParallelRepairCount(repairType), 1); + } + // we will use the max number from config between auto_repair_parallel_repair_count_in_group and auto_repair_parallel_repair_percentage_in_group + int value = Math.max(groupSize * config.getParallelRepairPercentage(repairType) / 100, + config.getParallelRepairCount(repairType)); + // make sure at least one node getting repaired + return Math.max(1, value); + } + + private static void logAlreadyMyTurn() + { + logger.warn("This node already was considered to having an ongoing repair for this repair type, must have " + + "been restarted, taking my turn back"); + } + + @VisibleForTesting + public static RepairTurn myTurnToRunRepair(RepairType repairType, UUID myId) + { + try + { + Collection allNodesInRing = ClusterMetadata.current().directory.addresses.values(); + logger.info("Total nodes in ring {}", allNodesInRing.size()); + TreeSet hostIdsInCurrentRing = getHostIdsInCurrentRing(repairType, allNodesInRing); + logger.info("Total nodes qualified for repair {}", hostIdsInCurrentRing.size()); + + List autoRepairHistories = getAutoRepairHistory(repairType); + Set autoRepairHistoryIds = new HashSet<>(); + + // 1. Remove any node that is not part of group based on gossip info + if (autoRepairHistories != null) + { + for (AutoRepairHistory nodeHistory : autoRepairHistories) + { + autoRepairHistoryIds.add(nodeHistory.hostId); + // clear delete_hosts if the node's delete hosts is not growing for more than two hours + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + if (!nodeHistory.deleteHosts.isEmpty() + && config.getAutoRepairHistoryClearDeleteHostsBufferInterval().toSeconds() < TimeUnit.MILLISECONDS.toSeconds( + currentTimeMillis() - nodeHistory.deleteHostsUpdateTime + )) + { + clearDeleteHosts(repairType, nodeHistory.hostId); + logger.info("Delete hosts for {} for repair type {} has not been updated for more than {} seconds. Delete hosts has been cleared. Delete hosts before clear {}" + , nodeHistory.hostId, repairType, config.getAutoRepairHistoryClearDeleteHostsBufferInterval(), nodeHistory.deleteHosts); + } + else if (!hostIdsInCurrentRing.contains(nodeHistory.hostId)) + { + if (nodeHistory.deleteHosts.size() > Math.max(2, hostIdsInCurrentRing.size() * 0.5)) + { + // More than half of the groups thinks the record should be deleted + logger.info("{} think {} is orphan node, will delete auto repair history for repair type {}.", nodeHistory.deleteHosts, nodeHistory.hostId, repairType); + deleteAutoRepairHistory(repairType, nodeHistory.hostId); + } + else + { + // I think this host should be deleted + logger.info("I({}) think {} is not part of ring, vote to delete it for repair type {}.", myId, nodeHistory.hostId, repairType); + addHostIdToDeleteHosts(repairType, myId, nodeHistory.hostId); + } + } + } + } + + // 2. Add node to auto repair history table if a node is in gossip info + for (UUID hostId : hostIdsInCurrentRing) + { + if (!autoRepairHistoryIds.contains(hostId)) + { + logger.info("{} for repair type {} doesn't exist in the auto repair history table, insert a new record.", repairType, hostId); + insertNewRepairHistory(repairType, hostId, currentTimeMillis(), currentTimeMillis()); + } + } + + // get updated current repair status + CurrentRepairStatus currentRepairStatus = getCurrentRepairStatus(repairType, getAutoRepairHistory(repairType), myId); + if (currentRepairStatus != null) + { + if (logger.isDebugEnabled()) + { + logger.debug("Latest repair status {}", currentRepairStatus); + } + //check if I am forced to run repair + for (AutoRepairHistory history : currentRepairStatus.historiesWithoutOnGoingRepair) + { + if (history.forceRepair && history.hostId.equals(myId)) + { + return MY_TURN_FORCE_REPAIR; + } + } + } + + // check if node was already indicated as having an ongoing repair, this may happen when a node restarts + // before finishing repairing. + if (currentRepairStatus != null && currentRepairStatus.getAllHostsWithOngoingRepair().contains(myId)) + { + logAlreadyMyTurn(); + + // use the previously chosen turn. + if (currentRepairStatus.myRepairHistory != null && currentRepairStatus.myRepairHistory.repairTurn != null) + { + return RepairTurn.valueOf(currentRepairStatus.myRepairHistory.repairTurn); + } + else + { + return MY_TURN; + } + } + + int parallelRepairNumber = getMaxNumberOfNodeRunAutoRepair(repairType, + autoRepairHistories == null ? 0 : autoRepairHistories.size()); + logger.info("Will run repairs concurrently on {} node(s)", parallelRepairNumber); + if (currentRepairStatus == null || parallelRepairNumber > currentRepairStatus.hostIdsWithOnGoingRepair.size()) + { + // more repairs can be run, I might be the new one + if (autoRepairHistories != null) + { + logger.info("Auto repair history table has {} records", autoRepairHistories.size()); + } + else + { + // try to fetch again + autoRepairHistories = getAutoRepairHistory(repairType); + if (autoRepairHistories == null) + { + logger.error("No record found"); + return NOT_MY_TURN; + } + + currentRepairStatus = getCurrentRepairStatus(repairType, autoRepairHistories, myId); + } + + UUID priorityHostId = null; + if (currentRepairStatus.priority != null) + { + for (UUID priorityID : currentRepairStatus.priority) + { + // remove ids doesn't belong to this ring + if (!hostIdsInCurrentRing.contains(priorityID)) + { + logger.info("{} is not part of the current ring, will be removed from priority list.", priorityID); + removePriorityStatus(repairType, priorityID); + } + else + { + priorityHostId = priorityID; + break; + } + } + } + + if (priorityHostId != null && !myId.equals(priorityHostId)) + { + logger.info("Priority list is not empty and I'm not the first node in the list, not my turn." + + "First node in priority list is {}", getBroadcastAddress(priorityHostId)); + return NOT_MY_TURN; + } + + if (myId.equals(priorityHostId)) + { + //I have a priority for repair hence its my turn now + return MY_TURN_DUE_TO_PRIORITY; + } + + // Determine if this node is the most eligible host to repair. + AutoRepairHistory nodeToBeRepaired = getMostEligibleHostToRepair(repairType, currentRepairStatus, myId); + if (nodeToBeRepaired != null) + { + if (nodeToBeRepaired.hostId.equals(myId)) + { + logger.info("This node is selected to be repaired for repair type {}", repairType); + return MY_TURN; + } + + // log which node is next, which is helpful for debugging + logger.info("Next node to be repaired for repair type {}: {} ({})", repairType, + getBroadcastAddress(nodeToBeRepaired.hostId), + nodeToBeRepaired); + } + + // If this node is not identified as most eligible, set the repair lag time. + if (currentRepairStatus.myRepairHistory != null) + { + AutoRepairMetricsManager.getMetrics(repairType) + .recordRepairStartLag(currentRepairStatus.myRepairHistory.lastRepairFinishTime); + } + } + else if (currentRepairStatus.hostIdsWithOnGoingForceRepair.contains(myId)) + { + return MY_TURN_FORCE_REPAIR; + } + // for some reason I was not done with the repair hence resume (maybe node restart in-between, etc.) + return currentRepairStatus.hostIdsWithOnGoingRepair.contains(myId) ? MY_TURN : NOT_MY_TURN; + } + catch (Exception e) + { + logger.error("Exception while deciding node's turn:", e); + } + return NOT_MY_TURN; + } + + static void deleteAutoRepairHistory(RepairType repairType, UUID hostId) + { + //delete the given hostId + delStatementRepairHistory.execute(QueryState.forInternalCalls(), + QueryOptions.forInternalCalls(internalQueryCL, + Lists.newArrayList(ByteBufferUtil.bytes(repairType.toString()), + ByteBufferUtil.bytes(hostId))), Dispatcher.RequestTime.forImmediateExecution()); + } + + static void updateStartAutoRepairHistory(RepairType repairType, UUID myId, long timestamp, RepairTurn turn) + { + recordStartRepairHistoryStatement.execute(QueryState.forInternalCalls(), + QueryOptions.forInternalCalls(internalQueryCL, + Lists.newArrayList(ByteBufferUtil.bytes(timestamp), + ByteBufferUtil.bytes(turn.name()), + ByteBufferUtil.bytes(repairType.toString()), + ByteBufferUtil.bytes(myId) + )), Dispatcher.RequestTime.forImmediateExecution()); + } + + static void updateFinishAutoRepairHistory(RepairType repairType, UUID myId, long timestamp) + { + recordFinishRepairHistoryStatement.execute(QueryState.forInternalCalls(), + QueryOptions.forInternalCalls(internalQueryCL, + Lists.newArrayList(ByteBufferUtil.bytes(timestamp), + ByteBufferUtil.bytes(repairType.toString()), + ByteBufferUtil.bytes(myId) + )), Dispatcher.RequestTime.forImmediateExecution()); + logger.info("Auto repair finished for {}", myId); + } + + public static void insertNewRepairHistory(RepairType repairType, UUID hostId, long startTime, long finishTime) + { + try + { + Keyspace autoRepairKS = Schema.instance.getKeyspaceInstance(SchemaConstants.DISTRIBUTED_KEYSPACE_NAME); + ConsistencyLevel cl = autoRepairKS.getReplicationStrategy().getClass() == NetworkTopologyStrategy.class ? + ConsistencyLevel.LOCAL_SERIAL : null; + + UntypedResultSet resultSet; + ResultMessage.Rows resultMessage = (ResultMessage.Rows) insertNewRepairHistoryStatement.execute( + QueryState.forInternalCalls(), QueryOptions.create(internalQueryCL, Lists.newArrayList( + ByteBufferUtil.bytes(repairType.toString()), + ByteBufferUtil.bytes(hostId), + ByteBufferUtil.bytes(startTime), + ByteBufferUtil.bytes(finishTime), + ByteBufferUtil.bytes(currentTimeMillis()) + ), false, -1, null, cl, ProtocolVersion.CURRENT, SchemaConstants.DISTRIBUTED_KEYSPACE_NAME), + Dispatcher.RequestTime.forImmediateExecution()); + resultSet = UntypedResultSet.create(resultMessage.result); + boolean applied = resultSet.one().getBoolean(ModificationStatement.CAS_RESULT_COLUMN.toString()); + if (applied) + { + logger.info("Successfully inserted a new auto repair history record for host id: {}", hostId); + } + else + { + logger.info("Record exists, no need to insert again for host id: {}", hostId); + } + } + catch (Exception e) + { + logger.error("Exception in inserting new repair history:", e); + } + } + + public static void insertNewRepairHistory(RepairType repairType, long startTime, long finishTime) + { + UUID hostId = StorageService.instance.getHostIdForEndpoint(FBUtilities.getBroadcastAddressAndPort()); + insertNewRepairHistory(repairType, hostId, startTime, finishTime); + } + + public static void addHostIdToDeleteHosts(RepairType repairType, UUID myID, UUID hostToBeDeleted) + { + SetSerializer serializer = SetSerializer.getInstance(UUIDSerializer.instance, UTF8Type.instance.comparatorSet); + addHostIDToDeleteHostsStatement.execute(QueryState.forInternalCalls(), + QueryOptions.forInternalCalls(internalQueryCL, + Lists.newArrayList(serializer.serialize(new HashSet<>(Arrays.asList(myID))), + ByteBufferUtil.bytes(currentTimeMillis()), + ByteBufferUtil.bytes(repairType.toString()), + ByteBufferUtil.bytes(hostToBeDeleted) + )), Dispatcher.RequestTime.forImmediateExecution()); + } + + public static void addPriorityHosts(RepairType repairType, Set hosts) + { + Set hostIds = new HashSet<>(); + for (InetAddressAndPort host : hosts) + { + //find hostId from IP address + UUID hostId = ClusterMetadata.current().directory.hostId(ClusterMetadata.current().directory.peerId(host)); + hostIds.add(hostId); + if (hostId != null) + { + logger.info("Add host {} to the priority list", hostId); + } + } + if (!hostIds.isEmpty()) + { + SetSerializer serializer = SetSerializer.getInstance(UUIDSerializer.instance, UTF8Type.instance.comparatorSet); + addPriorityHost.execute(QueryState.forInternalCalls(), + QueryOptions.forInternalCalls(internalQueryCL, + Lists.newArrayList(serializer.serialize(hostIds), + ByteBufferUtil.bytes(repairType.toString()))), + Dispatcher.RequestTime.forImmediateExecution()); + } + } + + static void removePriorityStatus(RepairType repairType, UUID hostId) + { + logger.info("Remove host {} from priority list", hostId); + delStatementPriorityStatus.execute(QueryState.forInternalCalls(), + QueryOptions.forInternalCalls(internalQueryCL, + Lists.newArrayList(ByteBufferUtil.bytes(hostId), + ByteBufferUtil.bytes(repairType.toString()))), + Dispatcher.RequestTime.forImmediateExecution()); + } + + public static Set getPriorityHostIds(RepairType repairType) + { + UntypedResultSet repairPriorityResult; + + ResultMessage.Rows repairPriorityRows = selectStatementRepairPriority.execute(QueryState.forInternalCalls(), + QueryOptions.forInternalCalls(internalQueryCL, Lists.newArrayList(ByteBufferUtil.bytes(repairType.toString()))), Dispatcher.RequestTime.forImmediateExecution()); + repairPriorityResult = UntypedResultSet.create(repairPriorityRows.result); + + Set priorities = null; + if (!repairPriorityResult.isEmpty()) + { + // there should be only one row + UntypedResultSet.Row row = repairPriorityResult.one(); + priorities = row.getSet(COL_REPAIR_PRIORITY, UUIDType.instance); + } + if (priorities != null) + { + return priorities; + } + return Collections.emptySet(); + } + + public static Set getPriorityHosts(RepairType repairType) + { + Set hosts = new HashSet<>(); + for (UUID hostId : getPriorityHostIds(repairType)) + { + InetAddressAndPort broadcastAddress = getBroadcastAddress(hostId); + if (broadcastAddress == null) + { + logger.warn("Could not resolve broadcastAddress for {}, skipping considering it as a priority host", hostId); + continue; + } + hosts.add(broadcastAddress); + } + return hosts; + } + + public static boolean shouldConsiderKeyspace(Keyspace ks) + { + AbstractReplicationStrategy replicationStrategy = ks.getReplicationStrategy(); + boolean repair = true; + if (replicationStrategy instanceof NetworkTopologyStrategy) + { + Set datacenters = ((NetworkTopologyStrategy) replicationStrategy).getDatacenters(); + String localDC = DatabaseDescriptor.getLocator().local().datacenter; + if (!datacenters.contains(localDC)) + { + repair = false; + } + } + if (replicationStrategy instanceof LocalStrategy || replicationStrategy instanceof MetaStrategy) + { + repair = false; + } + if (ks.getName().equalsIgnoreCase(SchemaConstants.TRACE_KEYSPACE_NAME)) + { + // by default, ignore the tables under system_traces as they do not have + // that much important data + repair = false; + } + return repair; + } + + public static boolean tableMaxRepairTimeExceeded(RepairType repairType, long startTime) + { + long tableRepairTimeSoFar = TimeUnit.MILLISECONDS.toSeconds + (currentTimeMillis() - startTime); + return AutoRepairService.instance.getAutoRepairConfig().getAutoRepairTableMaxRepairTime(repairType).toSeconds() < + tableRepairTimeSoFar; + } + + public static boolean keyspaceMaxRepairTimeExceeded(RepairType repairType, long startTime, int numOfTablesToBeRepaired) + { + long keyspaceRepairTimeSoFar = TimeUnit.MILLISECONDS.toSeconds((currentTimeMillis() - startTime)); + return (long) AutoRepairService.instance.getAutoRepairConfig().getAutoRepairTableMaxRepairTime(repairType).toSeconds() * + numOfTablesToBeRepaired < keyspaceRepairTimeSoFar; + } + + public static List getAllMVs(RepairType repairType, Keyspace keyspace, TableMetadata tableMetadata) + { + List allMvs = new ArrayList<>(); + if (AutoRepairService.instance.getAutoRepairConfig().getMaterializedViewRepairEnabled(repairType) && keyspace.getMetadata().views != null) + { + Iterator views = keyspace.getMetadata().views.forTable(tableMetadata.id).iterator(); + while (views.hasNext()) + { + String viewName = views.next().name(); + logger.info("Adding MV to the list {}.{}.{}", keyspace.getName(), tableMetadata.name, viewName); + allMvs.add(viewName); + } + } + return allMvs; + } + + public static void runRepairOnNewlyBootstrappedNodeIfEnabled() + { + AutoRepairConfig repairConfig = DatabaseDescriptor.getAutoRepairConfig(); + if (repairConfig.isAutoRepairSchedulingEnabled()) + { + for (AutoRepairConfig.RepairType rType : AutoRepairConfig.RepairType.values()) + if (repairConfig.isAutoRepairEnabled(rType) && repairConfig.getForceRepairNewNode(rType)) + AutoRepairUtils.setForceRepairNewNode(rType); + } + } + + public static Collection> split(Range tokenRange, int numberOfSplits) + { + Collection> ranges; + Optional splitter = DatabaseDescriptor.getPartitioner().splitter(); + if (splitter.isEmpty()) + { + NoSpamLogger.log(logger, NoSpamLogger.Level.WARN, 30, TimeUnit.MINUTES, "Partitioner {} does not support splitting, falling back to splitting by token range", DatabaseDescriptor.getPartitioner()); + ranges = Collections.singleton(tokenRange); + } + else + { + ranges = splitter.get().split(Collections.singleton(tokenRange), numberOfSplits); + } + return ranges; + } +} diff --git a/src/java/org/apache/cassandra/repair/autorepair/FixedSplitTokenRangeSplitter.java b/src/java/org/apache/cassandra/repair/autorepair/FixedSplitTokenRangeSplitter.java new file mode 100644 index 000000000000..a6dddb3060bb --- /dev/null +++ b/src/java/org/apache/cassandra/repair/autorepair/FixedSplitTokenRangeSplitter.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.repair.autorepair; + + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.service.AutoRepairService; + +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.tcm.compatibility.TokenRingUtils; +import org.apache.cassandra.utils.FBUtilities; + +import static org.apache.cassandra.repair.autorepair.AutoRepairUtils.split; + +/** + * An implementation that splits token ranges into a fixed number of subranges. + */ +public class FixedSplitTokenRangeSplitter implements IAutoRepairTokenRangeSplitter +{ + private static final Logger logger = LoggerFactory.getLogger(FixedSplitTokenRangeSplitter.class); + + /** + * Selecting the default value is tricky. If we select a small number, individual repairs would be heavy. + * On the other hand, if we select a large number, too many repair sessions would be created. + *

+ * If vnodes are configured using num_tokens, attempts to evenly subdivide subranges by each range + * using the following formula: + *

+ * Math.max(1, numberOfSubranges / tokens.size()) + *

+ * To maintain balance, 32 serves as a good default that accommodates both vnodes and non-vnodes effectively. + */ + public static final int DEFAULT_NUMBER_OF_SUBRANGES = 32; + + /** + * Number of evenly split subranges to create for each node that repair runs for. + *

+ * If vnodes are configured using num_tokens, attempts to evenly subdivide subranges by each range. + * For example, for num_tokens: 16 and number_of_subranges: 32, 2 (32/16) + * repair assignments will be created for each token range. At least one repair assignment will be + * created for each token range. + */ + static final String NUMBER_OF_SUBRANGES = "number_of_subranges"; + + private final AutoRepairConfig.RepairType repairType; + private int numberOfSubranges; + + public FixedSplitTokenRangeSplitter(AutoRepairConfig.RepairType repairType, Map parameters) + { + this.repairType = repairType; + + numberOfSubranges = Integer.parseInt(parameters.getOrDefault(NUMBER_OF_SUBRANGES, Integer.toString(DEFAULT_NUMBER_OF_SUBRANGES))); + } + + @Override + public Iterator getRepairAssignments(boolean primaryRangeOnly, List repairPlans) + { + return new RepairAssignmentIterator(repairPlans) + { + @Override + protected KeyspaceRepairAssignments next(int priority, KeyspaceRepairPlan repairPlan) + { + return getRepairAssignmentsForKeyspace(primaryRangeOnly, priority, repairPlan); + } + }; + } + + private KeyspaceRepairAssignments getRepairAssignmentsForKeyspace(boolean primaryRangeOnly, int priority, KeyspaceRepairPlan repairPlan) + { + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + List repairAssignments = new ArrayList<>(); + String keyspaceName = repairPlan.getKeyspaceName(); + List tableNames = repairPlan.getTableNames(); + + Collection> tokens = TokenRingUtils.getPrimaryRangesForEndpoint(keyspaceName, FBUtilities.getBroadcastAddressAndPort()); + if (!primaryRangeOnly) + { + // if we need to repair non-primary token ranges, then change the tokens accordingly + tokens = StorageService.instance.getLocalReplicas(keyspaceName).onlyFull().ranges(); + } + + boolean byKeyspace = config.getRepairByKeyspace(repairType); + // collect all token ranges. + List> allRanges = new ArrayList<>(); + // this is done to avoid micro splits in the case of vnodes + int splitsPerRange = Math.max(1, numberOfSubranges / tokens.size()); + for (Range token : tokens) + { + allRanges.addAll(split(token, splitsPerRange)); + } + + if (byKeyspace) + { + for (Range splitRange : allRanges) + { + // add repair assignment for each range entire keyspace's tables + repairAssignments.add(new RepairAssignment(splitRange, keyspaceName, tableNames)); + } + } + else + { + // add repair assignment per table + for (String tableName : tableNames) + { + for (Range splitRange : allRanges) + { + repairAssignments.add(new RepairAssignment(splitRange, keyspaceName, Collections.singletonList(tableName))); + } + } + } + return new KeyspaceRepairAssignments(priority, keyspaceName, repairAssignments); + } + + @Override + public void setParameter(String key, String value) + { + if (!key.equals(NUMBER_OF_SUBRANGES)) + { + throw new IllegalArgumentException("Unexpected parameter '" + key + "', must be " + NUMBER_OF_SUBRANGES); + } + logger.info("Setting {} to {} for repair type {}", key, value, repairType); + this.numberOfSubranges = Integer.parseInt(value); + } + + @Override + public Map getParameters() + { + return Collections.singletonMap(NUMBER_OF_SUBRANGES, Integer.toString(numberOfSubranges)); + } +} diff --git a/src/java/org/apache/cassandra/repair/autorepair/IAutoRepairTokenRangeSplitter.java b/src/java/org/apache/cassandra/repair/autorepair/IAutoRepairTokenRangeSplitter.java new file mode 100644 index 000000000000..8b82eac296db --- /dev/null +++ b/src/java/org/apache/cassandra/repair/autorepair/IAutoRepairTokenRangeSplitter.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.repair.autorepair; + +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.apache.cassandra.config.ParameterizedClass; + +/** + * Interface that defines how to generate {@link KeyspaceRepairAssignments}. + *

+ * The default is {@link RepairTokenRangeSplitter} which aims to provide sensible defaults for all repair types. + *

+ * Custom implementations class should require a constructor accepting + * ({@link AutoRepairConfig.RepairType}, {@link java.util.Map}) with the {@link java.util.Map} parameter accepting + * custom configuration for your splitter. If such a constructor does not exist, + * {@link AutoRepairConfig#newAutoRepairTokenRangeSplitter(AutoRepairConfig.RepairType, ParameterizedClass)} + * will fall back on invoking a default zero argument constructor. + */ +public interface IAutoRepairTokenRangeSplitter +{ + /** + * Split the token range you wish to repair into multiple assignments. + * The autorepair framework will repair the assignments from returned subrange iterator in the sequence it's + * provided. + * @param primaryRangeOnly Whether to repair only this node's primary ranges or all of its ranges. + * @param repairPlans A list of ordered prioritized repair plans to generate assignments for in order. + * @return iterator of repair assignments, with each element representing a grouping of repair assignments for a given keyspace. + * The iterator is traversed lazily {@link KeyspaceRepairAssignments} at a time with the intent to try to get the + * most up-to-date representation of your data (e.g. how much data exists and is unrepaired at a given time). + */ + Iterator getRepairAssignments(boolean primaryRangeOnly, List repairPlans); + + /** + * Update a configuration parameter. This is meant to be used by nodetool setautorepairconfig to + * update configuration dynamically. + * @param key parameter to update + * @param value The value to set to. + */ + default void setParameter(String key, String value) + { + throw new IllegalArgumentException(this.getClass().getName() + " does not support custom configuration"); + } + + /** + * @return custom configuration. This is meant to be used by nodetool getautorepairconfig for + * retrieving the splitter configuration. + */ + default Map getParameters() + { + return Collections.emptyMap(); + } +} diff --git a/src/java/org/apache/cassandra/repair/autorepair/KeyspaceRepairAssignments.java b/src/java/org/apache/cassandra/repair/autorepair/KeyspaceRepairAssignments.java new file mode 100644 index 000000000000..3ea91e9922f9 --- /dev/null +++ b/src/java/org/apache/cassandra/repair/autorepair/KeyspaceRepairAssignments.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.util.List; + +/** + * A grouping of repair assignments that were generated for a particular keyspace for a given priority. + */ +public class KeyspaceRepairAssignments +{ + private final int priority; + private final String keyspaceName; + private final List repairAssignments; + + public KeyspaceRepairAssignments(int priority, String keyspaceName, List repairAssignments) + { + this.priority = priority; + this.keyspaceName = keyspaceName; + this.repairAssignments = repairAssignments; + } + + public int getPriority() + { + return priority; + } + + public String getKeyspaceName() + { + return keyspaceName; + } + + public List getRepairAssignments() + { + return repairAssignments; + } +} diff --git a/src/java/org/apache/cassandra/repair/autorepair/KeyspaceRepairPlan.java b/src/java/org/apache/cassandra/repair/autorepair/KeyspaceRepairPlan.java new file mode 100644 index 000000000000..3c13e3d80d08 --- /dev/null +++ b/src/java/org/apache/cassandra/repair/autorepair/KeyspaceRepairPlan.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.util.List; +import java.util.Objects; + +/** + * Encapsulates an intent to repair the given keyspace's tables + */ +public class KeyspaceRepairPlan +{ + private final String keyspaceName; + + private final List tableNames; + + public KeyspaceRepairPlan(String keyspaceName, List tableNames) + { + this.keyspaceName = keyspaceName; + this.tableNames = tableNames; + } + + public String getKeyspaceName() + { + return keyspaceName; + } + + public List getTableNames() + { + return tableNames; + } + + @Override + public boolean equals(Object o) + { + if (o == null || getClass() != o.getClass()) return false; + KeyspaceRepairPlan that = (KeyspaceRepairPlan) o; + return Objects.equals(keyspaceName, that.keyspaceName) && Objects.equals(tableNames, that.tableNames); + } + + @Override + public int hashCode() + { + return Objects.hash(keyspaceName, tableNames); + } + + @Override + public String toString() + { + return "KeyspaceRepairPlan{" + + "keyspaceName='" + keyspaceName + '\'' + + ", tableNames=" + tableNames + + '}'; + } +} diff --git a/src/java/org/apache/cassandra/repair/autorepair/PrioritizedRepairPlan.java b/src/java/org/apache/cassandra/repair/autorepair/PrioritizedRepairPlan.java new file mode 100644 index 000000000000..fbedb71b7c3c --- /dev/null +++ b/src/java/org/apache/cassandra/repair/autorepair/PrioritizedRepairPlan.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.TreeSet; +import java.util.function.Consumer; + +import org.apache.cassandra.db.ColumnFamilyStore; + +/** + * Encapsulates a devised plan to repair tables, grouped by their keyspace and a given priority. This is used + * by {@link AutoRepair} to pass in an organized plan to + * {@link IAutoRepairTokenRangeSplitter#getRepairAssignments(boolean, List)} which + * can iterate over this plan in order to generate {@link RepairAssignment}s. + */ +public class PrioritizedRepairPlan +{ + private final int priority; + + private final List keyspaceRepairPlans; + + public PrioritizedRepairPlan(int priority, List keyspaceRepairPlans) + { + this.priority = priority; + this.keyspaceRepairPlans = keyspaceRepairPlans; + } + + public int getPriority() + { + return priority; + } + + public List getKeyspaceRepairPlans() + { + return keyspaceRepairPlans; + } + + @Override + public boolean equals(Object o) + { + if (o == null || getClass() != o.getClass()) return false; + PrioritizedRepairPlan that = (PrioritizedRepairPlan) o; + return priority == that.priority && Objects.equals(keyspaceRepairPlans, that.keyspaceRepairPlans); + } + + @Override + public int hashCode() + { + return Objects.hash(priority, keyspaceRepairPlans); + } + + @Override + public String toString() + { + return "PrioritizedRepairPlan{" + + "priority=" + priority + + ", keyspaceRepairPlans=" + keyspaceRepairPlans + + '}'; + } + + /** + * Builds a list of {@link PrioritizedRepairPlan}s for the given keyspace and table map, ordered by priority from + * highest to lowest, where priority is derived from table schema's defined priority for the given repair type. + *

+ * If a keyspace has tables with differing priorities, those tables will be included in the PrioritizedRepairPlan + * for their given priority. + * + * @param keyspacesToTableNames A mapping keyspace to table names + * @param repairType The repair type that is being executed + * @param orderFunc A function to order keyspace and tables in the returned plan. + * @return Ordered list of plan's by table priorities. + */ + public static List build(Map> keyspacesToTableNames, AutoRepairConfig.RepairType repairType, Consumer> orderFunc) + { + // Build a map of priority -> (keyspace -> tables) + Map>> plans = new HashMap<>(); + for (Map.Entry> keyspaceToTableNames : keyspacesToTableNames.entrySet()) + { + String keyspaceName = keyspaceToTableNames.getKey(); + for (String tableName : keyspaceToTableNames.getValue()) + { + int priority = getPriority(repairType, keyspaceName, tableName); + Map> keyspacesForPriority = plans.computeIfAbsent(priority, p -> new HashMap<>()); + List tableNamesAtPriority = keyspacesForPriority.computeIfAbsent(keyspaceName, k -> new ArrayList<>()); + tableNamesAtPriority.add(tableName); + } + } + + // Extract map into a List ordered by priority from highest to lowest. + List planList = new ArrayList<>(plans.size()); + TreeSet priorities = new TreeSet<>(Comparator.reverseOrder()); + priorities.addAll(plans.keySet()); + for (int priority : priorities) + { + Map> keyspacesAndTables = plans.get(priority); + List keyspaceRepairPlans = new ArrayList<>(keyspacesAndTables.size()); + planList.add(new PrioritizedRepairPlan(priority, keyspaceRepairPlans)); + + // Order keyspace and table names based on the input function (typically, this would shuffle the keyspace + // and table names randomly). + List keyspaceNames = new ArrayList<>(keyspacesAndTables.keySet()); + orderFunc.accept(keyspaceNames); + + for(String keyspaceName : keyspaceNames) + { + List tableNames = keyspacesAndTables.get(keyspaceName); + orderFunc.accept(tableNames); + KeyspaceRepairPlan keyspaceRepairPlan = new KeyspaceRepairPlan(keyspaceName, new ArrayList<>(tableNames)); + keyspaceRepairPlans.add(keyspaceRepairPlan); + } + } + + return planList; + } + + /** + * Convenience method to build a repair plan for a single keyspace with tables. Primarily useful in testing. + * @param keyspaceName Keyspace to repair + * @param tableNames tables to repair for the given keyspace. + * @return Single repair plan. + */ + static List buildSingleKeyspacePlan(AutoRepairConfig.RepairType repairType, String keyspaceName, String ... tableNames) + { + Map> keyspaceMap = new HashMap<>(); + keyspaceMap.put(keyspaceName, Arrays.asList(tableNames)); + return build(keyspaceMap, repairType, (l) -> {}); + } + + /** + * @return The priority of the given table if defined, otherwise 0. + */ + private static int getPriority(AutoRepairConfig.RepairType repairType, String keyspaceName, String tableName) + { + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(keyspaceName, tableName); + return cfs != null ? cfs.metadata().params.autoRepair.priority() : 0; + } +} diff --git a/src/java/org/apache/cassandra/repair/autorepair/RepairAssignment.java b/src/java/org/apache/cassandra/repair/autorepair/RepairAssignment.java new file mode 100644 index 000000000000..63f8fbed4426 --- /dev/null +++ b/src/java/org/apache/cassandra/repair/autorepair/RepairAssignment.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.util.List; +import java.util.Objects; + +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; + +/** + * Defines a repair assignment to be issued by the autorepair framework. + */ +public class RepairAssignment +{ + final Range tokenRange; + + final String keyspaceName; + + final List tableNames; + + public RepairAssignment(Range tokenRange, String keyspaceName, List tableNames) + { + this.tokenRange = tokenRange; + this.keyspaceName = keyspaceName; + this.tableNames = tableNames; + } + + public Range getTokenRange() + { + return tokenRange; + } + + public String getKeyspaceName() + { + return keyspaceName; + } + + public List getTableNames() + { + return tableNames; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + RepairAssignment that = (RepairAssignment) o; + return Objects.equals(tokenRange, that.tokenRange) && Objects.equals(keyspaceName, that.keyspaceName) && Objects.equals(tableNames, that.tableNames); + } + + @Override + public int hashCode() + { + return Objects.hash(tokenRange, keyspaceName, tableNames); + } + + @Override + public String toString() + { + return "RepairAssignment{" + + "tokenRange=" + tokenRange + + ", keyspaceName='" + keyspaceName + '\'' + + ", tableNames=" + tableNames + + '}'; + } +} diff --git a/src/java/org/apache/cassandra/repair/autorepair/RepairAssignmentIterator.java b/src/java/org/apache/cassandra/repair/autorepair/RepairAssignmentIterator.java new file mode 100644 index 000000000000..44d9f5ef5e55 --- /dev/null +++ b/src/java/org/apache/cassandra/repair/autorepair/RepairAssignmentIterator.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; + +/** + * Convenience {@link Iterator} implementation to assist implementations of + * {@link IAutoRepairTokenRangeSplitter#getRepairAssignments(boolean, List)} by passing {@link KeyspaceRepairPlan} + * to a custom {@link #next(int, KeyspaceRepairPlan)} method in priority order. + */ +public abstract class RepairAssignmentIterator implements Iterator +{ + private final Iterator repairPlanIterator; + + private Iterator currentIterator = null; + private PrioritizedRepairPlan currentPlan = null; + + public RepairAssignmentIterator(List repairPlans) + { + this.repairPlanIterator = repairPlans.iterator(); + } + + private synchronized Iterator currentIterator() + { + if (currentIterator == null || !currentIterator.hasNext()) + { + // Advance the repair plan iterator if the current repair plan is exhausted, but only + // if there are more repair plans. + if (repairPlanIterator.hasNext()) + { + currentPlan = repairPlanIterator.next(); + currentIterator = currentPlan.getKeyspaceRepairPlans().iterator(); + } + } + return currentIterator; + } + + @Override + public boolean hasNext() + { + Iterator iterator = currentIterator(); + return (iterator != null && iterator.hasNext()); + } + + @Override + public KeyspaceRepairAssignments next() + { + if (!hasNext()) + { + throw new NoSuchElementException("No remaining repair plans"); + } + + final KeyspaceRepairPlan repairPlan = currentIterator().next(); + return next(currentPlan.getPriority(), repairPlan); + } + + /** + * Invoked by {@link #next()} with the next {@link KeyspaceRepairPlan} for the given priority. + * @param priority current priority being processed. + * @param repairPlan the next keyspace repair plan to process + * @return assignments for the given keyspace at this priority. Should never return null, if one desires to + * short-circuit the iterator, override {@link #hasNext()}. + */ + protected abstract KeyspaceRepairAssignments next(int priority, KeyspaceRepairPlan repairPlan); +} diff --git a/src/java/org/apache/cassandra/repair/autorepair/RepairTokenRangeSplitter.java b/src/java/org/apache/cassandra/repair/autorepair/RepairTokenRangeSplitter.java new file mode 100644 index 000000000000..20a79adc8e8b --- /dev/null +++ b/src/java/org/apache/cassandra/repair/autorepair/RepairTokenRangeSplitter.java @@ -0,0 +1,949 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.EnumMap; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; + +import org.apache.cassandra.tcm.compatibility.TokenRingUtils; +import org.apache.cassandra.utils.FBUtilities; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.clearspring.analytics.stream.cardinality.CardinalityMergeException; +import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus; +import com.clearspring.analytics.stream.cardinality.ICardinality; +import org.apache.cassandra.config.DataStorageSpec; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.db.lifecycle.SSTableIntervalTree; +import org.apache.cassandra.db.lifecycle.SSTableSet; +import org.apache.cassandra.db.lifecycle.View; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.io.sstable.metadata.CompactionMetadata; +import org.apache.cassandra.io.sstable.metadata.MetadataType; +import org.apache.cassandra.io.util.FileUtils; +import org.apache.cassandra.service.AutoRepairService; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.utils.concurrent.Refs; + +import static org.apache.cassandra.repair.autorepair.AutoRepairUtils.split; + +/** + * The default implementation of {@link IAutoRepairTokenRangeSplitter} that attempts to: + *

    + *
  1. Create smaller, consistent repair times
  2. + *
  3. Minimize the impact on hosts
  4. + *
  5. Reduce overstreaming
  6. + *
  7. Reduce number of repairs
  8. + *
+ *

+ * To achieve these goals, this implementation inspects SSTable metadata to estimate the bytes and number of partitions + * within a range and splits it accordingly to bound the size of the token ranges used for repair assignments. + *

+ *

+ * Refer to + * Auto Repair documentation for this implementation + * for a more thorough breakdown of this implementation. + *

+ *

+ * While this splitter has a lot of tuning parameters, the expectation is that the established default configuration + * shall be sensible for all {@link org.apache.cassandra.repair.autorepair.AutoRepairConfig.RepairType}'s. The following + * configuration parameters are offered. + *

+ * + *

Configuration parameters:

+ *
    + *
  • bytes_per_assignment – Target size (in compressed bytes) for each repair. Throttles incremental repair + * and anticompaction per schedule after incremental repairs are enabled.
  • + * + *
  • max_bytes_per_schedule – Maximum data (in compressed bytes) to cover in a single schedule. Acts as a + * throttle for the repair cycle workload. Tune this up if writes are outpacing repair, or down if repairs are too + * disruptive. Alternatively, adjust {@code min_repair_interval}.
  • + * + *
  • partitions_per_assignment – Maximum number of partitions per repair assignment. Limits the number of + * partitions in Merkle tree leaves to prevent overstreaming.
  • + * + *
  • max_tables_per_assignment – Maximum number of tables to include in a single repair assignment. + * Especially useful for keyspaces with many tables. Prevents excessive batching of tables that exceed other + * parameters like {@code bytes_per_assignment} or {@code partitions_per_assignment}.
  • + *
+ */ +public class RepairTokenRangeSplitter implements IAutoRepairTokenRangeSplitter +{ + private static final Logger logger = LoggerFactory.getLogger(RepairTokenRangeSplitter.class); + + // Default max bytes to 100TiB, which is much more readable than Long.MAX_VALUE + private static final DataStorageSpec.LongBytesBound MAX_BYTES = new DataStorageSpec.LongBytesBound(102_400, DataStorageSpec.DataStorageUnit.GIBIBYTES); + + /** + * The target bytes that should be included in a repair assignment + */ + static final String BYTES_PER_ASSIGNMENT = "bytes_per_assignment"; + + /** + * Maximum number of partitions to include in a repair assignment + */ + static final String PARTITIONS_PER_ASSIGNMENT = "partitions_per_assignment"; + + /** + * Maximum number of tables to include in a repair assignment if {@link AutoRepairConfig.Options#repair_by_keyspace} + * is enabled + */ + static final String MAX_TABLES_PER_ASSIGNMENT = "max_tables_per_assignment"; + + /** + * The maximum number of bytes to cover in an individual schedule + */ + static final String MAX_BYTES_PER_SCHEDULE = "max_bytes_per_schedule"; + + static final List PARAMETERS = Arrays.asList(BYTES_PER_ASSIGNMENT, PARTITIONS_PER_ASSIGNMENT, MAX_TABLES_PER_ASSIGNMENT, MAX_BYTES_PER_SCHEDULE); + + private final AutoRepairConfig.RepairType repairType; + + private final Map givenParameters = new HashMap<>(); + + private DataStorageSpec.LongBytesBound bytesPerAssignment; + private long partitionsPerAssignment; + private int maxTablesPerAssignment; + private DataStorageSpec.LongBytesBound maxBytesPerSchedule; + + /** + * Established default for each {@link org.apache.cassandra.repair.autorepair.AutoRepairConfig.RepairType}, meant to + * choose sensible defaults for each. + *

+ * Defaults if not specified for the given repair type: + *

  • + *
      bytes_per_assignment: 50GiB
    + *
      partitions_per_assignment: 1048576 (2^20)
    + *
      max_tables_per_assignment: 64
    + *
      max_bytes_per_schedule: 1000GiB
    + *
  • + * It's expected that these defaults should work well for everything except incremental, where we set + * max_bytes_per_schedule to 100GiB. This should strike a good balance between the amount of data that will be + * repaired during an initial migration to incremental repair and should move the entire repaired set from + * unrepaired to repaired at steady state, assuming not more the 100GiB of data is written to a node per + * min_repair_interval. + */ + private static final Map DEFAULTS_BY_REPAIR_TYPE = new EnumMap<>(AutoRepairConfig.RepairType.class) + {{ + put(AutoRepairConfig.RepairType.FULL, RepairTypeDefaults.builder(AutoRepairConfig.RepairType.FULL) + .build()); + // Restrict incremental repair to 100GiB max bytes per schedule to confine the amount of possible autocompaction. + put(AutoRepairConfig.RepairType.INCREMENTAL, RepairTypeDefaults.builder(AutoRepairConfig.RepairType.INCREMENTAL) + .withMaxBytesPerSchedule(new DataStorageSpec.LongBytesBound("100GiB")) + .build()); + put(AutoRepairConfig.RepairType.PREVIEW_REPAIRED, RepairTypeDefaults.builder(AutoRepairConfig.RepairType.PREVIEW_REPAIRED) + .build()); + }}; + + public RepairTokenRangeSplitter(AutoRepairConfig.RepairType repairType, Map parameters) + { + this.repairType = repairType; + this.givenParameters.putAll(parameters); + + reinitParameters(); + } + + private void reinitParameters() + { + RepairTypeDefaults defaults = DEFAULTS_BY_REPAIR_TYPE.get(repairType); + + DataStorageSpec.LongBytesBound bytesPerAssignmentTmp = getPropertyOrDefault(BYTES_PER_ASSIGNMENT, DataStorageSpec.LongBytesBound::new, defaults.bytesPerAssignment); + DataStorageSpec.LongBytesBound maxBytesPerScheduleTmp = getPropertyOrDefault(MAX_BYTES_PER_SCHEDULE, DataStorageSpec.LongBytesBound::new, defaults.maxBytesPerSchedule); + + // Validate that bytesPerAssignment <= maxBytesPerSchedule + if (bytesPerAssignmentTmp.toBytes() > maxBytesPerScheduleTmp.toBytes()) + { + throw new IllegalArgumentException(String.format("%s='%s' cannot be greater than %s='%s' for %s", + BYTES_PER_ASSIGNMENT, + bytesPerAssignmentTmp, + MAX_BYTES_PER_SCHEDULE, + maxBytesPerScheduleTmp, + repairType.getConfigName())); + } + + bytesPerAssignment = bytesPerAssignmentTmp; + maxBytesPerSchedule = maxBytesPerScheduleTmp; + + partitionsPerAssignment = getPropertyOrDefault(PARTITIONS_PER_ASSIGNMENT, Long::parseLong, defaults.partitionsPerAssignment); + maxTablesPerAssignment = getPropertyOrDefault(MAX_TABLES_PER_ASSIGNMENT, Integer::parseInt, defaults.maxTablesPerAssignment); + + logger.info("Configured {}[{}] with {}={}, {}={}, {}={}, {}={}", RepairTokenRangeSplitter.class.getName(), + repairType.getConfigName(), + BYTES_PER_ASSIGNMENT, bytesPerAssignment, + PARTITIONS_PER_ASSIGNMENT, partitionsPerAssignment, + MAX_TABLES_PER_ASSIGNMENT, maxTablesPerAssignment, + MAX_BYTES_PER_SCHEDULE, maxBytesPerSchedule); + } + + private T getPropertyOrDefault(String propertyName, Function mapper, T defaultValue) + { + return Optional.ofNullable(this.givenParameters.get(propertyName)).map(mapper).orElse(defaultValue); + } + + @Override + public Iterator getRepairAssignments(boolean primaryRangeOnly, List repairPlans) + { + return new BytesBasedRepairAssignmentIterator(primaryRangeOnly, repairPlans); + } + + /** + * A custom {@link RepairAssignmentIterator} that confines the number of repair assignments to + * max_bytes_per_schedule. + */ + private class BytesBasedRepairAssignmentIterator extends RepairAssignmentIterator { + + private final boolean primaryRangeOnly; + private long bytesSoFar = 0; + + BytesBasedRepairAssignmentIterator(boolean primaryRangeOnly, List repairPlans) + { + super(repairPlans); + this.primaryRangeOnly = primaryRangeOnly; + } + + @Override + protected KeyspaceRepairAssignments next(int priority, KeyspaceRepairPlan repairPlan) + { + // short circuit if we've accumulated too many bytes by returning a KeyspaceRepairAssignments with + // no assignments. We do this rather than returning false in hasNext() because we want to signal + // to AutoRepair that a keyspace generated no assignments. + if (bytesSoFar >= maxBytesPerSchedule.toBytes()) + { + return new KeyspaceRepairAssignments(priority, repairPlan.getKeyspaceName(), Collections.emptyList()); + } + + List> tokenRanges = getTokenRanges(primaryRangeOnly, repairPlan.getKeyspaceName()); + // shuffle token ranges to unbias selection of ranges + Collections.shuffle(tokenRanges); + List repairAssignments = new ArrayList<>(); + // Generate assignments for each range speparately + for (Range tokenRange : tokenRanges) + { + repairAssignments.addAll(getRepairAssignmentsForKeyspace(repairType, repairPlan.getKeyspaceName(), repairPlan.getTableNames(), tokenRange)); + } + + FilteredRepairAssignments filteredRepairAssignments = filterRepairAssignments(priority, repairPlan.getKeyspaceName(), repairAssignments, bytesSoFar); + bytesSoFar = filteredRepairAssignments.newBytesSoFar; + return new KeyspaceRepairAssignments(priority, repairPlan.getKeyspaceName(), filteredRepairAssignments.repairAssignments); + } + } + + @VisibleForTesting + List getRepairAssignmentsForKeyspace(AutoRepairConfig.RepairType repairType, String keyspaceName, List tableNames, Range tokenRange) + { + List repairAssignments = new ArrayList<>(); + // this is used for batching minimal single assignment tables together + List currentAssignments = new ArrayList<>(); + + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + + // If we can repair by keyspace, sort the tables by size so can batch the smallest ones together + boolean repairByKeyspace = config.getRepairByKeyspace(repairType); + List tablesToProcess = tableNames; + if (repairByKeyspace) + { + tablesToProcess = tableNames.stream().sorted((t1, t2) -> { + ColumnFamilyStore cfs1 = ColumnFamilyStore.getIfExists(keyspaceName, t1); + ColumnFamilyStore cfs2 = ColumnFamilyStore.getIfExists(keyspaceName, t2); + // If for whatever reason the CFS is not retrievable, we can assume it has been deleted, so give the + // other cfs precedence. + if (cfs1 == null) + { + // cfs1 is lesser than because its null + return -1; + } + else if (cfs2 == null) + { + // cfs1 is greather than because cfs2 is null + return 1; + } + return Long.compare(cfs1.metric.totalDiskSpaceUsed.getCount(), cfs2.metric.totalDiskSpaceUsed.getCount()); + }).collect(Collectors.toList()); + } + + for (String tableName : tablesToProcess) + { + List tableAssignments = getRepairAssignmentsForTable(keyspaceName, tableName, tokenRange); + + if (tableAssignments.isEmpty()) + continue; + + // if not repairing by keyspace don't attempt to batch them with others. + if (!repairByKeyspace) + { + repairAssignments.addAll(tableAssignments); + } + // If the table assignments are for the same token range, and we have room to add more tables to the current assignment + else if (tableAssignments.size() == 1 && + currentAssignments.size() < maxTablesPerAssignment && + (currentAssignments.isEmpty() || currentAssignments.get(0).getTokenRange().equals(tableAssignments.get(0).getTokenRange()))) + { + long currentAssignmentsBytes = getEstimatedBytes(currentAssignments); + long tableAssignmentsBytes = getEstimatedBytes(tableAssignments); + // only add assignments together if they don't exceed max bytes per schedule. + if (currentAssignmentsBytes + tableAssignmentsBytes < maxBytesPerSchedule.toBytes()) + { + currentAssignments.addAll(tableAssignments); + } + else + { + // add table assignments by themselves + repairAssignments.addAll(tableAssignments); + } + } + else + { + if (!currentAssignments.isEmpty()) + { + repairAssignments.add(merge(currentAssignments)); + currentAssignments.clear(); + } + repairAssignments.addAll(tableAssignments); + } + } + + if (!currentAssignments.isEmpty()) + repairAssignments.add(merge(currentAssignments)); + + return repairAssignments; + } + + /** + * Given a repair type and map of sized-based repair assignments, confine them by maxBytesPerSchedule. + * @param repairAssignments the assignments to filter. + * @param bytesSoFar repair assignment bytes accumulated so far. + * @return A list of repair assignments confined by maxBytesPerSchedule. + */ + @VisibleForTesting + FilteredRepairAssignments filterRepairAssignments(int priority, String keyspaceName, List repairAssignments, long bytesSoFar) + { + // Confine repair assignments by maxBytesPerSchedule. + long bytesSoFarThisIteration = 0L; + long bytesNotRepaired = 0L; + int assignmentsNotRepaired = 0; + int assignmentsToRepair = 0; + int totalAssignments = 0; + + List assignmentsToReturn = new ArrayList<>(repairAssignments.size()); + for (SizedRepairAssignment repairAssignment : repairAssignments) + { + totalAssignments++; + // skip any repair assignments that would accumulate us past the maxBytesPerSchedule + if (bytesSoFar + repairAssignment.getEstimatedBytes() > maxBytesPerSchedule.toBytes()) + { + // log that repair assignment was skipped. + bytesNotRepaired += repairAssignment.getEstimatedBytes(); + assignmentsNotRepaired++; + logger.warn("Skipping {} because it would increase total repair bytes to {}", + repairAssignment, + getBytesOfMaxBytesPerSchedule(bytesSoFar + repairAssignment.getEstimatedBytes())); + } + else + { + bytesSoFar += repairAssignment.getEstimatedBytes(); + bytesSoFarThisIteration += repairAssignment.getEstimatedBytes(); + assignmentsToRepair++; + logger.info("Adding {}, increasing repair bytes to {}", + repairAssignment, + getBytesOfMaxBytesPerSchedule(bytesSoFar)); + assignmentsToReturn.add(repairAssignment); + } + } + + String message = "Returning {} assignment(s) for priorityBucket {} and keyspace {}, totaling {} ({} overall)"; + if (assignmentsNotRepaired != 0) + { + message += ". Skipping {} of {} assignment(s), totaling {}"; + if (repairType != AutoRepairConfig.RepairType.INCREMENTAL) + { + message += ". The entire primary range will not be repaired this schedule. " + + "Consider increasing maxBytesPerSchedule, reducing node density or monitoring to ensure " + + "all ranges do get repaired within gc_grace_seconds"; + logger.warn(message, assignmentsToRepair, priority, keyspaceName, + FileUtils.stringifyFileSize(bytesSoFarThisIteration), + getBytesOfMaxBytesPerSchedule(bytesSoFar), + assignmentsNotRepaired, totalAssignments, + FileUtils.stringifyFileSize(bytesNotRepaired)); + } + else + { + logger.info(message, assignmentsToRepair, priority, keyspaceName, + FileUtils.stringifyFileSize(bytesSoFarThisIteration), + getBytesOfMaxBytesPerSchedule(bytesSoFar), + assignmentsNotRepaired, totalAssignments, + FileUtils.stringifyFileSize(bytesNotRepaired)); + } + } + else + { + logger.info(message, assignmentsToRepair, priority, keyspaceName, + FileUtils.stringifyFileSize(bytesSoFarThisIteration), + getBytesOfMaxBytesPerSchedule(bytesSoFar)); + } + + return new FilteredRepairAssignments(assignmentsToReturn, bytesSoFar); + } + + @VisibleForTesting + static class FilteredRepairAssignments + { + final List repairAssignments; + final long newBytesSoFar; + + private FilteredRepairAssignments(List repairAssignments, long newBytesSoFar) + { + this.repairAssignments = repairAssignments; + this.newBytesSoFar = newBytesSoFar; + } + } + + private String getBytesOfMaxBytesPerSchedule(long bytes) + { + if (maxBytesPerSchedule.equals(MAX_BYTES)) + return FileUtils.stringifyFileSize(bytes); + else + return String.format("%s of %s", FileUtils.stringifyFileSize(bytes), maxBytesPerSchedule); + } + + /** + * @return The sum of {@link SizedRepairAssignment#getEstimatedBytes()} of all given + * repairAssignments. + * @param repairAssignments The assignments to sum + */ + @VisibleForTesting + protected static long getEstimatedBytes(List repairAssignments) + { + return repairAssignments + .stream() + .mapToLong(SizedRepairAssignment::getEstimatedBytes) + .sum(); + } + + @VisibleForTesting + static SizedRepairAssignment merge(List assignments) + { + if (assignments.isEmpty()) + throw new IllegalStateException("Cannot merge empty assignments"); + + Set mergedTableNames = new HashSet<>(); + Range referenceTokenRange = assignments.get(0).getTokenRange(); + String referenceKeyspaceName = assignments.get(0).getKeyspaceName(); + + for (SizedRepairAssignment assignment : assignments) + { + // These checks _should_ be unnecessary but are here to ensure that the assignments are consistent + if (!assignment.getTokenRange().equals(referenceTokenRange)) + throw new IllegalStateException("All assignments must have the same token range"); + if (!assignment.getKeyspaceName().equals(referenceKeyspaceName)) + throw new IllegalStateException("All assignments must have the same keyspace name"); + + mergedTableNames.addAll(assignment.getTableNames()); + } + + long sizeForAssignment = getEstimatedBytes(assignments); + return new SizedRepairAssignment(referenceTokenRange, referenceKeyspaceName, new ArrayList<>(mergedTableNames), + "full primary range for " + mergedTableNames.size() + " tables", sizeForAssignment); + } + + @VisibleForTesting + protected List getRepairAssignmentsForTable(String keyspaceName, String tableName, Range tokenRange) + { + List sizeEstimates = getRangeSizeEstimate(keyspaceName, tableName, tokenRange); + return getRepairAssignments(sizeEstimates); + } + + private static void logSkippingTable(String keyspaceName, String tableName) + { + logger.warn("Could not resolve table data for {}.{} assuming it has since been deleted, skipping", keyspaceName, tableName); + } + + @VisibleForTesting + protected List getRepairAssignments(List sizeEstimates) + { + List repairAssignments = new ArrayList<>(); + + // since its possible for us to hit maxBytesPerSchedule before seeing all ranges, shuffle so there is chance + // at least of hitting all the ranges _eventually_ for the worst case scenarios + Collections.shuffle(sizeEstimates); + int totalExpectedSubRanges = 0; + for (SizeEstimate estimate : sizeEstimates) + { + if (estimate.sizeForRepair != 0) + { + boolean needsSplitting = estimate.sizeForRepair > bytesPerAssignment.toBytes() || estimate.partitions > partitionsPerAssignment; + if (needsSplitting) + { + totalExpectedSubRanges += calculateNumberOfSplits(estimate); + } + } + } + for (SizeEstimate estimate : sizeEstimates) + { + if (estimate.sizeForRepair == 0) + { + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(estimate.keyspace, estimate.table); + + if (cfs == null) + { + logSkippingTable(estimate.keyspace, estimate.table); + continue; + } + + long memtableSize = cfs.getTracker().getView().getCurrentMemtable().getLiveDataSize(); + if (memtableSize > 0L) + { + logger.debug("Included {}.{} range {}, had no unrepaired SSTables, but memtableSize={}, adding single repair assignment", estimate.keyspace, estimate.table, estimate.tokenRange, memtableSize); + SizedRepairAssignment assignment = new SizedRepairAssignment(estimate.tokenRange, estimate.keyspace, Collections.singletonList(estimate.table), "full primary rangee for table with memtable only detected", memtableSize); + repairAssignments.add(assignment); + } + else + { + logger.debug("Included {}.{} range {}, has no SSTables or memtable data, but adding single repair assignment for entire range in case writes were missed", estimate.keyspace, estimate.table, estimate.tokenRange); + SizedRepairAssignment assignment = new SizedRepairAssignment(estimate.tokenRange, estimate.keyspace, Collections.singletonList(estimate.table), "full primary range for table with no data detected", 0L); + repairAssignments.add(assignment); + } + } + else + { + // Check if the estimate needs splitting based on the criteria + boolean needsSplitting = estimate.sizeForRepair > bytesPerAssignment.toBytes() || estimate.partitions > partitionsPerAssignment; + if (needsSplitting) + { + int numberOfSplits = calculateNumberOfSplits(estimate); + long approximateBytesPerSplit = estimate.sizeForRepair / numberOfSplits; + Collection> subranges = split(estimate.tokenRange, numberOfSplits); + for (Range subrange : subranges) + { + SizedRepairAssignment assignment = new SizedRepairAssignment(subrange, estimate.keyspace, Collections.singletonList(estimate.table), + String.format("subrange %d of %d", repairAssignments.size()+1, totalExpectedSubRanges), + approximateBytesPerSplit); + repairAssignments.add(assignment); + } + } + else + { + // No splitting needed, repair the entire range as-is + SizedRepairAssignment assignment = new SizedRepairAssignment(estimate.tokenRange, estimate.keyspace, + Collections.singletonList(estimate.table), + "full primary range for table", estimate.sizeForRepair); + repairAssignments.add(assignment); + } + } + } + return repairAssignments; + } + + private int calculateNumberOfSplits(SizeEstimate estimate) + { + // Calculate the number of splits needed for size and partitions + int splitsForSize = (int) Math.ceil((double) estimate.sizeForRepair / bytesPerAssignment.toBytes()); + int splitsForPartitions = (int) Math.ceil((double) estimate.partitions / partitionsPerAssignment); + + // Split the token range into subranges based on whichever (partitions, bytes) would generate the most splits. + boolean splitBySize = splitsForSize > splitsForPartitions; + int splits = splitBySize ? splitsForSize : splitsForPartitions; + + // calculate approximation for logging purposes + long approximateBytesPerSplit = estimate.sizeForRepair / splits; + long approximatePartitionsPerSplit = estimate.partitions / splits; + + logger.info("Splitting {}.{} for range {} into {} sub ranges by {} (splitsForSize={}, splitsForPartitions={}, " + + "approximateBytesInRange={}, approximatePartitionsInRange={}, " + + "approximateBytesPerSplit={}, approximatePartitionsPerSplit={})", + estimate.keyspace, estimate.table, estimate.tokenRange, + splits, splitBySize ? "size" : "partitions", + splitsForSize, splitsForPartitions, + FileUtils.stringifyFileSize(estimate.sizeForRepair), estimate.partitions, + FileUtils.stringifyFileSize(approximateBytesPerSplit), approximatePartitionsPerSplit + ); + return splits; + } + + private List> getTokenRanges(boolean primaryRangeOnly, String keyspaceName) + { + // Collect all applicable token ranges + Collection> wrappedRanges; + if (primaryRangeOnly) + { + wrappedRanges = TokenRingUtils.getPrimaryRangesForEndpoint(keyspaceName, FBUtilities.getBroadcastAddressAndPort()); + } + else + { + wrappedRanges = StorageService.instance.getLocalRanges(keyspaceName); + } + + // Unwrap each range as we need to account for ranges that overlap the ring + List> ranges = new ArrayList<>(); + for (Range wrappedRange : wrappedRanges) + { + ranges.addAll(wrappedRange.unwrap()); + } + return ranges; + } + + private List getRangeSizeEstimate(String keyspace, String table, Range tokenRange) + { + List sizeEstimates = new ArrayList<>(); + logger.debug("Calculating size estimate for {}.{} for range {}", keyspace, table, tokenRange); + try (Refs refs = getSSTableReaderRefs(repairType, keyspace, table, tokenRange)) + { + SizeEstimate estimate = getSizesForRangeOfSSTables(repairType, keyspace, table, tokenRange, refs); + logger.debug("Generated size estimate {}", estimate); + sizeEstimates.add(estimate); + } + return sizeEstimates; + } + + @VisibleForTesting + static SizeEstimate getSizesForRangeOfSSTables(AutoRepairConfig.RepairType repairType, String keyspace, String table, Range tokenRange, Refs refs) + { + List> singletonRange = Collections.singletonList(tokenRange); + ICardinality cardinality = new HyperLogLogPlus(13, 25); + long approxBytesInRange = 0L; + long totalBytes = 0L; + + for (SSTableReader reader : refs) + { + try + { + if (reader.openReason == SSTableReader.OpenReason.EARLY) + continue; + CompactionMetadata metadata = (CompactionMetadata) reader.descriptor.getMetadataSerializer().deserialize(reader.descriptor, MetadataType.COMPACTION); + if (metadata != null) + cardinality = cardinality.merge(metadata.cardinalityEstimator); + + // use onDiskLength, which is the actual size of the SSTable data file. + long sstableSize = reader.onDiskLength(); + totalBytes += sstableSize; + + // get the on disk size for the token range, note for compressed data this includes the full + // chunks the start and end ranges are found in. + long approximateRangeBytesInSSTable = reader.onDiskSizeForPartitionPositions(reader.getPositionsForRanges(singletonRange)); + approxBytesInRange += Math.min(approximateRangeBytesInSSTable, sstableSize); + } + catch (IOException | CardinalityMergeException e) + { + logger.error("Error calculating size estimate for {}.{} for range {} on {}", keyspace, table, tokenRange, reader, e); + } + } + + long partitions = 0L; + if (totalBytes > 0) + { + // use the ratio from size to estimate the partitions in the range as well + double ratio = approxBytesInRange / (double) totalBytes; + partitions = (long) Math.max(1, Math.ceil(cardinality.cardinality() * ratio)); + } + return new SizeEstimate(repairType, keyspace, table, tokenRange, partitions, approxBytesInRange, totalBytes); + } + + @VisibleForTesting + static Refs getSSTableReaderRefs(AutoRepairConfig.RepairType repairType, String keyspaceName, String tableName, Range tokenRange) + { + final ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(keyspaceName, tableName); + if (cfs == null) + { + logSkippingTable(keyspaceName, tableName); + return Refs.ref(Collections.emptyList()); + } + + Refs refs = null; + while (refs == null) + { + Iterable sstables = cfs.getTracker().getView().select(SSTableSet.CANONICAL); + SSTableIntervalTree tree = SSTableIntervalTree.buildSSTableIntervalTree(ImmutableList.copyOf(sstables)); + Range r = Range.makeRowRange(tokenRange); + List canonicalSSTables = View.sstablesInBounds(r.left, r.right, tree); + if (repairType == AutoRepairConfig.RepairType.INCREMENTAL) + { + canonicalSSTables = canonicalSSTables.stream().filter((sstable) -> !sstable.isRepaired()).collect(Collectors.toList()); + } + refs = Refs.tryRef(canonicalSSTables); + } + return refs; + } + + @Override + public void setParameter(String key, String value) + { + if (!PARAMETERS.contains(key)) + { + throw new IllegalArgumentException("Unexpected parameter '" + key + "', must be one of " + PARAMETERS); + } + + logger.info("Setting {} to {} for repair type {}", key, value, repairType); + givenParameters.put(key, value); + reinitParameters(); + } + + @Override + public Map getParameters() + { + final Map parameters = new LinkedHashMap<>(); + for (String parameter : PARAMETERS) + { + // Use the parameter as provided if present. + if (givenParameters.containsKey(parameter)) + { + parameters.put(parameter, givenParameters.get(parameter)); + continue; + } + + switch (parameter) + { + case BYTES_PER_ASSIGNMENT: + parameters.put(parameter, bytesPerAssignment.toString()); + continue; + case PARTITIONS_PER_ASSIGNMENT: + parameters.put(parameter, Long.toString(partitionsPerAssignment)); + continue; + case MAX_TABLES_PER_ASSIGNMENT: + parameters.put(parameter, Integer.toString(maxTablesPerAssignment)); + continue; + case MAX_BYTES_PER_SCHEDULE: + parameters.put(parameter, maxBytesPerSchedule.toString()); + continue; + default: + // not expected + parameters.put(parameter, ""); + } + } + return Collections.unmodifiableMap(parameters); + } + + /** + * Represents a size estimate by both bytes and partition count for a given keyspace and table for a token range. + */ + @VisibleForTesting + protected static class SizeEstimate + { + public final AutoRepairConfig.RepairType repairType; + public final String keyspace; + public final String table; + public final Range tokenRange; + public final long partitions; + public final long sizeInRange; + public final long totalSize; + /** + * Size to consider in the repair. For incremental repair, we want to consider the total size + * of the estimate as we have to factor in anticompacting the entire SSTable. + * For full repair, just use the size containing the range. + */ + public final long sizeForRepair; + + public SizeEstimate(AutoRepairConfig.RepairType repairType, + String keyspace, String table, Range tokenRange, + long partitions, long sizeInRange, long totalSize) + { + this.repairType = repairType; + this.keyspace = keyspace; + this.table = table; + this.tokenRange = tokenRange; + this.partitions = partitions; + this.sizeInRange = sizeInRange; + this.totalSize = totalSize; + + this.sizeForRepair = repairType == AutoRepairConfig.RepairType.INCREMENTAL ? totalSize : sizeInRange; + } + + @Override + public String toString() + { + return "SizeEstimate{" + + "repairType=" + repairType + + ", keyspace='" + keyspace + '\'' + + ", table='" + table + '\'' + + ", tokenRange=" + tokenRange + + ", partitions=" + partitions + + ", sizeInRange=" + sizeInRange + + ", totalSize=" + totalSize + + ", sizeForRepair=" + sizeForRepair + + '}'; + } + } + + /** + * Implementation of RepairAssignment that also assigns an estimation of bytes involved + * in the repair. + */ + @VisibleForTesting + protected static class SizedRepairAssignment extends RepairAssignment { + + final String description; + final long estimatedBytes; + + public SizedRepairAssignment(Range tokenRange, String keyspaceName, List tableNames) + { + this(tokenRange, keyspaceName, tableNames, "", 0L); + } + + public SizedRepairAssignment(Range tokenRange, String keyspaceName, List tableNames, + String description, + long estimatedBytes) + { + super(tokenRange, keyspaceName, tableNames); + this.description = description; + this.estimatedBytes = estimatedBytes; + } + + /** + * @return Additional metadata about the repair assignment. + */ + public String getDescription() + { + return description; + } + + /** + * Estimated bytes involved in the assignment. Typically Derived from {@link SizeEstimate#sizeForRepair}. + * @return estimated bytes involved in the assignment. + */ + public long getEstimatedBytes() + { + return estimatedBytes; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + SizedRepairAssignment that = (SizedRepairAssignment) o; + return estimatedBytes == that.estimatedBytes && Objects.equals(description, that.description); + } + + @Override + public int hashCode() + { + return Objects.hash(super.hashCode(), description, estimatedBytes); + } + + @Override + public String toString() + { + return "SizedRepairAssignment{" + + "description='" + description + '\'' + + ", tokenRange=" + tokenRange + + ", keyspaceName='" + keyspaceName + '\'' + + ", tableNames=" + tableNames + + ", estimatedBytes=" + FileUtils.stringifyFileSize(estimatedBytes) + + '}'; + } + } + + /** + * Conveinence builder for establishing defaults by repair type. + */ + protected static class RepairTypeDefaults + { + final AutoRepairConfig.RepairType repairType; + final DataStorageSpec.LongBytesBound bytesPerAssignment; + final long partitionsPerAssignment; + final int maxTablesPerAssignment; + final DataStorageSpec.LongBytesBound maxBytesPerSchedule; + + public RepairTypeDefaults(AutoRepairConfig.RepairType repairType, + DataStorageSpec.LongBytesBound bytesPerAssignment, + long partitionsPerAssignment, + int maxTablesPerAssignment, + DataStorageSpec.LongBytesBound maxBytesPerSchedule) + { + this.repairType = repairType; + this.bytesPerAssignment = bytesPerAssignment; + this.partitionsPerAssignment = partitionsPerAssignment; + this.maxTablesPerAssignment = maxTablesPerAssignment; + this.maxBytesPerSchedule = maxBytesPerSchedule; + } + + static RepairTypeDefaultsBuilder builder(AutoRepairConfig.RepairType repairType) + { + return new RepairTypeDefaultsBuilder(repairType); + } + + static class RepairTypeDefaultsBuilder + { + private final AutoRepairConfig.RepairType repairType; + private DataStorageSpec.LongBytesBound bytesPerAssignment = new DataStorageSpec.LongBytesBound("50GiB"); + // Aims to target at most 1 partitions per leaf assuming a merkle tree of depth 20 (2^20 = 1,048,576) + private long partitionsPerAssignment = 1_048_576; + private int maxTablesPerAssignment = 64; + private DataStorageSpec.LongBytesBound maxBytesPerSchedule = MAX_BYTES; + + private RepairTypeDefaultsBuilder(AutoRepairConfig.RepairType repairType) + { + this.repairType = repairType; + } + + @SuppressWarnings("unused") + public RepairTypeDefaultsBuilder withBytesPerAssignment(DataStorageSpec.LongBytesBound bytesPerAssignment) + { + this.bytesPerAssignment = bytesPerAssignment; + return this; + } + + @SuppressWarnings("unused") + public RepairTypeDefaultsBuilder withPartitionsPerAssignment(long partitionsPerAssignment) + { + this.partitionsPerAssignment = partitionsPerAssignment; + return this; + } + + @SuppressWarnings("unused") + public RepairTypeDefaultsBuilder withMaxTablesPerAssignment(int maxTablesPerAssignment) + { + this.maxTablesPerAssignment = maxTablesPerAssignment; + return this; + } + + public RepairTypeDefaultsBuilder withMaxBytesPerSchedule(DataStorageSpec.LongBytesBound maxBytesPerSchedule) + { + this.maxBytesPerSchedule = maxBytesPerSchedule; + return this; + } + + public RepairTokenRangeSplitter.RepairTypeDefaults build() + { + return new RepairTypeDefaults(repairType, bytesPerAssignment, partitionsPerAssignment, maxTablesPerAssignment, maxBytesPerSchedule); + } + } + } +} diff --git a/src/java/org/apache/cassandra/repair/consistent/SyncStatSummary.java b/src/java/org/apache/cassandra/repair/consistent/SyncStatSummary.java index 855ad4bad344..820c6b011ba6 100644 --- a/src/java/org/apache/cassandra/repair/consistent/SyncStatSummary.java +++ b/src/java/org/apache/cassandra/repair/consistent/SyncStatSummary.java @@ -81,7 +81,7 @@ public String toString() } } - private static class Table + public static class Table { final String keyspace; @@ -94,7 +94,7 @@ private static class Table final Map, Session> sessions = new HashMap<>(); - Table(String keyspace, String table) + public Table(String keyspace, String table) { this.keyspace = keyspace; this.table = table; @@ -138,7 +138,7 @@ void calculateTotals() totalsCalculated = true; } - boolean isCounter() + public boolean isCounter() { TableMetadata tmd = Schema.instance.getTableMetadata(keyspace, table); return tmd != null && tmd.isCounter(); @@ -174,6 +174,16 @@ public String toString() } return output.toString(); } + + public long getBytes() + { + return this.bytes; + } + + public long getRanges() + { + return this.ranges.size(); + } } private final Map, Table> summaries = new HashMap<>(); @@ -233,6 +243,12 @@ private void calculateTotals() totalsCalculated = true; } + public Map, Table> getTotals() + { + calculateTotals(); + return summaries; + } + public String toString() { List> tables = Lists.newArrayList(summaries.keySet()); diff --git a/src/java/org/apache/cassandra/repair/messages/RepairMessage.java b/src/java/org/apache/cassandra/repair/messages/RepairMessage.java index 835f90fc6804..c615bf9f52f3 100644 --- a/src/java/org/apache/cassandra/repair/messages/RepairMessage.java +++ b/src/java/org/apache/cassandra/repair/messages/RepairMessage.java @@ -25,7 +25,6 @@ import java.util.concurrent.TimeUnit; import java.util.function.BiConsumer; import java.util.function.Supplier; - import javax.annotation.Nullable; import com.google.common.annotations.VisibleForTesting; @@ -36,6 +35,7 @@ import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.config.RepairRetrySpec; import org.apache.cassandra.config.RetrySpec; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.metrics.RepairMetrics; import org.apache.cassandra.repair.SharedContext; import org.apache.cassandra.exceptions.RepairException; @@ -45,8 +45,8 @@ import org.apache.cassandra.net.RequestCallback; import org.apache.cassandra.net.Verb; import org.apache.cassandra.repair.RepairJobDesc; +import org.apache.cassandra.service.WaitStrategy; import org.apache.cassandra.streaming.PreviewKind; -import org.apache.cassandra.utils.Backoff; import org.apache.cassandra.utils.CassandraVersion; import org.apache.cassandra.utils.NoSpamLogger; import org.apache.cassandra.utils.TimeUUID; @@ -73,7 +73,7 @@ public void onResponse(Message msg) } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) + public void onFailure(InetAddressAndPort from, RequestFailure failure) { } }; @@ -134,11 +134,11 @@ public interface RepairFailureCallback void onFailure(Exception e); } - private static Backoff backoff(SharedContext ctx, Verb verb) + private static WaitStrategy backoff(SharedContext ctx, Verb verb) { RepairRetrySpec retrySpec = DatabaseDescriptor.getRepairRetrySpec(); RetrySpec spec = verb == Verb.VALIDATION_RSP ? retrySpec.getMerkleTreeResponseSpec() : retrySpec; - return Backoff.fromConfig(ctx, spec); + return RetrySpec.toStrategy(ctx, spec); } public static Supplier notDone(Future f) @@ -172,12 +172,12 @@ public static void sendMessageWithRetries(SharedContext ctx, Supplier a } @VisibleForTesting - static void sendMessageWithRetries(SharedContext ctx, Backoff backoff, Supplier allowRetry, RepairMessage request, Verb verb, InetAddressAndPort endpoint, RequestCallback finalCallback) + static void sendMessageWithRetries(SharedContext ctx, WaitStrategy backoff, Supplier allowRetry, RepairMessage request, Verb verb, InetAddressAndPort endpoint, RequestCallback finalCallback) { if (!ALLOWS_RETRY.contains(verb)) throw new AssertionError("Repair verb " + verb + " does not support retry, but a request to send with retry was given!"); BiConsumer maybeRecordRetry = (attempt, reason) -> { - if (attempt <= 0) + if (attempt <= 1) return; // we don't know what the prefix kind is... so use NONE... this impacts logPrefix as it will cause us to use "repair" rather than "preview repair" which may not be correct... but close enough... String prefix = PreviewKind.NONE.logPrefix(request.parentRepairSession()); @@ -217,9 +217,9 @@ else if (reason == RequestFailureReason.TIMEOUT) finalCallback.onFailure(from, failure); return false; case RETRY: - if (failure == RequestFailureReason.TIMEOUT && allowRetry.get()) + if (failure.reason == RequestFailureReason.TIMEOUT && allowRetry.get()) return true; - maybeRecordRetry.accept(attempt, failure); + maybeRecordRetry.accept(attempt, failure.reason); finalCallback.onFailure(from, failure); return false; default: @@ -229,8 +229,8 @@ else if (reason == RequestFailureReason.TIMEOUT) (attempt, retryReason, from, failure) -> { switch (retryReason) { - case MaxRetries: - maybeRecordRetry.accept(attempt, failure); + case GiveUp: + maybeRecordRetry.accept(attempt, failure.reason); finalCallback.onFailure(from, failure); return null; case Interrupted: @@ -255,9 +255,9 @@ public void onResponse(Message msg) } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) + public void onFailure(InetAddressAndPort from, RequestFailure failure) { - failureCallback.onFailure(RepairException.error(request.desc, PreviewKind.NONE, String.format("Got %s failure from %s: %s", verb, from, failureReason))); + failureCallback.onFailure(RepairException.error(request.desc, PreviewKind.NONE, String.format("Got %s failure from %s: %s", verb, from, failure.reason))); } @Override diff --git a/src/java/org/apache/cassandra/repair/messages/RepairOption.java b/src/java/org/apache/cassandra/repair/messages/RepairOption.java index bc9231dcc142..3c8a260164d1 100644 --- a/src/java/org/apache/cassandra/repair/messages/RepairOption.java +++ b/src/java/org/apache/cassandra/repair/messages/RepairOption.java @@ -17,10 +17,15 @@ */ package org.apache.cassandra.repair.messages; -import java.util.*; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.StringTokenizer; import com.google.common.base.Joiner; -import com.google.common.base.Preconditions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -29,8 +34,10 @@ import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.locator.MetaStrategy; -import org.apache.cassandra.streaming.PreviewKind; import org.apache.cassandra.repair.RepairParallelism; +import org.apache.cassandra.streaming.PreviewKind; + +import static com.google.common.base.Preconditions.checkArgument; /** * Repair options. @@ -52,10 +59,10 @@ public class RepairOption public static final String PREVIEW = "previewKind"; public static final String OPTIMISE_STREAMS_KEY = "optimiseStreams"; public static final String IGNORE_UNREPLICATED_KS = "ignoreUnreplicatedKeyspaces"; + public static final String REPAIR_DATA_KEY = "repairData"; public static final String REPAIR_PAXOS_KEY = "repairPaxos"; - public static final String PAXOS_ONLY_KEY = "paxosOnly"; public static final String NO_TOMBSTONE_PURGING = "nopurge"; - + public static final String REPAIR_ACCORD_KEY = "repairAccord"; // we don't want to push nodes too much for repair public static final int MAX_JOB_THREADS = 4; @@ -86,6 +93,7 @@ public static Set> parseRanges(String rangesStr, IPartitioner parti } return ranges; } + /** * Construct RepairOptions object from given map of Strings. *

    @@ -167,6 +175,12 @@ public static Set> parseRanges(String rangesStr, IPartitioner parti * ranges to the same host multiple times *

    * + * + * + * + * + * * *
    false
    accordRepair"true" if the repair should be of Accord in flight transactions. Will ensure + * that once repair completes all Accord transactions are replicated at quorumfalse
    * @@ -185,14 +199,21 @@ public static RepairOption parse(Map options, IPartitioner parti boolean force = Boolean.parseBoolean(options.get(FORCE_REPAIR_KEY)); boolean pullRepair = Boolean.parseBoolean(options.get(PULL_REPAIR_KEY)); boolean ignoreUnreplicatedKeyspaces = Boolean.parseBoolean(options.get(IGNORE_UNREPLICATED_KS)); + // Default to true because historically it was a default and some tests were written to expect it + boolean repairData = Boolean.parseBoolean(options.getOrDefault(REPAIR_DATA_KEY, "true")); boolean repairPaxos = Boolean.parseBoolean(options.get(REPAIR_PAXOS_KEY)); - boolean paxosOnly = Boolean.parseBoolean(options.get(PAXOS_ONLY_KEY)); boolean dontPurgeTombstones = Boolean.parseBoolean(options.get(NO_TOMBSTONE_PURGING)); + boolean repairAccord = Boolean.parseBoolean(options.get(REPAIR_ACCORD_KEY)); + if (repairAccord && !DatabaseDescriptor.getAccordTransactionsEnabled()) + { + logger.info("Overriding and disabling Accord repair because Accord is not enabled"); + repairAccord = false; + } if (previewKind != PreviewKind.NONE) { - Preconditions.checkArgument(!repairPaxos, "repairPaxos must be set to false for preview repairs"); - Preconditions.checkArgument(!paxosOnly, "paxosOnly must be set to false for preview repairs"); + checkArgument(!repairPaxos, "repairPaxos must be set to false for preview repairs"); + checkArgument(!repairAccord, "repairAccord must be set to false for preview repairs"); } int jobThreads = 1; @@ -212,7 +233,7 @@ public static RepairOption parse(Map options, IPartitioner parti boolean asymmetricSyncing = Boolean.parseBoolean(options.get(OPTIMISE_STREAMS_KEY)); - RepairOption option = new RepairOption(parallelism, primaryRange, incremental, trace, jobThreads, ranges, !ranges.isEmpty(), pullRepair, force, previewKind, asymmetricSyncing, ignoreUnreplicatedKeyspaces, repairPaxos, paxosOnly, dontPurgeTombstones); + RepairOption option = new RepairOption(parallelism, primaryRange, incremental, trace, jobThreads, ranges, pullRepair, force, previewKind, asymmetricSyncing, ignoreUnreplicatedKeyspaces, repairData, repairPaxos, dontPurgeTombstones, repairAccord); // data centers String dataCentersStr = options.get(DATACENTERS_KEY); @@ -286,39 +307,39 @@ else if (ranges.isEmpty()) private final boolean incremental; private final boolean trace; private final int jobThreads; - private final boolean isSubrangeRepair; private final boolean pullRepair; private final boolean forceRepair; private final PreviewKind previewKind; private final boolean optimiseStreams; private final boolean ignoreUnreplicatedKeyspaces; + private final boolean repairData; private final boolean repairPaxos; - private final boolean paxosOnly; private final boolean dontPurgeTombstones; + private final boolean repairAccord; private final Collection columnFamilies = new HashSet<>(); private final Collection dataCenters = new HashSet<>(); private final Collection hosts = new HashSet<>(); private final Collection> ranges = new HashSet<>(); - public RepairOption(RepairParallelism parallelism, boolean primaryRange, boolean incremental, boolean trace, int jobThreads, Collection> ranges, boolean isSubrangeRepair, boolean pullRepair, boolean forceRepair, PreviewKind previewKind, boolean optimiseStreams, boolean ignoreUnreplicatedKeyspaces, boolean repairPaxos, boolean paxosOnly, boolean dontPurgeTombstones) + public RepairOption(RepairParallelism parallelism, boolean primaryRange, boolean incremental, boolean trace, int jobThreads, Collection> ranges, boolean pullRepair, boolean forceRepair, PreviewKind previewKind, boolean optimiseStreams, boolean ignoreUnreplicatedKeyspaces, boolean repairData, boolean repairPaxos, boolean dontPurgeTombstones, boolean repairAccord) { - + checkArgument(repairData || repairAccord || repairPaxos, "Repair needs to repair at least one of data, Paxos, or Accord"); this.parallelism = parallelism; this.primaryRange = primaryRange; this.incremental = incremental; this.trace = trace; this.jobThreads = jobThreads; this.ranges.addAll(ranges); - this.isSubrangeRepair = isSubrangeRepair; this.pullRepair = pullRepair; this.forceRepair = forceRepair; this.previewKind = previewKind; this.optimiseStreams = optimiseStreams; this.ignoreUnreplicatedKeyspaces = ignoreUnreplicatedKeyspaces; + this.repairData = repairData; this.repairPaxos = repairPaxos; - this.paxosOnly = paxosOnly; this.dontPurgeTombstones = dontPurgeTombstones; + this.repairAccord = repairAccord; } public RepairParallelism getParallelism() @@ -381,11 +402,6 @@ public boolean isGlobal() return dataCenters.isEmpty() && hosts.isEmpty(); } - public boolean isSubrangeRepair() - { - return isSubrangeRepair; - } - public PreviewKind getPreviewKind() { return previewKind; @@ -424,6 +440,11 @@ public boolean ignoreUnreplicatedKeyspaces() return ignoreUnreplicatedKeyspaces; } + public boolean repairData() + { + return repairData; + } + public boolean repairPaxos() { return repairPaxos; @@ -431,7 +452,12 @@ public boolean repairPaxos() public boolean paxosOnly() { - return paxosOnly; + return !repairAccord && !repairData && repairPaxos; + } + + public boolean repairAccord() + { + return repairAccord; } public boolean dontPurgeTombstones() @@ -439,6 +465,11 @@ public boolean dontPurgeTombstones() return dontPurgeTombstones; } + public boolean accordOnly() + { + return !repairPaxos && !repairData && repairAccord; + } + @Override public String toString() { @@ -456,9 +487,10 @@ public String toString() ", force repair: " + forceRepair + ", optimise streams: "+ optimiseStreams() + ", ignore unreplicated keyspaces: "+ ignoreUnreplicatedKeyspaces + + ", repairData: " + repairData + ", repairPaxos: " + repairPaxos + - ", paxosOnly: " + paxosOnly + ", dontPurgeTombstones: " + dontPurgeTombstones + + ", repairAccord: " + repairAccord + ')'; } @@ -472,16 +504,16 @@ public Map asMap() options.put(COLUMNFAMILIES_KEY, Joiner.on(",").join(columnFamilies)); options.put(DATACENTERS_KEY, Joiner.on(",").join(dataCenters)); options.put(HOSTS_KEY, Joiner.on(",").join(hosts)); - options.put(SUB_RANGE_REPAIR_KEY, Boolean.toString(isSubrangeRepair)); options.put(TRACE_KEY, Boolean.toString(trace)); options.put(RANGES_KEY, Joiner.on(",").join(ranges)); options.put(PULL_REPAIR_KEY, Boolean.toString(pullRepair)); options.put(FORCE_REPAIR_KEY, Boolean.toString(forceRepair)); options.put(PREVIEW, previewKind.toString()); options.put(OPTIMISE_STREAMS_KEY, Boolean.toString(optimiseStreams)); + options.put(REPAIR_DATA_KEY, Boolean.toString(repairData)); options.put(REPAIR_PAXOS_KEY, Boolean.toString(repairPaxos)); - options.put(PAXOS_ONLY_KEY, Boolean.toString(paxosOnly)); options.put(NO_TOMBSTONE_PURGING, Boolean.toString(dontPurgeTombstones)); + options.put(REPAIR_ACCORD_KEY, Boolean.toString(repairAccord)); return options; } } diff --git a/src/java/org/apache/cassandra/repair/messages/SyncResponse.java b/src/java/org/apache/cassandra/repair/messages/SyncResponse.java index e7e7985fff34..e7b5446badb5 100644 --- a/src/java/org/apache/cassandra/repair/messages/SyncResponse.java +++ b/src/java/org/apache/cassandra/repair/messages/SyncResponse.java @@ -23,12 +23,13 @@ import java.util.Objects; import org.apache.cassandra.db.TypeSizes; -import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.IPartitionerDependentSerializer; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.repair.SyncNodePair; import org.apache.cassandra.repair.RepairJobDesc; +import org.apache.cassandra.repair.SyncNodePair; import org.apache.cassandra.streaming.SessionSummary; /** @@ -78,7 +79,7 @@ public int hashCode() return Objects.hash(desc, success, nodes, summaries); } - public static final IVersionedSerializer serializer = new IVersionedSerializer() + public static final IPartitionerDependentSerializer serializer = new IPartitionerDependentSerializer() { public void serialize(SyncResponse message, DataOutputPlus out, int version) throws IOException { @@ -93,7 +94,8 @@ public void serialize(SyncResponse message, DataOutputPlus out, int version) thr } } - public SyncResponse deserialize(DataInputPlus in, int version) throws IOException + @Override + public SyncResponse deserialize(DataInputPlus in, IPartitioner partitioner, int version) throws IOException { RepairJobDesc desc = RepairJobDesc.serializer.deserialize(in, version); SyncNodePair nodes = SyncNodePair.serializer.deserialize(in, version); @@ -103,7 +105,7 @@ public SyncResponse deserialize(DataInputPlus in, int version) throws IOExceptio List summaries = new ArrayList<>(numSummaries); for (int i=0; i { public enum State @@ -56,14 +54,34 @@ public enum State // API to split function calls for phase changes from getting the state public final Phase phase = new Phase(); - public CoordinatorState(Clock clock, int cmd, String keyspace, RepairOption options) + public CoordinatorState(SharedContext ctx, int cmd, String keyspace, RepairOption options) { - super(clock, nextTimeUUID(), State.class); + super(ctx.clock(), ctx.timeUUID().get(), State.class); this.cmd = cmd; this.keyspace = Objects.requireNonNull(keyspace); this.options = Objects.requireNonNull(options); } + public String getType() + { + if (options.isPreview()) + { + switch (options.getPreviewKind()) + { + case ALL: return "preview full"; + case REPAIRED: return "preview repaired"; + case UNREPAIRED: return "preview unrepaired"; + case NONE: throw new AssertionError("NONE preview kind not expected when preview repair is set"); + default: throw new AssertionError("Unknown preview kind: " + options.getPreviewKind()); + } + } + else if (options.isIncremental()) + { + return "incremental"; + } + return "full"; + } + public Collection getSessions() { return sessions.values(); diff --git a/src/java/org/apache/cassandra/repair/state/SessionState.java b/src/java/org/apache/cassandra/repair/state/SessionState.java index 32be08935077..352643a2474b 100644 --- a/src/java/org/apache/cassandra/repair/state/SessionState.java +++ b/src/java/org/apache/cassandra/repair/state/SessionState.java @@ -26,11 +26,9 @@ import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.repair.CommonRange; -import org.apache.cassandra.utils.Clock; +import org.apache.cassandra.repair.SharedContext; import org.apache.cassandra.utils.TimeUUID; -import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; - public class SessionState extends AbstractState { public enum State @@ -46,9 +44,9 @@ public enum State public final Phase phase = new Phase(); - public SessionState(Clock clock, TimeUUID parentRepairSession, String keyspace, String[] cfnames, CommonRange commonRange) + public SessionState(SharedContext ctx, TimeUUID parentRepairSession, String keyspace, String[] cfnames, CommonRange commonRange) { - super(clock, nextTimeUUID(), State.class); + super(ctx.clock(), ctx.timeUUID().get(), State.class); this.parentRepairSession = parentRepairSession; this.keyspace = keyspace; this.cfnames = cfnames; diff --git a/src/java/org/apache/cassandra/schema/AutoRepairParams.java b/src/java/org/apache/cassandra/schema/AutoRepairParams.java new file mode 100644 index 000000000000..1fe80f766f3e --- /dev/null +++ b/src/java/org/apache/cassandra/schema/AutoRepairParams.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.schema; + +import java.util.Arrays; +import java.util.Map; +import java.util.Objects; +import java.util.TreeMap; + +import com.google.common.base.MoreObjects; +import com.google.common.collect.ImmutableMap; +import org.apache.commons.lang3.StringUtils; + +import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig; +import org.apache.cassandra.utils.LocalizeString; + +import static java.lang.String.format; +import static org.apache.cassandra.utils.LocalizeString.toLowerCaseLocalized; + +/** + * AutoRepair table parameters - used to define the auto-repair configuration for a table. + */ +public final class AutoRepairParams +{ + public enum Option + { + FULL_ENABLED, + INCREMENTAL_ENABLED, + PREVIEW_REPAIRED_ENABLED, + PRIORITY; + + @Override + public String toString() + { + return toLowerCaseLocalized(name()); + } + } + + private final ImmutableMap options; + + public static final Map DEFAULT_OPTIONS = ImmutableMap.of( + LocalizeString.toLowerCaseLocalized(Option.FULL_ENABLED.name()), Boolean.toString(true), + LocalizeString.toLowerCaseLocalized(Option.INCREMENTAL_ENABLED.name()), Boolean.toString(true), + LocalizeString.toLowerCaseLocalized(Option.PREVIEW_REPAIRED_ENABLED.name()), Boolean.toString(true), + Option.PRIORITY.toString(), "0" + ); + + AutoRepairParams(Map options) + { + this.options = ImmutableMap.copyOf(options); + } + + public static final AutoRepairParams DEFAULT = + new AutoRepairParams(DEFAULT_OPTIONS); + + public static AutoRepairParams create(Map options) + { + Map optionsMap = new TreeMap<>(DEFAULT_OPTIONS); + if (options != null) + { + for (Map.Entry entry : options.entrySet()) + { + if (Arrays.stream(Option.values()).noneMatch(option -> option.toString().equalsIgnoreCase(entry.getKey()))) + { + throw new ConfigurationException(format("Unknown property '%s'", entry.getKey())); + } + optionsMap.put(entry.getKey(), entry.getValue()); + } + } + return new AutoRepairParams(optionsMap); + } + + public boolean repairEnabled(AutoRepairConfig.RepairType type) + { + String option = LocalizeString.toLowerCaseLocalized(type.toString()) + "_enabled"; + String enabled = options.getOrDefault(option, DEFAULT_OPTIONS.get(option)); + return Boolean.parseBoolean(enabled); + } + + public int priority() + { + String priority = options.getOrDefault(Option.PRIORITY.toString(), DEFAULT_OPTIONS.get(Option.PRIORITY.toString())); + return Integer.parseInt(priority); + } + + public void validate() + { + for (Option option : Option.values()) + { + if (!options.containsKey(LocalizeString.toLowerCaseLocalized(option.toString()))) + { + throw new ConfigurationException(format("Missing repair sub-option '%s'", option)); + } + } + if (options.get(LocalizeString.toLowerCaseLocalized(Option.FULL_ENABLED.toString())) != null && !isValidBoolean(options.get(LocalizeString.toLowerCaseLocalized(Option.FULL_ENABLED.toString())))) + { + throw new ConfigurationException(format("Invalid value %s for '%s' repair sub-option - must be a boolean", + options.get(LocalizeString.toLowerCaseLocalized(Option.FULL_ENABLED.toString())), + Option.FULL_ENABLED)); + } + if (options.get(LocalizeString.toLowerCaseLocalized(Option.INCREMENTAL_ENABLED.toString())) != null && !isValidBoolean(options.get(LocalizeString.toLowerCaseLocalized(Option.INCREMENTAL_ENABLED.toString())))) + { + throw new ConfigurationException(format("Invalid value %s for '%s' repair sub-option - must be a boolean", + options.get(LocalizeString.toLowerCaseLocalized(Option.INCREMENTAL_ENABLED.toString())), + Option.INCREMENTAL_ENABLED)); + } + if (options.get(LocalizeString.toLowerCaseLocalized(Option.PREVIEW_REPAIRED_ENABLED.toString())) != null && !isValidBoolean(options.get(LocalizeString.toLowerCaseLocalized(Option.PREVIEW_REPAIRED_ENABLED.toString())))) + { + throw new ConfigurationException(format("Invalid value %s for '%s' repair sub-option - must be a boolean", + options.get(LocalizeString.toLowerCaseLocalized(Option.PREVIEW_REPAIRED_ENABLED.toString())), + Option.PREVIEW_REPAIRED_ENABLED)); + } + if (options.get(LocalizeString.toLowerCaseLocalized(Option.PRIORITY.toString())) != null && !isValidInt(options.get(LocalizeString.toLowerCaseLocalized(Option.PRIORITY.toString())))) + { + throw new ConfigurationException(format("Invalid value %s for '%s' repair sub-option - must be an integer", + options.get(LocalizeString.toLowerCaseLocalized(Option.PRIORITY.toString())), + Option.PRIORITY)); + } + } + + public static boolean isValidBoolean(String value) + { + return StringUtils.equalsIgnoreCase(value, "true") || StringUtils.equalsIgnoreCase(value, "false"); + } + + public static boolean isValidInt(String value) + { + return StringUtils.isNumeric(value); + } + + public Map options() + { + return options; + } + + public static AutoRepairParams fromMap(Map map) + { + return create(map); + } + + public Map asMap() + { + return options; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("options", options) + .toString(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) + return true; + + if (!(o instanceof AutoRepairParams)) + return false; + + AutoRepairParams cp = (AutoRepairParams) o; + + return options.equals(cp.options); + } + + @Override + public int hashCode() + { + return Objects.hash(options); + } +} diff --git a/src/java/org/apache/cassandra/schema/ColumnMetadata.java b/src/java/org/apache/cassandra/schema/ColumnMetadata.java index e28312532be0..d5e7b23e4060 100644 --- a/src/java/org/apache/cassandra/schema/ColumnMetadata.java +++ b/src/java/org/apache/cassandra/schema/ColumnMetadata.java @@ -19,7 +19,13 @@ import java.io.IOException; import java.nio.ByteBuffer; -import java.util.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; +import java.util.Optional; import java.util.function.Predicate; import javax.annotation.Nonnull; @@ -29,24 +35,33 @@ import com.google.common.base.MoreObjects; import com.google.common.collect.Collections2; import com.google.common.collect.Lists; +import org.github.jamm.Unmetered; -import org.apache.cassandra.cql3.*; +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.cql3.ColumnSpecification; +import org.apache.cassandra.cql3.CqlBuilder; +import org.apache.cassandra.cql3.VariableSpecifications; import org.apache.cassandra.cql3.constraints.ColumnConstraint; import org.apache.cassandra.cql3.constraints.ColumnConstraints; import org.apache.cassandra.cql3.functions.masking.ColumnMask; import org.apache.cassandra.cql3.selection.Selectable; import org.apache.cassandra.cql3.selection.Selector; import org.apache.cassandra.cql3.selection.SimpleSelector; -import org.apache.cassandra.db.rows.*; -import org.apache.cassandra.db.marshal.*; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.CollectionType; +import org.apache.cassandra.db.marshal.ReversedType; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.db.marshal.UserType; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellPath; +import org.apache.cassandra.db.rows.ColumnData; import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.tcm.serialization.UDTAndFunctionsAwareMetadataSerializer; import org.apache.cassandra.tcm.serialization.Version; -import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.utils.ByteBufferUtil; -import org.github.jamm.Unmetered; import static org.apache.cassandra.db.TypeSizes.BOOL_SIZE; import static org.apache.cassandra.db.TypeSizes.sizeof; @@ -59,6 +74,7 @@ public final class ColumnMetadata extends ColumnSpecification implements Selecta (a, b) -> ((ColumnData) a).column().compareTo((ColumnMetadata) b); public static final int NO_POSITION = -1; + public static final int NO_UNIQUE_ID = Integer.MIN_VALUE; public enum ClusteringOrder { @@ -99,6 +115,7 @@ public boolean isPrimaryKeyKind() * the first clustering column is 0. */ private final int position; + public final int uniqueId; private final Comparator cellPathComparator; private final Comparator asymmetricCellPathComparator; @@ -132,47 +149,48 @@ private static long comparisonOrder(Kind kind, boolean isComplex, long position, public static ColumnMetadata partitionKeyColumn(TableMetadata table, ByteBuffer name, AbstractType type, int position) { - return new ColumnMetadata(table, name, type, position, Kind.PARTITION_KEY, null, ColumnConstraints.NO_OP); + return new ColumnMetadata(table, name, type, position, position, Kind.PARTITION_KEY, null, ColumnConstraints.NO_OP); } public static ColumnMetadata partitionKeyColumn(String keyspace, String table, String name, AbstractType type, int position) { - return new ColumnMetadata(keyspace, table, ColumnIdentifier.getInterned(name, true), type, position, Kind.PARTITION_KEY, null, ColumnConstraints.NO_OP); + return new ColumnMetadata(keyspace, table, ColumnIdentifier.getInterned(name, true), type, position, position, Kind.PARTITION_KEY, null, ColumnConstraints.NO_OP); } public static ColumnMetadata clusteringColumn(TableMetadata table, ByteBuffer name, AbstractType type, int position) { - return new ColumnMetadata(table, name, type, position, Kind.CLUSTERING, null, ColumnConstraints.NO_OP); + return new ColumnMetadata(table, name, type, position, position, Kind.CLUSTERING, null, ColumnConstraints.NO_OP); } public static ColumnMetadata clusteringColumn(String keyspace, String table, String name, AbstractType type, int position) { - return new ColumnMetadata(keyspace, table, ColumnIdentifier.getInterned(name, true), type, position, Kind.CLUSTERING, null, ColumnConstraints.NO_OP); + return new ColumnMetadata(keyspace, table, ColumnIdentifier.getInterned(name, true), type, position, position, Kind.CLUSTERING, null, ColumnConstraints.NO_OP); } - public static ColumnMetadata regularColumn(TableMetadata table, ByteBuffer name, AbstractType type) + public static ColumnMetadata regularColumn(TableMetadata table, ByteBuffer name, AbstractType type, int uniqueId) { - return new ColumnMetadata(table, name, type, NO_POSITION, Kind.REGULAR, null, ColumnConstraints.NO_OP); + return new ColumnMetadata(table, name, type, uniqueId, NO_POSITION, Kind.REGULAR, null, ColumnConstraints.NO_OP); } - public static ColumnMetadata regularColumn(String keyspace, String table, String name, AbstractType type) + public static ColumnMetadata regularColumn(String keyspace, String table, String name, AbstractType type, int uniqueId) { - return new ColumnMetadata(keyspace, table, ColumnIdentifier.getInterned(name, true), type, NO_POSITION, Kind.REGULAR, null, ColumnConstraints.NO_OP); + return new ColumnMetadata(keyspace, table, ColumnIdentifier.getInterned(name, true), type, uniqueId, NO_POSITION, Kind.REGULAR, null, ColumnConstraints.NO_OP); } - public static ColumnMetadata staticColumn(TableMetadata table, ByteBuffer name, AbstractType type) + public static ColumnMetadata staticColumn(TableMetadata table, ByteBuffer name, AbstractType type, int uniqueId) { - return new ColumnMetadata(table, name, type, NO_POSITION, Kind.STATIC, null, ColumnConstraints.NO_OP); + return new ColumnMetadata(table, name, type, uniqueId, NO_POSITION, Kind.STATIC, null, ColumnConstraints.NO_OP); } - public static ColumnMetadata staticColumn(String keyspace, String table, String name, AbstractType type) + public static ColumnMetadata staticColumn(String keyspace, String table, String name, AbstractType type, int uniqueId) { - return new ColumnMetadata(keyspace, table, ColumnIdentifier.getInterned(name, true), type, NO_POSITION, Kind.STATIC, null, ColumnConstraints.NO_OP); + return new ColumnMetadata(keyspace, table, ColumnIdentifier.getInterned(name, true), type, uniqueId, NO_POSITION, Kind.STATIC, null, ColumnConstraints.NO_OP); } public ColumnMetadata(TableMetadata table, ByteBuffer name, AbstractType type, + int uniqueId, int position, Kind kind, @Nullable ColumnMask mask, @@ -182,6 +200,7 @@ public ColumnMetadata(TableMetadata table, table.name, ColumnIdentifier.getInterned(name, UTF8Type.instance), type, + uniqueId, position, kind, mask, @@ -191,6 +210,7 @@ public ColumnMetadata(TableMetadata table, public ColumnMetadata(TableMetadata table, ByteBuffer name, AbstractType type, + int uniqueId, int position, Kind kind, @Nullable ColumnMask mask) @@ -199,6 +219,7 @@ public ColumnMetadata(TableMetadata table, table.name, ColumnIdentifier.getInterned(name, UTF8Type.instance), type, + uniqueId, position, kind, mask, @@ -210,11 +231,12 @@ public ColumnMetadata(String ksName, String cfName, ColumnIdentifier name, AbstractType type, + int uniqueId, int position, Kind kind, @Nullable ColumnMask mask) { - this(ksName, cfName, name, type, position, kind, mask, ColumnConstraints.NO_OP); + this(ksName, cfName, name, type, uniqueId, position, kind, mask, ColumnConstraints.NO_OP); } @VisibleForTesting @@ -222,12 +244,14 @@ public ColumnMetadata(String ksName, String cfName, ColumnIdentifier name, AbstractType type, + int uniqueId, int position, Kind kind, @Nullable ColumnMask mask, @Nonnull ColumnConstraints columnConstraints) { super(ksName, cfName, name, type); + this.uniqueId = uniqueId; assert name != null && type != null && kind != null; assert (position == NO_POSITION) == !kind.isPrimaryKeyKind(); // The position really only make sense for partition and clustering columns (and those must have one), // so make sure we don't sneak it for something else since it'd breaks equals() @@ -245,6 +269,7 @@ public ColumnMetadata(String ksName, this.comparisonOrder = comparisonOrder(kind, isComplex(), Math.max(0, position), name); this.mask = mask; this.columnConstraints = columnConstraints; + this.columnConstraints.setColumnName(name); } private static Comparator makeCellPathComparator(Kind kind, AbstractType type) @@ -276,22 +301,22 @@ private static Comparator makeCellPathComparator(Kind kind, AbstractTy public ColumnMetadata copy() { - return new ColumnMetadata(ksName, cfName, name, type, position, kind, mask, columnConstraints); + return new ColumnMetadata(ksName, cfName, name, type, uniqueId, position, kind, mask, columnConstraints); } public ColumnMetadata withNewName(ColumnIdentifier newName) { - return new ColumnMetadata(ksName, cfName, newName, type, position, kind, mask, columnConstraints); + return new ColumnMetadata(ksName, cfName, newName, type, uniqueId, position, kind, mask, columnConstraints); } public ColumnMetadata withNewType(AbstractType newType) { - return new ColumnMetadata(ksName, cfName, name, newType, position, kind, mask, columnConstraints); + return new ColumnMetadata(ksName, cfName, name, newType, uniqueId, position, kind, mask, columnConstraints); } public ColumnMetadata withNewMask(@Nullable ColumnMask newMask) { - return new ColumnMetadata(ksName, cfName, name, type, position, kind, newMask, columnConstraints); + return new ColumnMetadata(ksName, cfName, name, type, uniqueId, position, kind, newMask, columnConstraints); } public boolean isPartitionKey() @@ -371,7 +396,7 @@ public ColumnConstraints setColumnConstraints() public ColumnMetadata withNewColumnConstraints(ColumnConstraints constraints) { constraints.validate(this); - return new ColumnMetadata(ksName, cfName, name, type, position, kind, mask, constraints); + return new ColumnMetadata(ksName, cfName, name, type, uniqueId, position, kind, mask, constraints); } public void removeColumnConstraints() @@ -396,12 +421,12 @@ public boolean equals(Object o) private boolean equalsWithoutType(ColumnMetadata other) { return name.equals(other.name) - && kind == other.kind - && position == other.position - && ksName.equals(other.ksName) - && cfName.equals(other.cfName) - && Objects.equals(mask, other.mask) - && Objects.equals(columnConstraints, other.columnConstraints); + && kind == other.kind + && position == other.position + && ksName.equals(other.ksName) + && cfName.equals(other.cfName) + && Objects.equals(mask, other.mask) + && Objects.equals(columnConstraints, other.columnConstraints); } Optional compare(ColumnMetadata other) @@ -711,6 +736,8 @@ public void serialize(ColumnMetadata t, DataOutputPlus out, Version version) thr if (hasConstraints) ColumnConstraints.serializer.serialize(t.columnConstraints, out, version); } + if (version.isAtLeast(Version.V7)) + out.writeInt(t.uniqueId); } public ColumnMetadata deserialize(DataInputPlus in, Types types, UserFunctions functions, Version version) throws IOException @@ -734,7 +761,10 @@ public ColumnMetadata deserialize(DataInputPlus in, Types types, UserFunctions f constraints = ColumnConstraints.serializer.deserialize(in, version); else constraints = ColumnConstraints.NO_OP; - return new ColumnMetadata(ksName, tableName, new ColumnIdentifier(nameBB, name), type, position, kind, mask, constraints); + int uniqueId = NO_UNIQUE_ID; + if (version.isAtLeast(Version.V7)) + uniqueId = in.readInt(); + return new ColumnMetadata(ksName, tableName, new ColumnIdentifier(nameBB, name), type, uniqueId, position, kind, mask, constraints); } public long serializedSize(ColumnMetadata t, Version version) @@ -756,7 +786,8 @@ public long serializedSize(ColumnMetadata t, Version version) ByteBufferUtil.serializedSizeWithShortLength(t.name.bytes) + BOOL_SIZE + ((t.mask == null) ? 0 : ColumnMask.serializer.serializedSize(t.mask, version)) + - constraintsSize; + constraintsSize + + (version.isAtLeast(Version.V7) ? 4 : 0); } } } diff --git a/src/java/org/apache/cassandra/schema/DistributedMetadataLogKeyspace.java b/src/java/org/apache/cassandra/schema/DistributedMetadataLogKeyspace.java index 37af87f3e400..edc9afe17f9c 100644 --- a/src/java/org/apache/cassandra/schema/DistributedMetadataLogKeyspace.java +++ b/src/java/org/apache/cassandra/schema/DistributedMetadataLogKeyspace.java @@ -26,8 +26,6 @@ import com.google.common.collect.ImmutableMap; -import org.apache.cassandra.locator.MetaStrategy; -import org.apache.cassandra.utils.JVMStabilityInspector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,6 +34,8 @@ import org.apache.cassandra.cql3.statements.schema.CreateTableStatement; import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.exceptions.CasWriteTimeoutException; +import org.apache.cassandra.locator.MetaStrategy; +import org.apache.cassandra.service.accord.fastpath.FastPathStrategy; import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.MetadataSnapshots; @@ -44,6 +44,7 @@ import org.apache.cassandra.tcm.log.LogReader; import org.apache.cassandra.tcm.log.LogState; import org.apache.cassandra.tcm.transformations.cms.PreInitialize; +import org.apache.cassandra.utils.JVMStabilityInspector; import static org.apache.cassandra.tcm.Epoch.FIRST; @@ -162,6 +163,20 @@ public static LogState getLogState(Epoch since, boolean consistentFetch) return (consistentFetch ? serialLogReader : localLogReader).getLogState(since); } + /** + * Reconstructs the log state by returning a _consistent_ base snapshot of a start epoch, and + * a list of transformations between start and end. + * + * TODO: this is a rather expensive operation, and should be use sparingly. If we decide we need to + * rely on reconstructing arbitrary epochs during normal operation, we need to add a caching mechanism + * here. One more alternative is to keep a lazily-initialized AccordTopology table on CMS nodes for a + * number of recent epochs, and keep a node-local cache of this table on other nodes. + */ + public static LogState getLogState(Epoch start, Epoch end, boolean includeSnapshot) + { + return serialLogReader.getLogState(start, end, includeSnapshot); + } + public static class DistributedTableLogReader implements LogReader { private final ConsistencyLevel consistencyLevel; @@ -199,6 +214,27 @@ public EntryHolder getEntries(Epoch since) throws IOException return entryHolder; } + public EntryHolder getEntries(Epoch since, Epoch until) throws IOException + { + // during gossip upgrade we have epoch = Long.MIN_VALUE + 1 (and the reverse partitioner doesn't support negative keys) + since = since.isBefore(Epoch.EMPTY) ? Epoch.EMPTY : since; + // note that we want all entries with epoch >= since - but since we use a reverse partitioner, we actually + // want all entries where the token is less than token(since) + UntypedResultSet resultSet = execute(String.format("SELECT epoch, kind, transformation, entry_id FROM %s.%s WHERE token(epoch) <= token(?) AND token(epoch) >= token(?)", + SchemaConstants.METADATA_KEYSPACE_NAME, TABLE_NAME), + consistencyLevel, since.getEpoch(), until.getEpoch()); + EntryHolder entryHolder = new EntryHolder(since); + for (UntypedResultSet.Row row : resultSet) + { + long entryId = row.getLong("entry_id"); + Epoch epoch = Epoch.create(row.getLong("epoch")); + Transformation.Kind kind = Transformation.Kind.fromId(row.getInt("kind")); + Transformation transform = kind.fromVersionedBytes(row.getBlob("transformation")); + entryHolder.add(new Entry(new Entry.Id(entryId), epoch, transform)); + } + return entryHolder; + } + @Override public MetadataSnapshots snapshots() { @@ -223,7 +259,7 @@ private static TableMetadata.Builder parse(String cql, String table, String desc public static KeyspaceMetadata initialMetadata(Set knownDatacenters) { - return KeyspaceMetadata.create(SchemaConstants.METADATA_KEYSPACE_NAME, new KeyspaceParams(true, ReplicationParams.simpleMeta(1, knownDatacenters)), Tables.of(Log)); + return KeyspaceMetadata.create(SchemaConstants.METADATA_KEYSPACE_NAME, new KeyspaceParams(true, ReplicationParams.simpleMeta(1, knownDatacenters), FastPathStrategy.simple()), Tables.of(Log)); } public static KeyspaceMetadata initialMetadata(String datacenter) diff --git a/src/java/org/apache/cassandra/schema/DistributedSchema.java b/src/java/org/apache/cassandra/schema/DistributedSchema.java index e4eead15b5ea..b13cfc7db3f6 100644 --- a/src/java/org/apache/cassandra/schema/DistributedSchema.java +++ b/src/java/org/apache/cassandra/schema/DistributedSchema.java @@ -18,19 +18,8 @@ package org.apache.cassandra.schema; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Set; -import java.util.UUID; - import com.google.common.base.Preconditions; - +import com.google.common.collect.ImmutableMap; import org.apache.cassandra.auth.AuthKeyspace; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.functions.UserFunction; @@ -46,6 +35,18 @@ import org.apache.cassandra.tracing.TraceKeyspace; import org.apache.cassandra.utils.FBUtilities; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.UUID; + import static org.apache.cassandra.db.TypeSizes.sizeof; /** @@ -79,10 +80,18 @@ public static DistributedSchema first(Set knownDatacenters) return new DistributedSchema(Keyspaces.of(DistributedMetadataLogKeyspace.initialMetadata(knownDatacenters)), Epoch.FIRST); } + private static ImmutableMap keyspacesToTableMap(Keyspaces keyspaces) + { + ImmutableMap.Builder builder = ImmutableMap.builder(); + keyspaces.forEach(ksm -> ksm.tablesAndViews().forEach(tbl -> builder.put(tbl.id, tbl))); + return builder.build(); + } + private final Keyspaces keyspaces; private final Epoch epoch; private final UUID version; private final Map keyspaceInstances = new HashMap<>(); + private final transient ImmutableMap tables; public DistributedSchema(Keyspaces keyspaces) { @@ -95,6 +104,7 @@ public DistributedSchema(Keyspaces keyspaces, Epoch epoch) this.keyspaces = keyspaces; this.epoch = epoch; this.version = new UUID(0, epoch.getEpoch()); + this.tables = keyspacesToTableMap(keyspaces); validate(); } @@ -120,6 +130,22 @@ public KeyspaceMetadata getKeyspaceMetadata(String keyspace) return keyspaces.get(keyspace).get(); } + public Optional maybeGetKeyspaceMetadata(String keyspace) + { + return keyspaces.get(keyspace); + } + + public TableMetadata getTableMetadata(TableId id) + { + return tables.get(id); + } + + public TableMetadata getTableMetadata(String keyspace, String cf) + { + KeyspaceMetadata ks = keyspaces.getNullable(keyspace); + return ks == null ? null : ks.tables.getNullable(cf); + } + public static DistributedSchema fromSystemTables(Keyspaces keyspaces, Set knownDatacenters) { if (!keyspaces.containsKeyspace(SchemaConstants.METADATA_KEYSPACE_NAME)) diff --git a/src/java/org/apache/cassandra/schema/KeyspaceMetadata.java b/src/java/org/apache/cassandra/schema/KeyspaceMetadata.java index 8065c5929007..16a5c1b67a60 100644 --- a/src/java/org/apache/cassandra/schema/KeyspaceMetadata.java +++ b/src/java/org/apache/cassandra/schema/KeyspaceMetadata.java @@ -53,6 +53,7 @@ import static com.google.common.collect.Iterables.any; import static java.lang.String.format; import static org.apache.cassandra.db.TypeSizes.sizeof; +import static org.apache.cassandra.utils.LocalizeString.toLowerCaseLocalized; /** * An immutable representation of keyspace metadata (name, params, tables, types, and functions). @@ -364,9 +365,16 @@ public String toCqlString(boolean withWarnings, boolean withInternals, boolean i params.replication.appendCqlTo(builder); builder.append(" AND durable_writes = ") - .append(params.durableWrites) - .append(';') - .toString(); + .append(params.durableWrites); + + if (params.fastPath != null) + { + builder.append(" AND fast_path = '") + .append(toLowerCaseLocalized(params.fastPath.toString())) + .append("'"); + } + + builder.append(';'); } return builder.toString(); } diff --git a/src/java/org/apache/cassandra/schema/KeyspaceParams.java b/src/java/org/apache/cassandra/schema/KeyspaceParams.java index 76516334b8d5..09afed84ba89 100644 --- a/src/java/org/apache/cassandra/schema/KeyspaceParams.java +++ b/src/java/org/apache/cassandra/schema/KeyspaceParams.java @@ -28,10 +28,12 @@ import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.fastpath.FastPathStrategy; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.serialization.MetadataSerializer; import org.apache.cassandra.tcm.serialization.Version; +import static org.apache.cassandra.tcm.serialization.Version.MIN_ACCORD_VERSION; import static org.apache.cassandra.utils.LocalizeString.toLowerCaseLocalized; /** @@ -54,7 +56,8 @@ public final class KeyspaceParams public enum Option { DURABLE_WRITES, - REPLICATION; + REPLICATION, + FAST_PATH; @Override public String toString() @@ -65,41 +68,53 @@ public String toString() public final boolean durableWrites; public final ReplicationParams replication; + public final FastPathStrategy fastPath; - public KeyspaceParams(boolean durableWrites, ReplicationParams replication) + public KeyspaceParams(boolean durableWrites, ReplicationParams replication, FastPathStrategy fastPath) { this.durableWrites = durableWrites; this.replication = replication; + this.fastPath = fastPath; + } + + public static KeyspaceParams create(boolean durableWrites, Map replication, FastPathStrategy fastPath) + { + return new KeyspaceParams(durableWrites, ReplicationParams.fromMap(replication), fastPath); + } + + public static KeyspaceParams create(boolean durableWrites, Map replication, Map fastPath) + { + return create(durableWrites, replication, FastPathStrategy.fromMap(fastPath)); } public static KeyspaceParams create(boolean durableWrites, Map replication) { - return new KeyspaceParams(durableWrites, ReplicationParams.fromMap(replication)); + return create(durableWrites, replication, FastPathStrategy.simple()); } public static KeyspaceParams local() { - return new KeyspaceParams(DEFAULT_LOCAL_DURABLE_WRITES, ReplicationParams.local()); + return new KeyspaceParams(DEFAULT_LOCAL_DURABLE_WRITES, ReplicationParams.local(), FastPathStrategy.simple()); } public static KeyspaceParams simple(int replicationFactor) { - return new KeyspaceParams(true, ReplicationParams.simple(replicationFactor)); + return new KeyspaceParams(true, ReplicationParams.simple(replicationFactor), FastPathStrategy.simple()); } public static KeyspaceParams simple(String replicationFactor) { - return new KeyspaceParams(true, ReplicationParams.simple(replicationFactor)); + return new KeyspaceParams(true, ReplicationParams.simple(replicationFactor), FastPathStrategy.simple()); } public static KeyspaceParams simpleTransient(int replicationFactor) { - return new KeyspaceParams(false, ReplicationParams.simple(replicationFactor)); + return new KeyspaceParams(false, ReplicationParams.simple(replicationFactor), FastPathStrategy.simple()); } public static KeyspaceParams nts(Object... args) { - return new KeyspaceParams(true, ReplicationParams.nts(args)); + return new KeyspaceParams(true, ReplicationParams.nts(args), FastPathStrategy.simple()); } public void validate(String name, ClientState state, ClusterMetadata metadata) @@ -118,13 +133,13 @@ public boolean equals(Object o) KeyspaceParams p = (KeyspaceParams) o; - return durableWrites == p.durableWrites && replication.equals(p.replication); + return durableWrites == p.durableWrites && replication.equals(p.replication) && fastPath.equals(p.fastPath); } @Override public int hashCode() { - return Objects.hashCode(durableWrites, replication); + return Objects.hashCode(durableWrites, replication, fastPath); } @Override @@ -133,6 +148,7 @@ public String toString() return MoreObjects.toStringHelper(this) .add(Option.DURABLE_WRITES.toString(), durableWrites) .add(Option.REPLICATION.toString(), replication) + .add(Option.FAST_PATH.toString(), fastPath.toString()) .toString(); } @@ -142,19 +158,25 @@ public void serialize(KeyspaceParams t, DataOutputPlus out, Version version) thr { ReplicationParams.serializer.serialize(t.replication, out, version); out.writeBoolean(t.durableWrites); + if (version.isAtLeast(MIN_ACCORD_VERSION)) + FastPathStrategy.serializer.serialize(t.fastPath, out, version); } public KeyspaceParams deserialize(DataInputPlus in, Version version) throws IOException { ReplicationParams params = ReplicationParams.serializer.deserialize(in, version); boolean durableWrites = in.readBoolean(); - return new KeyspaceParams(durableWrites, params); + FastPathStrategy fastPath = version.isAtLeast(MIN_ACCORD_VERSION) + ? FastPathStrategy.serializer.deserialize(in, version) + : FastPathStrategy.simple(); + return new KeyspaceParams(durableWrites, params, fastPath); } public long serializedSize(KeyspaceParams t, Version version) { return ReplicationParams.serializer.serializedSize(t.replication, version) + - TypeSizes.sizeof(t.durableWrites); + TypeSizes.sizeof(t.durableWrites) + + (version.isAtLeast(MIN_ACCORD_VERSION) ? FastPathStrategy.serializer.serializedSize(t.fastPath, version) : 0); } } } diff --git a/src/java/org/apache/cassandra/schema/ReplicationParams.java b/src/java/org/apache/cassandra/schema/ReplicationParams.java index da44292e0d63..40a92c803fe6 100644 --- a/src/java/org/apache/cassandra/schema/ReplicationParams.java +++ b/src/java/org/apache/cassandra/schema/ReplicationParams.java @@ -70,6 +70,11 @@ public static ReplicationParams fromStrategy(AbstractReplicationStrategy strateg return new ReplicationParams(strategy.getClass(), strategy.configOptions); } + public static ReplicationParams copy(AbstractReplicationStrategy strategy) + { + return new ReplicationParams(strategy.getClass(), strategy.configOptions); + } + public static ReplicationParams local() { return new ReplicationParams(LocalStrategy.class, ImmutableMap.of()); diff --git a/src/java/org/apache/cassandra/schema/Schema.java b/src/java/org/apache/cassandra/schema/Schema.java index afb987a7a0c1..fcfae3f1515d 100644 --- a/src/java/org/apache/cassandra/schema/Schema.java +++ b/src/java/org/apache/cassandra/schema/Schema.java @@ -31,6 +31,7 @@ import org.slf4j.LoggerFactory; import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.db.virtual.VirtualKeyspaceRegistry; @@ -41,6 +42,7 @@ import org.apache.cassandra.exceptions.UnauthorizedException; import org.apache.cassandra.io.sstable.Descriptor; import org.apache.cassandra.locator.LocalStrategy; +import org.apache.cassandra.service.accord.AccordKeyspace; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tcm.transformations.AlterSchema; @@ -73,10 +75,14 @@ public final class Schema implements SchemaProvider private static Schema initialize() { - Keyspaces initialLocal = ((FORCE_LOAD_LOCAL_KEYSPACES || isDaemonInitialized() || isToolInitialized())) - ? Keyspaces.of(SchemaKeyspace.metadata(), - SystemKeyspace.metadata()) - : Keyspaces.NONE; + Keyspaces initialLocal = Keyspaces.NONE; + + if (FORCE_LOAD_LOCAL_KEYSPACES || isDaemonInitialized() || isToolInitialized()) + { + initialLocal = Keyspaces.of(SchemaKeyspace.metadata(), SystemKeyspace.metadata()); + initialLocal = DatabaseDescriptor.getAccordTransactionsEnabled() ? initialLocal.with(AccordKeyspace.metadata()) : initialLocal; + } + Schema schema = new Schema(initialLocal); for (KeyspaceMetadata ks : schema.localKeyspaces) schema.localKeyspaceInstances.put(ks.name, new LazyVariable<>(() -> Keyspace.forSchema(ks.name, schema))); @@ -135,7 +141,10 @@ public Keyspace getKeyspaceInstance(String keyspaceName) if (SchemaConstants.isVirtualSystemKeyspace(keyspaceName)) return null; else if (SchemaConstants.isLocalSystemKeyspace(keyspaceName)) - return localKeyspaceInstances.get(keyspaceName).get(); + { + Supplier supplier = localKeyspaceInstances.get(keyspaceName); + return supplier == null ? null : supplier.get(); + } else return ClusterMetadata.current().schema.getKeyspace(keyspaceName); } @@ -270,10 +279,15 @@ public TableMetadata getTableMetadata(String keyspace, String table) @Override public TableMetadata getTableMetadata(TableId id) { - return ObjectUtils.getFirstNonNull(() -> localKeyspaces.getTableOrViewNullable(id), - () -> distributedKeyspaces().getTableOrViewNullable(id), - () -> VirtualKeyspaceRegistry.instance.getTableMetadataNullable(id)); + TableMetadata metadata = localKeyspaces.getTableOrViewNullable(id); + if (metadata != null) + return metadata; + + metadata = distributedKeyspaces().getTableOrViewNullable(id); + if (metadata != null) + return metadata; + return VirtualKeyspaceRegistry.instance.getTableMetadataNullable(id); } public TableMetadata getTableMetadata(Descriptor descriptor) diff --git a/src/java/org/apache/cassandra/schema/SchemaConstants.java b/src/java/org/apache/cassandra/schema/SchemaConstants.java index 2323893c4b3b..03e911d675b4 100644 --- a/src/java/org/apache/cassandra/schema/SchemaConstants.java +++ b/src/java/org/apache/cassandra/schema/SchemaConstants.java @@ -30,6 +30,7 @@ import org.apache.cassandra.auth.AuthKeyspace; import org.apache.cassandra.db.Digest; import org.apache.cassandra.db.SystemKeyspace; +import org.apache.cassandra.service.accord.AccordKeyspace; import org.apache.cassandra.tracing.TraceKeyspace; import static org.apache.cassandra.utils.LocalizeString.toLowerCaseLocalized; @@ -46,23 +47,24 @@ public final class SchemaConstants public static final String METADATA_KEYSPACE_NAME = "system_cluster_metadata"; public static final String TRACE_KEYSPACE_NAME = "system_traces"; + public static final String ACCORD_KEYSPACE_NAME = "system_accord"; public static final String AUTH_KEYSPACE_NAME = "system_auth"; public static final String DISTRIBUTED_KEYSPACE_NAME = "system_distributed"; public static final String VIRTUAL_SCHEMA = "system_virtual_schema"; - public static final String VIRTUAL_VIEWS = "system_views"; public static final String VIRTUAL_METRICS = "system_metrics"; + public static final String VIRTUAL_ACCORD_DEBUG = "system_accord_debug"; public static final String DUMMY_KEYSPACE_OR_TABLE_NAME = "--dummy--"; /* system keyspace names (the ones with LocalStrategy replication strategy) */ public static final Set LOCAL_SYSTEM_KEYSPACE_NAMES = - ImmutableSet.of(SYSTEM_KEYSPACE_NAME, SCHEMA_KEYSPACE_NAME); + ImmutableSet.of(SYSTEM_KEYSPACE_NAME, SCHEMA_KEYSPACE_NAME, ACCORD_KEYSPACE_NAME); /* virtual table system keyspace names */ public static final Set VIRTUAL_SYSTEM_KEYSPACE_NAMES = - ImmutableSet.of(VIRTUAL_VIEWS, VIRTUAL_SCHEMA); + ImmutableSet.of(VIRTUAL_SCHEMA, VIRTUAL_VIEWS, VIRTUAL_METRICS); /* replicate system keyspace names (the ones with a "true" replication strategy) */ public static final Set REPLICATED_SYSTEM_KEYSPACE_NAMES = @@ -165,6 +167,7 @@ public static Set getLocalAndReplicatedSystemTableNames() .addAll(TraceKeyspace.TABLE_NAMES) .addAll(AuthKeyspace.TABLE_NAMES) .addAll(SystemDistributedKeyspace.TABLE_NAMES) + .addAll(AccordKeyspace.TABLE_NAMES) .build(); } } diff --git a/src/java/org/apache/cassandra/schema/SchemaKeyspace.java b/src/java/org/apache/cassandra/schema/SchemaKeyspace.java index b3ed1c2702e1..9001480c2494 100644 --- a/src/java/org/apache/cassandra/schema/SchemaKeyspace.java +++ b/src/java/org/apache/cassandra/schema/SchemaKeyspace.java @@ -44,6 +44,7 @@ import org.apache.cassandra.db.partitions.*; import org.apache.cassandra.db.rows.*; import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.service.accord.fastpath.FastPathStrategy; import org.apache.cassandra.service.reads.SpeculativeRetryPolicy; import org.apache.cassandra.schema.ColumnMetadata.ClusteringOrder; import org.apache.cassandra.schema.Keyspaces.KeyspacesDiff; @@ -61,6 +62,7 @@ import static org.apache.cassandra.config.CassandraRelevantProperties.TEST_FLUSH_LOCAL_SCHEMA_CHANGES; import static org.apache.cassandra.cql3.QueryProcessor.executeInternal; import static org.apache.cassandra.cql3.QueryProcessor.executeOnceInternal; +import static org.apache.cassandra.schema.ColumnMetadata.NO_UNIQUE_ID; import static org.apache.cassandra.schema.SchemaKeyspaceTables.*; import static org.apache.cassandra.utils.LocalizeString.toLowerCaseLocalized; import static org.apache.cassandra.utils.LocalizeString.toUpperCaseLocalized; @@ -97,6 +99,7 @@ private SchemaKeyspace() + "keyspace_name text," + "durable_writes boolean," + "replication frozen>," + + "fast_path frozen>," + "PRIMARY KEY ((keyspace_name)))"); private static final TableMetadata Tables = @@ -128,6 +131,8 @@ private SchemaKeyspace() + "additional_write_policy text," + "cdc boolean," + "read_repair text," + + "fast_path frozen>," + + "auto_repair frozen>," + "PRIMARY KEY ((keyspace_name), table_name))"); private static final TableMetadata Columns = @@ -212,6 +217,7 @@ private SchemaKeyspace() + "additional_write_policy text," + "cdc boolean," + "read_repair text," + + "auto_repair frozen>," + "PRIMARY KEY ((keyspace_name), view_name))"); private static final TableMetadata Indexes = @@ -343,7 +349,7 @@ static void saveSystemKeyspacesSchema() for (String schemaTable : ALL) { String query = String.format("DELETE FROM %s.%s USING TIMESTAMP ? WHERE keyspace_name = ?", SchemaConstants.SCHEMA_KEYSPACE_NAME, schemaTable); - for (String systemKeyspace : SchemaConstants.LOCAL_SYSTEM_KEYSPACE_NAMES) + for (String systemKeyspace : Schema.instance.localKeyspaces().names()) executeOnceInternal(query, timestamp, systemKeyspace); } @@ -491,7 +497,8 @@ private static Mutation.SimpleBuilder makeCreateKeyspaceMutation(String name, Ke .row() .add(KeyspaceParams.Option.DURABLE_WRITES.toString(), params.durableWrites) .add(KeyspaceParams.Option.REPLICATION.toString(), - (params.replication.isMeta() ? params.replication.asNonMeta() : params.replication).asMap()); + (params.replication.isMeta() ? params.replication.asNonMeta() : params.replication).asMap()) + .add(KeyspaceParams.Option.FAST_PATH.toString(), params.fastPath.asMap()); return builder; } @@ -551,7 +558,7 @@ private static void addTableToSchemaMutation(TableMetadata table, boolean withCo .add("id", table.id.asUUID()) .add("flags", TableMetadata.Flag.toStringSet(table.flags)); - addTableParamsToRowBuilder(table.params, rowBuilder); + addTableParamsToRowBuilder(table.params, rowBuilder, false); if (withColumnsAndTriggers) { @@ -569,7 +576,7 @@ private static void addTableToSchemaMutation(TableMetadata table, boolean withCo } } - private static void addTableParamsToRowBuilder(TableParams params, Row.SimpleBuilder builder) + public static void addTableParamsToRowBuilder(TableParams params, Row.SimpleBuilder builder, boolean forView) { builder.add("bloom_filter_fp_chance", params.bloomFilterFpChance) .add("comment", params.comment) @@ -608,6 +615,17 @@ private static void addTableParamsToRowBuilder(TableParams params, Row.SimpleBui // incremental_backups is enabled, to avoid RTE in pre-4.2 versioned node during upgrades if (!params.incrementalBackups) builder.add("incremental_backups", false); + + if (DatabaseDescriptor.getAccordTransactionsEnabled() && !forView) + builder.add("fast_path", params.fastPath.asMap()); + + // As above, only add the auto_repair column if the scheduler is enabled + // to avoid RTE in pre-5.1 versioned node during upgrades + if (DatabaseDescriptor.getRawConfig() != null + && DatabaseDescriptor.getAutoRepairConfig().isAutoRepairSchedulingEnabled()) + { + builder.add("auto_repair", params.autoRepair.asMap()); + } } private static void addAlterTableToSchemaMutation(TableMetadata oldTable, TableMetadata newTable, Mutation.SimpleBuilder builder) @@ -820,7 +838,7 @@ private static void addViewToSchemaMutation(ViewMetadata view, boolean includeCo .add("where_clause", view.whereClause.toCQLString()) .add("id", table.id.asUUID()); - addTableParamsToRowBuilder(table.params, rowBuilder); + addTableParamsToRowBuilder(table.params, rowBuilder, true); if (includeColumns) { @@ -966,9 +984,11 @@ private static KeyspaceParams fetchKeyspaceParams(String keyspaceName) UntypedResultSet.Row row = query(query, keyspaceName).one(); boolean durableWrites = row.getBoolean(KeyspaceParams.Option.DURABLE_WRITES.toString()); Map replication = row.getFrozenTextMap(KeyspaceParams.Option.REPLICATION.toString()); - KeyspaceParams params = KeyspaceParams.create(durableWrites, replication); + Map fastPath = row.getFrozenTextMap(KeyspaceParams.Option.FAST_PATH.toString()); + KeyspaceParams params = KeyspaceParams.create(durableWrites, replication, fastPath); + if (keyspaceName.equals(SchemaConstants.METADATA_KEYSPACE_NAME)) - params = new KeyspaceParams(params.durableWrites, params.replication.asMeta()); + params = new KeyspaceParams(params.durableWrites, params.replication.asMeta(), FastPathStrategy.simple()); return params; } @@ -1070,7 +1090,8 @@ static TableParams createTableParamsFromRow(UntypedResultSet.Row row) SpeculativeRetryPolicy.fromString(row.getString("additional_write_policy")) : SpeculativeRetryPolicy.fromString("99PERCENTILE")) .cdc(row.has("cdc") && row.getBoolean("cdc")) - .readRepair(getReadRepairStrategy(row)); + .readRepair(getReadRepairStrategy(row)) + .fastPath(getFastPathStrategy(row)); // allow_auto_snapshot column was introduced in 4.2 if (row.has("allow_auto_snapshot")) @@ -1080,6 +1101,12 @@ static TableParams createTableParamsFromRow(UntypedResultSet.Row row) if (row.has("incremental_backups")) builder.incrementalBackups(row.getBoolean("incremental_backups")); + // auto_repair column was introduced in 5.1 + if (row.has("auto_repair")) + { + builder.automatedRepair(AutoRepairParams.fromMap(row.getFrozenTextMap("auto_repair"))); + } + return builder.build(); } @@ -1162,7 +1189,7 @@ else if (!(function instanceof ScalarFunction)) mask = new ColumnMask((ScalarFunction) function, values); } - return new ColumnMetadata(keyspace, table, name, type, position, kind, mask); + return new ColumnMetadata(keyspace, table, name, type, NO_UNIQUE_ID, position, kind, mask); } private static Map fetchDroppedColumns(String keyspace, String table) @@ -1194,7 +1221,7 @@ private static DroppedColumn createDroppedColumnFromRow(UntypedResultSet.Row row assert kind == ColumnMetadata.Kind.REGULAR || kind == ColumnMetadata.Kind.STATIC : "Unexpected dropped column kind: " + kind; - ColumnMetadata column = new ColumnMetadata(keyspace, table, ColumnIdentifier.getInterned(name, true), type, ColumnMetadata.NO_POSITION, kind, null); + ColumnMetadata column = new ColumnMetadata(keyspace, table, ColumnIdentifier.getInterned(name, true), type, NO_UNIQUE_ID, ColumnMetadata.NO_POSITION, kind, null); long droppedTime = TimeUnit.MILLISECONDS.toMicros(row.getLong("dropped_time")); return new DroppedColumn(column, droppedTime); } @@ -1448,4 +1475,11 @@ private static ReadRepairStrategy getReadRepairStrategy(UntypedResultSet.Row row ? ReadRepairStrategy.fromString(row.getString("read_repair")) : ReadRepairStrategy.BLOCKING; } + + private static FastPathStrategy getFastPathStrategy(UntypedResultSet.Row row) + { + return row.has("fast_path") + ? FastPathStrategy.fromMap(row.getFrozenTextMap("fast_path")) + : FastPathStrategy.inheritKeyspace(); + } } diff --git a/src/java/org/apache/cassandra/schema/SchemaProvider.java b/src/java/org/apache/cassandra/schema/SchemaProvider.java index 0e34ee55095d..2e137433af7a 100644 --- a/src/java/org/apache/cassandra/schema/SchemaProvider.java +++ b/src/java/org/apache/cassandra/schema/SchemaProvider.java @@ -18,6 +18,7 @@ package org.apache.cassandra.schema; +import java.nio.ByteBuffer; import java.util.Collection; import java.util.Collections; import java.util.List; @@ -26,6 +27,7 @@ import java.util.UUID; import javax.annotation.Nullable; +import org.apache.cassandra.cql3.ColumnIdentifier; import org.apache.cassandra.cql3.functions.Function; import org.apache.cassandra.cql3.functions.FunctionName; import org.apache.cassandra.cql3.functions.UserFunction; @@ -33,6 +35,7 @@ import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.KeyspaceNotDefinedException; import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.exceptions.UnknownTableException; import org.apache.cassandra.io.sstable.Descriptor; @@ -133,6 +136,18 @@ default ColumnFamilyStore getColumnFamilyStoreInstance(TableId id) @Nullable TableMetadata getTableMetadata(TableId id); + @Nullable + default IPartitioner getTablePartitioner(TableId id) + { + TableMetadata metadata = getTableMetadata(id); + return metadata == null ? null : metadata.partitioner; + } + + default IPartitioner getExistingTablePartitioner(TableId id) throws UnknownTableException + { + return getExistingTableMetadata(id).partitioner; + } + @Nullable default TableMetadataRef getTableMetadataRef(TableId id) { @@ -147,16 +162,30 @@ default TableMetadataRef getTableMetadataRef(String keyspace, String table) return getTableMetadata(keyspace, table).ref; } + @Nullable + default ColumnMetadata getColumnMetadata(String keyspace, String table, ColumnIdentifier name) + { + TableMetadata metadata = getTableMetadata(keyspace, table); + if (metadata == null) return null; + return metadata.getColumn(name); + } + + @Nullable + default ColumnMetadata getColumnMetadata(String keyspace, String table, ByteBuffer name) + { + TableMetadata metadata = getTableMetadata(keyspace, table); + if (metadata == null) return null; + return metadata.getColumn(name); + } + default TableMetadata getExistingTableMetadata(TableId id) throws UnknownTableException { TableMetadata metadata = getTableMetadata(id); if (metadata != null) return metadata; - String message = - String.format("Couldn't find table with id %s. If a table was just created, this is likely due to the schema " - + "not being fully propagated. Please wait for schema agreement on table creation.", - id); + String message = "Couldn't find table with id " + id + ". If a table was just created, this is likely due to the schema " + + "not being fully propagated. Please wait for schema agreement on table creation."; throw new UnknownTableException(message, id); } diff --git a/src/java/org/apache/cassandra/schema/SystemDistributedKeyspace.java b/src/java/org/apache/cassandra/schema/SystemDistributedKeyspace.java index 08640007df1b..d50621a3a15c 100644 --- a/src/java/org/apache/cassandra/schema/SystemDistributedKeyspace.java +++ b/src/java/org/apache/cassandra/schema/SystemDistributedKeyspace.java @@ -32,11 +32,10 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; -import com.google.common.collect.ImmutableMap; import com.google.common.collect.Sets; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -83,10 +82,11 @@ private SystemDistributedKeyspace() * gen 4: compression chunk length reduced to 16KiB, memtable_flush_period_in_ms now unset on all tables in 4.0 * gen 5: add ttl and TWCS to repair_history tables * gen 6: add denylist table + * gen 7: add auto_repair_history and auto_repair_priority tables for AutoRepair feature * * // TODO: TCM - how do we evolve these tables? */ - public static final long GENERATION = 6; + public static final long GENERATION = 7; public static final String REPAIR_HISTORY = "repair_history"; @@ -96,7 +96,11 @@ private SystemDistributedKeyspace() public static final String PARTITION_DENYLIST_TABLE = "partition_denylist"; - public static final Set TABLE_NAMES = ImmutableSet.of(REPAIR_HISTORY, PARENT_REPAIR_HISTORY, VIEW_BUILD_STATUS, PARTITION_DENYLIST_TABLE); + public static final String AUTO_REPAIR_HISTORY = "auto_repair_history"; + + public static final String AUTO_REPAIR_PRIORITY = "auto_repair_priority"; + + public static final Set TABLE_NAMES = ImmutableSet.of(REPAIR_HISTORY, PARENT_REPAIR_HISTORY, VIEW_BUILD_STATUS, PARTITION_DENYLIST_TABLE, AUTO_REPAIR_HISTORY, AUTO_REPAIR_PRIORITY); public static final String REPAIR_HISTORY_CQL = "CREATE TABLE IF NOT EXISTS %s (" + "keyspace_name text," @@ -159,6 +163,28 @@ private SystemDistributedKeyspace() private static final TableMetadata PartitionDenylistTable = parse(PARTITION_DENYLIST_TABLE, "Partition keys which have been denied access", PARTITION_DENYLIST_CQL).build(); + public static final String AUTO_REPAIR_HISTORY_CQL = "CREATE TABLE IF NOT EXISTS %s (" + + "host_id uuid," + + "repair_type text," + + "repair_turn text," + + "repair_start_ts timestamp," + + "repair_finish_ts timestamp," + + "delete_hosts set," + + "delete_hosts_update_time timestamp," + + "force_repair boolean," + + "PRIMARY KEY (repair_type, host_id))"; + + private static final TableMetadata AutoRepairHistoryTable = + parse(AUTO_REPAIR_HISTORY, "Auto repair history for each node", AUTO_REPAIR_HISTORY_CQL).build(); + + public static final String AUTO_REPAIR_PRIORITY_CQL = "CREATE TABLE IF NOT EXISTS %s (" + + "repair_type text," + + "repair_priority set," + + "PRIMARY KEY (repair_type))"; + + private static final TableMetadata AutoRepairPriorityTable = + parse(AUTO_REPAIR_PRIORITY, "Auto repair priority for each group", AUTO_REPAIR_PRIORITY_CQL).build(); + private static TableMetadata.Builder parse(String table, String description, String cql) { return CreateTableStatement.parse(format(cql, table), SchemaConstants.DISTRIBUTED_KEYSPACE_NAME) @@ -171,7 +197,7 @@ public static KeyspaceMetadata metadata() { return KeyspaceMetadata.create(SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, KeyspaceParams.simple(Math.max(DEFAULT_RF, DatabaseDescriptor.getDefaultKeyspaceRF())), - Tables.of(RepairHistory, ParentRepairHistory, ViewBuildStatus, PartitionDenylistTable)); + Tables.of(RepairHistory, ParentRepairHistory, ViewBuildStatus, PartitionDenylistTable, AutoRepairHistoryTable, AutoRepairPriorityTable)); } public static void startParentRepair(TimeUUID parent_id, String keyspaceName, String[] cfnames, RepairOption options) diff --git a/src/java/org/apache/cassandra/schema/TableId.java b/src/java/org/apache/cassandra/schema/TableId.java index d6b7b141a94a..4c69326bc7f1 100644 --- a/src/java/org/apache/cassandra/schema/TableId.java +++ b/src/java/org/apache/cassandra/schema/TableId.java @@ -22,14 +22,23 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.UUID; +import java.util.function.LongUnaryOperator; import javax.annotation.Nullable; - import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.commons.lang3.ArrayUtils; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.db.marshal.ValueAccessor; +import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ObjectSizes; import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.UUIDGen; import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; @@ -40,15 +49,23 @@ * This is essentially a UUID, but we wrap it as it's used quite a bit in the code and having a nicely named class make * the code more readable. */ -public class TableId implements Comparable +public final class TableId implements Comparable { public static final long MAGIC = 1956074401491665062L; - // TODO: should this be a TimeUUID? - private final UUID id; + public static final long EMPTY_SIZE = ObjectSizes.measureDeep(new UUID(0, 0)); + private static final int MAGIC_BYTE = (int) ((flipSign(MAGIC) >>> 56) & 0xf0); + + final long msb, lsb; private TableId(UUID id) { - this.id = id; + this(id.getMostSignificantBits(), id.getLeastSignificantBits()); + } + + private TableId(long msb, long lsb) + { + this.msb = msb; + this.lsb = lsb; } public static TableId fromUUID(UUID id) @@ -56,6 +73,11 @@ public static TableId fromUUID(UUID id) return new TableId(id); } + public static TableId fromRaw(long msb, long lsb) + { + return new TableId(msb, lsb); + } + // TODO: should we be using UUID.randomUUID()? public static TableId generate() { @@ -131,51 +153,273 @@ public static TableId unsafeDeterministic(String keyspace, String table) public String toHexString() { - return ByteBufferUtil.bytesToHex(ByteBufferUtil.bytes(id)); + return ByteBufferUtil.bytesToHex(ByteBuffer.wrap(UUIDGen.decompose(msb, lsb))); } public UUID asUUID() { - return id; + return new UUID(msb, lsb); } @Override - public final int hashCode() + public int hashCode() { - return id.hashCode(); + long hilo = msb ^ lsb; + return ((int)(hilo >> 32)) ^ (int) hilo; } @Override public final boolean equals(Object o) { - return this == o || (o instanceof TableId && this.id.equals(((TableId) o).id)); + if (o == this | o == null) return o == this; + if (o.getClass() != TableId.class) return false; + TableId that = (TableId) o; + return this.msb == that.msb && this.lsb == that.lsb; } @Override public String toString() { - return id.toString(); + return asUUID().toString(); } public void serialize(DataOutput out) throws IOException { - out.writeLong(id.getMostSignificantBits()); - out.writeLong(id.getLeastSignificantBits()); + out.writeLong(msb); + out.writeLong(lsb); + } + + public int serialize(V dst, ValueAccessor accessor, int offset) + { + int position = offset; + position += accessor.putLong(dst, position, msb); + position += accessor.putLong(dst, position, lsb); + return position - offset; + } + + public final long msb() + { + return msb; + } + + public final long lsb() + { + return lsb; } - public int serializedSize() + public final int serializedSize() { return 16; } + public void serializeCompact(DataOutputPlus out) throws IOException + { + serializeCompact(out, Long.compare(msb, MAGIC), msb, lsb); + } + + public void serializeCompactComparable(DataOutputPlus out) throws IOException + { + serializeCompact(out, Long.compare(msb, MAGIC), flipSign(msb), flipSign(lsb)); + } + + private static void serializeCompact(DataOutputPlus out, int compareMagic, long msb, long lsb) throws IOException + { + // make this an ordered compact serialization at the cost of one byte + // TODO (desired): we could use 6 bits of the byte for encoding the vint header and avoid any extra bytes in most cases + if (compareMagic == 0) + { + int bytes = numberOfBytes(lsb); + out.writeByte(MAGIC_BYTE | bytes); + out.writeLeastSignificantBytes(lsb, bytes); + } + else + { + out.writeByte(MAGIC_BYTE + (compareMagic > 0 ? 0x10 : -0x10)); + out.writeLong(msb); + out.writeLong(lsb); + } + } + + public int serializeCompact(V dst, ValueAccessor accessor, int offset) + { + return serializeCompact(dst, accessor, offset, Long.compare(msb, MAGIC), msb, lsb); + } + + public int serializeCompactComparable(V dst, ValueAccessor accessor, int offset) + { + return serializeCompact(dst, accessor, offset, Long.compare(msb, MAGIC), flipSign(msb), flipSign(lsb)); + } + + private static int serializeCompact(V dst, ValueAccessor accessor, int offset, int compareMagic, long msb, long lsb) + { + if (compareMagic == 0) + { + int bytes = numberOfBytes(lsb); + accessor.putByte(dst, offset, (byte) (MAGIC_BYTE | bytes)); + accessor.putLeastSignificantBytes(dst, offset + 1, lsb, bytes); + return 1 + bytes; + } + else + { + int position = offset; + position += accessor.putByte(dst, position, (byte) (MAGIC_BYTE + (compareMagic > 0 ? 0x10 : -0x10))); + position += accessor.putLong(dst, position, msb); + position += accessor.putLong(dst, position, lsb); + return position - offset; + } + } + + private static int numberOfBytes(long lsb) + { + return (64 + 7 - Long.numberOfLeadingZeros(lsb)) / 8; + } + + public final int serializedCompactSize() + { + // make this an ordered compact serialization at the cost of one byte + return msb == MAGIC ? 1 + numberOfBytes(lsb) : 17; + } + + public final int serializedCompactComparableSize() + { + // make this an ordered compact serialization at the cost of one byte + return msb == MAGIC ? 1 + numberOfBytes(flipSign(lsb)) : 17; + } + + public static int staticSerializedSize() + { + return 16; + } + + public static void skip(DataInputPlus in) throws IOException + { + in.skipBytesFully(16); + } + + public static void skipCompact(DataInputPlus in) throws IOException + { + int b = in.readByte(); + if ((b & 0xf0) != MAGIC_BYTE) + in.skipBytesFully(16); + else + in.skipBytesFully(b & 0xf); + } + public static TableId deserialize(DataInput in) throws IOException { - return new TableId(new UUID(in.readLong(), in.readLong())); + return new TableId(in.readLong(), in.readLong()); + } + + private static TableId deserialize(DataInput in, LongUnaryOperator transform) throws IOException + { + return new TableId(transform.applyAsLong(in.readLong()), transform.applyAsLong(in.readLong())); + } + + private static long flipSign(long bits) + { + return bits ^ Long.MIN_VALUE; + } + + private static long keepSign(long bits) + { + return bits; + } + + public static TableId deserialize(V src, ValueAccessor accessor, int offset) + { + return deserialize(src, accessor, offset, TableId::keepSign); + } + + public static TableId deserializeComparable(V src, ValueAccessor accessor, int offset) + { + return deserialize(src, accessor, offset, TableId::flipSign); + } + + private static TableId deserialize(V src, ValueAccessor accessor, int offset, LongUnaryOperator transform) + { + return new TableId(transform.applyAsLong(accessor.getLong(src, offset)), transform.applyAsLong(accessor.getLong(src, offset + TypeSizes.LONG_SIZE))); + } + + public static TableId deserializeCompact(DataInputPlus in) throws IOException + { + return deserializeCompact(in, TableId::keepSign); + } + + public static TableId deserializeCompactComparable(DataInputPlus in) throws IOException + { + return deserializeCompact(in, TableId::flipSign); + } + + private static TableId deserializeCompact(DataInputPlus in, LongUnaryOperator transform) throws IOException + { + int b = in.readByte(); + if ((b & 0xf0) != MAGIC_BYTE) return deserialize(in, transform); + else return new TableId(MAGIC, transform.applyAsLong(in.readLeastSignificantBytes(b & 0xf))); + } + + public static TableId deserializeCompact(V src, ValueAccessor accessor, int offset) + { + return deserializeCompact(src, accessor, offset, TableId::keepSign); + } + + public static TableId deserializeCompactComparable(V src, ValueAccessor accessor, int offset) + { + return deserializeCompact(src, accessor, offset, TableId::flipSign); + } + + private static TableId deserializeCompact(V src, ValueAccessor accessor, int offset, LongUnaryOperator transform) + { + int b = accessor.getByte(src, offset++); + if ((b & 0xf0) != MAGIC_BYTE) return deserialize(src, accessor, offset, transform); + else return new TableId(MAGIC, transform.applyAsLong(accessor.getLeastSignificantBytes(src, offset, b & 0x0f))); } @Override - public int compareTo(TableId o) + public int compareTo(TableId that) { - return id.compareTo(o.id); + int c = Long.compare(this.msb, that.msb); + return c != 0 ? c : Long.compare(this.lsb, that.lsb); } + + public static final IVersionedSerializer serializer = new IVersionedSerializer<>() + { + @Override + public void serialize(TableId t, DataOutputPlus out, int version) throws IOException + { + t.serialize(out); + } + + @Override + public TableId deserialize(DataInputPlus in, int version) throws IOException + { + return TableId.deserialize(in); + } + + @Override + public long serializedSize(TableId t, int version) + { + return t.serializedSize(); + } + }; + + public static final MetadataSerializer metadataSerializer = new MetadataSerializer() + { + @Override + public void serialize(TableId t, DataOutputPlus out, Version version) throws IOException + { + t.serialize(out); + } + + @Override + public TableId deserialize(DataInputPlus in, Version version) throws IOException + { + return TableId.deserialize(in); + } + + @Override + public long serializedSize(TableId t, Version version) + { + return t.serializedSize(); + } + }; } diff --git a/src/java/org/apache/cassandra/schema/TableMetadata.java b/src/java/org/apache/cassandra/schema/TableMetadata.java index 9fdf5e821542..c2909dcb612f 100644 --- a/src/java/org/apache/cassandra/schema/TableMetadata.java +++ b/src/java/org/apache/cassandra/schema/TableMetadata.java @@ -35,6 +35,7 @@ import javax.annotation.Nonnull; import javax.annotation.Nullable; +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.MoreObjects; import com.google.common.collect.ImmutableCollection; import com.google.common.collect.ImmutableList; @@ -42,9 +43,8 @@ import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import com.google.common.collect.Sets; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import accord.utils.Invariants; import org.apache.cassandra.auth.DataResource; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.constraints.ColumnConstraint; @@ -72,21 +72,26 @@ import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.fastpath.FastPathStrategy; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.service.consensus.migration.TransactionalMigrationFromMode; +import org.apache.cassandra.service.reads.SpeculativeRetryPolicy; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.serialization.UDTAndFunctionsAwareMetadataSerializer; import org.apache.cassandra.tcm.serialization.Version; -import org.apache.cassandra.service.reads.SpeculativeRetryPolicy; import org.apache.cassandra.utils.AbstractIterator; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FBUtilities; import org.github.jamm.Unmetered; +import static accord.utils.Invariants.require; import static com.google.common.collect.Iterables.any; import static com.google.common.collect.Iterables.transform; import static java.lang.String.format; import static java.util.stream.Collectors.toList; import static java.util.stream.Collectors.toSet; import static org.apache.cassandra.db.TypeSizes.sizeof; +import static org.apache.cassandra.schema.ColumnMetadata.NO_UNIQUE_ID; import static org.apache.cassandra.schema.IndexMetadata.isNameValid; @Unmetered @@ -94,7 +99,7 @@ public class TableMetadata implements SchemaElement { public static final Serializer serializer = new Serializer(); - private static final Logger logger = LoggerFactory.getLogger(TableMetadata.class); + public static final String UNDEFINED_COLUMN_NAME_MESSAGE = "Undefined column name %s in table %s"; // Please note that currently the only one truly useful flag is COUNTER, as the rest of the flags were about // differencing between CQL tables and the various types of COMPACT STORAGE tables (pre-4.0). As those "compact" @@ -189,6 +194,8 @@ public enum Kind protected final ImmutableList partitionKeyColumns; protected final ImmutableList clusteringColumns; protected final RegularAndStaticColumns regularAndStaticColumns; + protected final RegularAndStaticColumns regularAndStaticAndDroppedColumns; + private final ColumnMetadata[] columnsById; public final Indexes indexes; public final Triggers triggers; @@ -204,7 +211,7 @@ public enum Kind // We cache the columns with constraints to avoid iterations over columns // Partition keys columns are evaluated separately, so we keep the two of them in // two different variables. - public final List partitionKeyConstraints; + public final List> partitionKeyConstraints; public final List columnsWithConstraints; public final List notNullColumns; @@ -227,6 +234,18 @@ protected TableMetadata(Builder builder) Collections.sort(builder.clusteringColumns); clusteringColumns = ImmutableList.copyOf(builder.clusteringColumns); regularAndStaticColumns = RegularAndStaticColumns.builder().addAll(builder.regularAndStaticColumns).build(); + regularAndStaticAndDroppedColumns = RegularAndStaticColumns.builder() + .addAll(builder.regularAndStaticColumns) + .addAll(droppedColumns.values().stream().map(c -> c.column).filter(c -> !regularAndStaticColumns.contains(c))::iterator) + .build(); + + columnsById = new ColumnMetadata[regularAndStaticAndDroppedColumns.size() + partitionKeyColumns.size() + clusteringColumns.size()]; + for (ColumnMetadata column : regularAndStaticAndDroppedColumns) + columnsById[column.uniqueId] = column; + for (ColumnMetadata column : partitionKeyColumns) + columnsById[column.uniqueId] = column; + for (ColumnMetadata column : clusteringColumns) + columnsById[column.uniqueId] = column; columns = ImmutableMap.copyOf(builder.columns); indexes = builder.indexes; @@ -248,7 +267,7 @@ else if (isIndex()) else ref = TableMetadataRef.withInitialReference(new TableMetadataRef(Schema.instance, keyspace, name, id), this); - List pkConstraints = new ArrayList<>(this.partitionKeyColumns.size()); + List> pkConstraints = new ArrayList<>(this.partitionKeyColumns.size()); for (ColumnMetadata column : this.partitionKeyColumns) { if (column.hasConstraint()) @@ -322,6 +341,11 @@ public TableMetadata withSwapped(Indexes indexes) return unbuild().indexes(indexes).build(); } + public TableId id() + { + return id; + } + public boolean isView() { return kind == Kind.VIEW; @@ -346,7 +370,7 @@ public boolean isCompactTable() { return false; } - + public boolean isIncrementalBackupsEnabled() { return params.incrementalBackups; @@ -357,6 +381,27 @@ public boolean isStaticCompactTable() return false; } + public boolean isAccordEnabled() + { + return params.transactionalMode.accordIsEnabled; + } + + public boolean migratingFromAccord() + { + return params.transactionalMigrationFrom.migratingFromAccord(); + } + + public boolean requiresAccordSupport() + { + return isAccordEnabled() || migratingFromAccord(); + } + + public boolean supportsPaxosOperations() + { + return params.transactionalMode == TransactionalMode.off + || params.transactionalMigrationFrom.from == TransactionalMode.off; + } + public ImmutableCollection columns() { return columns.values(); @@ -395,6 +440,11 @@ public RegularAndStaticColumns regularAndStaticColumns() return regularAndStaticColumns; } + public RegularAndStaticColumns regularAndStaticAndDroppedColumns() + { + return regularAndStaticAndDroppedColumns; + } + public Columns regularColumns() { return regularAndStaticColumns.regulars; @@ -472,7 +522,7 @@ public ColumnMetadata getExistingColumn(ColumnIdentifier name) { ColumnMetadata def = getColumn(name); if (def == null) - throw new InvalidRequestException(format("Undefined column name %s in table %s", name.toCQLString(), this)); + throw new InvalidRequestException(format(UNDEFINED_COLUMN_NAME_MESSAGE, name.toCQLString(), this)); return def; } /* @@ -486,6 +536,11 @@ public ColumnMetadata getColumn(ByteBuffer name) return columns.get(name); } + public ColumnMetadata getColumnById(int uniqueId) + { + return columnsById[uniqueId]; + } + public ColumnMetadata getDroppedColumn(ByteBuffer name) { DroppedColumn dropped = droppedColumns.get(name); @@ -506,7 +561,7 @@ public ColumnMetadata getDroppedColumn(ByteBuffer name, boolean isStatic) return null; if (isStatic && !dropped.column.isStatic()) - return ColumnMetadata.staticColumn(this, name, dropped.column.type); + return ColumnMetadata.staticColumn(this, name, dropped.column.type, dropped.column.uniqueId); return dropped.column; } @@ -590,6 +645,8 @@ public void validate() throw new InvalidRequestException(e.getMessage(), e); } } + + require((params.transactionalMode == TransactionalMode.off && params.transactionalMigrationFrom == TransactionalMigrationFromMode.none) || !isCounter(), "Counters are not supported with Accord for table " + this); } /** @@ -714,6 +771,19 @@ public static TableMetadata minimal(String keyspace, String name) .build(); } + /** + * There is a couple of places in the code where we need a TableMetadata object and don't have one readily available + * and know that only the keyspace and name matter. This creates such "fake" metadata. Use only if you know what + * you're doing. + */ + @VisibleForTesting + public static TableMetadata minimal(String keyspace, String name, TableId tableId) + { + return TableMetadata.builder(keyspace, name, tableId) + .addPartitionKeyColumn("key", BytesType.instance) + .build(); + } + public TableMetadata updateIndexTableMetadata(TableParams baseTableParams) { TableParams.Builder builder = baseTableParams.unbuild().gcGraceSeconds(0); @@ -862,6 +932,8 @@ public static final class Builder private final List partitionKeyColumns = new ArrayList<>(); private final List clusteringColumns = new ArrayList<>(); private final List regularAndStaticColumns = new ArrayList<>(); + private int maxAssignedUniqueId = NO_UNIQUE_ID; + private boolean assignUniqueIds = false; private Builder(String keyspace, String name, TableId id) { @@ -893,12 +965,62 @@ public TableMetadata build() id = TableId.generate(); } + if (assignUniqueIds) + { + int nextId = Math.max(0, maxAssignedUniqueId + 1); + for (int i = 0 ; i < partitionKeyColumns.size() ; ++i) + { + ColumnMetadata prev = partitionKeyColumns.get(i); + int expectedId = prev.position(); + Invariants.require(prev.uniqueId == expectedId || (prev.uniqueId == NO_UNIQUE_ID && nextId == expectedId)); + if (prev.uniqueId == NO_UNIQUE_ID) + partitionKeyColumns.set(i, setUniqueId(prev, nextId++)); + } + for (int i = 0 ; i < clusteringColumns.size() ; ++i) + { + ColumnMetadata prev = clusteringColumns.get(i); + int expectedId = partitionKeyColumns.size() + prev.position(); + Invariants.require(prev.uniqueId == expectedId || (prev.uniqueId == NO_UNIQUE_ID && nextId == expectedId)); + if (prev.uniqueId == NO_UNIQUE_ID) + clusteringColumns.set(i, setUniqueId(prev, nextId++)); + } + for (Map.Entry e : droppedColumns.entrySet()) + { + Invariants.require(e.getValue().column.uniqueId != NO_UNIQUE_ID || maxAssignedUniqueId == NO_UNIQUE_ID); + if (e.getValue().column.uniqueId == NO_UNIQUE_ID) + e.setValue(new DroppedColumn(withUniqueId(e.getValue().column, nextId++), e.getValue().droppedTime)); + } + for (int i = 0 ; i < regularAndStaticColumns.size() ; ++i) + { + ColumnMetadata prev = regularAndStaticColumns.get(i); + if (prev.uniqueId == NO_UNIQUE_ID) + { + DroppedColumn restoring = droppedColumns.get(prev.name.bytes); + int uniqueId = restoring != null ? restoring.column.uniqueId : nextId++; + regularAndStaticColumns.set(i, setUniqueId(prev, uniqueId)); + } + } + } + if (Flag.isCQLTable(flags)) return new TableMetadata(this); else return new CompactTableMetadata(this); } + ColumnMetadata setUniqueId(ColumnMetadata prev, int uniqueId) + { + ColumnMetadata next = withUniqueId(prev, uniqueId); + ColumnMetadata replaced = columns.put(next.name.bytes, next); + Invariants.require(prev == replaced); + return next; + } + + static ColumnMetadata withUniqueId(ColumnMetadata prev, int uniqueId) + { + return new ColumnMetadata(prev.ksName, prev.cfName, prev.name, prev.type, uniqueId, prev.position(), prev.kind, prev.getMask(), prev.getColumnConstraints()); + } + public Builder id(TableId val) { id = val; @@ -970,6 +1092,12 @@ public Builder compression(CompressionParams val) return this; } + public Builder fastPath(FastPathStrategy val) + { + params.fastPath(val); + return this; + } + public Builder defaultTimeToLive(int val) { params.defaultTimeToLive(val); @@ -1082,7 +1210,7 @@ public Builder addPartitionKeyColumn(ColumnIdentifier name, AbstractType type public Builder addPartitionKeyColumn(ColumnIdentifier name, AbstractType type, @Nullable ColumnMask mask, @Nonnull ColumnConstraints cqlConstraints) { - return addColumn(new ColumnMetadata(keyspace, this.name, name, type, partitionKeyColumns.size(), ColumnMetadata.Kind.PARTITION_KEY, mask, cqlConstraints)); + return addColumn(new ColumnMetadata(keyspace, this.name, name, type, NO_UNIQUE_ID, partitionKeyColumns.size(), ColumnMetadata.Kind.PARTITION_KEY, mask, cqlConstraints)); } public Builder addClusteringColumn(String name, AbstractType type) @@ -1107,7 +1235,7 @@ public Builder addClusteringColumn(ColumnIdentifier name, AbstractType type, public Builder addClusteringColumn(ColumnIdentifier name, AbstractType type, @Nullable ColumnMask mask, @Nonnull ColumnConstraints cqlConstraints) { - return addColumn(new ColumnMetadata(keyspace, this.name, name, type, clusteringColumns.size(), ColumnMetadata.Kind.CLUSTERING, mask, cqlConstraints)); + return addColumn(new ColumnMetadata(keyspace, this.name, name, type, NO_UNIQUE_ID, clusteringColumns.size(), ColumnMetadata.Kind.CLUSTERING, mask, cqlConstraints)); } public Builder addRegularColumn(String name, AbstractType type) @@ -1132,7 +1260,7 @@ public Builder addRegularColumn(ColumnIdentifier name, AbstractType type, @Nu public Builder addRegularColumn(ColumnIdentifier name, AbstractType type, @Nullable ColumnMask mask, @Nonnull ColumnConstraints cqlConstraints) { - return addColumn(new ColumnMetadata(keyspace, this.name, name, type, ColumnMetadata.NO_POSITION, ColumnMetadata.Kind.REGULAR, mask, cqlConstraints)); + return addColumn(new ColumnMetadata(keyspace, this.name, name, type, NO_UNIQUE_ID, ColumnMetadata.NO_POSITION, ColumnMetadata.Kind.REGULAR, mask, cqlConstraints)); } public Builder addStaticColumn(String name, AbstractType type) @@ -1157,7 +1285,7 @@ public Builder addStaticColumn(ColumnIdentifier name, AbstractType type, @Nul public Builder addStaticColumn(ColumnIdentifier name, AbstractType type, @Nullable ColumnMask mask, @Nonnull ColumnConstraints cqlConstraints) { - return addColumn(new ColumnMetadata(keyspace, this.name, name, type, ColumnMetadata.NO_POSITION, ColumnMetadata.Kind.STATIC, mask, cqlConstraints)); + return addColumn(new ColumnMetadata(keyspace, this.name, name, type, NO_UNIQUE_ID, ColumnMetadata.NO_POSITION, ColumnMetadata.Kind.STATIC, mask, cqlConstraints)); } public Builder addColumn(ColumnMetadata column) @@ -1181,6 +1309,8 @@ public Builder addColumn(ColumnMetadata column) } columns.put(column.name.bytes, column); + assignUniqueIds |= column.uniqueId == NO_UNIQUE_ID; + maxAssignedUniqueId = Math.max(maxAssignedUniqueId, column.uniqueId); return this; } @@ -1195,6 +1325,11 @@ public Builder droppedColumns(Map droppedColumns) { this.droppedColumns.clear(); this.droppedColumns.putAll(droppedColumns); + for (DroppedColumn column : droppedColumns.values()) + { + assignUniqueIds |= column.column.uniqueId == NO_UNIQUE_ID; + maxAssignedUniqueId = Math.max(maxAssignedUniqueId, column.column.uniqueId); + } return this; } @@ -1205,7 +1340,7 @@ public Builder recordDeprecatedSystemColumn(String name, AbstractType type) { // As we play fast and loose with the removal timestamp, make sure this is misued for a non system table. assert SchemaConstants.isLocalSystemKeyspace(keyspace); - recordColumnDrop(ColumnMetadata.regularColumn(keyspace, this.name, name, type), Long.MAX_VALUE); + recordColumnDrop(ColumnMetadata.regularColumn(keyspace, this.name, name, type, NO_UNIQUE_ID), Long.MAX_VALUE); return this; } @@ -1771,7 +1906,7 @@ else if (isStaticCompactTable) for (ColumnMetadata c : regularAndStaticColumns) { if (c.isStatic()) - columns.add(new ColumnMetadata(c.ksName, c.cfName, c.name, c.type, -1, ColumnMetadata.Kind.REGULAR, c.getMask(), c.getColumnConstraints())); + columns.add(new ColumnMetadata(c.ksName, c.cfName, c.name, c.type, c.uniqueId, -1, ColumnMetadata.Kind.REGULAR, c.getMask(), c.getColumnConstraints())); } otherColumns = columns.iterator(); } diff --git a/src/java/org/apache/cassandra/schema/TableParams.java b/src/java/org/apache/cassandra/schema/TableParams.java index 6903179525b7..e44af5ca5613 100644 --- a/src/java/org/apache/cassandra/schema/TableParams.java +++ b/src/java/org/apache/cassandra/schema/TableParams.java @@ -27,23 +27,49 @@ import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.Attributes; import org.apache.cassandra.cql3.CqlBuilder; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; -import org.apache.cassandra.tcm.serialization.MetadataSerializer; -import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.service.accord.fastpath.FastPathStrategy; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.service.consensus.migration.TransactionalMigrationFromMode; import org.apache.cassandra.service.reads.PercentileSpeculativeRetryPolicy; import org.apache.cassandra.service.reads.SpeculativeRetryPolicy; import org.apache.cassandra.service.reads.repair.ReadRepairStrategy; +import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; import org.apache.cassandra.utils.BloomCalculations; import org.apache.cassandra.utils.ByteBufferUtil; import static java.lang.String.format; import static java.util.stream.Collectors.toMap; -import static org.apache.cassandra.schema.TableParams.Option.*; +import static com.google.common.base.Preconditions.checkNotNull; import static org.apache.cassandra.db.TypeSizes.sizeof; +import static org.apache.cassandra.schema.TableParams.Option.ADDITIONAL_WRITE_POLICY; +import static org.apache.cassandra.schema.TableParams.Option.ALLOW_AUTO_SNAPSHOT; +import static org.apache.cassandra.schema.TableParams.Option.BLOOM_FILTER_FP_CHANCE; +import static org.apache.cassandra.schema.TableParams.Option.CACHING; +import static org.apache.cassandra.schema.TableParams.Option.CDC; +import static org.apache.cassandra.schema.TableParams.Option.COMMENT; +import static org.apache.cassandra.schema.TableParams.Option.COMPACTION; +import static org.apache.cassandra.schema.TableParams.Option.COMPRESSION; +import static org.apache.cassandra.schema.TableParams.Option.CRC_CHECK_CHANCE; +import static org.apache.cassandra.schema.TableParams.Option.DEFAULT_TIME_TO_LIVE; +import static org.apache.cassandra.schema.TableParams.Option.EXTENSIONS; +import static org.apache.cassandra.schema.TableParams.Option.FAST_PATH; +import static org.apache.cassandra.schema.TableParams.Option.GC_GRACE_SECONDS; +import static org.apache.cassandra.schema.TableParams.Option.INCREMENTAL_BACKUPS; +import static org.apache.cassandra.schema.TableParams.Option.MAX_INDEX_INTERVAL; +import static org.apache.cassandra.schema.TableParams.Option.MEMTABLE; +import static org.apache.cassandra.schema.TableParams.Option.MEMTABLE_FLUSH_PERIOD_IN_MS; +import static org.apache.cassandra.schema.TableParams.Option.MIN_INDEX_INTERVAL; +import static org.apache.cassandra.schema.TableParams.Option.PENDING_DROP; +import static org.apache.cassandra.schema.TableParams.Option.READ_REPAIR; +import static org.apache.cassandra.schema.TableParams.Option.SPECULATIVE_RETRY; import static org.apache.cassandra.utils.LocalizeString.toLowerCaseLocalized; public final class TableParams @@ -69,7 +95,12 @@ public enum Option ADDITIONAL_WRITE_POLICY, CRC_CHECK_CHANCE, CDC, - READ_REPAIR; + READ_REPAIR, + FAST_PATH, + TRANSACTIONAL_MODE, + TRANSACTIONAL_MIGRATION_FROM, + PENDING_DROP, + AUTO_REPAIR; @Override public String toString() @@ -97,6 +128,12 @@ public String toString() public final ImmutableMap extensions; public final boolean cdc; public final ReadRepairStrategy readRepair; + public final FastPathStrategy fastPath; + public final TransactionalMode transactionalMode; + public final TransactionalMigrationFromMode transactionalMigrationFrom; + public final boolean pendingDrop; + + public final AutoRepairParams autoRepair; private TableParams(Builder builder) { @@ -121,6 +158,12 @@ private TableParams(Builder builder) extensions = builder.extensions; cdc = builder.cdc; readRepair = builder.readRepair; + fastPath = builder.fastPath; + transactionalMode = builder.transactionalMode != null ? builder.transactionalMode : TransactionalMode.off; + transactionalMigrationFrom = builder.transactionalMigrationFrom; + pendingDrop = builder.pendingDrop; + checkNotNull(transactionalMigrationFrom); + autoRepair = builder.autoRepair; } public static Builder builder() @@ -148,7 +191,12 @@ public static Builder builder(TableParams params) .additionalWritePolicy(params.additionalWritePolicy) .extensions(params.extensions) .cdc(params.cdc) - .readRepair(params.readRepair); + .readRepair(params.readRepair) + .fastPath(params.fastPath) + .transactionalMode(params.transactionalMode) + .transactionalMigrationFrom(params.transactionalMigrationFrom) + .pendingDrop(params.pendingDrop) + .automatedRepair(params.autoRepair); } public Builder unbuild() @@ -162,7 +210,7 @@ public void validate() compression.validate(); double minBloomFilterFpChanceValue = BloomCalculations.minSupportedBloomFilterFpChance(); - if (bloomFilterFpChance <= minBloomFilterFpChanceValue || bloomFilterFpChance > 1) + if (bloomFilterFpChance <= minBloomFilterFpChanceValue || bloomFilterFpChance > 1) { fail("%s must be larger than %s and less than or equal to 1.0 (got %s)", BLOOM_FILTER_FP_CHANCE, @@ -203,6 +251,11 @@ public void validate() if (cdc && memtable.factory().writesShouldSkipCommitLog()) fail("CDC cannot work if writes skip the commit log. Check your memtable configuration."); + + if (transactionalMode.isTestMode() && !CassandraRelevantProperties.ACCORD_ALLOW_TEST_MODES.getBoolean()) + fail("Transactional mode " + transactionalMode + " can't be used if " + CassandraRelevantProperties.ACCORD_ALLOW_TEST_MODES.getKey() + " is not set"); + + autoRepair.validate(); } private static void fail(String format, Object... args) @@ -239,7 +292,12 @@ public boolean equals(Object o) && memtable.equals(p.memtable) && extensions.equals(p.extensions) && cdc == p.cdc - && readRepair == p.readRepair; + && readRepair == p.readRepair + && fastPath.equals(fastPath) + && transactionalMode == p.transactionalMode + && transactionalMigrationFrom == p.transactionalMigrationFrom + && pendingDrop == p.pendingDrop + && autoRepair.equals(p.autoRepair); } @Override @@ -263,7 +321,12 @@ public int hashCode() memtable, extensions, cdc, - readRepair); + readRepair, + fastPath, + transactionalMode, + transactionalMigrationFrom, + pendingDrop, + autoRepair); } @Override @@ -275,6 +338,7 @@ public String toString() .add(ALLOW_AUTO_SNAPSHOT.toString(), allowAutoSnapshot) .add(BLOOM_FILTER_FP_CHANCE.toString(), bloomFilterFpChance) .add(CRC_CHECK_CHANCE.toString(), crcCheckChance) + .add(FAST_PATH.toString(), fastPath) .add(GC_GRACE_SECONDS.toString(), gcGraceSeconds) .add(DEFAULT_TIME_TO_LIVE.toString(), defaultTimeToLive) .add(INCREMENTAL_BACKUPS.toString(), incrementalBackups) @@ -289,6 +353,11 @@ public String toString() .add(EXTENSIONS.toString(), extensions) .add(CDC.toString(), cdc) .add(READ_REPAIR.toString(), readRepair) + .add(Option.FAST_PATH.toString(), fastPath) + .add(Option.TRANSACTIONAL_MODE.toString(), transactionalMode) + .add(Option.TRANSACTIONAL_MIGRATION_FROM.toString(), transactionalMigrationFrom) + .add(PENDING_DROP.toString(), pendingDrop) + .add(Option.AUTO_REPAIR.toString(), autoRepair) .toString(); } @@ -318,8 +387,8 @@ public void appendCqlTo(CqlBuilder builder, boolean isView) if (!isView) { - builder.append("AND default_time_to_live = ").append(defaultTimeToLive) - .newLine(); + builder.append("AND fast_path = ").append(fastPath.asCQL()).newLine(); + builder.append("AND default_time_to_live = ").append(defaultTimeToLive).newLine(); } builder.append("AND extensions = ").append(extensions.entrySet() @@ -339,8 +408,23 @@ public void appendCqlTo(CqlBuilder builder, boolean isView) .append("AND min_index_interval = ").append(minIndexInterval) .newLine() .append("AND read_repair = ").appendWithSingleQuotes(readRepair.toString()) - .newLine() - .append("AND speculative_retry = ").appendWithSingleQuotes(speculativeRetry.toString()); + .newLine(); + + if (!isView) + { + builder.append("AND transactional_mode = ").appendWithSingleQuotes(transactionalMode.toString()) + .newLine() + .append("AND transactional_migration_from = ").appendWithSingleQuotes(transactionalMigrationFrom.toString()) + .newLine(); + } + + builder.append("AND speculative_retry = ").appendWithSingleQuotes(speculativeRetry.toString()); + if (DatabaseDescriptor.getRawConfig() != null + && DatabaseDescriptor.getAutoRepairConfig().isAutoRepairSchedulingEnabled()) + { + builder.newLine() + .append("AND auto_repair = ").append(autoRepair.asMap()); + } } public static final class Builder @@ -364,7 +448,12 @@ public static final class Builder private ImmutableMap extensions = ImmutableMap.of(); private boolean cdc; private ReadRepairStrategy readRepair = ReadRepairStrategy.BLOCKING; + private FastPathStrategy fastPath = FastPathStrategy.inheritKeyspace(); + private TransactionalMode transactionalMode = TransactionalMode.off; + public TransactionalMigrationFromMode transactionalMigrationFrom = TransactionalMigrationFromMode.none; + public boolean pendingDrop = false; + private AutoRepairParams autoRepair = AutoRepairParams.DEFAULT; public Builder() { } @@ -482,11 +571,41 @@ public Builder readRepair(ReadRepairStrategy val) return this; } + public Builder fastPath(FastPathStrategy val) + { + fastPath = val; + return this; + } + + public Builder transactionalMode(TransactionalMode val) + { + transactionalMode = val; + return this; + } + + public Builder transactionalMigrationFrom(TransactionalMigrationFromMode val) + { + transactionalMigrationFrom = val; + return this; + } + public Builder extensions(Map val) { extensions = ImmutableMap.copyOf(val); return this; } + + public Builder pendingDrop(boolean pendingDrop) + { + this.pendingDrop = pendingDrop; + return this; + } + + public Builder automatedRepair(AutoRepairParams val) + { + autoRepair = val; + return this; + } } public static class Serializer implements MetadataSerializer @@ -516,6 +635,13 @@ public void serialize(TableParams t, DataOutputPlus out, Version version) throws out.writeBoolean(t.allowAutoSnapshot); out.writeBoolean(t.incrementalBackups); } + if (version.isAtLeast(Version.MIN_ACCORD_VERSION)) + { + FastPathStrategy.serializer.serialize(t.fastPath, out, version); + out.writeInt(t.transactionalMode.ordinal()); + out.writeInt(t.transactionalMigrationFrom.ordinal()); + out.writeBoolean(t.pendingDrop); + } } public TableParams deserialize(DataInputPlus in, Version version) throws IOException @@ -540,12 +666,19 @@ public TableParams deserialize(DataInputPlus in, Version version) throws IOExcep .readRepair(ReadRepairStrategy.fromString(in.readUTF())) .allowAutoSnapshot(!version.isAtLeast(Version.V4) || in.readBoolean()) .incrementalBackups(!version.isAtLeast(Version.V4) || in.readBoolean()); + if (version.isAtLeast(Version.MIN_ACCORD_VERSION)) + { + builder.fastPath(FastPathStrategy.serializer.deserialize(in, version)) + .transactionalMode(TransactionalMode.fromOrdinal(in.readInt())) + .transactionalMigrationFrom(TransactionalMigrationFromMode.fromOrdinal(in.readInt())) + .pendingDrop(in.readBoolean()); + } return builder.build(); } public long serializedSize(TableParams t, Version version) { - return sizeof(t.comment) + + long size = sizeof(t.comment) + sizeof(t.bloomFilterFpChance) + sizeof(t.crcCheckChance) + sizeof(t.gcGraceSeconds) + @@ -564,6 +697,14 @@ public long serializedSize(TableParams t, Version version) sizeof(t.readRepair.name()) + (version.isAtLeast(Version.V4) ? sizeof(t.allowAutoSnapshot) : 0) + (version.isAtLeast(Version.V4) ? sizeof(t.incrementalBackups) : 0); + if (version.isAtLeast(Version.MIN_ACCORD_VERSION)) + { + size += FastPathStrategy.serializer.serializedSize(t.fastPath, version) + + sizeof(t.transactionalMode.ordinal()) + + sizeof(t.transactionalMigrationFrom.ordinal()) + + sizeof(t.pendingDrop); + } + return size; } private void serializeMap(Map map, DataOutputPlus out) throws IOException diff --git a/src/java/org/apache/cassandra/security/AbstractSslContextFactory.java b/src/java/org/apache/cassandra/security/AbstractSslContextFactory.java index c2a7fdb97510..0fc8f94352d8 100644 --- a/src/java/org/apache/cassandra/security/AbstractSslContextFactory.java +++ b/src/java/org/apache/cassandra/security/AbstractSslContextFactory.java @@ -37,8 +37,8 @@ import io.netty.handler.ssl.SslProvider; import org.apache.cassandra.config.EncryptionOptions; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.NOT_REQUIRED; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.NOT_REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; import static org.apache.cassandra.config.CassandraRelevantProperties.DISABLE_TCACTIVE_OPENSSL; @@ -68,7 +68,7 @@ abstract public class AbstractSslContextFactory implements ISslContextFactory protected final List accepted_protocols; protected final String algorithm; protected final String store_type; - protected final EncryptionOptions.ClientAuth clientAuth; + protected final EncryptionOptions.ClientEncryptionOptions.ClientAuth clientAuth; protected final boolean require_endpoint_verification; /* ServerEncryptionOptions does not use the enabled flag at all instead using the existing @@ -105,7 +105,7 @@ protected AbstractSslContextFactory(Map parameters) accepted_protocols = getStringList("accepted_protocols"); algorithm = getString("algorithm"); store_type = getString("store_type", "JKS"); - clientAuth = parameters.get("require_client_auth") == null ? NOT_REQUIRED : EncryptionOptions.ClientAuth.from(getString("require_client_auth")); + clientAuth = parameters.get("require_client_auth") == null ? NOT_REQUIRED : EncryptionOptions.ClientEncryptionOptions.ClientAuth.from(getString("require_client_auth")); require_endpoint_verification = getBoolean("require_endpoint_verification", false); enabled = getBoolean("enabled"); optional = getBoolean("optional"); @@ -158,7 +158,7 @@ public SSLContext createJSSESslContext(boolean verifyPeerCertificate) throws SSL } @Override - public SSLContext createJSSESslContext(EncryptionOptions.ClientAuth clientAuth) throws SSLException + public SSLContext createJSSESslContext(EncryptionOptions.ClientEncryptionOptions.ClientAuth clientAuth) throws SSLException { TrustManager[] trustManagers = null; if (clientAuth != NOT_REQUIRED) @@ -186,7 +186,7 @@ public SslContext createNettySslContext(boolean verifyPeerCertificate, SocketTyp } @Override - public SslContext createNettySslContext(EncryptionOptions.ClientAuth clientAuth, SocketType socketType, + public SslContext createNettySslContext(EncryptionOptions.ClientEncryptionOptions.ClientAuth clientAuth, SocketType socketType, CipherSuiteFilter cipherFilter) throws SSLException { /* @@ -291,7 +291,7 @@ protected SslProvider getSslProvider() */ abstract protected KeyManagerFactory buildOutboundKeyManagerFactory() throws SSLException; - private ClientAuth toNettyClientAuth(EncryptionOptions.ClientAuth clientAuth) + private ClientAuth toNettyClientAuth(EncryptionOptions.ClientEncryptionOptions.ClientAuth clientAuth) { switch (clientAuth) { diff --git a/src/java/org/apache/cassandra/security/ISslContextFactory.java b/src/java/org/apache/cassandra/security/ISslContextFactory.java index 1db5f579b16d..2f8b53d16694 100644 --- a/src/java/org/apache/cassandra/security/ISslContextFactory.java +++ b/src/java/org/apache/cassandra/security/ISslContextFactory.java @@ -72,7 +72,7 @@ public interface ISslContextFactory * @return JSSE's {@link SSLContext} * @throws SSLException in case the Ssl Context creation fails for some reason */ - default SSLContext createJSSESslContext(EncryptionOptions.ClientAuth clientAuth) throws SSLException + default SSLContext createJSSESslContext(EncryptionOptions.ClientEncryptionOptions.ClientAuth clientAuth) throws SSLException { switch (clientAuth) { @@ -112,7 +112,7 @@ SslContext createNettySslContext(boolean verifyPeerCertificate, SocketType socke * @return Netty's {@link SslContext} * @throws SSLException in case the Ssl Context creation fails for some reason */ - default SslContext createNettySslContext(EncryptionOptions.ClientAuth clientAuth, SocketType socketType, + default SslContext createNettySslContext(EncryptionOptions.ClientEncryptionOptions.ClientAuth clientAuth, SocketType socketType, CipherSuiteFilter cipherFilter) throws SSLException { switch (clientAuth) diff --git a/src/java/org/apache/cassandra/security/SSLFactory.java b/src/java/org/apache/cassandra/security/SSLFactory.java index a9b4be9d5c39..65a6dfa24f1a 100644 --- a/src/java/org/apache/cassandra/security/SSLFactory.java +++ b/src/java/org/apache/cassandra/security/SSLFactory.java @@ -47,7 +47,7 @@ import static org.apache.cassandra.config.CassandraRelevantProperties.DISABLE_TCACTIVE_OPENSSL; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; import static org.apache.cassandra.utils.LocalizeString.toLowerCaseLocalized; /** @@ -125,7 +125,7 @@ public static List tlsInstanceProtocolSubstitution() /** * Create a JSSE {@link SSLContext}. */ - public static SSLContext createSSLContext(EncryptionOptions options, EncryptionOptions.ClientAuth clientAuth) throws IOException + public static SSLContext createSSLContext(EncryptionOptions options, EncryptionOptions.ClientEncryptionOptions.ClientAuth clientAuth) throws IOException { return options.sslContextFactoryInstance.createJSSESslContext(clientAuth); } @@ -133,7 +133,7 @@ public static SSLContext createSSLContext(EncryptionOptions options, EncryptionO /** * get a netty {@link SslContext} instance */ - public static SslContext getOrCreateSslContext(EncryptionOptions options, EncryptionOptions.ClientAuth clientAuth, + public static SslContext getOrCreateSslContext(EncryptionOptions options, EncryptionOptions.ClientEncryptionOptions.ClientAuth clientAuth, SocketType socketType, String contextDescription) throws IOException { @@ -157,7 +157,7 @@ public static SslContext getOrCreateSslContext(EncryptionOptions options, Encryp /** * Create a Netty {@link SslContext} */ - static SslContext createNettySslContext(EncryptionOptions options, EncryptionOptions.ClientAuth clientAuth, + static SslContext createNettySslContext(EncryptionOptions options, EncryptionOptions.ClientEncryptionOptions.ClientAuth clientAuth, SocketType socketType) throws IOException { return createNettySslContext(options, clientAuth, socketType, @@ -167,7 +167,7 @@ static SslContext createNettySslContext(EncryptionOptions options, EncryptionOpt /** * Create a Netty {@link SslContext} with a supplied cipherFilter */ - static SslContext createNettySslContext(EncryptionOptions options, EncryptionOptions.ClientAuth clientAuth, + static SslContext createNettySslContext(EncryptionOptions options, EncryptionOptions.ClientEncryptionOptions.ClientAuth clientAuth, SocketType socketType, CipherSuiteFilter cipherFilter) throws IOException { return options.sslContextFactoryInstance.createNettySslContext(clientAuth, socketType, @@ -356,7 +356,7 @@ private static boolean filterOutSSLv2Hello(String string) return !string.equals("SSLv2Hello"); } - public static void validateSslContext(String contextDescription, EncryptionOptions options, EncryptionOptions.ClientAuth clientAuth, boolean logProtocolAndCiphers) throws IOException + public static void validateSslContext(String contextDescription, EncryptionOptions options, EncryptionOptions.ClientEncryptionOptions.ClientAuth clientAuth, boolean logProtocolAndCiphers) throws IOException { if (options != null && options.tlsEncryptionPolicy() != EncryptionOptions.TlsEncryptionPolicy.UNENCRYPTED) { diff --git a/src/java/org/apache/cassandra/serializers/ListSerializer.java b/src/java/org/apache/cassandra/serializers/ListSerializer.java index 020abd2f4647..6bd688858a14 100644 --- a/src/java/org/apache/cassandra/serializers/ListSerializer.java +++ b/src/java/org/apache/cassandra/serializers/ListSerializer.java @@ -20,17 +20,20 @@ import java.nio.BufferUnderflowException; import java.nio.ByteBuffer; -import java.util.*; +import java.util.ArrayList; +import java.util.List; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.function.Predicate; +import com.google.common.base.Preconditions; import com.google.common.collect.Range; import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.ByteBufferAccessor; import org.apache.cassandra.db.marshal.ValueAccessor; +import org.apache.cassandra.utils.ByteBufferUtil; public class ListSerializer extends CollectionSerializer> { @@ -223,10 +226,29 @@ public Class> getType() } @Override - public ByteBuffer getSerializedValue(ByteBuffer collection, ByteBuffer key, AbstractType comparator) + public ByteBuffer getSerializedValue(ByteBuffer collection, ByteBuffer index, AbstractType comparator) { - // We don't allow selecting an element of a list, so we don't need this. - throw new UnsupportedOperationException(); + try + { + int n = readCollectionSize(collection, ByteBufferAccessor.instance); + // Start the offset after the (size of) the collection size we just read + int offset = sizeOfCollectionSize(); + int idx = ByteBufferUtil.toInt(index); + + Preconditions.checkElementIndex(idx, n); + + for (int i = 0; i <= idx; i++) + { + if (i == idx) + return readValue(collection, ByteBufferAccessor.instance, offset); + offset += skipValue(collection, ByteBufferAccessor.instance, offset); + } + throw new AssertionError("Asked to read index " + idx + " but never read the index"); + } + catch (BufferUnderflowException | IndexOutOfBoundsException e) + { + throw new MarshalException("Not enough bytes to read a list"); + } } @Override diff --git a/src/java/org/apache/cassandra/serializers/SetSerializer.java b/src/java/org/apache/cassandra/serializers/SetSerializer.java index 72a652d4a8e0..eb49e5c3d13c 100644 --- a/src/java/org/apache/cassandra/serializers/SetSerializer.java +++ b/src/java/org/apache/cassandra/serializers/SetSerializer.java @@ -116,7 +116,7 @@ public Set deserialize(V input, ValueAccessor accessor) l.add(elements.deserialize(value, accessor)); } if (!accessor.isEmptyFromOffset(input, offset)) - throw new MarshalException("Unexpected extraneous bytes after set value"); + throw new MarshalException("Unexpected extraneous bytes after set value" + l + "," + accessor.toHex(input)); return l; } catch (BufferUnderflowException | IndexOutOfBoundsException e) diff --git a/src/java/org/apache/cassandra/service/AbstractWriteResponseHandler.java b/src/java/org/apache/cassandra/service/AbstractWriteResponseHandler.java index 343bac1c4f80..84dfe49b5702 100644 --- a/src/java/org/apache/cassandra/service/AbstractWriteResponseHandler.java +++ b/src/java/org/apache/cassandra/service/AbstractWriteResponseHandler.java @@ -25,7 +25,6 @@ import java.util.concurrent.atomic.AtomicIntegerFieldUpdater; import java.util.function.Function; import java.util.function.Supplier; -import java.util.stream.Collectors; import javax.annotation.Nullable; import org.slf4j.Logger; @@ -36,7 +35,10 @@ import org.apache.cassandra.db.IMutation; import org.apache.cassandra.db.Mutation; import org.apache.cassandra.db.WriteType; +import org.apache.cassandra.exceptions.CoordinatorBehindException; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RetryOnDifferentSystemException; import org.apache.cassandra.exceptions.WriteFailureException; import org.apache.cassandra.exceptions.WriteTimeoutException; import org.apache.cassandra.locator.EndpointsForToken; @@ -50,6 +52,7 @@ import org.apache.cassandra.utils.concurrent.Condition; import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; +import static com.google.common.collect.ImmutableMap.toImmutableMap; import static java.lang.Long.MAX_VALUE; import static java.lang.Math.min; import static java.util.concurrent.TimeUnit.MICROSECONDS; @@ -58,6 +61,8 @@ import static org.apache.cassandra.config.DatabaseDescriptor.getCounterWriteRpcTimeout; import static org.apache.cassandra.config.DatabaseDescriptor.getWriteRpcTimeout; import static org.apache.cassandra.db.WriteType.COUNTER; +import static org.apache.cassandra.exceptions.RequestFailureReason.COORDINATOR_BEHIND; +import static org.apache.cassandra.exceptions.RequestFailureReason.RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM; import static org.apache.cassandra.locator.Replicas.countInOurDc; import static org.apache.cassandra.schema.Schema.instance; import static org.apache.cassandra.service.StorageProxy.WritePerformer; @@ -78,6 +83,10 @@ public abstract class AbstractWriteResponseHandler implements RequestCallback private static final AtomicIntegerFieldUpdater failuresUpdater = AtomicIntegerFieldUpdater.newUpdater(AbstractWriteResponseHandler.class, "failures"); private volatile int failures = 0; + private static final AtomicIntegerFieldUpdater alreadyHintedForRetryOnDifferentSystemUpdater = + AtomicIntegerFieldUpdater.newUpdater(AbstractWriteResponseHandler.class, "alreadyHintedForRetryOnDifferentSystem"); + // Only write a hint to be applied as a transaction once + private volatile int alreadyHintedForRetryOnDifferentSystem = 0; private volatile Map failureReasonByEndpoint; private final Dispatcher.RequestTime requestTime; private @Nullable final Supplier hintOnFailure; @@ -110,7 +119,7 @@ protected AbstractWriteResponseHandler(ForWrite replicaPlan, Runnable callback, this.requestTime = requestTime; } - public void get() throws WriteTimeoutException, WriteFailureException + public void get() throws WriteTimeoutException, WriteFailureException, RetryOnDifferentSystemException { long timeoutNanos = currentTimeoutNanos(); @@ -127,14 +136,42 @@ public void get() throws WriteTimeoutException, WriteFailureException if (!signaled) throwTimeout(); - if (blockFor() + failures > candidateReplicaCount()) + int candidateReplicaCount = candidateReplicaCount(); + if (blockFor() + failures > candidateReplicaCount) { - if (RequestCallback.isTimeout(this.getFailureReasonByEndpointMap().keySet().stream() - .filter(this::waitingFor) // DatacenterWriteResponseHandler filters errors from remote DCs - .collect(Collectors.toMap(Function.identity(), this.getFailureReasonByEndpointMap()::get)))) + // failures keeps incrementing, and this.failureReasonByEndpoint keeps getting new entries after signaling. + // Simpler to reason about what happened by copying this.failureReasonByEndpoint and then inferring + // failures from it + final Map failureReasonByEndpoint = getFailureReasonByEndpointMap().keySet().stream() + .filter(this::waitingFor) // DatacenterWriteResponseHandler filters errors from remote DCs + .collect(toImmutableMap(Function.identity(), getFailureReasonByEndpointMap()::get)); + final int failures = failureReasonByEndpoint.size(); + if (RequestCallback.isTimeout(failureReasonByEndpoint)) throwTimeout(); - throw new WriteFailureException(replicaPlan.consistencyLevel(), ackCount(), blockFor(), writeType, this.getFailureReasonByEndpointMap()); + int transactionRetryErrors = 0; + int coordinatorBehindErrors = 0; + for (RequestFailureReason reason : failureReasonByEndpoint.values()) + { + if (reason == RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM) + transactionRetryErrors++; + if (reason == COORDINATOR_BEHIND) + coordinatorBehindErrors++; + } + int totalRetriableFailures = transactionRetryErrors + coordinatorBehindErrors; + + // Retrying might fix this + if (candidateReplicaCount - failures + totalRetriableFailures >= blockFor()) + { + // Doesn't matter which we throw really but for clarity/metrics be specific + // Retrying on the correct system might make this write succeed + if (transactionRetryErrors > 0) + throw new RetryOnDifferentSystemException(); + if (coordinatorBehindErrors > 0) + throw new CoordinatorBehindException("Write request failed due to coordinator behind"); + } + + throw new WriteFailureException(replicaPlan.consistencyLevel(), ackCount(), blockFor(), writeType, getFailureReasonByEndpointMap()); } if (replicaPlan.stillAppliesTo(ClusterMetadata.current())) @@ -295,9 +332,9 @@ protected void signal() } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) + public void onFailure(InetAddressAndPort from, RequestFailure failure) { - logger.trace("Got failure from {}", from); + logger.trace("Got failure {} from {}", failure, from); int n = waitingFor(from) ? failuresUpdater.incrementAndGet(this) @@ -309,15 +346,27 @@ public void onFailure(InetAddressAndPort from, RequestFailureReason failureReaso if (failureReasonByEndpoint == null) failureReasonByEndpoint = new ConcurrentHashMap<>(); } - failureReasonByEndpoint.put(from, failureReason); + failureReasonByEndpoint.put(from, failure.reason); logFailureOrTimeoutToIdealCLDelegate(); if (blockFor() + n > candidateReplicaCount()) signal(); - if (hintOnFailure != null && StorageProxy.shouldHint(replicaPlan.lookup(from)) && requestTime.shouldSendHints()) - StorageProxy.submitHint(hintOnFailure.get(), replicaPlan.lookup(from), null); + // If the failure was RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM then we only want to hint once + // and not for each instance since odds are it will be applied as a transaction at all replicas + if (hintOnFailure != null && StorageProxy.shouldHint(replicaPlan.lookup(from)) ) + { + if (failure.reason == RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM) + { + if (alreadyHintedForRetryOnDifferentSystemUpdater.compareAndSet(this, 0, 1)) + StorageProxy.submitHintForRetryOnDifferentSystem(hintOnFailure.get()); + } + else + { + StorageProxy.submitHint(hintOnFailure.get(), replicaPlan.lookup(from), null); + } + } } @Override diff --git a/src/java/org/apache/cassandra/service/ActiveRepairService.java b/src/java/org/apache/cassandra/service/ActiveRepairService.java index 1a60bd306745..bee41a7fa3a3 100644 --- a/src/java/org/apache/cassandra/service/ActiveRepairService.java +++ b/src/java/org/apache/cassandra/service/ActiveRepairService.java @@ -35,12 +35,12 @@ import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; -import javax.management.openmbean.CompositeData; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Predicate; import java.util.function.Supplier; import java.util.stream.Collectors; +import javax.management.openmbean.CompositeData; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; @@ -50,7 +50,6 @@ import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Multimap; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -64,6 +63,7 @@ import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.gms.ApplicationState; import org.apache.cassandra.gms.EndpointState; @@ -104,6 +104,7 @@ import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.disk.usage.DiskUsageMonitor; import org.apache.cassandra.service.paxos.PaxosRepair; import org.apache.cassandra.service.paxos.cleanup.PaxosCleanup; import org.apache.cassandra.service.snapshot.SnapshotManager; @@ -130,7 +131,14 @@ import static org.apache.cassandra.config.CassandraRelevantProperties.SKIP_PAXOS_REPAIR_ON_TOPOLOGY_CHANGE; import static org.apache.cassandra.config.CassandraRelevantProperties.SKIP_PAXOS_REPAIR_ON_TOPOLOGY_CHANGE_KEYSPACES; import static org.apache.cassandra.config.Config.RepairCommandPoolFullStrategy.reject; -import static org.apache.cassandra.config.DatabaseDescriptor.*; +import static org.apache.cassandra.config.DatabaseDescriptor.getRepairCommandPoolFullStrategy; +import static org.apache.cassandra.config.DatabaseDescriptor.getRepairCommandPoolSize; +import static org.apache.cassandra.config.DatabaseDescriptor.getRepairRetrySpec; +import static org.apache.cassandra.config.DatabaseDescriptor.getRepairRpcTimeout; +import static org.apache.cassandra.config.DatabaseDescriptor.getRepairStateExpires; +import static org.apache.cassandra.config.DatabaseDescriptor.getRepairStateSize; +import static org.apache.cassandra.config.DatabaseDescriptor.getRpcTimeout; +import static org.apache.cassandra.config.DatabaseDescriptor.paxosRepairEnabled; import static org.apache.cassandra.net.Verb.PREPARE_MSG; import static org.apache.cassandra.repair.messages.RepairMessage.notDone; import static org.apache.cassandra.utils.Simulate.With.MONITORS; @@ -447,15 +455,17 @@ public int parentRepairSessionsCount() */ public RepairSession submitRepairSession(TimeUUID parentRepairSession, CommonRange range, + boolean excludedDeadNodes, String keyspace, RepairParallelism parallelismDegree, boolean isIncremental, boolean pullRepair, PreviewKind previewKind, boolean optimiseStreams, + boolean repairData, boolean repairPaxos, - boolean paxosOnly, boolean dontPurgeTombstones, + boolean repairAccord, ExecutorPlus executor, Scheduler validationScheduler, String... cfnames) @@ -469,9 +479,11 @@ public RepairSession submitRepairSession(TimeUUID parentRepairSession, if (cfnames.length == 0) return null; - final RepairSession session = new RepairSession(ctx, validationScheduler, parentRepairSession, range, keyspace, + final RepairSession session = new RepairSession(ctx, validationScheduler, parentRepairSession, + range, excludedDeadNodes, keyspace, parallelismDegree, isIncremental, pullRepair, - previewKind, optimiseStreams, repairPaxos, paxosOnly, dontPurgeTombstones, cfnames); + previewKind, optimiseStreams, repairData, repairPaxos, + dontPurgeTombstones, repairAccord, cfnames); repairs.getIfPresent(parentRepairSession).register(session.state); sessions.put(session.getId(), session); @@ -658,6 +670,9 @@ public boolean verifyCompactionsPendingThreshold(TimeUUID parentRepairSession, P public Future prepareForRepair(TimeUUID parentRepairSession, InetAddressAndPort coordinator, Set endpoints, RepairOption options, boolean isForcedRepair, List columnFamilyStores) { + if (!verifyDiskHeadroomThreshold(parentRepairSession, options.getPreviewKind(), options.isIncremental())) + failRepair(parentRepairSession, "Rejecting incoming repair, disk usage above threshold"); // failRepair throws exception + if (!verifyCompactionsPendingThreshold(parentRepairSession, options.getPreviewKind())) failRepair(parentRepairSession, "Rejecting incoming repair, pending compactions above threshold"); // failRepair throws exception @@ -715,6 +730,24 @@ public Future prepareForRepair(TimeUUID parentRepairSession, InetAddressAndPo return promise; } + public static boolean verifyDiskHeadroomThreshold(TimeUUID parentRepairSession, PreviewKind previewKind, boolean isIncremental) + { + if (!isIncremental) // disk headroom is required for anti-compaction which is only performed by incremental repair + return true; + + double diskUsage = DiskUsageMonitor.instance.getDiskUsage(); + double rejectRatio = ActiveRepairService.instance().getIncrementalRepairDiskHeadroomRejectRatio(); + + if (diskUsage + rejectRatio > 1) + { + logger.error("[{}] Rejecting incoming repair, disk usage ({}%) above threshold ({}%)", + previewKind.logPrefix(parentRepairSession), String.format("%.2f", diskUsage * 100), String.format("%.2f", (1 - rejectRatio) * 100)); + return false; + } + + return true; + } + private void sendPrepareWithRetries(TimeUUID parentRepairSession, AtomicInteger pending, Set failedNodes, @@ -731,10 +764,10 @@ public void onResponse(Message msg) } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) + public void onFailure(InetAddressAndPort from, RequestFailure failure) { failedNodes.add(from.toString()); - if (failureReason == RequestFailureReason.TIMEOUT) + if (failure.reason == RequestFailureReason.TIMEOUT) { pending.set(-1); promise.setFailure(failRepairException(parentRepairSession, "Did not get replies from all endpoints.")); @@ -787,7 +820,7 @@ public void onResponse(Message msg) } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) + public void onFailure(InetAddressAndPort from, RequestFailure failure) { logger.debug("Failed to clean up parent repair session {} on {}. The uncleaned sessions will " + "be removed on a node restart. This should not be a problem unless you see thousands " + @@ -1075,6 +1108,16 @@ public void setRepairPendingCompactionRejectThreshold(int value) DatabaseDescriptor.setRepairPendingCompactionRejectThreshold(value); } + public double getIncrementalRepairDiskHeadroomRejectRatio() + { + return DatabaseDescriptor.getIncrementalRepairDiskHeadroomRejectRatio(); + } + + public void setIncrementalRepairDiskHeadroomRejectRatio(double value) + { + DatabaseDescriptor.setIncrementalRepairDiskHeadroomRejectRatio(value); + } + /** * Remove any parent repair sessions matching predicate */ diff --git a/src/java/org/apache/cassandra/service/ActiveRepairServiceMBean.java b/src/java/org/apache/cassandra/service/ActiveRepairServiceMBean.java index 851dc6c802bb..c739b048d68f 100644 --- a/src/java/org/apache/cassandra/service/ActiveRepairServiceMBean.java +++ b/src/java/org/apache/cassandra/service/ActiveRepairServiceMBean.java @@ -74,4 +74,8 @@ public interface ActiveRepairServiceMBean int parentRepairSessionsCount(); public int getPaxosRepairParallelism(); public void setPaxosRepairParallelism(int v); + + public double getIncrementalRepairDiskHeadroomRejectRatio(); + + public void setIncrementalRepairDiskHeadroomRejectRatio(double value); } diff --git a/src/java/org/apache/cassandra/service/AutoRepairService.java b/src/java/org/apache/cassandra/service/AutoRepairService.java new file mode 100644 index 000000000000..db1b29c38968 --- /dev/null +++ b/src/java/org/apache/cassandra/service/AutoRepairService.java @@ -0,0 +1,326 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.config.ParameterizedClass; +import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig.RepairType; +import org.apache.cassandra.repair.autorepair.AutoRepairUtils; +import org.apache.cassandra.utils.MBeanWrapper; + +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Joiner; + +/** + * Implement all the MBeans for AutoRepair. + */ +public class AutoRepairService implements AutoRepairServiceMBean +{ + public static final String MBEAN_NAME = "org.apache.cassandra.db:type=AutoRepairService"; + + @VisibleForTesting + protected AutoRepairConfig config; + + public static final AutoRepairService instance = new AutoRepairService(); + + @VisibleForTesting + protected AutoRepairService() + { + } + + public static void setup() + { + instance.config = DatabaseDescriptor.getAutoRepairConfig(); + } + + static + { + MBeanWrapper.instance.registerMBean(instance, MBEAN_NAME); + } + + public void checkCanRun(String repairType) + { + checkCanRun(RepairType.parse(repairType)); + } + + public void checkCanRun(RepairType repairType) + { + if (!config.isAutoRepairSchedulingEnabled()) + throw new ConfigurationException("Auto-repair scheduler is disabled."); + + if (repairType != RepairType.INCREMENTAL) + return; + + if (config.getMaterializedViewRepairEnabled(repairType) && DatabaseDescriptor.isMaterializedViewsOnRepairEnabled()) + throw new ConfigurationException("Cannot run incremental repair while materialized view replay is enabled. Set materialized_views_on_repair_enabled to false."); + + if (DatabaseDescriptor.isCDCEnabled() && DatabaseDescriptor.isCDCOnRepairEnabled()) + throw new ConfigurationException("Cannot run incremental repair while CDC replay is enabled. Set cdc_on_repair_enabled to false."); + } + + public AutoRepairConfig getAutoRepairConfig() + { + return config; + } + + @Override + public boolean isAutoRepairDisabled() + { + return config == null || !config.isAutoRepairSchedulingEnabled(); + } + + @Override + public String getAutoRepairConfiguration() + { + StringBuilder sb = new StringBuilder(); + sb.append("repair scheduler configuration:"); + appendConfig(sb, "repair_check_interval", config.getRepairCheckInterval()); + appendConfig(sb, "repair_task_min_duration", config.getRepairTaskMinDuration()); + appendConfig(sb, "history_clear_delete_hosts_buffer_interval", config.getAutoRepairHistoryClearDeleteHostsBufferInterval()); + for (RepairType repairType : RepairType.values()) + { + sb.append(formatRepairTypeConfig(repairType, config)); + } + return sb.toString(); + } + + @Override + public void setAutoRepairEnabled(String repairType, boolean enabled) + { + checkCanRun(repairType); + config.setAutoRepairEnabled(RepairType.parse(repairType), enabled); + } + + @Override + public void setRepairThreads(String repairType, int repairThreads) + { + config.setRepairThreads(RepairType.parse(repairType), repairThreads); + } + + @Override + public void setRepairPriorityForHosts(String repairType, String commaSeparatedHostSet) + { + Set hosts = InetAddressAndPort.parseHosts(commaSeparatedHostSet, false); + if (!hosts.isEmpty()) + { + AutoRepairUtils.addPriorityHosts(RepairType.parse(repairType), hosts); + } + } + + @Override + public void setForceRepairForHosts(String repairType, String commaSeparatedHostSet) + { + Set hosts = InetAddressAndPort.parseHosts(commaSeparatedHostSet, false); + if (!hosts.isEmpty()) + { + AutoRepairUtils.setForceRepair(RepairType.parse(repairType), hosts); + } + } + + @Override + public void setRepairMinInterval(String repairType, String minRepairInterval) + { + config.setRepairMinInterval(RepairType.parse(repairType), minRepairInterval); + } + + @Override + public void startScheduler() + { + config.startScheduler(); + } + + @Override + public void setAutoRepairHistoryClearDeleteHostsBufferDuration(String duration) + { + config.setAutoRepairHistoryClearDeleteHostsBufferInterval(duration); + } + + @Override + public void setAutoRepairMinRepairTaskDuration(String duration) + { + config.setRepairTaskMinDuration(duration); + } + + @Override + public void setRepairSSTableCountHigherThreshold(String repairType, int sstableHigherThreshold) + { + config.setRepairSSTableCountHigherThreshold(RepairType.parse(repairType), sstableHigherThreshold); + } + + @Override + public void setAutoRepairTableMaxRepairTime(String repairType, String autoRepairTableMaxRepairTime) + { + config.setAutoRepairTableMaxRepairTime(RepairType.parse(repairType), autoRepairTableMaxRepairTime); + } + + @Override + public void setIgnoreDCs(String repairType, Set ignoreDCs) + { + config.setIgnoreDCs(RepairType.parse(repairType), ignoreDCs); + } + + @Override + public void setPrimaryTokenRangeOnly(String repairType, boolean primaryTokenRangeOnly) + { + config.setRepairPrimaryTokenRangeOnly(RepairType.parse(repairType), primaryTokenRangeOnly); + } + + @Override + public void setParallelRepairPercentage(String repairType, int percentage) + { + config.setParallelRepairPercentage(RepairType.parse(repairType), percentage); + } + + @Override + public void setParallelRepairCount(String repairType, int count) + { + config.setParallelRepairCount(RepairType.parse(repairType), count); + } + + @Override + public void setAllowParallelReplicaRepair(String repairType, boolean enabled) + { + config.setAllowParallelReplicaRepair(RepairType.parse(repairType), enabled); + } + + @Override + public void setAllowParallelReplicaRepairAcrossSchedules(String repairType, boolean enabled) + { + config.setAllowParallelReplicaRepairAcrossSchedules(RepairType.parse(repairType), enabled); + } + + @Override + public void setMVRepairEnabled(String repairType, boolean enabled) + { + config.setMaterializedViewRepairEnabled(RepairType.parse(repairType), enabled); + } + + @Override + public void setRepairSessionTimeout(String repairType, String timeout) + { + config.setRepairSessionTimeout(RepairType.parse(repairType), timeout); + } + + @Override + public Set getOnGoingRepairHostIds(String repairType) + { + List histories = AutoRepairUtils.getAutoRepairHistory(RepairType.parse(repairType)); + if (histories == null) + { + return Collections.emptySet(); + } + Set hostIds = new HashSet<>(); + AutoRepairUtils.CurrentRepairStatus currentRepairStatus = new AutoRepairUtils.CurrentRepairStatus(histories, AutoRepairUtils.getPriorityHostIds(RepairType.parse(repairType)), null); + for (UUID id : currentRepairStatus.hostIdsWithOnGoingRepair) + { + hostIds.add(id.toString()); + } + for (UUID id : currentRepairStatus.hostIdsWithOnGoingForceRepair) + { + hostIds.add(id.toString()); + } + return Collections.unmodifiableSet(hostIds); + } + + @Override + public void setAutoRepairTokenRangeSplitterParameter(String repairType, String key, String value) + { + config.getTokenRangeSplitterInstance(RepairType.parse(repairType)).setParameter(key, value); + } + + @Override + public void setRepairByKeyspace(String repairType, boolean repairByKeyspace) + { + config.setRepairByKeyspace(RepairType.parse(repairType), repairByKeyspace); + } + + @Override + public void setAutoRepairMaxRetriesCount(String repairType, int retries) + { + config.setRepairMaxRetries(RepairType.parse(repairType), retries); + } + + @Override + public void setAutoRepairRetryBackoff(String repairType, String interval) + { + config.setRepairRetryBackoff(RepairType.parse(repairType), interval); + } + + private String formatRepairTypeConfig(RepairType repairType, AutoRepairConfig config) + { + StringBuilder sb = new StringBuilder(); + sb.append("\nconfiguration for repair_type: ").append(repairType.getConfigName()); + sb.append("\n\tenabled: ").append(config.isAutoRepairEnabled(repairType)); + // Only show configuration if enabled + if (config.isAutoRepairEnabled(repairType)) + { + Set priorityHosts = AutoRepairUtils.getPriorityHosts(repairType); + if (!priorityHosts.isEmpty()) + { + appendConfig(sb, "priority_hosts", Joiner.on(',').skipNulls().join(priorityHosts)); + } + + appendConfig(sb, "min_repair_interval", config.getRepairMinInterval(repairType)); + appendConfig(sb, "repair_by_keyspace", config.getRepairByKeyspace(repairType)); + appendConfig(sb, "number_of_repair_threads", config.getRepairThreads(repairType)); + appendConfig(sb, "sstable_upper_threshold", config.getRepairSSTableCountHigherThreshold(repairType)); + appendConfig(sb, "table_max_repair_time", config.getAutoRepairTableMaxRepairTime(repairType)); + appendConfig(sb, "ignore_dcs", config.getIgnoreDCs(repairType)); + appendConfig(sb, "repair_primary_token_range_only", config.getRepairPrimaryTokenRangeOnly(repairType)); + appendConfig(sb, "parallel_repair_count", config.getParallelRepairCount(repairType)); + appendConfig(sb, "parallel_repair_percentage", config.getParallelRepairPercentage(repairType)); + appendConfig(sb, "allow_parallel_replica_repair", config.getAllowParallelReplicaRepair(repairType)); + appendConfig(sb, "allow_parallel_replica_repair_across_schedules", config.getAllowParallelReplicaRepairAcrossSchedules(repairType)); + appendConfig(sb, "materialized_view_repair_enabled", config.getMaterializedViewRepairEnabled(repairType)); + appendConfig(sb, "initial_scheduler_delay", config.getInitialSchedulerDelay(repairType)); + appendConfig(sb, "repair_session_timeout", config.getRepairSessionTimeout(repairType)); + appendConfig(sb, "force_repair_new_node", config.getForceRepairNewNode(repairType)); + appendConfig(sb, "repair_max_retries", config.getRepairMaxRetries(repairType)); + appendConfig(sb, "repair_retry_backoff", config.getRepairRetryBackoff(repairType)); + + final ParameterizedClass splitterClass = config.getTokenRangeSplitter(repairType); + final String splitterClassName = splitterClass.class_name != null ? splitterClass.class_name : AutoRepairConfig.DEFAULT_SPLITTER.getName(); + appendConfig(sb, "token_range_splitter", splitterClassName); + Map tokenRangeSplitterParameters = config.getTokenRangeSplitterInstance(repairType).getParameters(); + if (!tokenRangeSplitterParameters.isEmpty()) + { + for (Map.Entry param : tokenRangeSplitterParameters.entrySet()) + { + appendConfig(sb, String.format("token_range_splitter.%s", param.getKey()), param.getValue()); + } + } + } + + return sb.toString(); + } + + private void appendConfig(StringBuilder sb, String config, T value) + { + sb.append(String.format("%s%s: %s", "\n\t", config, value)); + } +} diff --git a/src/java/org/apache/cassandra/service/AutoRepairServiceMBean.java b/src/java/org/apache/cassandra/service/AutoRepairServiceMBean.java new file mode 100644 index 000000000000..181c6008f533 --- /dev/null +++ b/src/java/org/apache/cassandra/service/AutoRepairServiceMBean.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service; + + +import java.util.Set; + +/** + * Defines all the MBeans exposed for AutoRepair. + */ +public interface AutoRepairServiceMBean +{ + public void setAutoRepairEnabled(String repairType, boolean enabled); + + public void setRepairThreads(String repairType, int repairThreads); + + public void setRepairPriorityForHosts(String repairType, String commaSeparatedHostSet); + + public void setForceRepairForHosts(String repairType, String commaSeparatedHostSet); + + public void setRepairMinInterval(String repairType, String minRepairInterval); + + void startScheduler(); + + public void setAutoRepairHistoryClearDeleteHostsBufferDuration(String duration); + + public void setAutoRepairMinRepairTaskDuration(String duration); + + public void setRepairSSTableCountHigherThreshold(String repairType, int ssTableHigherThreshold); + + public void setAutoRepairTableMaxRepairTime(String repairType, String autoRepairTableMaxRepairTime); + + public void setIgnoreDCs(String repairType, Set ignorDCs); + + public void setPrimaryTokenRangeOnly(String repairType, boolean primaryTokenRangeOnly); + + public void setParallelRepairPercentage(String repairType, int percentage); + + public void setParallelRepairCount(String repairType, int count); + + public void setAllowParallelReplicaRepair(String repairType, boolean enabled); + + public void setAllowParallelReplicaRepairAcrossSchedules(String repairType, boolean enabled); + + public void setMVRepairEnabled(String repairType, boolean enabled); + + public boolean isAutoRepairDisabled(); + + public String getAutoRepairConfiguration(); + + public void setRepairSessionTimeout(String repairType, String timeout); + + public Set getOnGoingRepairHostIds(String repairType); + + public void setAutoRepairTokenRangeSplitterParameter(String repairType, String key, String value); + + public void setRepairByKeyspace(String repairType, boolean repairByKeyspace); + + public void setAutoRepairMaxRetriesCount(String repairType, int retries); + + public void setAutoRepairRetryBackoff(String repairType, String interval); +} diff --git a/src/java/org/apache/cassandra/service/BatchlogResponseHandler.java b/src/java/org/apache/cassandra/service/BatchlogResponseHandler.java index 0fa284770080..41b68de395f5 100644 --- a/src/java/org/apache/cassandra/service/BatchlogResponseHandler.java +++ b/src/java/org/apache/cassandra/service/BatchlogResponseHandler.java @@ -20,7 +20,7 @@ import java.util.concurrent.atomic.AtomicIntegerFieldUpdater; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.WriteFailureException; import org.apache.cassandra.exceptions.WriteTimeoutException; import org.apache.cassandra.locator.InetAddressAndPort; @@ -29,9 +29,10 @@ public class BatchlogResponseHandler extends AbstractWriteResponseHandler { - AbstractWriteResponseHandler wrapped; - BatchlogCleanup cleanup; + final AbstractWriteResponseHandler wrapped; + final BatchlogCleanup cleanup; protected volatile int requiredBeforeFinish; + private static final AtomicIntegerFieldUpdater requiredBeforeFinishUpdater = AtomicIntegerFieldUpdater.newUpdater(BatchlogResponseHandler.class, "requiredBeforeFinish"); @@ -55,9 +56,9 @@ public void onResponse(Message msg) cleanup.ackMutation(); } - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) + public void onFailure(InetAddressAndPort from, RequestFailure failure) { - wrapped.onFailure(from, failureReason); + wrapped.onFailure(from, failure); } public boolean invokeOnFailure() @@ -104,6 +105,11 @@ public BatchlogCleanup(int mutationsWaitingFor, BatchlogCleanupCallback callback this.callback = callback; } + public BatchlogCleanup(BatchlogCleanupCallback callback) + { + this.callback = callback; + } + public int decrement() { return mutationsWaitingForUpdater.decrementAndGet(this); @@ -114,6 +120,11 @@ public void ackMutation() if (decrement() == 0) callback.invoke(); } + + public void setMutationsWaitingFor(int mutationsWaitingFor) + { + mutationsWaitingForUpdater.lazySet(this, mutationsWaitingFor); + } } public interface BatchlogCleanupCallback diff --git a/src/java/org/apache/cassandra/service/CASRequest.java b/src/java/org/apache/cassandra/service/CASRequest.java index 50ea5852a63a..9a0592f4cd2e 100644 --- a/src/java/org/apache/cassandra/service/CASRequest.java +++ b/src/java/org/apache/cassandra/service/CASRequest.java @@ -17,13 +17,19 @@ */ package org.apache.cassandra.service; +import accord.primitives.Txn; +import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.db.SinglePartitionReadCommand; import org.apache.cassandra.db.partitions.FilteredPartition; import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.service.accord.txn.TxnResult; import org.apache.cassandra.service.paxos.Ballot; +import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.transport.Dispatcher; +import static org.apache.cassandra.service.StorageProxy.ConsensusAttemptResult; + /** * Abstract the conditions and updates for a CAS operation. */ @@ -34,17 +40,21 @@ public interface CASRequest /** * The command to use to fetch the value to compare for the CAS. */ - public SinglePartitionReadCommand readCommand(long nowInSec); + SinglePartitionReadCommand readCommand(long nowInSec); /** * Returns whether the provided CF, that represents the values fetched using the * readFilter(), match the CAS conditions this object stands for. */ - public boolean appliesTo(FilteredPartition current) throws InvalidRequestException; + boolean appliesTo(FilteredPartition current) throws InvalidRequestException; /** * The updates to perform of a CAS success. The values fetched using the readFilter() * are passed as argument. */ - public PartitionUpdate makeUpdates(FilteredPartition current, ClientState clientState, Ballot ballot) throws InvalidRequestException; + PartitionUpdate makeUpdates(FilteredPartition current, ClientState clientState, Ballot ballot) throws InvalidRequestException; + + Txn toAccordTxn(ClusterMetadata cm, ConsistencyLevel consistencyLevel, ConsistencyLevel commitConsistencyLevel, ClientState clientState, long nowInSecs); + + ConsensusAttemptResult toCasResult(TxnResult txnResult); } diff --git a/src/java/org/apache/cassandra/service/CassandraDaemon.java b/src/java/org/apache/cassandra/service/CassandraDaemon.java index 57f38e93e9a0..694725c8586d 100644 --- a/src/java/org/apache/cassandra/service/CassandraDaemon.java +++ b/src/java/org/apache/cassandra/service/CassandraDaemon.java @@ -58,6 +58,9 @@ import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.db.SystemKeyspaceMigrator41; import org.apache.cassandra.db.commitlog.CommitLog; +import org.apache.cassandra.db.virtual.AccordDebugKeyspace; +import org.apache.cassandra.db.virtual.LogMessagesTable; +import org.apache.cassandra.db.virtual.SlowQueriesTable; import org.apache.cassandra.db.virtual.SystemViewsKeyspace; import org.apache.cassandra.db.virtual.VirtualKeyspace; import org.apache.cassandra.db.virtual.VirtualKeyspaceRegistry; @@ -78,6 +81,7 @@ import org.apache.cassandra.schema.Schema; import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.security.ThreadAwareSecurityManager; +import org.apache.cassandra.service.accord.AccordOperations; import org.apache.cassandra.service.paxos.PaxosState; import org.apache.cassandra.service.snapshot.SnapshotManager; import org.apache.cassandra.streaming.StreamManager; @@ -92,6 +96,7 @@ import org.apache.cassandra.utils.concurrent.Future; import org.apache.cassandra.utils.concurrent.FutureCombiner; import org.apache.cassandra.utils.logging.LoggingSupportFactory; +import org.apache.cassandra.utils.logging.SlowQueriesAppender; import org.apache.cassandra.utils.logging.VirtualTableAppender; import static java.util.concurrent.TimeUnit.NANOSECONDS; @@ -269,6 +274,7 @@ protected void setup() Startup.initialize(DatabaseDescriptor.getSeeds()); disableAutoCompaction(Schema.instance.distributedKeyspaces().names()); CMSOperations.initJmx(); + AccordOperations.initJmx(); if (ClusterMetadata.current().myNodeId() != null) RegistrationStatus.instance.onRegistration(); } @@ -400,6 +406,8 @@ protected void setup() AuditLogManager.instance.initialize(); + StorageService.instance.doAutoRepairSetup(); + // schedule periodic background compaction task submission. this is simply a backstop against compactions stalling // due to scheduling errors or race conditions ScheduledExecutors.optionalTasks.scheduleWithFixedDelay(ColumnFamilyStore.getBackgroundCompactionTaskSubmitter(), 5, 1, TimeUnit.MINUTES); @@ -420,7 +428,6 @@ protected void setup() logger.info("Prewarming of auth caches is disabled"); PaxosState.startAutoRepairs(); - completeSetup(); } @@ -434,7 +441,6 @@ public void runStartupChecks() { exitOrFail(e.returnCode, e.getMessage(), e.getCause()); } - } /** @@ -550,11 +556,20 @@ public void setupVirtualKeyspaces() VirtualKeyspaceRegistry.instance.register(SystemViewsKeyspace.instance); VirtualKeyspaceRegistry.instance.register(new VirtualKeyspace(VIRTUAL_METRICS, createMetricsKeyspaceTables())); - // flush log messages to system_views.system_logs virtual table as there were messages already logged - // before that virtual table was instantiated + if (DatabaseDescriptor.getAccord().enable_virtual_debug_only_keyspace) + VirtualKeyspaceRegistry.instance.register(AccordDebugKeyspace.instance); + + // Flush log messages to system_views.system_logs virtual table as there were messages already logged + // before that virtual table was instantiated. + // In general, there is no need to do same treatment for slow queries as by the time queries are processed + // the logging framework if fully setup already but for the sake of it and to be sure, just do it as well. LoggingSupportFactory.getLoggingSupport() .getAppender(VirtualTableAppender.class, VirtualTableAppender.APPENDER_NAME) - .ifPresent(appender -> ((VirtualTableAppender) appender).flushBuffer()); + .ifPresent(appender -> appender.flushBuffer(LogMessagesTable.class, LogMessagesTable.TABLE_NAME)); + + LoggingSupportFactory.getLoggingSupport() + .getAppender(SlowQueriesAppender.class, SlowQueriesAppender.APPENDER_NAME) + .ifPresent(appender -> appender.flushBuffer(SlowQueriesTable.class, SlowQueriesTable.TABLE_NAME)); } public synchronized void initializeClientTransports() diff --git a/src/java/org/apache/cassandra/service/DiskErrorsHandler.java b/src/java/org/apache/cassandra/service/DiskErrorsHandler.java index b4fe9d67db67..14add63feefa 100644 --- a/src/java/org/apache/cassandra/service/DiskErrorsHandler.java +++ b/src/java/org/apache/cassandra/service/DiskErrorsHandler.java @@ -18,8 +18,6 @@ package org.apache.cassandra.service; -import com.google.common.annotations.VisibleForTesting; - import org.apache.cassandra.io.FSError; import org.apache.cassandra.io.sstable.CorruptSSTableException; @@ -43,8 +41,7 @@ class NoOpDiskErrorHandler implements DiskErrorsHandler { public static final DiskErrorsHandler NO_OP = new NoOpDiskErrorHandler(); - @VisibleForTesting - NoOpDiskErrorHandler() {} + private NoOpDiskErrorHandler() {} @Override public void inspectCommitLogError(Throwable t) {} diff --git a/src/java/org/apache/cassandra/service/DiskErrorsHandlerService.java b/src/java/org/apache/cassandra/service/DiskErrorsHandlerService.java index 97e7ecde5fba..98fb7ee609d9 100644 --- a/src/java/org/apache/cassandra/service/DiskErrorsHandlerService.java +++ b/src/java/org/apache/cassandra/service/DiskErrorsHandlerService.java @@ -35,7 +35,7 @@ public class DiskErrorsHandlerService private static volatile DiskErrorsHandler instance = NO_OP; @VisibleForTesting - public static synchronized void set(DiskErrorsHandler newInstance) + public static synchronized void set(DiskErrorsHandler newInstance) throws ConfigurationException { if (newInstance == null) return; @@ -58,7 +58,7 @@ public static synchronized void set(DiskErrorsHandler newInstance) } catch (Throwable t) { - logger.warn("Exception occured while initializing disk error handler of class " + newInstance.getClass().getName(), t); + throw new ConfigurationException("Exception occured while initializing disk error handler of class " + newInstance.getClass().getName(), t); } } diff --git a/src/java/org/apache/cassandra/service/FailureRecordingCallback.java b/src/java/org/apache/cassandra/service/FailureRecordingCallback.java index c4ca8e22f5ed..aebeea49df80 100644 --- a/src/java/org/apache/cassandra/service/FailureRecordingCallback.java +++ b/src/java/org/apache/cassandra/service/FailureRecordingCallback.java @@ -25,6 +25,7 @@ import java.util.Set; import java.util.concurrent.atomic.AtomicReferenceFieldUpdater; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.RequestCallbackWithFailure; @@ -65,7 +66,7 @@ public RequestFailureReason setValue(RequestFailureReason value) public static void push(AtomicReferenceFieldUpdater headUpdater, O owner, InetAddressAndPort from, RequestFailureReason reason) { - push(headUpdater, owner, new FailureResponses(from, reason)); + getAndPush(headUpdater, owner, new FailureResponses(from, reason)); } public static void pushExclusive(AtomicReferenceFieldUpdater headUpdater, O owner, InetAddressAndPort from, RequestFailureReason reason) @@ -136,9 +137,9 @@ public int failureCount() private static final AtomicReferenceFieldUpdater responsesUpdater = AtomicReferenceFieldUpdater.newUpdater(FailureRecordingCallback.class, FailureResponses.class, "failureResponses"); @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) + public void onFailure(InetAddressAndPort from, RequestFailure failure) { - FailureResponses.push(responsesUpdater, this, from, failureReason); + FailureResponses.push(responsesUpdater, this, from, failure.reason); } protected void onFailureWithMutex(InetAddressAndPort from, RequestFailureReason failureReason) diff --git a/src/java/org/apache/cassandra/service/FileSystemOwnershipCheck.java b/src/java/org/apache/cassandra/service/FileSystemOwnershipCheck.java index 3d69c9e7631c..6e6ebd67b7a6 100644 --- a/src/java/org/apache/cassandra/service/FileSystemOwnershipCheck.java +++ b/src/java/org/apache/cassandra/service/FileSystemOwnershipCheck.java @@ -75,9 +75,6 @@ public class FileSystemOwnershipCheck implements StartupCheck { private static final Logger logger = LoggerFactory.getLogger(FileSystemOwnershipCheck.class); - public static final String FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN = "CassandraOwnershipToken"; - public static final String DEFAULT_FS_OWNERSHIP_FILENAME = ".cassandra_fs_ownership"; - // Ownership file properties static final String VERSION = "version"; static final String VOLUME_COUNT = "volume_count"; @@ -230,7 +227,7 @@ protected String constructTokenFromProperties(Map config) throws { String cluster = getOwnershipToken(config); if (null == cluster || cluster.isEmpty()) - throw exception(String.format(MISSING_PROPERTY, FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN)); + throw exception(String.format(MISSING_PROPERTY, CassandraRelevantProperties.FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN.getKey())); return cluster; } diff --git a/src/java/org/apache/cassandra/service/Rebuild.java b/src/java/org/apache/cassandra/service/Rebuild.java index 421595bab637..b6e4d61d1bf4 100644 --- a/src/java/org/apache/cassandra/service/Rebuild.java +++ b/src/java/org/apache/cassandra/service/Rebuild.java @@ -44,11 +44,15 @@ import org.apache.cassandra.locator.Replica; import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.service.accord.AccordService; import org.apache.cassandra.streaming.StreamOperation; +import org.apache.cassandra.streaming.StreamResultFuture; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.ownership.DataPlacement; import org.apache.cassandra.tcm.ownership.DataPlacements; import org.apache.cassandra.tcm.ownership.MovementMap; +import org.apache.cassandra.utils.concurrent.Future; +import org.apache.cassandra.utils.concurrent.FutureCombiner; import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; import static org.apache.cassandra.utils.FBUtilities.getBroadcastAddressAndPort; @@ -109,7 +113,8 @@ public static void rebuild(String sourceDc, String keyspace, String tokens, Stri false, DatabaseDescriptor.getStreamingConnectionsPerHost(), rebuildMovements, - null); + null, + true); if (sourceDc != null) streamer.addSourceFilter(new RangeStreamer.SingleDatacenterFilter(metadata.locator, sourceDc)); @@ -153,7 +158,12 @@ else if (tokens == null) streamer.addKeyspaceToFetch(keyspace); } - streamer.fetchAsync().get(); + StreamResultFuture resultFuture = streamer.fetchAsync(); + // wait for result + Future accordReady = AccordService.instance().epochReady(metadata.epoch); + Future ready = FutureCombiner.allOf(resultFuture, accordReady); + // wait for result + ready.get(); } catch (InterruptedException e) { diff --git a/src/java/org/apache/cassandra/service/RetryStrategy.java b/src/java/org/apache/cassandra/service/RetryStrategy.java new file mode 100644 index 000000000000..91c87c520d02 --- /dev/null +++ b/src/java/org/apache/cassandra/service/RetryStrategy.java @@ -0,0 +1,326 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service; + +import com.google.common.annotations.VisibleForTesting; + +import accord.utils.Invariants; +import accord.utils.RandomSource; +import org.apache.cassandra.service.TimeoutStrategy.LatencySourceFactory; +import org.apache.cassandra.service.TimeoutStrategy.Wait; + +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; +import java.util.function.DoubleSupplier; +import java.util.function.LongBinaryOperator; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import static java.util.concurrent.TimeUnit.*; +import static org.apache.cassandra.service.TimeoutStrategy.parseInMicros; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; + +/** + *

    A strategy for making retry timing decisions for operations. + * The strategy is defined by six factors:

      + *
    • {@link #minMinMicros} + *
    • {@link #maxMaxMicros} + *
    • {@link #min} + *
    • {@link #max} + *
    • {@link #waitRandomizer} + *
    • {@link #maxAttempts} + *
    + * + *

    The first two represent the absolute upper and lower bound times we are permitted to produce as constants

    + *

    The next two represent time periods, and may be defined dynamically based on a simple calculation over:

      + *
    • {@code pX()} recent experienced latency distribution for successful operations, + * e.g. {@code p50(rw)} the maximum of read and write median latencies, + * {@code p999(r)} the 99.9th percentile of read latencies + *
    • {@code attempts} the number of failed attempts made by the operation so far + *
    • {@code constant} a user provided floating point constant + *
    + * + *

    Their calculation may take any of these forms + *

  • constant {@code $constant$[mu]s} + *
  • dynamic constant {@code pX() * constant} + *
  • dynamic linear {@code pX() * constant * attempts} + *
  • dynamic exponential {@code pX() * constant ^ attempts} + * + * e.g. + *
  • {@code 10ms <= p50(rw)*0.66...p99(rw)} + *
  • {@code 10ms <= p95(rw)*1.8^attempts <= 100ms} + *
  • {@code 5ms <= p50(rw)*0.5} + * + *

    These calculations are put together to construct a range from which we draw a random number. + * The period we wait for {@code X} will be drawn so that {@code minMin <= min <= max <= maxMax}. + * + *

    With the constraint that {@code max} must be {@code spread} greater than {@code min}, + * but no greater than its expression-defined maximum. {@code max} will be increased up until + * this point, after which {@code min} will be decreased until this gap is imposed. + * + *

    The {@link #waitRandomizer} property specifies the manner in which a random value is drawn from the range. + * It is defined using one of the following specifiers: + *

  • uniform + *
  • exp($power$) or exponential($power$) + *
  • qexp($power$) or qexponential($power$) or quantizedexponential($power$) + * + * The uniform specifier is self-explanatory, selecting all values in the range with equal probability. + * The exponential specifier draws values towards the end of the range with higher probability, raising + * a floating point number in the range [0..1.0) to the power provided, and translating the resulting value + * to a uniform value in the range. + * The quantized exponential specifier partitions the range into {@code attempts} buckets, then applies the pure + * exponential approach to draw values from [0..attempts), before drawing a uniform value from the corresponding bucket + */ +public class RetryStrategy implements WaitStrategy +{ + private static final Pattern RANDOMIZER = Pattern.compile( + "uniform|exp(onential)?[(](?[0-9.]+)[)]|q(uantized)?exp(onential)?[(](?[0-9.]+)[)]"); + + static final Pattern PARSE = Pattern.compile( + "(\\s*(?0|[0-9]+[mu]?s)\\s*<=)?" + + "(\\s*(?[^=]+)([(]?\\s*<=\\s*(?0|[0-9]+[mu]?s)\\s*[)]?)?\\s*[.]{3})?" + + "(\\s*(?[^=]+))" + + "(\\s*<=\\s*(?0|[0-9]+[mu]?s))?"); + + public static final WaitRandomizerFactory randomizers = new WaitRandomizerFactory(){}; + + public interface WaitRandomizer + { + long wait(long min, long max, int attempts); + } + + public static WaitRandomizerFactory randomizers(RandomSource random) + { + return new WaitRandomizerFactory() + { + @Override public LongBinaryOperator uniformLongSupplier() { return random::nextLong; } + @Override public DoubleSupplier uniformDoubleSupplier() { return random::nextDouble; } + }; + } + + public interface WaitRandomizerFactory + { + default LongBinaryOperator uniformLongSupplier() { return (min, max) -> ThreadLocalRandom.current().nextLong(min, max); } // DO NOT USE METHOD HANDLES (want to fetch afresh each time) + default DoubleSupplier uniformDoubleSupplier() { return () -> ThreadLocalRandom.current().nextDouble(); } + + default WaitRandomizer uniform() { return new Uniform(uniformLongSupplier()); } + default WaitRandomizer exponential(double power) { return new Exponential(uniformLongSupplier(), uniformDoubleSupplier(), power); } + default WaitRandomizer quantizedExponential(double power) { return new QuantizedExponential(uniformLongSupplier(), uniformDoubleSupplier(), power); } + + class Uniform implements WaitRandomizer + { + final LongBinaryOperator uniformLong; + + public Uniform(LongBinaryOperator uniformLong) + { + this.uniformLong = uniformLong; + } + + @Override + public long wait(long min, long max, int attempts) + { + return uniformLong.applyAsLong(min, max); + } + } + + abstract class AbstractExponential implements WaitRandomizer + { + final LongBinaryOperator uniformLong; + final DoubleSupplier uniformDouble; + final double power; + + public AbstractExponential(LongBinaryOperator uniformLong, DoubleSupplier uniformDouble, double power) + { + this.uniformLong = uniformLong; + this.uniformDouble = uniformDouble; + this.power = power; + } + } + + class Exponential extends AbstractExponential + { + public Exponential(LongBinaryOperator uniformLong, DoubleSupplier uniformDouble, double power) + { + super(uniformLong, uniformDouble, power); + } + + @Override + public long wait(long min, long max, int attempts) + { + if (attempts == 1) + return uniformLong.applyAsLong(min, max); + + double p = uniformDouble.getAsDouble(); + long delta = max - min; + delta *= Math.pow(p, power); + return max - delta; + } + } + + class QuantizedExponential extends AbstractExponential + { + public QuantizedExponential(LongBinaryOperator uniformLong, DoubleSupplier uniformDouble, double power) + { + super(uniformLong, uniformDouble, power); + } + + @Override + public long wait(long min, long max, int attempts) + { + long quanta = (max - min) / attempts; + if (attempts == 1 || quanta == 0) + return uniformLong.applyAsLong(min, max); + + double p = uniformDouble.getAsDouble(); + int base = (int) (attempts * Math.pow(p, power)); + return max - ThreadLocalRandom.current().nextLong(quanta * base, quanta * (base + 1)); + } + } + } + + public final WaitRandomizer waitRandomizer; + public final long minMinMicros, maxMinMicros, maxMaxMicros; + public final @Nullable Wait min; + public final @Nonnull Wait max; + public final int maxAttempts; + + protected RetryStrategy(WaitRandomizer waitRandomizer, long minMinMicros, Wait min, long maxMinMicros, Wait max, long maxMaxMicros, int retries) + { + this.waitRandomizer = waitRandomizer; + this.minMinMicros = minMinMicros; + this.min = min; + this.maxMinMicros = maxMinMicros; + this.max = max; + this.maxMaxMicros = maxMaxMicros; + this.maxAttempts = retries == Integer.MAX_VALUE ? Integer.MAX_VALUE : retries + 1; + Invariants.require(maxAttempts >= 1); + } + + public long computeWaitUntil(int attempts) + { + long wait = computeWait(attempts, NANOSECONDS); + if (wait < 0) + return -1; + return nanoTime() + wait; + } + + public long computeWait(int attempt, TimeUnit units) + { + if (attempt > maxAttempts) + return -1; + + long result; + if (min == null) + { + result = max.getMicros(attempt); + } + else + { + long min = this.min.getMicros(attempt); + if (min > maxMinMicros) + min = maxMinMicros; + long max = this.max.getMicros(attempt); + result = min >= max ? min : waitRandomizer.wait(min, max, attempt); + } + + if (result > maxMaxMicros) result = maxMaxMicros; + if (result < minMinMicros) result = minMinMicros; + return units.convert(result, MICROSECONDS); + } + + public static RetryStrategy parse(String spec, LatencySourceFactory latencies) + { + return parse(spec, latencies, null); + } + + public static RetryStrategy parse(String spec, LatencySourceFactory latencies, WaitRandomizer randomizer) + { + String original = spec; + int retries = Integer.MAX_VALUE; + int end = spec.length(); + { + int next; + while ((next = spec.lastIndexOf(',', end - 1)) >= 0) + { + int mid = spec.indexOf('=', next + 1); + if (mid <= next || mid >= end) + throw new IllegalArgumentException("Invalid modifier specification: '" + spec.substring(next, end) + "'; expecting '=' for value assignment"); + String key = spec.substring(next + 1, mid).trim(); + String value = spec.substring(mid + 1, end).trim(); + switch (key) + { + default: throw new IllegalArgumentException("Invalid modifier specification: unrecognised property '" + key + '\''); + case "retries": + retries = Integer.parseInt(value); + break; + case "rnd": + if (randomizer != null) + throw new IllegalArgumentException("Randomizer already specified, cannot re-specify: " + value); + randomizer = parseWaitRandomizer(value); + break; + } + end = next; + } + if (end != spec.length()) + spec = spec.substring(0, end); + } + + Matcher m = PARSE.matcher(spec); + if (!m.matches()) + throw new IllegalArgumentException("Invalid specification: '" + spec + "'; does not match " + PARSE); + + long minMin = parseInMicros(m.group("minmin"), 0); + long maxMax = parseInMicros(m.group("maxmax"), Long.MAX_VALUE); + Wait max = TimeoutStrategy.parseWait(m.group("max"), latencies); + String minSpec = m.group("min"); + Wait min = minSpec == null ? null : TimeoutStrategy.parseWait(minSpec, latencies); + if (min == null && randomizer != null) + throw new IllegalArgumentException("Invalid to specify randomiser when no range specified: '" + original + '\''); + if (min instanceof Wait.Constant && minMin != 0) + throw new IllegalArgumentException("Invalid to specify an absolute minimum constant when the min bound is itself a constant: '" + original + '\''); + long maxMin = parseInMicros(m.group("maxmin"), Long.MAX_VALUE); + if (min instanceof Wait.Constant && maxMin != Long.MAX_VALUE) + throw new IllegalArgumentException("Invalid to specify an absolute max(min) constant when the min bound is itself a constant: '" + original + '\''); + if (max instanceof Wait.Constant && maxMax != Long.MAX_VALUE) + throw new IllegalArgumentException("Invalid to specify an absolute maximum constant when the max bound is itself a constant: '" + original + '\''); + if (randomizer == null) + randomizer = randomizers.uniform(); + return new RetryStrategy(randomizer, minMin, min, maxMin, max, maxMax, retries); + } + + @VisibleForTesting + protected static WaitRandomizer parseWaitRandomizer(String input) + { + Matcher m = RANDOMIZER.matcher(input); + if (!m.matches()) + throw new IllegalArgumentException(input + " does not match" + RANDOMIZER); + + String exp; + exp = m.group("exp"); + if (exp != null) + return randomizers.exponential(Double.parseDouble(exp)); + exp = m.group("qexp"); + if (exp != null) + return randomizers.quantizedExponential(Double.parseDouble(exp)); + return randomizers.uniform(); + } +} diff --git a/src/java/org/apache/cassandra/service/StorageProxy.java b/src/java/org/apache/cassandra/service/StorageProxy.java index f7219a4b580d..4d0857b40c47 100644 --- a/src/java/org/apache/cassandra/service/StorageProxy.java +++ b/src/java/org/apache/cassandra/service/StorageProxy.java @@ -22,8 +22,10 @@ import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Objects; @@ -31,22 +33,27 @@ import java.util.Set; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.Future; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.function.Function; +import java.util.function.IntPredicate; import java.util.stream.Collectors; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; -import com.google.common.base.Preconditions; import com.google.common.cache.CacheLoader; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; +import com.google.common.collect.Iterators; import com.google.common.util.concurrent.Uninterruptibles; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import accord.primitives.Txn; import org.apache.cassandra.batchlog.Batch; import org.apache.cassandra.batchlog.BatchlogManager; import org.apache.cassandra.concurrent.DebuggableTask.RunnableDebuggableTask; @@ -58,10 +65,12 @@ import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.db.CounterMutation; import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.EmptyIterators; import org.apache.cassandra.db.IMutation; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.MessageParams; import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.db.PartitionPosition; import org.apache.cassandra.db.PartitionRangeReadCommand; import org.apache.cassandra.db.ReadCommand; import org.apache.cassandra.db.ReadExecutionController; @@ -78,9 +87,11 @@ import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; import org.apache.cassandra.db.rows.RowIterator; import org.apache.cassandra.db.view.ViewUtils; +import org.apache.cassandra.dht.AbstractBounds; import org.apache.cassandra.dht.Token; import org.apache.cassandra.exceptions.CasWriteTimeoutException; import org.apache.cassandra.exceptions.CasWriteUnknownResultException; +import org.apache.cassandra.exceptions.CoordinatorBehindException; import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.exceptions.IsBootstrappingException; import org.apache.cassandra.exceptions.OverloadedException; @@ -88,9 +99,10 @@ import org.apache.cassandra.exceptions.ReadAbortException; import org.apache.cassandra.exceptions.ReadFailureException; import org.apache.cassandra.exceptions.ReadTimeoutException; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.RequestFailureException; -import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.exceptions.RequestTimeoutException; +import org.apache.cassandra.exceptions.RetryOnDifferentSystemException; import org.apache.cassandra.exceptions.UnavailableException; import org.apache.cassandra.exceptions.WriteFailureException; import org.apache.cassandra.exceptions.WriteTimeoutException; @@ -122,6 +134,25 @@ import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.TableParams; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.IAccordService; +import org.apache.cassandra.service.accord.IAccordService.IAccordResult; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.txn.TxnData; +import org.apache.cassandra.service.accord.txn.TxnDataKeyValue; +import org.apache.cassandra.service.accord.txn.TxnDataValue; +import org.apache.cassandra.service.accord.txn.TxnQuery; +import org.apache.cassandra.service.accord.txn.TxnRangeReadResult; +import org.apache.cassandra.service.accord.txn.TxnRead; +import org.apache.cassandra.service.accord.txn.TxnResult; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationMutationHelper.SplitConsumer; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationMutationHelper.SplitMutations; +import org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter; +import org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter.SplitReads; +import org.apache.cassandra.service.consensus.migration.TransactionalMigrationFromMode; import org.apache.cassandra.service.paxos.Ballot; import org.apache.cassandra.service.paxos.Commit; import org.apache.cassandra.service.paxos.ContentionStrategy; @@ -131,6 +162,7 @@ import org.apache.cassandra.service.paxos.v1.ProposeCallback; import org.apache.cassandra.service.reads.AbstractReadExecutor; import org.apache.cassandra.service.reads.ReadCallback; +import org.apache.cassandra.service.reads.ReadCoordinator; import org.apache.cassandra.service.reads.range.RangeCommands; import org.apache.cassandra.service.reads.repair.ReadRepair; import org.apache.cassandra.tcm.ClusterMetadata; @@ -145,14 +177,19 @@ import org.apache.cassandra.utils.MonotonicClock; import org.apache.cassandra.utils.NoSpamLogger; import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.Throwables; import org.apache.cassandra.utils.TimeUUID; import org.apache.cassandra.utils.concurrent.CountDownLatch; +import org.apache.cassandra.utils.concurrent.Future; import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; +import static accord.primitives.Txn.Kind.Read; +import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.collect.Iterables.concat; import static java.util.concurrent.TimeUnit.MILLISECONDS; import static java.util.concurrent.TimeUnit.NANOSECONDS; import static org.apache.cassandra.db.ConsistencyLevel.SERIAL; +import static org.apache.cassandra.db.partitions.PartitionIterators.singletonIterator; import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.casReadMetrics; import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.casWriteMetrics; import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.readMetrics; @@ -170,6 +207,17 @@ import static org.apache.cassandra.net.Verb.SCHEMA_VERSION_REQ; import static org.apache.cassandra.net.Verb.TRUNCATE_REQ; import static org.apache.cassandra.service.BatchlogResponseHandler.BatchlogCleanup; +import static org.apache.cassandra.service.StorageProxy.ConsensusAttemptResult.RETRY_NEW_PROTOCOL; +import static org.apache.cassandra.service.StorageProxy.ConsensusAttemptResult.casResult; +import static org.apache.cassandra.service.StorageProxy.ConsensusAttemptResult.serialReadResult; +import static org.apache.cassandra.service.accord.txn.TxnResult.Kind.range_read; +import static org.apache.cassandra.service.accord.txn.TxnResult.Kind.retry_new_protocol; +import static org.apache.cassandra.service.consensus.migration.ConsensusMigrationMutationHelper.mutateWithAccordAsync; +import static org.apache.cassandra.service.consensus.migration.ConsensusMigrationMutationHelper.splitMutationsIntoAccordAndNormal; +import static org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter.ConsensusRoutingDecision; +import static org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter.getTableMetadata; +import static org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter.shouldReadEphemerally; +import static org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter.splitReadsIntoAccordAndNormal; import static org.apache.cassandra.service.paxos.Ballot.Flag.GLOBAL; import static org.apache.cassandra.service.paxos.Ballot.Flag.LOCAL; import static org.apache.cassandra.service.paxos.BallotGenerator.Global.nextBallot; @@ -179,6 +227,8 @@ import static org.apache.cassandra.utils.Clock.Global.nanoTime; import static org.apache.cassandra.utils.LocalizeString.toUpperCaseLocalized; import static org.apache.cassandra.utils.MonotonicClock.Global.approxTime; +import static org.apache.cassandra.utils.Throwables.getStackTraceAsToString; +import static org.apache.cassandra.utils.Throwables.unchecked; import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; import static org.apache.cassandra.utils.concurrent.CountDownLatch.newCountDownLatch; import static org.apache.commons.lang3.StringUtils.join; @@ -322,26 +372,64 @@ public static RowIterator cas(String keyspaceName, key, keyspaceName, cfName)); } - return (Paxos.useV2() || keyspaceName.equals(SchemaConstants.METADATA_KEYSPACE_NAME)) - ? Paxos.cas(key, request, consistencyForPaxos, consistencyForCommit, clientState) - : legacyCas(keyspaceName, cfName, key, request, consistencyForPaxos, consistencyForCommit, clientState, nowInSeconds, requestTime); - } - - public static RowIterator legacyCas(String keyspaceName, - String cfName, - DecoratedKey key, - CASRequest request, - ConsistencyLevel consistencyForPaxos, - ConsistencyLevel consistencyForCommit, - ClientState clientState, - long nowInSeconds, - Dispatcher.RequestTime requestTime) + ConsensusAttemptResult lastAttemptResult; + do + { + ClusterMetadata cm = ClusterMetadata.current(); + TableMetadata metadata = Schema.instance.validateTable(keyspaceName, cfName); + ConsensusRoutingDecision decision = consensusRouting(cm, metadata, key, consistencyForPaxos, requestTime, true); + switch (decision) + { + case paxosV2: + lastAttemptResult = Paxos.cas(key, + request, + consistencyForPaxos, + consistencyForCommit, + clientState, + requestTime); + break; + case paxosV1: + lastAttemptResult = legacyCas(metadata, + key, + request, + consistencyForPaxos, + consistencyForCommit, + clientState, + nowInSeconds, + requestTime); + break; + case accord: + Txn txn = request.toAccordTxn(cm, + consistencyForPaxos, + consistencyForCommit, + clientState, + nowInSeconds); + IAccordService accordService = AccordService.instance(); + TxnResult txnResult = accordService.coordinate(metadata.epoch.getEpoch(), + txn, + consistencyForPaxos, + requestTime); + lastAttemptResult = request.toCasResult(txnResult); + break; + default: + throw new IllegalStateException("Unsupported consensus " + decision); + } + } while (lastAttemptResult.shouldRetryOnNewConsensusProtocol); + return lastAttemptResult.casResult; + } + + private static ConsensusAttemptResult legacyCas(TableMetadata metadata, + DecoratedKey key, + CASRequest request, + ConsistencyLevel consistencyForPaxos, + ConsistencyLevel consistencyForCommit, + ClientState clientState, + long nowInSeconds, + Dispatcher.RequestTime requestTime) throws UnavailableException, IsBootstrappingException, RequestFailureException, RequestTimeoutException, InvalidRequestException { try { - TableMetadata metadata = Schema.instance.validateTable(keyspaceName, cfName); - Function> updateProposer = ballot -> { // read the current values and check they validate the conditions @@ -359,7 +447,7 @@ public static RowIterator legacyCas(String keyspaceName, { Tracing.trace("CAS precondition does not match current values {}", current); casWriteMetrics.conditionNotMet.inc(); - return Pair.create(PartitionUpdate.emptyUpdate(metadata, key), current.rowIterator()); + return Pair.create(PartitionUpdate.emptyUpdate(metadata, key), current.rowIterator(false)); } // Create the desired updates @@ -384,15 +472,14 @@ public static RowIterator legacyCas(String keyspaceName, return Pair.create(updates, null); }; - return doPaxos(metadata, - key, - consistencyForPaxos, - consistencyForCommit, - consistencyForCommit, - requestTime, - casWriteMetrics, - updateProposer); - + return casResult(doPaxos(metadata, + key, + consistencyForPaxos, + consistencyForCommit, + consistencyForCommit, + requestTime, + casWriteMetrics, + updateProposer)); } catch (CasWriteUnknownResultException e) { @@ -657,7 +744,7 @@ private static PaxosBallotAndContention beginAndRepairPaxos(Dispatcher.RequestTi // https://issues.apache.org/jira/browse/CASSANDRA-5062?focusedCommentId=13619810&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-13619810) // Since we waited for quorum nodes, if some of them haven't seen the last commit (which may just be a timing issue, but may also // mean we lost messages), we pro-actively "repair" those nodes, and retry. - Iterable missingMRC = summary.replicasMissingMostRecentCommit(); + Iterable missingMRC = summary.replicasMissingMostRecentCommit(metadata); if (Iterables.size(missingMRC) > 0) { Tracing.trace("Repairing replicas that missed the most recent commit"); @@ -846,7 +933,7 @@ public void runMayThrow() { if (!(ex instanceof WriteTimeoutException)) logger.error("Failed to apply paxos commit locally : ", ex); - responseHandler.onFailure(FBUtilities.getBroadcastAddressAndPort(), RequestFailureReason.forException(ex)); + responseHandler.onFailure(FBUtilities.getBroadcastAddressAndPort(), RequestFailure.forException(ex)); } } @@ -1148,24 +1235,142 @@ public static void mutateWithTriggers(List mutations, } } - Collection augmented = TriggerExecutor.instance.execute(mutations); + List augmented = TriggerExecutor.instance.execute(mutations); - boolean updatesView = Keyspace.open(mutations.iterator().next().getKeyspaceName()) + String keyspaceName = mutations.iterator().next().getKeyspaceName(); + boolean updatesView = Keyspace.open(keyspaceName) .viewManager .updatesAffectView(mutations, true); - long size = IMutation.dataSize(mutations); + long size = IMutation.dataSize(augmented != null ? augmented : mutations); writeMetrics.mutationSize.update(size); writeMetricsForLevel(consistencyLevel).mutationSize.update(size); - - if (augmented != null) - mutateAtomically(augmented, consistencyLevel, updatesView, requestTime); + if (augmented != null || mutateAtomically || updatesView) + mutateAtomically(augmented != null ? augmented : (List)mutations, consistencyLevel, updatesView, requestTime); else + dispatchMutationsWithRetryOnDifferentSystem(mutations, consistencyLevel, requestTime); + } + + public static void dispatchMutationsWithRetryOnDifferentSystem(List mutations, ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime) + { + while (true) { - if (mutateAtomically || updatesView) - mutateAtomically((Collection) mutations, consistencyLevel, updatesView, requestTime); - else - mutate(mutations, consistencyLevel, requestTime); + ClusterMetadata cm = ClusterMetadata.current(); + try + { + SplitMutations splitMutations = splitMutationsIntoAccordAndNormal(cm, (List)mutations); + List accordMutations = splitMutations.accordMutations(); + IAccordResult accordResult = accordMutations != null ? mutateWithAccordAsync(cm, accordMutations, consistencyLevel, requestTime) : null; + List normalMutations = splitMutations.normalMutations(); + Tracing.trace("Split mutations into Accord {} and normal {}", accordMutations, normalMutations); + + Throwable failure = null; + try + { + if (normalMutations != null) + { + mutate(normalMutations, consistencyLevel, requestTime); + Tracing.trace("Successfully wrote normal mutations"); + } + } + catch (RetryOnDifferentSystemException e) + { + writeMetrics.retryDifferentSystem.mark(); + writeMetricsForLevel(consistencyLevel).retryDifferentSystem.mark(); + logger.debug("Retrying mutations on different system because some mutations were misrouted according to Cassandra"); + Tracing.trace("Got {} from normal mutations, will retry", e); + continue; + } + catch (CoordinatorBehindException e) + { + writeMetrics.retryCoordinatorBehind.mark(); + writeMetricsForLevel(consistencyLevel).retryCoordinatorBehind.mark(); + mutations.forEach(IMutation::clearCachedSerializationsForRetry); + logger.debug("Retrying mutations now that coordinator has caught up to cluster metadata"); + Tracing.trace("Got {} from normal mutations, will retry", e); + continue; + } + catch (Exception e) + { + failure = Throwables.merge(failure, e); + } + + // Check if the Accord mutations succeeded asynchronously + try + { + if (accordResult != null) + { + TxnResult.Kind kind = accordResult.awaitAndGet().kind(); + if (kind == retry_new_protocol && failure == null) + { + Tracing.trace("Accord returned retry new protocol"); + logger.debug("Retrying mutations on different system because some mutations were misrouted according to Accord"); + continue; + } + Tracing.trace("Successfully wrote Accord mutations"); + } + } + catch (Exception e) + { + failure = Throwables.merge(failure, e); + } + + if (failure != null) + throw unchecked(failure); + } + catch (Exception t) + { + // Unexpected error so it would be helpful to have details + Tracing.trace("{}", getStackTraceAsToString(t)); + throw t; + } + break; + } + } + + private static ConsistencyLevel consistencyLevelForBatchLog(ConsistencyLevel consistencyLevel, boolean requireQuorumForRemove) + { + // If we are requiring quorum nodes for removal, we upgrade consistency level to QUORUM unless we already + // require ALL, or EACH_QUORUM. This is so that *at least* QUORUM nodes see the update. + ConsistencyLevel batchConsistencyLevel = requireQuorumForRemove + ? ConsistencyLevel.QUORUM + : consistencyLevel; + + switch (consistencyLevel) + { + case ALL: + case EACH_QUORUM: + batchConsistencyLevel = consistencyLevel; + } + return batchConsistencyLevel; + } + + private static void doFallibleWriteWithMetricTracking(Runnable r, ConsistencyLevel consistencyLevel) + { + try + { + r.run(); + } + catch (UnavailableException e) + { + writeMetrics.unavailables.mark(); + writeMetricsForLevel(consistencyLevel).unavailables.mark(); + Tracing.trace("Unavailable"); + throw e; + } + catch (WriteTimeoutException e) + { + writeMetrics.timeouts.mark(); + writeMetricsForLevel(consistencyLevel).timeouts.mark(); + Tracing.trace("Write timeout; received {} of {} required replies", e.received, e.blockFor); + throw e; + } + catch (WriteFailureException e) + { + writeMetrics.failures.mark(); + writeMetricsForLevel(consistencyLevel).failures.mark(); + Tracing.trace("Write failure; received {} of {} required replies", e.received, e.blockFor); + throw e; } } @@ -1176,92 +1381,170 @@ public static void mutateWithTriggers(List mutations, * After: remove the batchlog entry (after writing hints for the batch rows, if necessary). * * @param mutations the Mutations to be applied across the replicas - * @param consistency_level the consistency level for the operation + * @param consistencyLevel the consistency level for the operation * @param requireQuorumForRemove at least a quorum of nodes will see update before deleting batchlog * @param requestTime object holding times when request got enqueued and started execution */ - public static void mutateAtomically(Collection mutations, - ConsistencyLevel consistency_level, + public static void mutateAtomically(List mutations, + ConsistencyLevel consistencyLevel, boolean requireQuorumForRemove, Dispatcher.RequestTime requestTime) throws UnavailableException, OverloadedException, WriteTimeoutException { Tracing.trace("Determining replicas for atomic batch"); long startTime = nanoTime(); - - List wrappers = new ArrayList<>(mutations.size()); + boolean attributeNonAccordLatency = true; + long nonAccordEndTime = -1; if (mutations.stream().anyMatch(mutation -> Keyspace.open(mutation.getKeyspaceName()).getReplicationStrategy().hasTransientReplicas())) throw new AssertionError("Logged batches are unsupported with transient replication"); try { + ConsistencyLevel batchConsistencyLevel = consistencyLevelForBatchLog(consistencyLevel, requireQuorumForRemove); + // This can't be updated for each iteration because cleanup has to go to the correct replicas which is where the batchlog is originally written + ReplicaPlan.ForWrite batchlogReplicaPlan = ReplicaPlans.forBatchlogWrite(ClusterMetadata.current(), batchConsistencyLevel == ConsistencyLevel.ANY); + final TimeUUID batchUUID = nextTimeUUID(); + boolean wroteToBatchLog = false; + while (true) + { + ClusterMetadata cm = ClusterMetadata.current(); + // In case we hit an error in before/during splitting + attributeNonAccordLatency = true; + List wrappers = new ArrayList<>(mutations.size()); + List accordMutations = new ArrayList<>(mutations.size()); + BatchlogCleanup cleanup = new BatchlogCleanup(() -> asyncRemoveFromBatchlog(batchlogReplicaPlan, batchUUID, requestTime)); - // If we are requiring quorum nodes for removal, we upgrade consistency level to QUORUM unless we already - // require ALL, or EACH_QUORUM. This is so that *at least* QUORUM nodes see the update. - ConsistencyLevel batchConsistencyLevel = requireQuorumForRemove - ? ConsistencyLevel.QUORUM - : consistency_level; + // add a handler for each mutation that will not be written on Accord - includes checking availability, but doesn't initiate any writes, yet + SplitConsumer splitConsumer = (accordMutation, normalMutation, originalMutations, mutationIndex) -> { + Mutation eitherMutation = normalMutation != null ? normalMutation : accordMutation; + Keyspace keyspace = Keyspace.open(eitherMutation.getKeyspaceName()); + Token tk = eitherMutation.key().getToken(); - switch (consistency_level) - { - case ALL: - case EACH_QUORUM: - batchConsistencyLevel = consistency_level; - } + if (accordMutation != null) + accordMutations.add(accordMutation); - ReplicaPlan.ForWrite replicaPlan = ReplicaPlans.forBatchlogWrite(batchConsistencyLevel == ConsistencyLevel.ANY); + if (normalMutation == null) + return; - final TimeUUID batchUUID = nextTimeUUID(); - BatchlogCleanup cleanup = new BatchlogCleanup(mutations.size(), - () -> asyncRemoveFromBatchlog(replicaPlan, batchUUID, requestTime)); + // Always construct the replica plan to check availability + ReplicaPlan.ForWrite dataReplicaPlan = ReplicaPlans.forWrite(cm, keyspace, consistencyLevel, tk, ReplicaPlans.writeNormal); - // add a handler for each mutation - includes checking availability, but doesn't initiate any writes, yet - for (Mutation mutation : mutations) - { - WriteResponseHandlerWrapper wrapper = wrapBatchResponseHandler(mutation, - consistency_level, - batchConsistencyLevel, - WriteType.BATCH, - cleanup, - requestTime); - // exit early if we can't fulfill the CL at this time. - wrappers.add(wrapper); - } + if (dataReplicaPlan.lookup(FBUtilities.getBroadcastAddressAndPort()) != null) + writeMetrics.localRequests.mark(); + else + writeMetrics.remoteRequests.mark(); + + WriteResponseHandlerWrapper wrapper = wrapBatchResponseHandler(normalMutation, + dataReplicaPlan, + batchConsistencyLevel, + WriteType.BATCH, + cleanup, + requestTime); + wrappers.add(wrapper); + }; + splitMutationsIntoAccordAndNormal(cm, mutations, splitConsumer); + attributeNonAccordLatency = !wrappers.isEmpty(); + cleanup.setMutationsWaitingFor(wrappers.size() + (accordMutations.isEmpty() ? 0 : 1)); + Tracing.trace("Split batch into Accord {} and normal {}", accordMutations, wrappers); + + // If the entire batch can execute on Accord then we can skip the batch log entirely + // Write to the batch log first in case it fails so we don't end up with Accord applying + // part of the batch independently + if (!wrappers.isEmpty() && !wroteToBatchLog) + { + // write to the batchlog, including writes that will be routed to Accord to preserve the behavior + // of the batch log where if part of a batch is visible then eventually the entire batch is visible. + // If the Accord routed mutations depend on the Accord txn succeeding then it is no longer consistent + // with the mutations delivered by the batch log since an unacknowledged Accord txn won't be retried + // unless those mutations are also written to the batch log + // Only write to the log once and reuse the batchUUID for every attempt to route the mutations correctly + doFallibleWriteWithMetricTracking(() -> syncWriteToBatchlog(mutations, batchlogReplicaPlan, batchUUID, requestTime), consistencyLevel); + Tracing.trace("Successfully wrote to batchlog"); + wroteToBatchLog = true; + } - // write to the batchlog - syncWriteToBatchlog(mutations, replicaPlan, batchUUID, requestTime); + // Start Accord executing so it executes while the mutations are synchronously applied + IAccordResult accordResult = !accordMutations.isEmpty() ? mutateWithAccordAsync(cm, accordMutations, consistencyLevel, requestTime) : null; - // now actually perform the writes and wait for them to complete - syncWriteBatchedMutations(wrappers, Stage.MUTATION, requestTime); - } - catch (UnavailableException e) - { - writeMetrics.unavailables.mark(); - writeMetricsForLevel(consistency_level).unavailables.mark(); - Tracing.trace("Unavailable"); - throw e; + Throwable failure = null; + try + { + // now actually perform the writes and wait for them to complete + if (!wrappers.isEmpty()) + { + doFallibleWriteWithMetricTracking(() -> syncWriteBatchedMutations(wrappers, Stage.MUTATION, requestTime), consistencyLevel); + Tracing.trace("Successfully wrote normal mutations"); + } + } + catch (RetryOnDifferentSystemException e) + { + writeMetrics.retryDifferentSystem.mark(); + writeMetricsForLevel(consistencyLevel).retryDifferentSystem.mark(); + logger.debug("Retrying batch txn on different system because some mutations were misrouted"); + Tracing.trace("Got {} from normal mutations, will retry", e); + continue; + } + catch (CoordinatorBehindException e) + { + writeMetrics.retryCoordinatorBehind.mark(); + writeMetricsForLevel(consistencyLevel).retryCoordinatorBehind.mark(); + mutations.forEach(IMutation::clearCachedSerializationsForRetry); + logger.debug("Retrying batch now that coordinator has caught up to cluster metadata"); + Tracing.trace("Got {} from normal mutations, will retry", e); + continue; + } + catch (Exception e) + { + failure = Throwables.merge(failure, e); + } + finally + { + // Try to exclude most of the Accord time + nonAccordEndTime = nanoTime(); + } + + // Check if the Accord mutations succeeded asynchronously + try + { + // It's notable here that the Accord portion of the batch will not be hinted + // while the regular mutations are hinted on failure and also going to be replayed later from + // the batch log. It wouldn't be difficult to add hinting here, but it does seem redundant with + // the batch log. + if (accordResult != null) + { + TxnResult.Kind kind = accordResult.awaitAndGet().kind(); + if (kind == retry_new_protocol && failure == null) + continue; + Tracing.trace("Successfully wrote Accord mutations"); + cleanup.ackMutation(); + } + } + catch (Exception e) + { + failure = Throwables.merge(failure, e); + } + if (failure != null) + throw unchecked(failure); + break; + } } - catch (WriteTimeoutException e) + catch (Exception t) { - writeMetrics.timeouts.mark(); - writeMetricsForLevel(consistency_level).timeouts.mark(); - Tracing.trace("Write timeout; received {} of {} required replies", e.received, e.blockFor); - throw e; - } - catch (WriteFailureException e) - { - writeMetrics.failures.mark(); - writeMetricsForLevel(consistency_level).failures.mark(); - Tracing.trace("Write failure; received {} of {} required replies", e.received, e.blockFor); - throw e; + // Unexpected error so it would be helpful to have details + Tracing.trace("{}", getStackTraceAsToString(t)); + throw t; } finally { - long latency = nanoTime() - startTime; - writeMetrics.addNano(latency); - writeMetricsForLevel(consistency_level).addNano(latency); - updateCoordinatorWriteLatencyTableMetric(mutations, latency); + if (attributeNonAccordLatency) + { + // On the exception path nonAccordEndTime will be -1 + long latency = nonAccordEndTime != -1 ? nonAccordEndTime : nanoTime() - startTime; + writeMetrics.addNano(latency); + writeMetricsForLevel(consistencyLevel).addNano(latency); + updateCoordinatorWriteLatencyTableMetric(mutations, latency); + } } } @@ -1358,12 +1641,12 @@ private static void asyncWriteBatchedMutations(List } catch (OverloadedException | WriteTimeoutException e) { - wrapper.handler.onFailure(FBUtilities.getBroadcastAddressAndPort(), RequestFailureReason.forException(e)); + wrapper.handler.onFailure(FBUtilities.getBroadcastAddressAndPort(), RequestFailure.forException(e)); } } } - private static void syncWriteBatchedMutations(List wrappers, Stage stage, Dispatcher.RequestTime requestTime) + private static void syncWriteBatchedMutations(Iterable wrappers, Stage stage, Dispatcher.RequestTime requestTime) throws WriteTimeoutException, OverloadedException { String localDataCenter = DatabaseDescriptor.getLocator().local().datacenter; @@ -1421,22 +1704,12 @@ public static AbstractWriteResponseHandler performWrite(IMutation mut // same as performWrites except does not initiate writes (but does perform availability checks). private static WriteResponseHandlerWrapper wrapBatchResponseHandler(Mutation mutation, - ConsistencyLevel consistencyLevel, + ReplicaPlan.ForWrite replicaPlan, ConsistencyLevel batchConsistencyLevel, WriteType writeType, BatchlogResponseHandler.BatchlogCleanup cleanup, Dispatcher.RequestTime requestTime) { - Keyspace keyspace = Keyspace.open(mutation.getKeyspaceName()); - Token tk = mutation.key().getToken(); - - ReplicaPlan.ForWrite replicaPlan = ReplicaPlans.forWrite(keyspace, consistencyLevel, tk, ReplicaPlans.writeNormal); - - if (replicaPlan.lookup(FBUtilities.getBroadcastAddressAndPort()) != null) - writeMetrics.localRequests.mark(); - else - writeMetrics.remoteRequests.mark(); - AbstractReplicationStrategy rs = replicaPlan.replicationStrategy(); AbstractWriteResponseHandler writeHandler = rs.getWriteResponseHandler(replicaPlan, null, writeType, mutation, requestTime); BatchlogResponseHandler batchHandler = new BatchlogResponseHandler<>(writeHandler, batchConsistencyLevel.blockFor(rs), cleanup, requestTime); @@ -1465,13 +1738,17 @@ private static WriteResponseHandlerWrapper wrapViewBatchResponseHandler(Mutation } // used by atomic_batch_mutate to decouple availability check from the write itself, caches consistency level and endpoints. - private static class WriteResponseHandlerWrapper + public static class WriteResponseHandlerWrapper { - final BatchlogResponseHandler handler; - final Mutation mutation; + @Nonnull + public final BatchlogResponseHandler handler; + @Nonnull + public final Mutation mutation; - WriteResponseHandlerWrapper(BatchlogResponseHandler handler, Mutation mutation) + public WriteResponseHandlerWrapper(@Nonnull BatchlogResponseHandler handler, @Nonnull Mutation mutation) { + checkNotNull(handler); + checkNotNull(mutation); this.handler = handler; this.mutation = mutation; } @@ -1587,7 +1864,7 @@ public static void sendToHintedReplicas(final Mutation mutation, if (insertLocal) { - Preconditions.checkNotNull(localReplica); + checkNotNull(localReplica); performLocally(stage, localReplica, mutation::apply, responseHandler, mutation, requestTime); } @@ -1706,9 +1983,9 @@ public void runMayThrow() } catch (Exception ex) { - if (!(ex instanceof WriteTimeoutException)) + if (!(ex instanceof WriteTimeoutException) && !(ex instanceof RetryOnDifferentSystemException)) logger.error("Failed to apply mutation locally : ", ex); - handler.onFailure(FBUtilities.getBroadcastAddressAndPort(), RequestFailureReason.forException(ex)); + handler.onFailure(FBUtilities.getBroadcastAddressAndPort(), RequestFailure.forException(ex)); } } @@ -1850,8 +2127,8 @@ public static PartitionIterator read(SinglePartitionReadCommand.Group group, Con } return consistencyLevel.isSerialConsistency() - ? readWithPaxos(group, consistencyLevel, requestTime) - : readRegular(group, consistencyLevel, requestTime); + ? readWithConsensus(group, consistencyLevel, requestTime) + : dispatchReadWithRetryOnDifferentSystem(group, consistencyLevel, ReadCoordinator.DEFAULT, requestTime); } public static boolean hasJoined() @@ -1866,15 +2143,148 @@ public static boolean hasJoined() return metadata.myNodeState() == NodeState.JOINED; } - private static PartitionIterator readWithPaxos(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime) + private static ConsensusRoutingDecision consensusRouting(ClusterMetadata cm, TableMetadata metadata, DecoratedKey partitionKey, ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime, boolean isForWrite) + { + if (metadata.keyspace.equals(SchemaConstants.METADATA_KEYSPACE_NAME)) + return ConsensusRoutingDecision.paxosV2; + return ConsensusRequestRouter.instance.routeAndMaybeMigrate(cm, + partitionKey, + metadata.id, + consistencyLevel, + requestTime, + DatabaseDescriptor.getCasContentionTimeout(NANOSECONDS), + isForWrite); + } + + private static PartitionIterator readWithConsensus(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime) throws InvalidRequestException, UnavailableException, ReadFailureException, ReadTimeoutException { - return (Paxos.useV2() || group.metadata().keyspace.equals(SchemaConstants.METADATA_KEYSPACE_NAME)) - ? Paxos.read(group, consistencyLevel, requestTime) - : legacyReadWithPaxos(group, consistencyLevel, requestTime); + ConsensusAttemptResult lastResult; + do + { + ClusterMetadata cm = ClusterMetadata.current(); + SinglePartitionReadCommand command = group.queries.get(0); + ConsensusRoutingDecision decision = consensusRouting(cm, group.metadata(), command.partitionKey(), consistencyLevel, requestTime, false); + switch (decision) + { + case paxosV2: + lastResult = Paxos.read(group, consistencyLevel, requestTime); + break; + case paxosV1: + lastResult = legacyReadWithPaxos(group, consistencyLevel, requestTime); + break; + case accord: + lastResult = readWithAccord(cm, group, consistencyLevel, requestTime); + break; + default: + throw new IllegalStateException("Unsupported consensus " + decision); + } + } while (lastResult.shouldRetryOnNewConsensusProtocol); + return lastResult.serialReadResult; + } + + public static ConsistencyLevel consistencyLevelForAccordRead(ClusterMetadata cm, TableId tableId, SinglePartitionReadCommand.Group group, @Nullable ConsistencyLevel consistencyLevel) + { + // Null means no specific consistency behavior is required from Accord, it's functionally similar to + // reading at ONE if you are reading data that wasn't written via Accord + if (consistencyLevel == null) + return null; + + TableParams tableParams = getTableMetadata(cm, tableId).params; + TransactionalMode mode = tableParams.transactionalMode; + TransactionalMigrationFromMode migrationFromMode = tableParams.transactionalMigrationFrom; + for (SinglePartitionReadCommand command : group.queries) + { + // readCLForMode should return either null or the supplied consistency level + // in which case we will read everything at that CL since Accord doesn't support per table + // read consistency + ConsistencyLevel readCL = mode.readCLForMode(migrationFromMode, consistencyLevel, cm, tableId, command.partitionKey().getToken()); + if (readCL != null) + return readCL; + } + return null; } - private static PartitionIterator legacyReadWithPaxos(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime) + public static IAccordResult readWithAccord(ClusterMetadata cm, PartitionRangeReadCommand command, AbstractBounds range, ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime) + { + if (consistencyLevel != null && !IAccordService.SUPPORTED_READ_CONSISTENCY_LEVELS.contains(consistencyLevel)) + throw new InvalidRequestException(consistencyLevel + " is not supported by Accord"); + + TableMetadata tableMetadata = getTableMetadata(cm, command.metadata().id); + TableParams tableParams = tableMetadata.params; + consistencyLevel = tableParams.transactionalMode.readCLForMode(tableParams.transactionalMigrationFrom, consistencyLevel, cm, tableMetadata.id, command.dataRange().keyRange()); + TableMetadatas tables = TableMetadatas.of(tableMetadata); + TxnRead read = TxnRead.createRangeRead(tables, command, range, consistencyLevel); + Txn.Kind kind = shouldReadEphemerally(read.keys(), tableParams, Read); + TableMetadatasAndKeys tablesAndKeys = new TableMetadatasAndKeys(tables, read.keys()); + Txn txn = new Txn.InMemory(kind, read.keys(), read, TxnQuery.RANGE_QUERY, null, tablesAndKeys); + IAccordService accordService = AccordService.instance(); + return accordService.coordinateAsync(tableMetadata.epoch.getEpoch(), txn, consistencyLevel, requestTime); + } + + private static IAccordResult readWithAccordAsync(ClusterMetadata cm, SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime) + { + if (consistencyLevel != null && !IAccordService.SUPPORTED_READ_CONSISTENCY_LEVELS.contains(consistencyLevel)) + throw new InvalidRequestException(consistencyLevel + " is not supported by Accord"); + + // If the non-SERIAL write strategy is sending all writes through Accord there is no need to use the supplied consistency + // level since Accord will manage reading safely + TableMetadata tableMetadata = getTableMetadata(cm, group.metadata().id); + TableMetadatas tables = TableMetadatas.of(tableMetadata); + TableParams tableParams = tableMetadata.params; + TableMetadatasAndKeys.KeyCollector keyCollector = new TableMetadatasAndKeys.KeyCollector(tables); + consistencyLevel = consistencyLevelForAccordRead(cm, tableMetadata.id, group, consistencyLevel); + TxnRead read = TxnRead.createSerialRead(group.queries, consistencyLevel, keyCollector); + Txn.Kind kind = shouldReadEphemerally(read.keys(), tableParams, Read); + Txn txn = new Txn.InMemory(kind, read.keys(), read, TxnQuery.ALL, null, keyCollector.buildTablesAndKeys()); + return AccordService.instance().coordinateAsync(tableMetadata.epoch.getEpoch(), txn, consistencyLevel, requestTime); + } + + private static ConsensusAttemptResult readWithAccord(ClusterMetadata cm, SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime) + { + IAccordResult accordResult = readWithAccordAsync(cm, group, consistencyLevel, requestTime); + return getConsensusAttemptResultFromAsyncTxnResult(accordResult, group.queries.size(), index -> group.queries.get(index).isReversed()); + } + + /* + * Used for both the SERIAL and non-SERIAL read path into Accord + */ + public static ConsensusAttemptResult getConsensusAttemptResultFromAsyncTxnResult(IAccordResult accordResult, int numQueries, IntPredicate isQueryReversed) + { + TxnResult txnResult = accordResult.awaitAndGet(); + // TODO (required): Converge on a single approach to RETRY_NEW_PROTOCOL, this works for now because reads don't support it anyways + if (txnResult.kind() == retry_new_protocol) + return RETRY_NEW_PROTOCOL; + if (txnResult.kind() == range_read) + return serialReadResult(((TxnRangeReadResult)txnResult).partitions.get()); + TxnData data = (TxnData) txnResult; + + if (data.isEmpty()) + { + return serialReadResult(EmptyIterators.partition()); + } + else if (data.size() == 1) + { + TxnDataKeyValue value = ((TxnDataKeyValue)data.values().iterator().next()); + return serialReadResult(singletonIterator(value.rowIterator(isQueryReversed.test(0)))); + } + else + { + // TODO (review): 95% sure this isn't actually needed and the consumer is going consume these by DecoratedKey not iteration order, but the non-transactional path does preserve the order of the iterators + List partitionIterators = new ArrayList<>(numQueries); + for (int i = 0; i < numQueries; i++) + partitionIterators.add(null); + for (Map.Entry e : data.entrySet()) + { + int queryIndex = e.getKey(); + TxnDataKeyValue value = ((TxnDataKeyValue)e.getValue()); + partitionIterators.set(queryIndex, singletonIterator(value.rowIterator(isQueryReversed.test(queryIndex)))); + } + return serialReadResult(partitionIterators.size() == 1 ? partitionIterators.get(0) : PartitionIterators.concat(partitionIterators)); + } + } + + private static ConsensusAttemptResult legacyReadWithPaxos(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime) throws InvalidRequestException, UnavailableException, ReadFailureException, ReadTimeoutException { long start = nanoTime(); @@ -1887,7 +2297,6 @@ private static PartitionIterator legacyReadWithPaxos(SinglePartitionReadCommand. // calculate the blockFor before repair any paxos round to avoid RS being altered in between. int blockForRead = consistencyLevel.blockFor(Keyspace.open(metadata.keyspace).getReplicationStrategy()); - PartitionIterator result = null; try { final ConsistencyLevel consistencyForReplayCommitsOrFetch = consistencyLevel == ConsistencyLevel.LOCAL_SERIAL @@ -1924,7 +2333,7 @@ private static PartitionIterator legacyReadWithPaxos(SinglePartitionReadCommand. throw new ReadFailureException(consistencyLevel, e.received, e.blockFor, false, e.failureReasonByEndpoint); } - result = fetchRows(group.queries, consistencyForReplayCommitsOrFetch, requestTime); + return serialReadResult(fetchRows(group.queries, consistencyForReplayCommitsOrFetch, ReadCoordinator.DEFAULT, requestTime)); } catch (UnavailableException e) { @@ -1968,26 +2377,126 @@ private static PartitionIterator legacyReadWithPaxos(SinglePartitionReadCommand. readMetricsForLevel(consistencyLevel).addNano(latency); Keyspace.open(metadata.keyspace).getColumnFamilyStore(metadata.name).metric.coordinatorReadLatency.update(latency, TimeUnit.NANOSECONDS); } + } - return result; + public static PartitionIterator dispatchReadWithRetryOnDifferentSystem(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, ReadCoordinator coordinator, Dispatcher.RequestTime requestTime) + throws UnavailableException, ReadFailureException, ReadTimeoutException + { + while (true) + { + ClusterMetadata cm = ClusterMetadata.current(); + try + { + SplitReads splitReads = splitReadsIntoAccordAndNormal(cm, group, coordinator, requestTime); + SinglePartitionReadCommand.Group accordReads = splitReads.accordReads; + IAccordResult accordResult = accordReads != null ? readWithAccordAsync(cm, accordReads, consistencyLevel, requestTime) : null; + SinglePartitionReadCommand.Group normalReads = splitReads.normalReads; + Tracing.trace("Split reads into Accord {} and normal {}", accordReads, normalReads); + + Throwable failure = null; + PartitionIterator normalPartitions = null; + try + { + if (normalReads != null) + { + normalPartitions = readRegular(normalReads, consistencyLevel, coordinator, requestTime); + Tracing.trace("Successfully executed normal reads"); + } + } + catch (RetryOnDifferentSystemException e) + { + readMetrics.retryDifferentSystem.mark(); + readMetricsForLevel(consistencyLevel).retryDifferentSystem.mark(); + logger.debug("Retrying reads on different system because some reads were misrouted according to Accord"); + Tracing.trace("Got {} from normal reads, will retry", e); + continue; + } + catch (CoordinatorBehindException e) + { + readMetrics.retryCoordinatorBehind.mark(); + readMetricsForLevel(consistencyLevel).retryCoordinatorBehind.mark(); + logger.debug("Retrying reads now that coordinator has caught up to cluster metadata"); + Tracing.trace("Got {} from normal reads, will retry", e); + continue; + } + catch (Exception e) + { + failure = Throwables.merge(failure, e); + } + + // Check if the Accord reads succeeded asynchronously + PartitionIterator accordPartitions = null; + try + { + if (accordResult != null) + { + ConsensusAttemptResult consensusResult = getConsensusAttemptResultFromAsyncTxnResult(accordResult, accordReads.queries.size(), index -> group.queries.get(index).isReversed()); + if (consensusResult == RETRY_NEW_PROTOCOL) + { + readMetrics.retryDifferentSystem.mark(); + readMetricsForLevel(consistencyLevel).retryDifferentSystem.mark(); + Tracing.trace("Accord returned retry new protocol"); + logger.debug("Retrying reads on different system because some reads were misrouted according to Accord"); + continue; + } + Tracing.trace("Successfully executed Accord reads"); + accordPartitions = consensusResult.serialReadResult; + } + } + catch (Exception e) + { + failure = Throwables.merge(failure, e); + } + + if (failure != null) + throw unchecked(failure); + + PartitionIterator resultIterator = null; + if (normalPartitions != null && (accordPartitions == null || !accordPartitions.hasNext())) + resultIterator = normalPartitions; + else if ((normalPartitions == null || !normalPartitions.hasNext()) && accordPartitions != null) + resultIterator = accordPartitions; + else + { + // Merge into partition key order + List partitions = new ArrayList<>(group.queries.size()); + Iterator mergeIterator = Iterators.mergeSorted(ImmutableList.of(normalPartitions, accordPartitions), Comparator.comparing(RowIterator::partitionKey)); + while (mergeIterator.hasNext()) + { + partitions.add(singletonIterator(mergeIterator.next())); + } + resultIterator = PartitionIterators.concat(partitions);} + return maybeEnforceLimits(resultIterator, group); + } + catch (Exception t) + { + // Unexpected error so it would be helpful to have details + Tracing.trace("{}", getStackTraceAsToString(t)); + throw t; + } + } + } + + public static PartitionIterator maybeEnforceLimits(PartitionIterator iterator, SinglePartitionReadCommand.Group group) + { + // Note that the only difference between the command in a group must be the partition key on which + // they applied. + boolean enforceStrictLiveness = group.queries.get(0).metadata().enforceStrictLiveness(); + // If we have more than one command, then despite each read command honoring the limit, the total result + // might not honor it and so we should enforce it + if (group.queries.size() > 1) + return group.limits().filter(iterator, group.nowInSec(), group.selectsFullPartition(), enforceStrictLiveness); + return iterator; } @SuppressWarnings("resource") - private static PartitionIterator readRegular(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime) + private static PartitionIterator readRegular(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, ReadCoordinator coordinator, Dispatcher.RequestTime requestTime) throws UnavailableException, ReadFailureException, ReadTimeoutException { long start = nanoTime(); try { - PartitionIterator result = fetchRows(group.queries, consistencyLevel, requestTime); - // Note that the only difference between the command in a group must be the partition key on which - // they applied. - boolean enforceStrictLiveness = group.queries.get(0).metadata().enforceStrictLiveness(); - // If we have more than one command, then despite each read command honoring the limit, the total result - // might not honor it and so we should enforce it - if (group.queries.size() > 1) - result = group.limits().filter(result, group.nowInSec(), group.selectsFullPartition(), enforceStrictLiveness); - return result; + return fetchRows(group.queries, consistencyLevel, coordinator, requestTime); } catch (UnavailableException e) { @@ -2073,9 +2582,13 @@ public RowIterator next() * 3. Wait for a response from R replicas * 4. If the digests (if any) match the data return the data * 5. else carry out read repair by getting data from all the nodes. + * + * This should not be called directly because it bypasses statistics and error handling. It is public + * so it can be used by Accord to fetch rows and the statistics will be tracked by Accord. */ - private static PartitionIterator fetchRows(List commands, + public static PartitionIterator fetchRows(List commands, ConsistencyLevel consistencyLevel, + ReadCoordinator coordinator, Dispatcher.RequestTime requestTime) throws UnavailableException, ReadFailureException, ReadTimeoutException { @@ -2088,7 +2601,7 @@ private static PartitionIterator fetchRows(List comm // for type of speculation we'll use in this read for (int i=0; i> getSchemaVersions() @@ -2612,17 +3126,18 @@ public final void run() long timeTakenNanos = now - startTimeNanos(); MessagingService.instance().metrics.recordSelfDroppedMessage(Verb.MUTATION_REQ, timeTakenNanos, NANOSECONDS); - if (requestTime.shouldSendHints()) + // Don't submit a hint if this replica is transient + if (localReplica.isTransient()) + return; + + HintRunnable runnable = new HintRunnable(ImmutableSet.of(localReplica.endpoint())) { - HintRunnable runnable = new HintRunnable(EndpointsForToken.of(localReplica.range().right, localReplica)) + protected void runMayThrow() throws Exception { - protected void runMayThrow() throws Exception - { - LocalMutationRunnable.this.runMayThrow(); - } - }; - submitHint(runnable); - } + LocalMutationRunnable.this.runMayThrow(); + } + }; + submitHint(runnable); return; } @@ -2677,9 +3192,9 @@ public static void logRequestException(Exception exception, Collection targets; - protected HintRunnable(EndpointsForToken targets) + protected HintRunnable(Set targets) { this.targets = targets; } @@ -2697,7 +3212,7 @@ public void run() finally { StorageMetrics.totalHintsInProgress.dec(targets.size()); - for (InetAddressAndPort target : targets.endpoints()) + for (InetAddressAndPort target : targets) getHintsInProgressFor(target).decrementAndGet(); } } @@ -2743,25 +3258,43 @@ private static AtomicInteger getHintsInProgressFor(InetAddressAndPort destinatio } } - public static void submitHint(Mutation mutation, Replica target, AbstractWriteResponseHandler responseHandler) + public static void submitHintForRetryOnDifferentSystem(Mutation mutation) + { + submitHint(mutation, ImmutableSet.of(HintsService.RETRY_ON_DIFFERENT_SYSTEM_ADDRESS), null); + } + + public static Future submitHint(Mutation mutation, Replica target, AbstractWriteResponseHandler responseHandler) + { + return submitHint(mutation, EndpointsForToken.of(target.range().right, target), responseHandler); + } + + private static Future submitHint(Mutation mutation, + EndpointsForToken targets, + AbstractWriteResponseHandler responseHandler) { - submitHint(mutation, EndpointsForToken.of(target.range().right, target), responseHandler); + // hints should not be written for transient replicas because there is no point if they didn't contribute + // to quorum, they would eventually be removed anyways after running incremental repair. + // This logic assumes we don't always write to transient replicas to minimize incremental repair mismatches + // so we may want to walk this back when revisiting transient replication + Replicas.assertFull(targets); + return submitHint(mutation, targets.endpoints(), responseHandler); } - private static void submitHint(Mutation mutation, - EndpointsForToken targets, - AbstractWriteResponseHandler responseHandler) + private static Future submitHint(Mutation mutation, + Set targets, + AbstractWriteResponseHandler responseHandler) { - Replicas.assertFull(targets); // hints should not be written for transient replicas HintRunnable runnable = new HintRunnable(targets) { public void runMayThrow() { Set validTargets = new HashSet<>(targets.size()); Set hostIds = new HashSet<>(targets.size()); - for (InetAddressAndPort target : targets.endpoints()) + for (InetAddressAndPort target : targets) { - UUID hostId = StorageService.instance.getHostIdForEndpoint(target); + UUID hostId = target == HintsService.RETRY_ON_DIFFERENT_SYSTEM_ADDRESS ? + HintsService.RETRY_ON_DIFFERENT_SYSTEM_UUID : + StorageService.instance.getHostIdForEndpoint(target); if (hostId != null) { hostIds.add(hostId); @@ -2786,14 +3319,14 @@ public void runMayThrow() } }; - submitHint(runnable); + return submitHint(runnable); } private static Future submitHint(HintRunnable runnable) { StorageMetrics.totalHintsInProgress.inc(runnable.targets.size()); - for (Replica target : runnable.targets) - getHintsInProgressFor(target.endpoint()).incrementAndGet(); + for (InetAddressAndPort target : runnable.targets) + getHintsInProgressFor(target).incrementAndGet(); return (Future) Stage.MUTATION.submit(runnable); } @@ -2975,6 +3508,37 @@ public final boolean equals(Object o) } } + public static class ConsensusAttemptResult + { + public static final ConsensusAttemptResult RETRY_NEW_PROTOCOL = new ConsensusAttemptResult(null, null, true); + + @Nullable + RowIterator casResult; + + @Nonnull + public final PartitionIterator serialReadResult; + + public final boolean shouldRetryOnNewConsensusProtocol; + + private ConsensusAttemptResult(@Nullable RowIterator casResult, @Nullable PartitionIterator serialReadResult, boolean shouldRetryOnNewConsensusProtocol) + { + this.casResult = casResult; + this.serialReadResult = serialReadResult; + this.shouldRetryOnNewConsensusProtocol = shouldRetryOnNewConsensusProtocol; + } + + public static ConsensusAttemptResult serialReadResult(@Nonnull PartitionIterator serialReadResult) + { + checkNotNull(serialReadResult, "serialReadResult should not be null"); + return new ConsensusAttemptResult(null, serialReadResult, false); + } + + public static ConsensusAttemptResult casResult(@Nullable RowIterator casResult) + { + return new ConsensusAttemptResult(casResult, null, false); + } + } + @Override public boolean getSnapshotOnDuplicateRowDetectionEnabled() { @@ -3161,7 +3725,7 @@ public boolean isLoggingReadRepairs() @Override public void setPaxosVariant(String variant) { - Preconditions.checkNotNull(variant); + checkNotNull(variant); Paxos.setPaxosVariant(Config.PaxosVariant.valueOf(variant)); } diff --git a/src/java/org/apache/cassandra/service/StorageProxyMBean.java b/src/java/org/apache/cassandra/service/StorageProxyMBean.java index 395b49a84834..5c2d5ec22cd7 100644 --- a/src/java/org/apache/cassandra/service/StorageProxyMBean.java +++ b/src/java/org/apache/cassandra/service/StorageProxyMBean.java @@ -21,7 +21,6 @@ import java.util.Map; import java.util.Set; - public interface StorageProxyMBean { public long getTotalHints(); diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java index 812dc31cd492..0caf26a3a6e8 100644 --- a/src/java/org/apache/cassandra/service/StorageService.java +++ b/src/java/org/apache/cassandra/service/StorageService.java @@ -50,6 +50,7 @@ import java.util.function.Supplier; import java.util.stream.Collectors; import java.util.stream.Stream; +import javax.annotation.Nonnull; import javax.annotation.Nullable; import javax.management.ListenerNotFoundException; import javax.management.NotificationBroadcasterSupport; @@ -61,7 +62,6 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Joiner; -import com.google.common.base.Preconditions; import com.google.common.base.Predicate; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; @@ -71,6 +71,9 @@ import com.google.common.collect.Ordering; import com.google.common.collect.Sets; import com.google.common.util.concurrent.Uninterruptibles; + +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.repair.autorepair.AutoRepair; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -124,6 +127,7 @@ import org.apache.cassandra.gms.Gossiper; import org.apache.cassandra.gms.IEndpointStateChangeSubscriber; import org.apache.cassandra.gms.VersionedValue; +import org.apache.cassandra.gms.VersionedValue.VersionedValueFactory; import org.apache.cassandra.hints.Hint; import org.apache.cassandra.hints.HintsService; import org.apache.cassandra.index.IndexStatusManager; @@ -142,11 +146,11 @@ import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.LocalStrategy; import org.apache.cassandra.locator.MetaStrategy; +import org.apache.cassandra.locator.NodeProximity; import org.apache.cassandra.locator.RangesAtEndpoint; import org.apache.cassandra.locator.RangesByEndpoint; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.locator.Replicas; -import org.apache.cassandra.locator.NodeProximity; import org.apache.cassandra.locator.SnitchAdapter; import org.apache.cassandra.locator.SystemReplicas; import org.apache.cassandra.metrics.Sampler; @@ -167,6 +171,9 @@ import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.schema.TableMetadataRef; import org.apache.cassandra.schema.ViewMetadata; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationState; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationTarget; import org.apache.cassandra.service.disk.usage.DiskUsageBroadcaster; import org.apache.cassandra.service.paxos.Paxos; import org.apache.cassandra.service.paxos.PaxosCommit; @@ -186,6 +193,7 @@ import org.apache.cassandra.tcm.compatibility.GossipHelper; import org.apache.cassandra.tcm.compatibility.TokenRingUtils; import org.apache.cassandra.tcm.membership.Directory; +import org.apache.cassandra.tcm.membership.Location; import org.apache.cassandra.tcm.membership.NodeAddresses; import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.tcm.membership.NodeState; @@ -199,6 +207,7 @@ import org.apache.cassandra.tcm.sequences.SingleNodeSequences; import org.apache.cassandra.tcm.transformations.Assassinate; import org.apache.cassandra.tcm.transformations.CancelInProgressSequence; +import org.apache.cassandra.tcm.transformations.AlterTopology; import org.apache.cassandra.tcm.transformations.Register; import org.apache.cassandra.tcm.transformations.Startup; import org.apache.cassandra.tcm.transformations.Unregister; @@ -224,6 +233,8 @@ import org.apache.cassandra.utils.progress.jmx.JMXProgressSupport; import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.base.Preconditions.checkState; import static java.util.Arrays.asList; import static java.util.Arrays.stream; import static java.util.concurrent.TimeUnit.MILLISECONDS; @@ -242,6 +253,7 @@ import static org.apache.cassandra.index.SecondaryIndexManager.getIndexName; import static org.apache.cassandra.index.SecondaryIndexManager.isIndexColumnFamily; import static org.apache.cassandra.io.util.FileUtils.ONE_MIB; +import static org.apache.cassandra.locator.InetAddressAndPort.stringify; import static org.apache.cassandra.schema.SchemaConstants.isLocalSystemKeyspace; import static org.apache.cassandra.service.ActiveRepairService.ParentRepairStatus; import static org.apache.cassandra.service.ActiveRepairService.repairCommandExecutor; @@ -251,6 +263,8 @@ import static org.apache.cassandra.service.StorageService.Mode.LEAVING; import static org.apache.cassandra.service.StorageService.Mode.MOVE_FAILED; import static org.apache.cassandra.service.StorageService.Mode.NORMAL; +import static org.apache.cassandra.service.consensus.migration.ConsensusTableMigration.finishMigrationToConsensusProtocol; +import static org.apache.cassandra.service.consensus.migration.ConsensusTableMigration.startMigrationToConsensusProtocol; import static org.apache.cassandra.tcm.membership.NodeState.BOOTSTRAPPING; import static org.apache.cassandra.tcm.membership.NodeState.BOOT_REPLACING; import static org.apache.cassandra.tcm.membership.NodeState.JOINED; @@ -258,6 +272,7 @@ import static org.apache.cassandra.tcm.membership.NodeState.REGISTERED; import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis; import static org.apache.cassandra.utils.FBUtilities.getBroadcastAddressAndPort; +import static org.apache.cassandra.utils.PojoToString.pojoMapToString; /** * This abstraction contains the token/identifier of this node @@ -313,6 +328,9 @@ private static int getRingDelay() @VisibleForTesting // this is used for dtests only, see CASSANDRA-18152 public volatile boolean skipNotificationListeners = false; + // For tests that unsafely change the partitioner store the original here + private IPartitioner originalPartitioner; + private final java.util.function.Predicate anyOutOfRangeOpsRecorded = keyspace -> keyspace.metric.outOfRangeTokenReads.getCount() > 0 || keyspace.metric.outOfRangeTokenWrites.getCount() > 0 @@ -370,8 +388,12 @@ public RangesAtEndpoint getLocalReplicas(String keyspaceName) public RangesAtEndpoint getReplicas(String keyspaceName, InetAddressAndPort endpoint) { - return Keyspace.open(keyspaceName).getReplicationStrategy() - .getAddressReplicas(ClusterMetadata.current(), endpoint); + return getReplicas(Keyspace.open(keyspaceName).getReplicationStrategy(), endpoint); + } + + public RangesAtEndpoint getReplicas(AbstractReplicationStrategy replicationStrategy, InetAddressAndPort endpoint) + { + return replicationStrategy.getAddressReplicas(ClusterMetadata.current(), endpoint); } public List> getLocalRanges(String ks) @@ -455,12 +477,12 @@ public enum Mode { STARTING, NORMAL, JOINING, JOINING_FAILED, LEAVING, DECOMMISS private volatile int totalCFs, remainingCFs; - private static final AtomicInteger nextRepairCommand = new AtomicInteger(); - private final List lifecycleSubscribers = new CopyOnWriteArrayList<>(); private final String jmxObjectName; + public static final AtomicInteger nextRepairCommand = new AtomicInteger(); + // true when keeping strict consistency while bootstrapping public static final boolean useStrictConsistency = CONSISTENT_RANGE_MOVEMENT.getBoolean(); private boolean joinRing = JOIN_RING.getBoolean(); @@ -800,8 +822,8 @@ public void runMayThrow() throws InterruptedException, ExecutionException, IOExc Gossiper.instance.addLocalApplicationState(ApplicationState.SSTABLE_VERSIONS, valueFactory.sstableVersions(sstablesTracker.versionsInUse())); - if (ClusterMetadataService.state() == ClusterMetadataService.State.REMOTE) - Gossiper.instance.triggerRoundWithCMS(); + Gossiper.instance.triggerRoundWithCMS(); + // Has to be called after the host id has potentially changed try { @@ -1114,6 +1136,17 @@ public void doAuthSetup(boolean async) } } + public void doAutoRepairSetup() + { + AutoRepairService.setup(); + if (DatabaseDescriptor.getAutoRepairConfig().isAutoRepairSchedulingEnabled()) + { + logger.info("Enabling auto-repair scheduling"); + AutoRepair.instance.setup(); + logger.info("AutoRepair setup complete!"); + } + } + public boolean isAuthSetupComplete() { return authSetupComplete; @@ -1651,6 +1684,36 @@ public void abortBootstrap(String nodeStr, String endpointStr) } } + @Override + public void migrateConsensusProtocol(@Nonnull List keyspaceNames, + @Nullable List maybeTableNames, + @Nullable String maybeRangesStr) + { + checkArgument(!keyspaceNames.contains(SchemaConstants.METADATA_KEYSPACE_NAME)); + startMigrationToConsensusProtocol(keyspaceNames, Optional.ofNullable(maybeTableNames), Optional.ofNullable(maybeRangesStr)); + } + + @Override + public Integer finishConsensusMigration(@Nonnull String keyspace, + @Nullable List maybeTableNames, + @Nullable String maybeRangesStr, + @Nonnull String target) + { + checkArgument(!keyspace.equals(SchemaConstants.METADATA_KEYSPACE_NAME)); + return finishMigrationToConsensusProtocol(keyspace, Optional.ofNullable(maybeTableNames), Optional.ofNullable(maybeRangesStr), ConsensusMigrationTarget.valueOf(target)); + } + + @Override + public String listConsensusMigrations(@Nullable Set keyspaceNames, + @Nullable Set tableNames, + @Nonnull String format) + { + ClusterMetadata cm = ClusterMetadata.current(); + ConsensusMigrationState snapshot = cm.consensusMigrationState; + Map snapshotAsMap = snapshot.toMap(keyspaceNames, tableNames); + return pojoMapToString(snapshotAsMap, format); + } + public Map> getConcurrency(List stageNames) { Stream stageStream = stageNames.isEmpty() ? stream(Stage.values()) : stageNames.stream().map(Stage::fromPoolName); @@ -2568,16 +2631,6 @@ public String getSavedCachesLocation() return FileUtils.getCanonicalPath(DatabaseDescriptor.getSavedCachesLocation()); } - private List stringify(Iterable endpoints, boolean withPort) - { - List stringEndpoints = new ArrayList<>(); - for (InetAddressAndPort ep : endpoints) - { - stringEndpoints.add(ep.getHostAddress(withPort)); - } - return stringEndpoints; - } - public int getCurrentGenerationNumber() { return Gossiper.instance.getCurrentGenerationNumber(getBroadcastAddressAndPort()); @@ -3753,7 +3806,7 @@ protected synchronized void drain(boolean isFinalShutdown) throws IOException, I if (daemon != null) shutdownClientServers(); - ScheduledExecutors.optionalTasks.shutdown(); + Gossiper.instance.stop(); ActiveRepairService.instance().stop(); @@ -3763,6 +3816,9 @@ protected synchronized void drain(boolean isFinalShutdown) throws IOException, I transientMode = Optional.of(Mode.DRAINING); } + if (AccordService.isSetup()) + AccordService.instance().shutdownAndWait(1, MINUTES); + // In-progress writes originating here could generate hints to be written, // which is currently scheduled on the mutation stage. So shut down MessagingService // before mutation stage, so we can get all the hints saved before shutting down. @@ -3777,6 +3833,9 @@ protected synchronized void drain(boolean isFinalShutdown) throws IOException, I logger.error("Messaging service timed out shutting down", t); } + // ScheduledExecutors shuts down after MessagingService, as MessagingService may issue tasks to it. + ScheduledExecutors.optionalTasks.shutdown(); + if (!isFinalShutdown) { logger.debug("clearing mutation stage"); @@ -3854,6 +3913,8 @@ protected synchronized void drain(boolean isFinalShutdown) throws IOException, I CommitLog.instance.shutdownBlocking(); + AutoRepair.instance.shutdownBlocking(); + // wait for miscellaneous tasks like sstable and commitlog segment deletion ColumnFamilyStore.shutdownPostFlushExecutor(); @@ -3967,9 +4028,20 @@ synchronized void checkServiceAllowedToStart(String service) @VisibleForTesting public IPartitioner setPartitionerUnsafe(IPartitioner newPartitioner) { - IPartitioner oldPartitioner = DatabaseDescriptor.setPartitionerUnsafe(newPartitioner); + checkNotNull(newPartitioner, "newPartitioner is null"); + checkState(originalPartitioner == null, "Already changed the partitioner without resetting"); + originalPartitioner = DatabaseDescriptor.setPartitionerUnsafe(newPartitioner); valueFactory = new VersionedValue.VersionedValueFactory(newPartitioner); - return oldPartitioner; + return originalPartitioner; + } + + @VisibleForTesting + public void resetPartitionerUnsafe() + { + checkState(originalPartitioner != null, "Original partitioner was never changed"); + DatabaseDescriptor.setPartitionerUnsafe(originalPartitioner); + valueFactory = new VersionedValueFactory(originalPartitioner); + originalPartitioner = null; } public void truncate(String keyspace, String table) throws TimeoutException, IOException @@ -4152,6 +4224,28 @@ public List getNonLocalStrategyKeyspaces() return Lists.newArrayList(Schema.instance.distributedKeyspaces().names()); } + @Override + public List getAccordManagedKeyspaces() + { + Keyspaces keyspaces = Schema.instance.getNonLocalStrategyKeyspaces(); + return keyspaces.stream().flatMap(ks -> ks.tables.stream()) + .filter(TableMetadata::requiresAccordSupport) + .map(tbm -> tbm.keyspace) + .distinct() + .sorted() + .collect(toList()); + } + + @Override + public List getAccordManagedTables() + { + Keyspaces keyspaces = Schema.instance.getNonLocalStrategyKeyspaces(); + return keyspaces.stream().flatMap(ks -> ks.tables.stream()) + .filter(TableMetadata::requiresAccordSupport) + .map(tbm -> tbm.keyspace + '.' + tbm.name) + .collect(toList()); + } + public Map getViewBuildStatuses(String keyspace, String view, boolean withPort) { Map coreViewStatus = SystemDistributedKeyspace.viewStatus(keyspace, view); @@ -4981,7 +5075,7 @@ public void enableFullQueryLogger(String path, String rollCycle, Boolean blockin archiveCommand = archiveCommand != null ? archiveCommand : fqlOptions.archive_command; maxArchiveRetries = maxArchiveRetries != Integer.MIN_VALUE ? maxArchiveRetries : fqlOptions.max_archive_retries; - Preconditions.checkNotNull(path, "cassandra.yaml did not set log_dir and not set as parameter"); + checkNotNull(path, "cassandra.yaml did not set log_dir and not set as parameter"); FullQueryLogger.instance.enableWithoutClean(File.getPath(path), rollCycle, blocking, maxQueueWeight, maxLogSize, archiveCommand, maxArchiveRetries); } @@ -5346,7 +5440,7 @@ public Long getRepairRpcTimeout() public void setRepairRpcTimeout(Long timeoutInMillis) { - Preconditions.checkState(timeoutInMillis > 0); + checkState(timeoutInMillis > 0); DatabaseDescriptor.setRepairRpcTimeout(timeoutInMillis); logger.info("RepairRpcTimeout set to {}ms via JMX", timeoutInMillis); } @@ -5541,4 +5635,72 @@ public void setPrioritizeSAIOverLegacyIndex(boolean value) { DatabaseDescriptor.setPrioritizeSAIOverLegacyIndex(value); } + + @Override + public void setPaxosRepairRaceWait(boolean paxosRepairRaceWait) + { + DatabaseDescriptor.setPaxosRepairRaceWait(paxosRepairRaceWait); + } + + @Override + public boolean getPaxosRepairRaceWait() + { + return DatabaseDescriptor.getPaxosRepairRaceWait(); + } + + public void alterTopology(String changes) + { + Map updates = AlterTopology.parseArgs(changes, ClusterMetadata.current().directory); + logger.info("Received request to modify rack assignments. Proposed changes: {}", updates); + if (updates.isEmpty()) + return; + + AlterTopology transform = new AlterTopology(updates, ClusterMetadataService.instance().placementProvider()); + ClusterMetadataService.instance() + .commit(transform, + m -> { + logger.info("Rack changes committed successfully"); + return m; + }, + (c, r) -> { + throw new IllegalArgumentException("Unable to commit rack changes: " + r); + }); + } + + @Override + public List getTablesForKeyspace(String keyspace) + { + return Keyspace.open(keyspace).getColumnFamilyStores().stream().map(cfs -> cfs.name).collect(Collectors.toList()); + } + + @Override + public List mutateSSTableRepairedState(boolean repaired, boolean preview, String keyspace, List tableNames) + { + Map tables = Keyspace.open(keyspace).getColumnFamilyStores() + .stream().collect(Collectors.toMap(c -> c.name, c -> c)); + for (String tableName : tableNames) + { + if (!tables.containsKey(tableName)) + throw new RuntimeException("Table " + tableName + " does not exist in keyspace " + keyspace); + } + + // only select SSTables that are unrepaired when repaired is true and vice versa + Predicate predicate = sst -> repaired != sst.isRepaired(); + + // mutate SSTables + long repairedAt = !repaired ? 0 : currentTimeMillis(); + List sstablesTouched = new ArrayList<>(); + for (String tableName : tableNames) + { + ColumnFamilyStore table = tables.get(tableName); + Set result = table.runWithCompactionsDisabled(() -> { + Set sstables = table.getLiveSSTables().stream().filter(predicate).collect(Collectors.toSet()); + if (!preview) + table.getCompactionStrategyManager().mutateRepaired(sstables, repairedAt, null, false); + return sstables; + }, predicate, OperationType.ANTICOMPACTION, true, false, true); + sstablesTouched.addAll(result.stream().map(sst -> sst.descriptor.baseFile().name()).collect(Collectors.toList())); + } + return sstablesTouched; + } } diff --git a/src/java/org/apache/cassandra/service/StorageServiceMBean.java b/src/java/org/apache/cassandra/service/StorageServiceMBean.java index 7760ea01883e..5d4781c54651 100644 --- a/src/java/org/apache/cassandra/service/StorageServiceMBean.java +++ b/src/java/org/apache/cassandra/service/StorageServiceMBean.java @@ -27,6 +27,7 @@ import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeoutException; +import javax.annotation.Nonnull; import javax.annotation.Nullable; import javax.management.NotificationEmitter; import javax.management.openmbean.CompositeData; @@ -585,7 +586,6 @@ default int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, * If classQualifer is not empty but level is empty/null, it will set the level to null for the defined classQualifer
    * If level cannot be parsed, then the level will be defaulted to DEBUG
    *
    - * The logback configuration should have {@code < jmxConfigurator />} set * * @param classQualifier The logger's classQualifer * @param level The log level @@ -1138,6 +1138,20 @@ default int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, public String getBootstrapState(); void abortBootstrap(String nodeId, String endpoint); + void migrateConsensusProtocol(@Nullable List keyspaceNames, + @Nullable List maybeTableNames, + @Nullable String maybeRangesStr); + + Integer finishConsensusMigration(@Nonnull String keyspace, + @Nullable List maybeTableNames, + @Nullable String maybeRangesStr, + @Nonnull String target); + + String listConsensusMigrations(@Nullable Set keyspaceNames, @Nullable Set tableNames, @Nonnull String format); + + List getAccordManagedKeyspaces(); + List getAccordManagedTables(); + /** Gets the concurrency settings for processing stages*/ static class StageConcurrency implements Serializable { @@ -1185,6 +1199,7 @@ public void enableAuditLog(String loggerName, String includedKeyspaces, String e /** * Start the fully query logger. + * * @param path Path where the full query log will be stored. If null cassandra.yaml value is used. * @param rollCycle How often to create a new file for query data (MINUTELY, DAILY, HOURLY) * @param blocking Whether threads submitting queries to the query log should block if they can't be drained to the filesystem or alternatively drops samples and log @@ -1356,4 +1371,15 @@ public void enableAuditLog(String loggerName, String includedKeyspaces, String e boolean getPrioritizeSAIOverLegacyIndex(); void setPrioritizeSAIOverLegacyIndex(boolean value); + + void setPaxosRepairRaceWait(boolean paxosRepairCoordinatorWait); + + boolean getPaxosRepairRaceWait(); + // Comma delimited list of "nodeId=dc:rack" or "endpoint=dc:rack" + void alterTopology(String updates); + /** Gets the names of all tables for the given keyspace */ + public List getTablesForKeyspace(String keyspace); + + /** Mutates the repaired state of all SSTables for the given SSTables */ + public List mutateSSTableRepairedState(boolean repaired, boolean preview, String keyspace, List tables); } diff --git a/src/java/org/apache/cassandra/service/TimeoutStrategy.java b/src/java/org/apache/cassandra/service/TimeoutStrategy.java new file mode 100644 index 000000000000..7ca39db222b0 --- /dev/null +++ b/src/java/org/apache/cassandra/service/TimeoutStrategy.java @@ -0,0 +1,411 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Supplier; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import javax.annotation.Nullable; + +import com.google.common.annotations.VisibleForTesting; + +import accord.utils.Invariants; +import com.codahale.metrics.Snapshot; +import org.apache.cassandra.metrics.ClientRequestMetrics; +import org.apache.cassandra.service.TimeoutStrategy.LatencySupplier.Constant; +import org.apache.cassandra.service.TimeoutStrategy.LatencySupplier.Percentile; + +import static java.lang.Double.parseDouble; +import static java.lang.Integer.parseInt; +import static java.lang.Math.max; +import static java.lang.Math.pow; +import static java.util.concurrent.TimeUnit.MICROSECONDS; +import static java.util.concurrent.TimeUnit.NANOSECONDS; +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.apache.cassandra.config.DatabaseDescriptor.getCasContentionTimeout; +import static org.apache.cassandra.config.DatabaseDescriptor.getReadRpcTimeout; +import static org.apache.cassandra.config.DatabaseDescriptor.getWriteRpcTimeout; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; + +/** + *

    A strategy for making timeout decisions for operations. This is a simplified single-value version of + * the RetryStrategy + * + *

    This represent a computed time period, that may be defined dynamically based on a simple calculation over:

      + *
    • {@code pX()} recent experienced latency distribution for successful operations, + * e.g. {@code p50(rw)} the maximum of read and write median latencies, + * {@code p999(r)} the 99.9th percentile of read latencies + *
    • {@code attempts} the number of failed attempts made by the operation so far + *
    • {@code constant} a user provided floating point constant + *
    + * + *

    The calculation may take any of these forms + *

  • constant {@code $constant$[mu]s} + *
  • dynamic constant {@code pX() * constant} + *
  • dynamic linear {@code pX() * constant * attempts} + *
  • dynamic exponential {@code pX() * constant ^ attempts} + * + *

    Furthermore, the dynamic calculations can be bounded with a min/max, like so: + * {@code min[mu]s <= dynamic expr <= max[mu]s} + * + * e.g. + *

  • {@code 10ms <= p50(rw)*0.66} + *
  • {@code 10ms <= p95(rw)*1.8^attempts <= 100ms} + *
  • {@code 5ms <= p50(rw)*0.5} + * + * TODO (expected): permit simple constant addition (e.g. p50+5ms) + * TODO (required): track separate stats per-DC as inputs to these decisions + */ +public class TimeoutStrategy implements WaitStrategy +{ + static final Pattern PARSE = Pattern.compile( + "(\\s*(?0|[0-9]+[mu]?s)\\s*<=)?" + + "(\\s*(?[^=]+))" + + "(\\s*<=\\s*(?0|[0-9]+[mu]?s))?"); + + static final Pattern WAIT = Pattern.compile( + "\\s*(?0|[0-9]+[mu]?s)" + + "|\\s*((p(?[0-9]+)(\\((?r|w|rw|wr)\\))?)?|(?0|[0-9]+[mu]?s))" + + "\\s*(([*]\\s*(?[0-9.]+))?\\s*(?[*^]\\s*attempts)?)?\\s*"); + static final Pattern TIME = Pattern.compile( + "0|[0-9]+[mu]?s"); + + // Factories can be useful for testing purposes, to supply custom implementations of selectors and modifiers. + final static LatencyModifierFactory modifiers = new LatencyModifierFactory(){}; + + interface LatencyModifierFactory + { + default LatencyModifier identity() { return (l, a) -> l; } + default LatencyModifier multiply(double constant) { return (l, a) -> saturatedCast(l * constant); } + default LatencyModifier multiplyByAttempts(double multiply) { return (l, a) -> saturatedCast(l * multiply * a); } + default LatencyModifier multiplyByAttemptsExp(double base) { return (l, a) -> saturatedCast(l * pow(base, a)); } + } + + public interface Wait + { + long getMicros(int attempts); + + class Constant implements Wait + { + final long micros; + public Constant(long micros) { this.micros = micros; } + @Override public long getMicros(int attempts) { return micros; } + } + + class Modifying implements Wait + { + final LatencySupplier supplier; + final LatencyModifier modifier; + + Modifying(LatencySupplier supplier, LatencyModifier modifier) + { + this.supplier = supplier; + this.modifier = modifier; + } + + @Override + public long getMicros(int attempts) + { + return modifier.modify(supplier.getMicros(), attempts); + } + } + } + + public interface LatencySupplier + { + long getMicros(); + + class Constant implements LatencySupplier + { + final long micros; + public Constant(long micros) {this.micros = micros; } + @Override public long getMicros() { return micros; } + } + + class Percentile implements LatencySupplier + { + final LatencySource latencies; + final double percentile; + + public Percentile(LatencySource latencies, double percentile) + { + this.latencies = latencies; + this.percentile = percentile; + } + + @Override + public long getMicros() + { + return latencies.get(percentile); + } + } + } + + public interface LatencySource + { + long get(double percentile); + } + + public interface LatencySourceFactory + { + LatencySource source(String params); + + static LatencySourceFactory rw(ClientRequestMetrics reads, ClientRequestMetrics writes) + { + return new ReadWriteLatencySourceFactory(reads, writes); + } + + static LatencySourceFactory of(ClientRequestMetrics latencies) + { + LatencySource source = new TimeLimitedLatencySupplier(latencies.latency::getSnapshot, 10, SECONDS); + return ignore -> source; + } + + static LatencySourceFactory none() + { + return ignore -> ignore2 -> { throw new UnsupportedOperationException(); }; + } + } + + public static class ReadWriteLatencySourceFactory implements LatencySourceFactory + { + final LatencySource reads, writes; + + public ReadWriteLatencySourceFactory(ClientRequestMetrics reads, ClientRequestMetrics writes) + { + this(reads.latency::getSnapshot, writes.latency::getSnapshot); + } + + public ReadWriteLatencySourceFactory(Supplier reads, Supplier writes) + { + this.reads = new TimeLimitedLatencySupplier(reads, 10, SECONDS); + this.writes = new TimeLimitedLatencySupplier(writes, 10, SECONDS); + } + + @Override + public LatencySource source(String rw) + { + if (rw.length() == 2) + return percentile -> Math.max(reads.get(percentile), writes.get(percentile)); + else if ("r".equals(rw)) + return reads; + else + return writes; + } + } + + interface LatencyModifier + { + long modify(long latency, int attempts); + } + + static class SnapshotAndTime + { + final long validUntil; + final Snapshot snapshot; + + SnapshotAndTime(long validUntil, Snapshot snapshot) + { + this.validUntil = validUntil; + this.snapshot = snapshot; + } + } + + static class TimeLimitedLatencySupplier extends AtomicReference implements LatencySource + { + final Supplier snapshotSupplier; + final long validForNanos; + + TimeLimitedLatencySupplier(Supplier snapshotSupplier, long time, TimeUnit units) + { + this.snapshotSupplier = snapshotSupplier; + this.validForNanos = units.toNanos(time); + } + + private Snapshot getSnapshot() + { + long now = nanoTime(); + + SnapshotAndTime cur = get(); + if (cur != null && cur.validUntil > now) + return cur.snapshot; + + Snapshot newSnapshot = snapshotSupplier.get(); + SnapshotAndTime next = new SnapshotAndTime(now + validForNanos, newSnapshot); + if (compareAndSet(cur, next)) + return next.snapshot; + + return accumulateAndGet(next, (a, b) -> a.validUntil > b.validUntil ? a : b).snapshot; + } + + @Override + public long get(double percentile) + { + return (long)getSnapshot().getValue(percentile); + } + } + + final Wait wait; + final long minMicros, maxMicros; + + public TimeoutStrategy(Wait wait, long minMicros, long maxMicros) + { + this.minMicros = minMicros; + this.maxMicros = maxMicros; + this.wait = wait; + } + + public long computeWait(int attempts, TimeUnit units) + { + long wait = this.wait.getMicros(attempts); + if (wait < minMicros) wait = minMicros; + else if (wait > maxMicros) wait = maxMicros; + return units.convert(wait, MICROSECONDS); + } + + public long computeWaitUntil(int attempts) + { + long nanos = computeWait(attempts, NANOSECONDS); + return nanoTime() + nanos; + } + + private static LatencySupplier parseLatencySupplier(Matcher m, LatencySourceFactory latenciesFactory) + { + String perc = m.group("perc"); + if (perc == null) + return new Constant(parseInMicros(m.group("constbase"))); + + String rw = m.group("rw"); + if (rw == null) rw = "rw"; + LatencySource latencies = latenciesFactory.source(rw); + double percentile = parseDouble("0." + perc); + return new Percentile(latencies, percentile); + } + + private static @Nullable LatencyModifier parseLatencyModifier(String spec, Matcher m, LatencyModifierFactory modifiers) + { + String mod = m.group("mod"); + String modkind = m.group("modkind"); + double modifier = 1.0; + if (mod != null) modifier = Double.parseDouble(mod); + else if (modkind == null) return null; + else if (!modkind.startsWith("*")) + throw new IllegalArgumentException("Invalid latency modifier specification: " + spec + ". Expect constant factor as base for exponent."); + + if (modkind == null) + return modifiers.multiply(modifier); + + if (modkind.startsWith("*")) + return modifiers.multiplyByAttempts(modifier); + else if (modkind.startsWith("^")) + return modifiers.multiplyByAttemptsExp(modifier); + else + throw new IllegalArgumentException("Unrecognised attempt modifier: " + modkind); + } + + static long saturatedCast(double v) + { + if (v > Long.MAX_VALUE) + return Long.MAX_VALUE; + return (long) v; + } + + public static TimeoutStrategy parse(String input, LatencySourceFactory latencies) + { + Matcher m = PARSE.matcher(input); + if (!m.matches()) + throw new IllegalArgumentException("Invalid specification: '" + input + "'; does not match " + PARSE); + long min = parseInMicros(m.group("min"), 0); + long max = parseInMicros(m.group("max"), Long.MAX_VALUE); + Wait wait = TimeoutStrategy.parseWait(m.group("wait"), latencies); + return new TimeoutStrategy(wait, min, max); + } + + public static Wait parseWait(String input, LatencySourceFactory latencies) + { + return parseWait(input, latencies, modifiers); + } + + @VisibleForTesting + static Wait parseWait(String input, LatencySourceFactory latencies, LatencyModifierFactory modifiers) + { + Matcher m = WAIT.matcher(input); + if (!m.matches()) + throw new IllegalArgumentException(input + " does not match " + WAIT); + + String maybeConst = m.group("const"); + if (maybeConst != null) + { + long v = parseInMicros(maybeConst); + return new Wait.Constant(v); + } + + LatencySupplier supplier = parseLatencySupplier(m, latencies); + LatencyModifier modifier = parseLatencyModifier(input, m, modifiers); + if (modifier == null && supplier instanceof LatencySupplier.Constant) + return new Wait.Constant(((Constant) supplier).micros); + if (modifier == null) + modifier = modifiers.identity(); + return new Wait.Modifying(supplier, modifier); + } + + public static long parseInMicros(String input, long orElse) + { + if (input == null) + return orElse; + + return parseInMicros(input); + } + + public static long parseInMicros(String text) + { + Matcher m = TIME.matcher(text); + if (!m.matches()) + throw new IllegalArgumentException(text + " does not match " + TIME); + + if (text.length() == 1) + { + Invariants.require(text.charAt(0) == '0'); + return 0; + } + + char penultimate = text.charAt(text.length() - 2); + switch (penultimate) + { + default: return parseInt(text.substring(0, text.length() - 1)) * 1000_000L; + case 'm': return parseInt(text.substring(0, text.length() - 2)) * 1000L; + case 'u': return parseInt(text.substring(0, text.length() - 2)); + } + } + + private static String orElse(Supplier get, String orElse) + { + String result = get.get(); + return result != null ? result : orElse; + } + + @VisibleForTesting + public static long maxQueryTimeoutMicros() + { + return max(max(getCasContentionTimeout(MICROSECONDS), getWriteRpcTimeout(MICROSECONDS)), getReadRpcTimeout(MICROSECONDS)); + } +} diff --git a/src/java/org/apache/cassandra/service/TruncateResponseHandler.java b/src/java/org/apache/cassandra/service/TruncateResponseHandler.java index 54b1241006d7..566d9fa02240 100644 --- a/src/java/org/apache/cassandra/service/TruncateResponseHandler.java +++ b/src/java/org/apache/cassandra/service/TruncateResponseHandler.java @@ -23,19 +23,19 @@ import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; -import org.apache.cassandra.utils.concurrent.Condition; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.cassandra.db.TruncateResponse; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.exceptions.TruncateException; import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.net.RequestCallback; import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.RequestCallback; +import org.apache.cassandra.utils.concurrent.Condition; import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; - import static java.util.concurrent.TimeUnit.NANOSECONDS; import static org.apache.cassandra.config.DatabaseDescriptor.getTruncateRpcTimeout; import static org.apache.cassandra.utils.Clock.Global.nanoTime; @@ -100,10 +100,10 @@ public void onResponse(Message message) } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) + public void onFailure(InetAddressAndPort from, RequestFailure failure) { // If the truncation hasn't succeeded on some replica, abort and indicate this back to the client. - failureReasonByEndpoint.put(from, failureReason); + failureReasonByEndpoint.put(from, failure.reason); condition.signalAll(); } diff --git a/src/java/org/apache/cassandra/service/WaitStrategy.java b/src/java/org/apache/cassandra/service/WaitStrategy.java new file mode 100644 index 000000000000..cba13ca98728 --- /dev/null +++ b/src/java/org/apache/cassandra/service/WaitStrategy.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service; + +import java.util.concurrent.TimeUnit; + +public interface WaitStrategy +{ + // a value of below 0 means give up + long computeWaitUntil(int attempts); + // a value of below 0 means give up + long computeWait(int attempts, TimeUnit units); + + enum None implements WaitStrategy + { + INSTANCE; + + @Override + public long computeWait(int attempts, TimeUnit units) + { + return -1; + } + + @Override + public long computeWaitUntil(int attempts) + { + return -1; + } + } + +} diff --git a/src/java/org/apache/cassandra/service/accord/AbstractAccordSegmentCompactor.java b/src/java/org/apache/cassandra/service/accord/AbstractAccordSegmentCompactor.java new file mode 100644 index 000000000000..18f3de0633ba --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AbstractAccordSegmentCompactor.java @@ -0,0 +1,226 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service.accord; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.PriorityQueue; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.utils.Invariants; +import accord.utils.btree.BTree; +import accord.utils.btree.BulkIterator; +import accord.utils.btree.UpdateFunction; +import org.apache.cassandra.db.BufferClustering; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.LivenessInfo; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.LongType; +import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.db.rows.BTreeRow; +import org.apache.cassandra.db.rows.BufferCell; +import org.apache.cassandra.db.rows.ColumnData; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.exceptions.UnknownTableException; +import org.apache.cassandra.io.sstable.SSTableTxnWriter; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.journal.SegmentCompactor; +import org.apache.cassandra.journal.StaticSegment; +import org.apache.cassandra.journal.StaticSegment.KeyOrderReader; +import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.accord.AccordJournalValueSerializers.FlyweightImage; +import org.apache.cassandra.service.accord.AccordJournalValueSerializers.FlyweightSerializer; +import org.apache.cassandra.utils.NoSpamLogger; + +import static java.util.concurrent.TimeUnit.MINUTES; +import org.apache.cassandra.service.accord.serializers.Version; + +/** + * Segment compactor: takes static segments and compacts them into a single SSTable. + */ +public abstract class AbstractAccordSegmentCompactor implements SegmentCompactor +{ + protected static final Logger logger = LoggerFactory.getLogger(AbstractAccordSegmentCompactor.class); + private static final NoSpamLogger.NoSpamLogStatement unknownTable = NoSpamLogger.getStatement(logger, "Unknown (probably dropped) TableId {} reading {}; skipping record", 1L, MINUTES); + + static final Object[] rowTemplate = BTree.build(BulkIterator.of(new Object[2]), 2, UpdateFunction.noOp); + + protected final Version userVersion; + protected final ColumnData userVersionCell; + protected final ColumnFamilyStore cfs; + protected final long timestamp = ClientState.getTimestamp(); + + public AbstractAccordSegmentCompactor(Version userVersion, ColumnFamilyStore cfs) + { + this.userVersion = userVersion; + this.userVersionCell = BufferCell.live(AccordKeyspace.JournalColumns.user_version, timestamp, Int32Type.instance.decompose(userVersion.version)); + this.cfs = cfs; + } + + void switchPartitions() {} + + boolean considerWritingKey() + { + return false; + } + + abstract void initializeWriter(); + abstract SSTableTxnWriter writer(); + abstract void finishAndAddWriter(); + abstract Throwable cleanupWriter(Throwable t); + + @Override + public Collection> compact(Collection> segments) + { + Invariants.require(segments.size() >= 2, () -> String.format("Can only compact 2 or more segments, but got %d", segments.size())); + logger.info("Compacting {} static segments: {}", segments.size(), segments); + + PriorityQueue> readers = new PriorityQueue<>(); + for (StaticSegment segment : segments) + { + KeyOrderReader reader = segment.keyOrderReader(); + if (reader.advance()) + readers.add(reader); + else + reader.close(); + } + + // nothing to compact (all segments empty, should never happen, but it is theoretically possible?) - exit early + // TODO (required): investigate how this comes to be, check if there is a cleanup issue + if (readers.isEmpty()) + return Collections.emptyList(); + + initializeWriter(); + + JournalKey key = null; + FlyweightImage builder = null; + FlyweightSerializer serializer = null; + long firstDescriptor = -1, lastDescriptor = -1; + int firstOffset = -1, lastOffset = -1; + try + { + KeyOrderReader reader; + while ((reader = readers.poll()) != null) + { + if (key == null || !reader.key().equals(key)) + { + maybeWritePartition(key, builder, serializer, firstDescriptor, firstOffset); + switchPartitions(); + key = reader.key(); + serializer = (FlyweightSerializer) key.type.serializer; + builder = serializer.mergerFor(); + builder.reset(key); + firstDescriptor = lastDescriptor = -1; + firstOffset = lastOffset = -1; + } + + Version realVersion = Version.fromVersion(reader.descriptor.userVersion); + + boolean advanced; + do + { + if (builder == null) + { + builder = serializer.mergerFor(); + builder.reset(key); + } + + try (DataInputBuffer in = new DataInputBuffer(reader.record(), false)) + { + if (lastDescriptor != -1) + { + Invariants.require(reader.descriptor.timestamp <= lastDescriptor, + "Descriptors were accessed out of order: %d was accessed after %d", reader.descriptor.timestamp, lastDescriptor); + Invariants.require(reader.descriptor.timestamp != lastDescriptor || + reader.offset() < lastOffset, + "Offsets were accessed out of order: %d was accessed after %s", reader.offset(), lastOffset); + } + serializer.deserialize(key, builder, in, realVersion); + lastDescriptor = reader.descriptor.timestamp; + lastOffset = reader.offset(); + if (firstDescriptor == -1) + { + firstDescriptor = lastDescriptor; + firstOffset = lastOffset; + } + } + + if (considerWritingKey()) + { + maybeWritePartition(key, builder, serializer, firstDescriptor, firstOffset); + builder = null; + firstDescriptor = lastDescriptor = -1; + firstOffset = lastOffset = -1; + } + } + while ((advanced = reader.advance()) && reader.key().equals(key)); + + if (advanced) readers.offer(reader); // there is more to this reader, but not with this key + else reader.close(); + } + + maybeWritePartition(key, builder, serializer, firstDescriptor, firstOffset); + switchPartitions(); + } + catch (UnknownTableException e) + { + unknownTable.info(e.id, key); + } + catch (Throwable t) + { + t = cleanupWriter(t); + throw new RuntimeException(String.format("Caught exception while serializing. Last seen key: %s", key), t); + } + + finishAndAddWriter(); + return Collections.emptyList(); + } + + private JournalKey prevKey; + private DecoratedKey prevDecoratedKey; + + private void maybeWritePartition(JournalKey key, FlyweightImage builder, FlyweightSerializer serializer, long descriptor, int offset) throws IOException + { + if (builder != null) + { + DecoratedKey decoratedKey = AccordKeyspace.JournalColumns.decorate(key); + Invariants.requireArgument(prevKey == null || ((decoratedKey.compareTo(prevDecoratedKey) >= 0 ? 1 : -1) == (JournalKey.SUPPORT.compare(key, prevKey) >= 0 ? 1 : -1)), + "Partition key and JournalKey didn't have matching order, which may imply a serialization issue.\n%s (%s)\n%s (%s)", + key, decoratedKey, prevKey, prevDecoratedKey); + prevKey = key; + prevDecoratedKey = decoratedKey; + + Object[] rowData = rowTemplate.clone(); + try (DataOutputBuffer out = DataOutputBuffer.scratchBuffer.get()) + { + serializer.reserialize(key, builder, out, userVersion); + rowData[0] = BufferCell.live(AccordKeyspace.JournalColumns.record, timestamp, out.asNewBuffer()); + } + rowData[1] = userVersionCell; + Row row = BTreeRow.create(BufferClustering.make(LongType.instance.decompose(descriptor), Int32Type.instance.decompose(offset)), LivenessInfo.EMPTY, Row.Deletion.LIVE, rowData); + PartitionUpdate update = PartitionUpdate.singleRowUpdate(AccordKeyspace.Journal, decoratedKey, row); + writer().append(update.unfilteredIterator()); + } + } +} + diff --git a/src/java/org/apache/cassandra/service/accord/AccordCache.java b/src/java/org/apache/cassandra/service/accord/AccordCache.java new file mode 100644 index 000000000000..704ed3188192 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordCache.java @@ -0,0 +1,1292 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service.accord; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collection; +import java.util.IdentityHashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.TimeUnit; +import java.util.function.BiConsumer; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.function.ToLongFunction; +import java.util.stream.Stream; + +import javax.annotation.Nullable; + +import com.google.common.annotations.VisibleForTesting; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.RoutingKey; +import accord.local.Command; +import accord.local.cfk.CommandsForKey; +import accord.local.cfk.Serialize; +import accord.primitives.Routable; +import accord.primitives.SaveStatus; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.utils.IntrusiveLinkedList; +import accord.utils.Invariants; +import accord.utils.QuadFunction; +import accord.utils.TriFunction; +import accord.utils.async.Cancellable; +import org.agrona.collections.Object2ObjectHashMap; +import org.apache.cassandra.cache.CacheSize; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.exceptions.UnknownTableException; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.metrics.AccordCacheMetrics; +import org.apache.cassandra.metrics.CacheAccessMetrics; +import org.apache.cassandra.service.accord.AccordCacheEntry.Status; +import org.apache.cassandra.service.accord.events.CacheEvents; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.utils.NoSpamLogger; +import org.apache.cassandra.utils.NoSpamLogger.NoSpamLogStatement; +import org.apache.cassandra.utils.ObjectSizes; + +import static accord.utils.Invariants.illegalState; +import static accord.utils.Invariants.require; +import static org.apache.cassandra.service.accord.AccordCacheEntry.Status.EVICTED; +import static org.apache.cassandra.service.accord.AccordCacheEntry.Status.LOADED; +import static org.apache.cassandra.service.accord.AccordCacheEntry.Status.MODIFIED; + +/** + * Cache for AccordCommand and AccordCommandsForKey, available memory is shared between the two object types. + *

    + * Supports dynamic object sizes. After each acquire/free cycle, the cacheable objects size is recomputed to + * account for data added/removed during txn processing if it's modified flag is set + * + * TODO (required): we only iterate over unreferenced entries + */ +public class AccordCache implements CacheSize +{ + private static final Logger logger = LoggerFactory.getLogger(AccordCache.class); + private static final NoSpamLogStatement evictNoEvict = NoSpamLogger.getStatement(logger, "Found and expired {} marked no evict, with age {}, exceeding its expected max age of {}", 1L, TimeUnit.MINUTES); + + // Debug mode to verify that loading from journal + system tables results in + // functionally identical (or superceding) command to the one we've just evicted. + private static boolean VALIDATE_LOAD_ON_EVICT = false; + + @VisibleForTesting + public static void validateLoadOnEvict(boolean value) + { + VALIDATE_LOAD_ON_EVICT = value; + } + + public interface Adapter + { + @Nullable V load(AccordCommandStore commandStore, K key); + @Nullable Runnable save(AccordCommandStore commandStore, K key, @Nullable V value, @Nullable Object shrunk); + // a result of null means we can immediately evict, without saving + @Nullable V quickShrink(V value); + // a result of null means we cannot shrink, and should save/evict as appropriate + @Nullable Object fullShrink(K key, V value); + @Nullable V inflate(AccordCommandStore commandStore, K key, Object shrunk); + long estimateHeapSize(V value); + long estimateShrunkHeapSize(Object shrunk); + boolean validate(AccordCommandStore commandStore, K key, V value); + S safeRef(AccordCacheEntry node); + + default AccordCacheEntry newEntry(K key, AccordCache.Type.Instance owner) + { + return AccordCacheEntry.createReadyToLoad(key, owner); + } + } + + static class Stats + { + long queries; + long hits; + long misses; + } + + public static final class ImmutableStats + { + public final long queries; + public final long hits; + public final long misses; + + public ImmutableStats(Stats stats) + { + queries = stats.queries; + hits = stats.hits; + misses = stats.misses; + } + } + + private final List> types = new CopyOnWriteArrayList<>(); + private final Function saveExecutor; + private final AccordCacheEntry.OnSaved onSaved; + // TODO (required): monitor this queue and periodically clean up entries, or implement an eviction deadline system + private final IntrusiveLinkedList> evictQueue = new IntrusiveLinkedList<>(); + private final IntrusiveLinkedList> noEvictQueue = new IntrusiveLinkedList<>(); + + private int unreferencedBytes; + private int unreferenced; + private long maxSizeInBytes; + private long bytesCached; + private int noEvictGeneration; + private boolean shrinkingOn = true; + + @VisibleForTesting + final AccordCacheMetrics metrics; + final Stats stats = new Stats(); + + public AccordCache(Function saveExecutor, AccordCacheEntry.OnSaved onSaved, long maxSizeInBytes, AccordCacheMetrics metrics) + { + this.saveExecutor = saveExecutor; + this.onSaved = onSaved; + this.maxSizeInBytes = maxSizeInBytes; + this.metrics = metrics; + } + + @Override + public void setCapacity(long sizeInBytes) + { + maxSizeInBytes = sizeInBytes; + maybeShrinkOrEvictSomeNodes(); + } + + public void setShrinkingOn(boolean shrinkingOn) + { + this.shrinkingOn = shrinkingOn; + } + + @Override + public long capacity() + { + return maxSizeInBytes; + } + + /** + * Make sure we don't have any items lingering too long in the no evict queue, to avoid cache memory leaks + */ + void processNoEvictQueue() + { + noEvictGeneration = (noEvictGeneration + 1) & 0xffff; + if (noEvictQueue.isEmpty()) + return; + + Iterator> iter = noEvictQueue.iterator(); + int skipCount = 3; + while (skipCount > 0 && iter.hasNext()) + { + AccordCacheEntry entry = iter.next(); + int age = (noEvictGeneration - entry.noEvictGeneration()) & 0xffff; + if (age >= entry.noEvictMaxAge()) + { + evictNoEvict.warn(entry, age, entry.noEvictMaxAge()); + entry.unlink(); + evict(entry, true); + } + else + { + --skipCount; + } + } + } + + /* + * Roughly respects LRU semantics when evicting. Might consider prioritising keeping MODIFIED nodes around + * for longer to maximise the chances of hitting system tables fewer times (or not at all). + */ + private void maybeShrinkOrEvictSomeNodes() + { + while (bytesCached > maxSizeInBytes && !evictQueue.isEmpty()) + { + AccordCacheEntry node = evictQueue.peek(); + shrinkOrEvict(node); + } + } + + @VisibleForTesting + private void shrinkOrEvict(AccordCacheEntry node) + { + require(node.references() == 0); + + if (shrinkingOn && node.tryShrink()) + { + IntrusiveLinkedList> queue; + queue = node.isNoEvict() ? noEvictQueue : evictQueue; + node.unlink(); + queue.addLast(node); + } + else + { + tryEvict(node); + } + } + + @VisibleForTesting + public void tryEvict(AccordCacheEntry node) + { + require(node.references() == 0); + + if (node.isNoEvict()) + { + node.unlink(); + noEvictQueue.addLast(node); + return; + } + + Status status = node.status(); + switch (status) + { + default: throw new IllegalStateException("Unhandled status " + status); + case LOADING: + node.loading().loading.cancel(); + case WAITING_TO_LOAD: + Invariants.paranoid(node.loadingOrWaiting().waiters == null); + case LOADED: + node.unlink(); + evict(node, true); + break; + case MODIFIED: + Type parent = node.owner.parent(); + node.save(saveExecutor, parent.adapter, onSaved); + boolean evict = node.status() == LOADED; + node.unlink(); + if (evict) evict(node, true); + } + } + + private void evict(AccordCacheEntry node, boolean updateUnreferenced) + { + if (logger.isTraceEnabled()) + logger.trace("Evicting {}", node); + + require(node.isUnqueued()); + + if (updateUnreferenced) + { + unreferencedBytes -= node.sizeOnHeap; + --unreferenced; + } + bytesCached -= node.sizeOnHeap; + Type.Instance owner = node.owner; + Type parent = owner.parent(); + parent.bytesCached -= node.sizeOnHeap; + --parent.size; + + // TODO (expected): use listeners + if (node.status() == LOADED && VALIDATE_LOAD_ON_EVICT) + owner.validateLoadEvicted(node); + + AccordCacheEntry self = node.owner.cache.remove(node.key()); + Invariants.require(self.references() == 0); + require(self == node, "Leaked node detected; was attempting to remove %s but cache had %s", node, self); + node.notifyListeners(Listener::onEvict); + node.evicted(); + } + + Collection> load(BiFunction loadExecutor, P param, AccordCacheEntry node, AccordCacheEntry.OnLoaded onLoaded) + { + Type parent = node.owner.parent(); + return node.load(loadExecutor, param, parent.adapter, onLoaded).waiters(); + } + + void loaded(AccordCacheEntry node, V value) + { + node.loaded(value); + node.notifyListeners(Listener::onUpdate); + } + + void failedToLoad(AccordCacheEntry node) + { + Invariants.require(node.references() == 0); + if (node.isUnqueued()) + { + Invariants.require(node.status() == EVICTED); + return; + } + node.unlink(); + node.failedToLoad(); + evict(node, true); + } + + void saved(AccordCacheEntry node, Object identity, Throwable fail) + { + if (node.saved(identity, fail) && node.references() == 0) + evictQueue.addFirst(node); // add to front since we have just saved, so we were eligible for eviction + } + + public > void release(S safeRef, AccordTask owner) + { + safeRef.global().owner.release(safeRef, owner); + } + + public ImmutableStats stats() + { + return new ImmutableStats(stats); + } + + public > Type newType(Class keyClass, Adapter adapter) + { + Type instance = new Type<>(keyClass, adapter); + types.add(instance); + return instance; + } + + public > Type newType( + Class keyClass, + BiFunction loadFunction, + QuadFunction saveFunction, + Function quickShrink, + TriFunction validateFunction, + ToLongFunction heapEstimator, + Function, S> safeRefFactory) + { + return newType(keyClass, loadFunction, saveFunction, quickShrink, (i, j) -> j, (c, i, j) -> (V)j, validateFunction, heapEstimator, i -> 0, safeRefFactory); + } + + public > Type newType( + Class keyClass, + BiFunction loadFunction, + QuadFunction saveFunction, + Function quickShrink, + BiFunction fullShrink, + TriFunction inflate, + TriFunction validateFunction, + ToLongFunction heapEstimator, + ToLongFunction shrunkHeapEstimator, + Function, S> safeRefFactory) + { + return newType(keyClass, new FunctionalAdapter<>(loadFunction, saveFunction, quickShrink, + fullShrink, inflate, + validateFunction, heapEstimator, shrunkHeapEstimator, + safeRefFactory, AccordCacheEntry::createReadyToLoad)); + } + + public Collection> types() + { + return types; + } + + public interface Listener + { + default void onAdd(AccordCacheEntry state) {} + default void onUpdate(AccordCacheEntry state) {} + default void onEvict(AccordCacheEntry state) {} + } + + public class Type> implements CacheSize + { + public class Instance implements Iterable> + { + final AccordCommandStore commandStore; + // TODO (desired): don't need to store key separately as stored in node; ideally use a hash set that allows us to get the current entry + private final Map> cache = new Object2ObjectHashMap<>(); + private List> listeners = null; + + public Instance(AccordCommandStore commandStore) + { + this.commandStore = commandStore; + } + + public S acquire(K key) + { + AccordCacheEntry node = acquire(key, false); + return adapter.safeRef(node); + } + + public S acquireIfLoaded(K key) + { + AccordCacheEntry node = acquire(key, true); + if (node == null) + return null; + return adapter.safeRef(node); + } + + public S acquire(AccordCacheEntry node) + { + Invariants.require(node.owner == this); + acquireExisting(node, false); + return adapter.safeRef(node); + } + + public void recordPreAcquired(AccordSafeState ref) + { + Invariants.require(ref.global().owner == this); + incrementCacheHits(); + } + + private AccordCacheEntry acquire(K key, boolean onlyIfLoaded) + { + incrementCacheQueries(); + @SuppressWarnings("unchecked") + AccordCacheEntry node = cache.get(key); + return node == null + ? acquireAbsent(key, onlyIfLoaded) + : acquireExisting(node, onlyIfLoaded); + } + + /* + * Can only return a LOADING Node (or null) + */ + private AccordCacheEntry acquireAbsent(K key, boolean onlyIfLoaded) + { + incrementCacheMisses(); + if (onlyIfLoaded) + return null; + AccordCacheEntry node = adapter.newEntry(key, this); + node.increment(); + + Object prev = cache.put(key, node); + node.initSize(parent()); + Invariants.require(prev == null, "%s not absent from cache: %s already present", key, node); + ++size; + node.notifyListeners(Listener::onAdd); + maybeShrinkOrEvictSomeNodes(); + return node; + } + + /* + * Can't return EVICTED or INITIALIZED + */ + private AccordCacheEntry acquireExisting(AccordCacheEntry node, boolean onlyIfLoaded) + { + boolean isLoaded = node.isLoaded(); + if (isLoaded) + incrementCacheHits(); + else + incrementCacheMisses(); + + if (onlyIfLoaded && !isLoaded) + return null; + + if (node.increment() == 1) + { + --unreferenced; + unreferencedBytes -= node.sizeOnHeap; + node.unlink(); + } + + return node; + } + + public void release(AccordSafeState safeRef, AccordTask owner) + { + K key = safeRef.global().key(); + logger.trace("Releasing resources for {}: {}", key, safeRef); + + AccordCacheEntry node = cache.get(key); + + require(!safeRef.invalidated()); + require(safeRef.global() != null, "safeRef node is null for %s", key); + require(safeRef.global() == node, "safeRef node not in map: %s != %s", safeRef.global(), node); + require(node.references() > 0, "references (%d) are zero for %s (%s)", node.references(), key, node); + require(node.isUnqueued()); + + boolean evict = false; + if (safeRef.hasUpdate()) + { + V update = safeRef.current(); + if (update != null) + update = adapter.quickShrink(update); + node.setExclusive(update); + if (update == null) + { + if (node.is(MODIFIED)) + node.saved(); + evict = true; + } + node.notifyListeners(Listener::onUpdate); + } + else if (node.isLoadingOrWaiting()) + { + node.loadingOrWaiting().remove(owner); + } + else + { + evict = node.is(LOADED) && node.isNull(); + } + safeRef.invalidate(); + + if (node.decrement() == 0) + { + if (evict) + { + evict(node, false); + return; + } + + ++unreferenced; + unreferencedBytes += node.sizeOnHeap; + Status status = node.status(); // status() completes + switch (status) + { + default: throw new IllegalStateException("Unhandled status " + status); + case WAITING_TO_LOAD: + case LOADING: + case LOADED: + case MODIFIED: + logger.trace("Moving {} with status {} to eviction queue", key, status); + evictQueue.addLast(node); + + case SAVING: + case FAILED_TO_SAVE: + break; // can never evict, so no point in adding to eviction queue either + } + } + + maybeShrinkOrEvictSomeNodes(); + } + + public Stream> stream() + { + return cache.values().stream(); + } + + Type parent() + { + return Type.this; + } + + @Override + public Iterator> iterator() + { + return stream().iterator(); + } + + void validateLoadEvicted(AccordCacheEntry node) + { + @SuppressWarnings("unchecked") + AccordCacheEntry state = (AccordCacheEntry) node; + K key = state.key(); + V evicted = state.tryGetFull(); + if (evicted == null) + { + try + { + Object shrunk = state.tryGetShrunk(); + if (shrunk != null) + evicted = adapter.inflate(commandStore, key, shrunk); + } + catch (RuntimeException rte) + { + if (rte.getCause() instanceof UnknownTableException) + return; + throw rte; + } + } + if (!adapter.validate(node.owner.commandStore, key, evicted)) + throw new IllegalStateException("Reloaded value for key " + key + " is not equal to or fuller than evicted value " + evicted); + } + + @VisibleForTesting + public AccordCacheEntry getUnsafe(K key) + { + return cache.get(key); + } + + public Set keySet() + { + return cache.keySet(); + } + + @VisibleForTesting + public boolean isReferenced(K key) + { + AccordCacheEntry node = cache.get(key); + return node != null && node.references() > 0; + } + + @VisibleForTesting + boolean keyIsReferenced(Object key, Class> valClass) + { + AccordCacheEntry node = cache.get(key); + return node != null && node.references() > 0; + } + + @VisibleForTesting + boolean keyIsCached(Object key, Class> valClass) + { + AccordCacheEntry node = cache.get(key); + return node != null; + } + + @VisibleForTesting + int references(Object key, Class> valClass) + { + AccordCacheEntry node = cache.get(key); + return node != null ? node.references() : 0; + } + + void notifyListeners(BiConsumer, AccordCacheEntry> notify, AccordCacheEntry node) + { + notifyListeners(listeners, notify, node); + notifyListeners(typeListeners, notify, node); + } + + void notifyListeners(List> listeners, BiConsumer, AccordCacheEntry> notify, AccordCacheEntry node) + { + if (listeners != null) + { + for (int i = 0, size = listeners.size() ; i < size ; ++i) + notify.accept(listeners.get(i), node); + + } + } + + public void register(Listener l) + { + if (listeners == null) + listeners = new ArrayList<>(); + listeners.add(l); + } + + public void unregister(Listener l) + { + if (!tryUnregister(l)) + throw illegalState("Listener was not registered"); + } + + public boolean tryUnregister(Listener l) + { + if (listeners == null || !listeners.remove(l)) + return false; + if (listeners.isEmpty()) + listeners = null; + return true; + } + + } + + private final Class keyClass; + private Adapter adapter; + private long bytesCached; + private int size; + + @VisibleForTesting + final CacheAccessMetrics typeMetrics; + private final Stats stats = new Stats(); + private List> typeListeners = null; + + public Type( + Class keyClass, + Adapter adapter) + { + this.keyClass = keyClass; + this.adapter = adapter; + this.typeMetrics = metrics.forInstance(keyClass); + } + + void updateSize(long newSize, long delta, boolean isUnreferenced, boolean updateHistogram) + { + // TODO (expected): deprecate this in favour of a histogram snapshot of any point in time + bytesCached += delta; + AccordCache.this.bytesCached += delta; + if (updateHistogram) metrics.objectSize.update(newSize); + if (isUnreferenced) AccordCache.this.unreferencedBytes += delta; + } + + // can be safely garbage collected if empty + Instance newInstance(AccordCommandStore commandStore) + { + return new Instance(commandStore); + } + + private void incrementCacheQueries() + { + typeMetrics.requests.mark(); + metrics.requests.mark(); + stats.queries++; + AccordCache.this.stats.queries++; + } + + private void incrementCacheHits() + { + typeMetrics.hits.mark(); + metrics.hits.mark(); + stats.hits++; + AccordCache.this.stats.hits++; + } + + private void incrementCacheMisses() + { + typeMetrics.misses.mark(); + metrics.misses.mark(); + stats.misses++; + AccordCache.this.stats.misses++; + } + + AccordCache parent() + { + return AccordCache.this; + } + + public Stats stats() + { + return stats; + } + + public ImmutableStats statsSnapshot() + { + return new ImmutableStats(stats); + } + + public Stats globalStats() + { + return AccordCache.this.stats; + } + + @VisibleForTesting + public void unsafeSetLoadFunction(BiFunction loadFunction) + { + if (adapter.getClass() != SettableWrapper.class) + adapter = new SettableWrapper<>(adapter); + ((SettableWrapper)adapter).load = loadFunction; + } + + public BiFunction unsafeGetLoadFunction() + { + if (adapter.getClass() != SettableWrapper.class) + adapter = new SettableWrapper<>(adapter); + return ((SettableWrapper)adapter).load; + } + + Adapter adapter() + { + return adapter; + } + + @Override + public long capacity() + { + return AccordCache.this.capacity(); + } + + @Override + public void setCapacity(long capacity) + { + throw new UnsupportedOperationException("Capacity is shared between all instances. Please set the capacity on the global cache"); + } + + @Override + public int size() + { + return size; + } + + @Override + public long weightedSize() + { + return bytesCached; + } + + public long globalAllocated() + { + return AccordCache.this.bytesCached; + } + + public int globalReferencedEntries() + { + return AccordCache.this.numReferencedEntries(); + } + + public int globalUnreferencedEntries() + { + return AccordCache.this.numUnreferencedEntries(); + } + + public void register(Listener l) + { + if (typeListeners == null) + typeListeners = new ArrayList<>(); + typeListeners.add(l); + } + + public void unregister(Listener l) + { + if (typeListeners == null) + throw new AssertionError("No listeners exist"); + if (!typeListeners.remove(l)) + throw new AssertionError("Listener was not registered"); + if (typeListeners.isEmpty()) + typeListeners = null; + } + + @Override + public String toString() + { + return "Instance{" + + ", keyClass=" + keyClass + + '}'; + } + } + + @VisibleForTesting + AccordCacheEntry head() + { + Iterator> iter = evictQueue.iterator(); + return iter.hasNext() ? iter.next() : null; + } + + @VisibleForTesting + AccordCacheEntry tail() + { + AccordCacheEntry last = null; + Iterator> iter = evictQueue.iterator(); + while (iter.hasNext()) + last = iter.next(); + return last; + } + + public boolean isEmpty() + { + return size() == 0; + } + + Iterable> evictionQueue() + { + return evictQueue::iterator; + } + + private int cacheSize() + { + int size = 0; + for (Type type : types) + size += type.size(); + return size; + } + + @VisibleForTesting + int numReferencedEntries() + { + return cacheSize() - unreferenced; + } + + @VisibleForTesting + int numUnreferencedEntries() + { + return unreferenced; + } + + @VisibleForTesting + int unreferencedBytes() + { + return unreferencedBytes; + } + + @Override + public int size() + { + return cacheSize(); + } + + @Override + public long weightedSize() + { + return bytesCached; + } + + static void registerJfrListener(int shardId, AccordCache.Type type, String name) + { + if (!DatabaseDescriptor.getAccordStateCacheListenerJFREnabled()) + return; + + type.register(new AccordCache.Listener<>() { + private final IdentityHashMap, CacheEvents.Evict> pendingEvicts = new IdentityHashMap<>(); + + @Override + public void onAdd(AccordCacheEntry state) + { + CacheEvents.Add add = new CacheEvents.Add(); + CacheEvents.Evict evict = new CacheEvents.Evict(); + if (!add.isEnabled()) + return; + add.begin(); + evict.begin(); + add.shard = evict.shard = shardId; + add.instance = evict.instance = name; + add.key = evict.key = state.key().toString(); + updateMutable(type, state, add); + add.commit(); + pendingEvicts.put(state, evict); + } + + @Override + public void onEvict(AccordCacheEntry state) + { + CacheEvents.Evict event = pendingEvicts.remove(state); + if (event == null) return; + updateMutable(type, state, event); + event.commit(); + } + }); + } + + private static void updateMutable(AccordCache.Type type, AccordCacheEntry state, CacheEvents event) + { + event.status = state.status().name(); + + event.lastQueriedEstimatedSizeOnHeap = state.sizeOnHeap(); + + event.instanceAllocated = type.weightedSize(); + AccordCache.Stats stats = type.stats(); + event.instanceStatsQueries = stats.queries; + event.instanceStatsHits = stats.hits; + event.instanceStatsMisses = stats.misses; + + event.globalSize = type.size(); + event.globalReferenced = type.globalReferencedEntries(); + event.globalUnreferenced = type.globalUnreferencedEntries(); + event.globalCapacity = type.capacity(); + event.globalAllocated = type.globalAllocated(); + + stats = type.globalStats(); + event.globalStatsQueries = stats.queries; + event.globalStatsHits = stats.hits; + event.globalStatsMisses = stats.misses; + + event.update(); + } + + static class FunctionalAdapter implements Adapter + { + final BiFunction load; + final QuadFunction save; + final Function quickShrink; + final BiFunction shrink; + final TriFunction inflate; + final TriFunction validate; + final ToLongFunction estimateHeapSize; + final ToLongFunction estimateShrunkHeapSize; + final Function, S> newSafeRef; + final BiFunction.Instance, AccordCacheEntry> newNode; + + FunctionalAdapter(BiFunction load, + QuadFunction save, + Function quickShrink, BiFunction shrink, + TriFunction inflate, + TriFunction validate, + ToLongFunction estimateHeapSize, + ToLongFunction estimateShrunkHeapSize, + Function, S> newSafeRef, + BiFunction.Instance, AccordCacheEntry> newNode) + { + this.load = load; + this.save = save; + this.shrink = shrink; + this.quickShrink = quickShrink; + this.inflate = inflate; + this.validate = validate; + this.estimateHeapSize = estimateHeapSize; + this.estimateShrunkHeapSize = estimateShrunkHeapSize; + this.newSafeRef = newSafeRef; + this.newNode = newNode; + } + + FunctionalAdapter(Adapter wrap) + { + this(wrap::load, wrap::save, wrap::quickShrink, wrap::fullShrink, wrap::inflate, wrap::validate, wrap::estimateHeapSize, wrap::estimateShrunkHeapSize, wrap::safeRef, wrap::newEntry); + } + + @Override + public V load(AccordCommandStore commandStore, K key) + { + return load.apply(commandStore, key); + } + + @Override + public Runnable save(AccordCommandStore commandStore, K key, @Nullable V value, @Nullable Object shrunk) + { + return save.apply(commandStore, key, value, shrunk); + } + + @Override + public V quickShrink(V value) + { + return quickShrink.apply(value); + } + + @Override + public Object fullShrink(K key, V value) + { + return shrink.apply(key, value); + } + + @Override + public V inflate(AccordCommandStore commandStore, K key, Object shrunk) + { + return inflate.apply(commandStore, key, shrunk); + } + + @Override + public boolean validate(AccordCommandStore commandStore, K key, V value) + { + return validate.apply(commandStore, key, value); + } + + @Override + public long estimateHeapSize(V value) + { + return estimateHeapSize.applyAsLong(value); + } + + @Override + public long estimateShrunkHeapSize(Object shrunk) + { + return estimateShrunkHeapSize.applyAsLong(shrunk); + } + + @Override + public S safeRef(AccordCacheEntry node) + { + return newSafeRef.apply(node); + } + + @Override + public AccordCacheEntry newEntry(K key, Type.Instance owner) + { + return newNode.apply(key, owner); + } + } + + static class SettableWrapper extends FunctionalAdapter + { + volatile BiFunction load; + + SettableWrapper(Adapter wrapper) + { + super(wrapper); + this.load = super.load; + } + + public static Adapter loadOnly(BiFunction load) + { + SettableWrapper result = new SettableWrapper<>(new NoOpAdapter<>()); + result.load = load; + return result; + } + + @Override + public V load(AccordCommandStore commandStore, K key) + { + return load.apply(commandStore, key); + } + } + + static class NoOpAdapter implements Adapter + { + @Override public V load(AccordCommandStore commandStore, K key) { return null; } + @Override public Runnable save(AccordCommandStore commandStore, K key, @Nullable V value, @Nullable Object shrunk) { return null; } + @Override public V quickShrink(V value) { return null; } + @Override public Object fullShrink(K key, V value) { return null; } + @Override public V inflate(AccordCommandStore commandStore, K key, Object shrunk) { return null; } + @Override public long estimateHeapSize(V value) { return 0; } + @Override public long estimateShrunkHeapSize(Object shrunk) { return 0; } + @Override public boolean validate(AccordCommandStore commandStore, K key, V value) { return false; } + @Override public S safeRef(AccordCacheEntry node) { return null; } + } + + public static class CommandsForKeyAdapter implements Adapter + { + public static final CommandsForKeyAdapter CFK_ADAPTER = new CommandsForKeyAdapter(); + private CommandsForKeyAdapter() {} + + @Override + public CommandsForKey load(AccordCommandStore commandStore, RoutingKey key) + { + return commandStore.loadCommandsForKey(key); + } + + @Override + public Runnable save(AccordCommandStore commandStore, RoutingKey key, @Nullable CommandsForKey value, @Nullable Object serialized) + { + return commandStore.saveCommandsForKey(key, value, serialized); + } + + @Override + public CommandsForKey quickShrink(CommandsForKey value) + { + return value; + } + + @Override + public Object fullShrink(RoutingKey key, CommandsForKey value) + { + if (value.isEmpty()) + return null; + + return Serialize.toBytesWithoutKey(value.maximalPrune()); + } + + @Override + public CommandsForKey inflate(AccordCommandStore commandStore, RoutingKey key, Object shrunk) + { + return Serialize.fromBytes(key, (ByteBuffer)shrunk); + } + + @Override + public long estimateHeapSize(CommandsForKey value) + { + return AccordObjectSizes.commandsForKey(value); + } + + @Override + public long estimateShrunkHeapSize(Object shrunk) + { + return ObjectSizes.sizeOnHeapOf((ByteBuffer) shrunk); + } + + @Override + public boolean validate(AccordCommandStore commandStore, RoutingKey key, CommandsForKey value) + { + return commandStore.validateCommandsForKey(key, value); + } + + @Override + public AccordSafeCommandsForKey safeRef(AccordCacheEntry node) + { + return new AccordSafeCommandsForKey(node); + } + } + + public static class CommandAdapter implements Adapter + { + public static final CommandAdapter COMMAND_ADAPTER = new CommandAdapter(); + private CommandAdapter() {} + + @Override + public Command load(AccordCommandStore commandStore, TxnId txnId) + { + Invariants.require(!txnId.is(Txn.Kind.EphemeralRead)); + return commandStore.loadCommand(txnId); + } + + @Override + public Runnable save(AccordCommandStore commandStore, TxnId txnId, @Nullable Command value, @Nullable Object serialized) + { + if (txnId.is(Routable.Domain.Key)) + return null; + + if (value == null) + { + value = inflate(commandStore, txnId, serialized); + if (value == null) + return null; + } + + return null; + } + + @Override + public Command quickShrink(Command value) + { + if (value.saveStatus() == SaveStatus.Uninitialised) + return null; + if (value.txnId().is(Txn.Kind.EphemeralRead) && value.saveStatus().compareTo(SaveStatus.ReadyToExecute) >= 0) + return null; // TODO (expected): should we manage this with the waiting callback? more work, but maybe cleaner/clearer/safer + return AccordCommandStore.prepareToCache(value); + } + + @Override + public Object fullShrink(TxnId txnId, Command value) + { + if (txnId.is(Txn.Kind.EphemeralRead)) + Invariants.require(value.saveStatus().compareTo(SaveStatus.ReadyToExecute) < 0); + + try + { + return AccordJournal.asSerializedChange(null, value, Version.LATEST); + } + catch (IOException e) + { + logger.warn("Failed to serialize {}", value, e); + return null; + } + } + + @Override + public @Nullable Command inflate(AccordCommandStore commandStore, TxnId key, Object serialized) + { + AccordJournal.Builder builder = new AccordJournal.Builder(key); + ByteBuffer buffer = (ByteBuffer) serialized; + buffer.mark(); + try (DataInputBuffer buf = new DataInputBuffer(buffer, false)) + { + builder.deserializeNext(buf, Version.LATEST); + return builder.construct(commandStore.unsafeGetRedundantBefore()); + } + catch (UnknownTableException e) + { + // TODO (required): log, and make sure callers correctly handle null + return null; + } + catch (IOException e) + { + // TODO (required): test and make sure recover safely from exceptions OR log and return null + throw new RuntimeException(e); + } + finally + { + buffer.reset(); + } + } + + @Override + public long estimateHeapSize(Command value) + { + return AccordObjectSizes.command(value); + } + + @Override + public long estimateShrunkHeapSize(Object shrunk) + { + return ObjectSizes.sizeOnHeapOf((ByteBuffer) shrunk); + } + + @Override + public boolean validate(AccordCommandStore commandStore, TxnId key, Command value) + { + return commandStore.validateCommand(key, value); + } + + @Override + public AccordSafeCommand safeRef(AccordCacheEntry node) + { + return new AccordSafeCommand(node); + } + + @Override + public AccordCacheEntry newEntry(TxnId txnId, Type.Instance owner) + { + AccordCacheEntry node = new AccordCacheEntry<>(txnId, owner); + if (txnId.is(Txn.Kind.EphemeralRead)) + { + node.initialize(null); + int maxAge = (int)Math.min(0xff, 1 + DatabaseDescriptor.getReadRpcTimeout(TimeUnit.SECONDS)); + node.markNoEvict(owner.parent().parent().noEvictGeneration, maxAge); + } + else + { + node.readyToLoad(); + } + return node; + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordCacheEntry.java b/src/java/org/apache/cassandra/service/accord/AccordCacheEntry.java new file mode 100644 index 000000000000..f03d01c55851 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordCacheEntry.java @@ -0,0 +1,670 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service.accord; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.concurrent.atomic.AtomicIntegerFieldUpdater; +import java.util.function.BiConsumer; +import java.util.function.BiFunction; +import java.util.function.Function; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.primitives.Ints; + +import accord.utils.ArrayBuffers.BufferList; +import accord.utils.IntrusiveLinkedListNode; +import accord.utils.Invariants; +import accord.utils.async.Cancellable; +import org.apache.cassandra.service.accord.AccordCache.Adapter; +import org.apache.cassandra.utils.ObjectSizes; + +import static org.apache.cassandra.service.accord.AccordCacheEntry.Status.EVICTED; +import static org.apache.cassandra.service.accord.AccordCacheEntry.Status.FAILED_TO_LOAD; +import static org.apache.cassandra.service.accord.AccordCacheEntry.Status.FAILED_TO_SAVE; +import static org.apache.cassandra.service.accord.AccordCacheEntry.Status.LOADED; +import static org.apache.cassandra.service.accord.AccordCacheEntry.Status.LOADING; +import static org.apache.cassandra.service.accord.AccordCacheEntry.Status.MODIFIED; +import static org.apache.cassandra.service.accord.AccordCacheEntry.Status.SAVING; +import static org.apache.cassandra.service.accord.AccordCacheEntry.Status.WAITING_TO_LOAD; + +/** + * Global (per CommandStore) state of a cached entity (Command or CommandsForKey). + */ +public class AccordCacheEntry extends IntrusiveLinkedListNode +{ + public enum Status + { + UNINITIALIZED, + WAITING_TO_LOAD(UNINITIALIZED), + LOADING(WAITING_TO_LOAD), + /** + * Consumers should never see this state + */ + FAILED_TO_LOAD(LOADING), + + LOADED(true, false, UNINITIALIZED, LOADING), + MODIFIED(true, false, LOADED), + SAVING(true, true, MODIFIED), + + /** + * Attempted to save but failed. Shouldn't normally happen unless we have a bug in serialization, + * or commit log has been stopped. + */ + FAILED_TO_SAVE(true, true, SAVING), + + UNUSED, // spacing to permit easier bit masks + + EVICTED(WAITING_TO_LOAD, LOADING, LOADED, FAILED_TO_LOAD), + ; + + static final Status[] VALUES = values(); + static + { + MODIFIED.permittedFrom |= 1 << MODIFIED.ordinal(); + MODIFIED.permittedFrom |= 1 << SAVING.ordinal(); + MODIFIED.permittedFrom |= 1 << FAILED_TO_SAVE.ordinal(); + LOADED.permittedFrom |= 1 << SAVING.ordinal(); + LOADED.permittedFrom |= 1 << MODIFIED.ordinal(); + for (Status status : VALUES) + { + Invariants.require((status.ordinal() & IS_LOADED) != 0 == status.loaded); + Invariants.require(((status.ordinal() & IS_LOADED) != 0 && (status.ordinal() & IS_NESTED) != 0) == status.nested); + } + } + + final boolean loaded; + final boolean nested; + int permittedFrom; + + Status(Status ... statuses) + { + this(false, false, statuses); + } + + Status(boolean loaded, boolean nested, Status ... statuses) + { + this.loaded = loaded; + this.nested = nested; + for (Status status : statuses) + permittedFrom |= 1 << status.ordinal(); + } + } + + static final int STATUS_MASK = 0x0000001F; + static final int SHRUNK = 0x00000040; + static final int NO_EVICT = 0x00000020; + static final int IS_LOADED = 0x4; + static final int IS_NESTED = 0x2; // only valid to test if already tested NORMAL + static final int IS_LOADING_OR_WAITING_MASK = 0x6; // only valid to test if already tested NORMAL + static final int IS_LOADING_OR_WAITING = 0x2; // only valid to test if already tested NORMAL + static final long EMPTY_SIZE = ObjectSizes.measure(new AccordCacheEntry<>(null, null)); + + private final K key; + final AccordCache.Type.Instance owner; + + private Object state; + private int status; + int sizeOnHeap; + private volatile int references; + private static final AtomicIntegerFieldUpdater referencesUpdater = AtomicIntegerFieldUpdater.newUpdater(AccordCacheEntry.class, "references"); + + AccordCacheEntry(K key, AccordCache.Type.Instance owner) + { + this.key = key; + this.owner = owner; + } + + void unlink() + { + remove(); + } + + boolean isUnqueued() + { + return isFree(); + } + + public K key() + { + return key; + } + + public int references() + { + return references; + } + + public int increment() + { + return referencesUpdater.incrementAndGet(this); + } + + public int decrement() + { + return referencesUpdater.decrementAndGet(this); + } + + boolean isLoaded() + { + return (status & IS_LOADED) != 0; + } + + boolean isNested() + { + Invariants.require(isLoaded()); + return (status & IS_NESTED) != 0; + } + + boolean isShrunk() + { + return (status & SHRUNK) != 0; + } + + public boolean is(Status status) + { + return (this.status & STATUS_MASK) == status.ordinal(); + } + + boolean isLoadingOrWaiting() + { + return (status & IS_LOADING_OR_WAITING_MASK) == IS_LOADING_OR_WAITING; + } + + public boolean isComplete() + { + return !is(LOADING) && !is(SAVING); + } + + int noEvictGeneration() + { + Invariants.require(isNoEvict()); + return (status >>> 8) & 0xffff; + } + + int noEvictMaxAge() + { + Invariants.require(isNoEvict()); + return status >>> 24; + } + + boolean isNoEvict() + { + return (status & NO_EVICT) != 0; + } + + int sizeOnHeap() + { + return sizeOnHeap; + } + + void updateSize(AccordCache.Type parent) + { + // TODO (expected): we aren't weighing the keys + int newSizeOnHeap = Ints.saturatedCast(EMPTY_SIZE + estimateOnHeapSize(parent.adapter())); + parent.updateSize(newSizeOnHeap, newSizeOnHeap - sizeOnHeap, references == 0, true); + sizeOnHeap = newSizeOnHeap; + } + + void initSize(AccordCache.Type parent) + { + // TODO (expected): we aren't weighing the keys + sizeOnHeap = Ints.saturatedCast(EMPTY_SIZE); + parent.updateSize(sizeOnHeap, sizeOnHeap, false, false); + } + + @Override + public String toString() + { + return "Node{" + status() + + ", key=" + key() + + ", references=" + references + + "}@" + Integer.toHexString(System.identityHashCode(this)); + } + + public Status status() + { + return Status.VALUES[(status & STATUS_MASK)]; + } + + private void setStatus(Status newStatus) + { + Invariants.require((newStatus.permittedFrom & (1 << (status & STATUS_MASK))) != 0, "%s not permitted from %s", newStatus, status()); + setStatusUnsafe(newStatus); + } + + private void setStatusUnsafe(Status newStatus) + { + status &= ~STATUS_MASK; + status |= newStatus.ordinal(); + } + + public void initialize(V value) + { + Invariants.require(state == null); + setStatus(LOADED); + state = value; + } + + public void readyToLoad() + { + Invariants.require(state == null); + setStatus(WAITING_TO_LOAD); + state = new WaitingToLoad(); + } + + public void markNoEvict(int generation, int maxAge) + { + Invariants.require((maxAge & ~0xff) == 0); + Invariants.require((generation & ~0xffff) == 0); + status |= NO_EVICT; + status |= generation << 8; + status |= maxAge << 24; + } + + public LoadingOrWaiting loadingOrWaiting() + { + return (LoadingOrWaiting)state; + } + + void notifyListeners(BiConsumer, AccordCacheEntry> notify) + { + owner.notifyListeners(notify, this); + } + + public interface OnLoaded + { + void onLoaded(AccordCacheEntry state, V value, Throwable fail); + + static OnLoaded immediate() + { + return new OnLoaded() + { + @Override + public void onLoaded(AccordCacheEntry state, V value, Throwable fail) + { + if (fail == null) state.loaded(value); + else state.failedToLoad(); + } + }; + } + } + + public interface OnSaved + { + void onSaved(AccordCacheEntry state, Object identity, Throwable fail); + + static OnSaved immediate() + { + return new OnSaved() + { + @Override + public void onSaved(AccordCacheEntry state, Object identity, Throwable fail) + { + state.saved(identity, fail); + } + }; + } + } + + public

    Loading load(BiFunction loadExecutor, P param, Adapter adapter, OnLoaded onLoaded) + { + Invariants.require(is(WAITING_TO_LOAD), "%s", this); + Loading loading = ((WaitingToLoad)state).load(loadExecutor.apply(param, () -> { + V result; + try + { + result = adapter.load(owner.commandStore, key); + } + catch (Throwable t) + { + onLoaded.onLoaded(this, null, t); + throw t; + } + onLoaded.onLoaded(this, result, null); + })); + setStatus(LOADING); + state = loading; + return loading; + } + + public Loading testLoad() + { + Invariants.require(is(WAITING_TO_LOAD)); + Loading loading = ((WaitingToLoad)state).load(() -> {}); + setStatus(LOADING); + state = loading; + return loading; + } + + public Loading loading() + { + Invariants.require(is(LOADING), "%s", this); + return (Loading) state; + } + + // must own the cache's lock when invoked. this is true of most methods in the class, + // but this one is less obvious so named as to draw attention + public V getExclusive() + { + Invariants.require(owner == null || owner.commandStore == null || owner.commandStore.executor().isOwningThread()); + Invariants.require(isLoaded(), "%s", this); + if (isShrunk()) + { + AccordCache.Type parent = owner.parent(); + inflate(owner.commandStore, key, parent.adapter()); + updateSize(parent); + } + + return (V)unwrap(); + } + + private Object unwrap() + { + return isNested() ? ((Nested)state).state : state; + } + + // must own the cache's lock when invoked + void setExclusive(V value) + { + if (value == state) + return; + + Saving cancel = is(SAVING) ? ((Saving)state) : null; + setStatus(MODIFIED); + state = value; + updateSize(owner.parent()); + // TODO (expected): do we want to always cancel in-progress saving? + if (cancel != null) + cancel.saving.cancel(); + } + + public void loaded(V value) + { + setStatus(LOADED); + state = value; + updateSize(owner.parent()); + } + + public void testLoaded(V value) + { + setStatus(LOADED); + state = value; + } + + public void failedToLoad() + { + setStatus(FAILED_TO_LOAD); + state = null; + } + + boolean tryShrink() + { + if (!isLoaded()) + return false; + + AccordCache.Type parent = owner.parent(); + if (!tryShrink(key, parent.adapter())) + return false; + updateSize(parent); + return true; + } + + V tryGetFull() + { + return isShrunk() ? null : (V)unwrap(); + } + + Object tryGetShrunk() + { + return isShrunk() ? unwrap() : null; + } + + boolean isNull() + { + return state == null; + } + + /** + * Submits a save runnable to the specified executor. When the runnable + * has completed, the state save will have either completed or failed. + */ + @VisibleForTesting + void save(Function saveExecutor, Adapter adapter, OnSaved onSaved) + { + V full = isShrunk() ? null : (V)state; + Object shrunk = isShrunk() ? state : null; + Runnable save = adapter.save(owner.commandStore, key, full, shrunk); + if (null == save) // null mutation -> null Runnable -> no change on disk + { + setStatus(LOADED); + } + else + { + setStatus(SAVING); + Object identity = new Object(); + Cancellable saving = saveExecutor.apply(() -> { + try + { + save.run(); + } + catch (Throwable t) + { + onSaved.onSaved(this, identity, t); + throw t; + } + onSaved.onSaved(this, identity, null); + }); + state = new Saving(saving, identity, state); + } + } + + boolean saved(Object identity, Throwable fail) + { + if (!is(SAVING)) + return false; + + Saving saving = (Saving) state; + if (saving.identity != identity) + return false; + + if (fail != null) + { + setStatus(FAILED_TO_SAVE); + state = new FailedToSave(fail, ((Saving)state).state); + return false; + } + else + { + setStatus(LOADED); + state = saving.state; + return true; + } + } + + protected void saved() + { + Invariants.require(is(MODIFIED)); + setStatus(LOADED); + } + + public Cancellable saving() + { + return ((Saving)state).saving; + } + + public AccordCacheEntry evicted() + { + if (isNoEvict()) + setStatusUnsafe(EVICTED); + else setStatus(EVICTED); + state = null; + return this; + } + + public Throwable failure() + { + return ((FailedToSave)state).cause; + } + + private boolean tryShrink(K key, Adapter adapter) + { + Invariants.require(!isNested()); + if (isShrunk() || state == null) + return false; + + Object update = adapter.fullShrink(key, (V)state); + if (update == null || update == state) + return false; + + state = update; + status |= SHRUNK; + return true; + } + + private void inflate(AccordCommandStore commandStore, K key, Adapter adapter) + { + Invariants.require(isShrunk()); + if (isNested()) + { + Nested nested = (Nested) state; + nested.state = adapter.inflate(commandStore, key, nested.state); + } + else + { + state = adapter.inflate(commandStore, key, state); + } + status &= ~SHRUNK; + } + + private long estimateOnHeapSize(Adapter adapter) + { + Object current = unwrap(); + if (current == null) return 0; + else if (isShrunk()) return adapter.estimateShrunkHeapSize(current); + return adapter.estimateHeapSize((V)current); + } + + public static abstract class LoadingOrWaiting + { + Collection> waiters; + + public LoadingOrWaiting() + { + } + + public LoadingOrWaiting(Collection> waiters) + { + this.waiters = waiters; + } + + public Collection> waiters() + { + return waiters != null ? waiters : Collections.emptyList(); + } + + public BufferList> copyWaiters() + { + BufferList> list = new BufferList<>(); + if (waiters != null) + list.addAll(waiters); + return list; + } + + public void add(AccordTask waiter) + { + if (waiters == null) + waiters = new ArrayList<>(); + waiters.add(waiter); + } + + public void remove(AccordTask waiter) + { + if (waiters != null) + { + waiters.remove(waiter); + if (waiters.isEmpty()) + waiters = null; + } + } + } + + static class WaitingToLoad extends LoadingOrWaiting + { + public Loading load(Cancellable loading) + { + Invariants.paranoid(waiters == null || !waiters.isEmpty()); + Loading result = new Loading(waiters, loading); + waiters = Collections.emptyList(); + return result; + } + } + + static class Loading extends LoadingOrWaiting + { + public final Cancellable loading; + + public Loading(Collection> waiters, Cancellable loading) + { + super(waiters); + this.loading = loading; + } + } + + static class Nested + { + Object state; + } + + static class Saving extends Nested + { + final Cancellable saving; + final Object identity; + + Saving(Cancellable saving, Object identity, Object state) + { + this.saving = saving; + this.identity = identity; + this.state = state; + } + } + + static class FailedToSave extends Nested + { + final Throwable cause; + + FailedToSave(Throwable cause, Object state) + { + this.cause = cause; + this.state = state; + } + + public Throwable failure() + { + return cause; + } + } + + public static AccordCacheEntry createReadyToLoad(K key, AccordCache.Type.Instance owner) + { + AccordCacheEntry node = new AccordCacheEntry<>(key, owner); + node.readyToLoad(); + return node; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordCommandStore.java b/src/java/org/apache/cassandra/service/accord/AccordCommandStore.java new file mode 100644 index 000000000000..ab74a128a70d --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordCommandStore.java @@ -0,0 +1,597 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.List; +import java.util.NavigableMap; +import java.util.Objects; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executor; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicReferenceFieldUpdater; +import java.util.concurrent.locks.Lock; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.function.IntFunction; +import javax.annotation.Nullable; + +import com.google.common.annotations.VisibleForTesting; + +import accord.api.Agent; +import accord.api.DataStore; +import accord.api.Journal; +import accord.api.LocalListeners; +import accord.api.ProgressLog; +import accord.api.RoutingKey; +import accord.impl.AbstractLoader; +import accord.impl.AbstractSafeCommandStore.CommandStoreCaches; +import accord.local.Command; +import accord.local.CommandStore; +import accord.local.CommandStores; +import accord.local.Commands; +import accord.local.NodeCommandStoreService; +import accord.local.PreLoadContext; +import accord.local.RedundantBefore; +import accord.local.SafeCommandStore; +import accord.local.cfk.CommandsForKey; +import accord.primitives.PartialTxn; +import accord.primitives.RangeDeps; +import accord.primitives.Ranges; +import accord.primitives.RoutableKey; +import accord.primitives.Status; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import accord.utils.Invariants; +import accord.utils.async.AsyncChain; +import accord.utils.async.AsyncChains; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.AccordKeyspace.CommandsForKeyAccessor; +import org.apache.cassandra.service.accord.IAccordService.AccordCompactionInfo; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.accord.txn.TxnRead; +import org.apache.cassandra.utils.Clock; +import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; + +import static accord.api.Journal.CommandUpdate; +import static accord.api.Journal.FieldUpdates; +import static accord.api.Journal.Load.MINIMAL; +import static accord.api.Journal.Loader; +import static accord.utils.Invariants.require; + +public class AccordCommandStore extends CommandStore +{ + // TODO (required): track this via a PhantomReference, so that if we remove a CommandStore without clearing the caches we can be sure to release them + public static class Caches + { + private final AccordCache global; + private final AccordCache.Type.Instance commands; + private final AccordCache.Type.Instance commandsForKeys; + + Caches(AccordCache global, AccordCache.Type.Instance commandCache, AccordCache.Type.Instance commandsForKeyCache) + { + this.global = global; + this.commands = commandCache; + this.commandsForKeys = commandsForKeyCache; + } + + public final AccordCache global() + { + return global; + } + + public final AccordCache.Type.Instance commands() + { + return commands; + } + + public final AccordCache.Type.Instance commandsForKeys() + { + return commandsForKeys; + } + } + + public static final class ExclusiveCaches extends Caches implements CommandStoreCaches + { + private final Lock lock; + + public ExclusiveCaches(Lock lock, AccordCache global, AccordCache.Type.Instance commands, AccordCache.Type.Instance commandsForKeys) + { + super(global, commands, commandsForKeys); + this.lock = lock; + } + + + @Override + public AccordSafeCommand acquireIfLoaded(TxnId txnId) + { + return commands().acquireIfLoaded(txnId); + } + + @Override + public AccordSafeCommandsForKey acquireIfLoaded(RoutingKey key) + { + return commandsForKeys().acquireIfLoaded(key); + } + + @Override + public void close() + { + lock.unlock(); + } + } + + static final AtomicReferenceFieldUpdater safeRedundantBeforeUpdater + = AtomicReferenceFieldUpdater.newUpdater(AccordCommandStore.class, SafeRedundantBefore.class, "safeRedundantBefore"); + static final AtomicLong nextSafeRedundantBeforeTicket = new AtomicLong(); + + public final String loggingId; + private final Journal journal; + private final RangeSearcher rangeSearcher; + private final AccordExecutor executor; + private final Executor taskExecutor; + private final ExclusiveCaches caches; + private long lastSystemTimestampMicros = Long.MIN_VALUE; + private final CommandsForRanges.Manager commandsForRanges; + private final TableId tableId; + volatile SafeRedundantBefore safeRedundantBefore; + + private AccordSafeCommandStore current; + private Thread currentThread; + + private final CommandStoreLoader loader; + + public AccordCommandStore(int id, + NodeCommandStoreService node, + Agent agent, + DataStore dataStore, + ProgressLog.Factory progressLogFactory, + LocalListeners.Factory listenerFactory, + EpochUpdateHolder epochUpdateHolder, + Journal journal, + AccordExecutor executor) + { + super(id, node, agent, dataStore, progressLogFactory, listenerFactory, epochUpdateHolder); + this.loggingId = String.format("[%s]", id); + this.journal = journal; + this.rangeSearcher = RangeSearcher.extractRangeSearcher(journal); + this.executor = executor; + + final AccordCache.Type.Instance commands; + final AccordCache.Type.Instance commandsForKey; + try (AccordExecutor.ExclusiveGlobalCaches exclusive = executor.lockCaches()) + { + commands = exclusive.commands.newInstance(this); + commandsForKey = exclusive.commandsForKey.newInstance(this); + this.caches = new ExclusiveCaches(executor.lock, exclusive.global, commands, commandsForKey); + } + + this.taskExecutor = executor.executor(this); + this.commandsForRanges = new CommandsForRanges.Manager(this); + this.loader = new CommandStoreLoader(this); + + maybeLoadRedundantBefore(journal.loadRedundantBefore(id())); + maybeLoadBootstrapBeganAt(journal.loadBootstrapBeganAt(id())); + maybeLoadSafeToRead(journal.loadSafeToRead(id())); + maybeLoadRangesForEpoch(journal.loadRangesForEpoch(id())); + + CommandStores.RangesForEpoch ranges = this.rangesForEpoch; + if (ranges == null || ranges.all().isEmpty()) + { + EpochUpdate update = epochUpdateHolder.get(); + if (update != null) + ranges = update.newRangesForEpoch; + Invariants.require(ranges != null, "CommandStore %d created with no ranges", id); + } + tableId = (TableId)ranges.all().stream().map(r -> r.start().prefix()).reduce((a, b) -> { + Invariants.require(a.equals(b), "CommandStore created with multiple distinct TableId (%s and %s)", a, b); + return a; + }).orElseThrow(() -> Invariants.illegalState("CommandStore %d created with no ranges", id)); + } + + static Factory factory(IntFunction executorFactory) + { + return (id, node, agent, dataStore, progressLogFactory, listenerFactory, rangesForEpoch, journal) -> + new AccordCommandStore(id, node, agent, dataStore, progressLogFactory, listenerFactory, rangesForEpoch, journal, executorFactory.apply(id)); + } + + public CommandsForRanges.Manager diskCommandsForRanges() + { + return commandsForRanges; + } + + @Override + public void markShardDurable(SafeCommandStore safeStore, TxnId globalSyncId, Ranges ranges, Status.Durability durability) + { + if (durability.compareTo(Status.Durability.UniversalOrInvalidated) >= 0) + store.snapshot(ranges, globalSyncId); + super.markShardDurable(safeStore, globalSyncId, ranges, durability); + if (durability.compareTo(Status.Durability.UniversalOrInvalidated) >= 0) + commandsForRanges.gcBefore(globalSyncId, ranges); + } + + @Override + public boolean inStore() + { + return currentThread == Thread.currentThread(); + } + + void tryPreSetup(AccordTask task) + { + if (inStore() && current != null) + task.presetup(current.task); + } + + public final TableId tableId() + { + return tableId; + } + + public AccordExecutor executor() + { + return executor; + } + + // TODO (desired): we use this for executing callbacks with mutual exclusivity, + // but we don't need to block the actual CommandStore - could quite easily + // inflate a separate queue dynamically in AccordExecutor + public Executor taskExecutor() + { + return taskExecutor; + } + + public ExclusiveCaches lockCaches() + { + //noinspection LockAcquiredButNotSafelyReleased + caches.lock.lock(); + return caches; + } + + public ExclusiveCaches tryLockCaches() + { + if (caches.lock.tryLock()) + return caches; + return null; + } + + public Caches cachesExclusive() + { + Invariants.require(executor.isOwningThread()); + return caches; + } + + public Caches cachesUnsafe() + { + return caches; + } + + public void persistFieldUpdates(FieldUpdates fieldUpdates, Runnable onFlush) + { + journal.saveStoreState(id, fieldUpdates, onFlush); + } + + @Nullable + @VisibleForTesting + public void appendToLog(Command before, Command after, Runnable onFlush) + { + journal.saveCommand(id, new CommandUpdate(before, after), onFlush); + } + + boolean validateCommand(TxnId txnId, Command evicting) + { + if (!Invariants.isParanoid()) + return true; + + Command reloaded = loadCommand(txnId); + return Objects.equals(evicting, reloaded); + } + + @VisibleForTesting + public void sanityCheckCommand(RedundantBefore redundantBefore, Command command) + { + ((AccordJournal) journal).sanityCheck(id, redundantBefore, command); + } + + CommandsForKey loadCommandsForKey(RoutableKey key) + { + return CommandsForKeyAccessor.load(id, (TokenKey) key); + } + + boolean validateCommandsForKey(RoutableKey key, CommandsForKey evicting) + { + if (!Invariants.isParanoid()) + return true; + + CommandsForKey reloaded = CommandsForKeyAccessor.load(id, (TokenKey) key); + return Objects.equals(evicting, reloaded); + } + + @Nullable + Runnable saveCommandsForKey(RoutingKey key, CommandsForKey after, Object serialized) + { + return CommandsForKeyAccessor.systemTableUpdater(id, (TokenKey) key, after, serialized, nextSystemTimestampMicros()); + } + + public long nextSystemTimestampMicros() + { + lastSystemTimestampMicros = Math.max(TimeUnit.MILLISECONDS.toMicros(Clock.Global.currentTimeMillis()), lastSystemTimestampMicros + 1); + return lastSystemTimestampMicros; + } + @Override + public AsyncChain build(PreLoadContext loadCtx, Function function) + { + return AccordTask.create(this, loadCtx, function).chain(); + } + + @Override + public AsyncChain build(Callable task) + { + return AsyncChains.ofCallable(taskExecutor(), task); + } + + public DataStore dataStore() + { + return store; + } + + NodeCommandStoreService node() + { + return node; + } + + ProgressLog progressLog() + { + return progressLog; + } + + @Override + public AsyncChain build(PreLoadContext preLoadContext, Consumer consumer) + { + return AccordTask.create(this, preLoadContext, consumer).chain(); + } + + public void executeBlocking(Runnable runnable) + { + try + { + executor.submit(runnable).get(); + } + catch (InterruptedException e) + { + throw new UncheckedInterruptedException(e); + } + catch (ExecutionException e) + { + throw new RuntimeException(e); + } + } + + public AccordSafeCommandStore begin(AccordTask operation, + @Nullable CommandsForRanges commandsForRanges) + { + require(current == null); + current = AccordSafeCommandStore.create(operation, commandsForRanges, this); + return current; + } + + void setOwner(Thread thread, Thread self) + { + Invariants.require(thread == null ? currentThread == self : currentThread == null); + currentThread = thread; + if (thread != null) CommandStore.register(this); + } + + public boolean hasSafeStore() + { + return current != null; + } + + public void complete(AccordSafeCommandStore store) + { + require(current == store); + current.postExecute(); + current = null; + } + + public void abort(AccordSafeCommandStore store) + { + checkInStore(); + Invariants.require(store == current); + current = null; + } + + @Override + public void shutdown() + { + } + + public void registerTransitive(SafeCommandStore safeStore, RangeDeps rangeDeps) + { + if (rangeDeps.isEmpty()) + return; + + RedundantBefore redundantBefore = unsafeGetRedundantBefore(); + CommandStores.RangesForEpoch ranges = safeStore.ranges(); + // used in places such as accord.local.CommandStore.fetchMajorityDeps + // We find a set of dependencies for a range then update CommandsFor to know about them + Ranges allRanges = safeStore.ranges().all(); + Ranges coordinateRanges = Ranges.EMPTY; + long coordinateEpoch = -1; + try (ExclusiveCaches caches = lockCaches()) + { + for (int i = 0; i < rangeDeps.txnIdCount(); i++) + { + TxnId txnId = rangeDeps.txnId(i); + AccordCacheEntry state = caches.commands().getUnsafe(txnId); + if (state != null && state.isLoaded() && state.getExclusive() != null && state.getExclusive().known().isDefinitionKnown()) + continue; + + Ranges addRanges = rangeDeps.ranges(i).slice(allRanges); + if (addRanges.isEmpty()) continue; + + if (coordinateEpoch != txnId.epoch()) + { + coordinateEpoch = txnId.epoch(); + coordinateRanges = ranges.allAt(txnId.epoch()); + } + if (addRanges.intersects(coordinateRanges)) continue; + addRanges = redundantBefore.removeGcBefore(txnId, txnId, addRanges); + if (addRanges.isEmpty()) continue; + diskCommandsForRanges().mergeTransitive(txnId, addRanges, Ranges::with); + } + } + } + + public void appendCommands(List diffs, Runnable onFlush) + { + for (int i = 0; i < diffs.size(); i++) + { + boolean isLast = i == diffs.size() - 1; + CommandUpdate change = diffs.get(i); + journal.saveCommand(id, change, isLast ? onFlush : null); + } + } + + @VisibleForTesting + public Command loadCommand(TxnId txnId) + { + return journal.loadCommand(id, txnId, unsafeGetRedundantBefore(), durableBefore()); + } + + public static Command prepareToCache(Command command) + { + // TODO (required): validate we don't have duplicate objects + if (command != null) + { + PartialTxn txn = command.partialTxn(); + if (txn != null) + { + TxnRead read = (TxnRead) txn.read(); + read.unmemoize(); + } + } + return command; + } + + public Command.Minimal loadMinimal(TxnId txnId) + { + return journal.loadMinimal(id, txnId, MINIMAL, unsafeGetRedundantBefore(), durableBefore()); + } + + public AccordCompactionInfo getCompactionInfo() + { + SafeRedundantBefore safeRedundantBefore = this.safeRedundantBefore; + RedundantBefore redundantBefore; + if (safeRedundantBefore == null) redundantBefore = RedundantBefore.EMPTY; + else redundantBefore = safeRedundantBefore.redundantBefore; + CommandStores.RangesForEpoch ranges = this.rangesForEpoch; + if (ranges == null) ranges = CommandStores.RangesForEpoch.EMPTY; + return new AccordCompactionInfo(id, redundantBefore, ranges, tableId); + } + + public RangeSearcher rangeSearcher() + { + return rangeSearcher; + } + + public Loader loader() + { + return loader; + } + + @VisibleForTesting + public void unsafeUpsertRedundantBefore(RedundantBefore addRedundantBefore) + { + super.unsafeUpsertRedundantBefore(addRedundantBefore); + } + + private static class CommandStoreLoader extends AbstractLoader + { + private final AccordCommandStore store; + + private CommandStoreLoader(AccordCommandStore store) + { + this.store = store; + } + + @Override + public AsyncChain load(TxnId txnId) + { + return store.submit(txnId, safeStore -> { + maybeApplyWrites(txnId, safeStore, (safeCommand, cmd) -> { + Commands.applyWrites(safeStore, txnId, cmd).begin(store.agent); + }); + return safeStore.unsafeGet(txnId).current(); + }); + } + } + + /** + * Replay/state reloading + */ + + void maybeLoadRedundantBefore(RedundantBefore redundantBefore) + { + if (redundantBefore != null) + { + loadRedundantBefore(redundantBefore); + Invariants.require(safeRedundantBefore == null); + safeRedundantBefore = new SafeRedundantBefore(0, redundantBefore); + } + } + + void maybeLoadBootstrapBeganAt(NavigableMap bootstrapBeganAt) + { + if (bootstrapBeganAt != null) + loadBootstrapBeganAt(bootstrapBeganAt); + } + + void maybeLoadSafeToRead(NavigableMap safeToRead) + { + if (safeToRead != null) + loadSafeToRead(safeToRead); + } + + void maybeLoadRangesForEpoch(CommandStores.RangesForEpoch rangesForEpoch) + { + if (rangesForEpoch != null) + loadRangesForEpoch(rangesForEpoch); + } + + // TODO (expected): handle journal failures, and consider how we handle partial failures. + // Very likely we will not be able to safely or cleanly handle partial failures of this logic, but decide and document. + // TODO (desired): consider merging with PersistentField? This version is cheaper to manage which may be preferable at the CommandStore level. + static class SafeRedundantBefore + { + final long ticket; + final RedundantBefore redundantBefore; + + SafeRedundantBefore(long ticket, RedundantBefore redundantBefore) + { + this.ticket = ticket; + this.redundantBefore = redundantBefore; + } + + static SafeRedundantBefore max(SafeRedundantBefore a, SafeRedundantBefore b) + { + return a.ticket >= b.ticket ? a : b; + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordCommandStores.java b/src/java/org/apache/cassandra/service/accord/AccordCommandStores.java new file mode 100644 index 000000000000..4cec60f50c24 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordCommandStores.java @@ -0,0 +1,268 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service.accord; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; + +import accord.api.Agent; +import accord.api.DataStore; +import accord.api.Journal; +import accord.api.LocalListeners; +import accord.api.ProgressLog; +import accord.local.CommandStores; +import accord.local.Node; +import accord.local.NodeCommandStoreService; +import accord.local.ShardDistributor; +import accord.primitives.Range; +import accord.topology.Topology; +import accord.utils.RandomSource; +import org.apache.cassandra.cache.CacheSize; +import org.apache.cassandra.concurrent.Stage; +import org.apache.cassandra.config.AccordSpec.QueueShardModel; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.metrics.AccordCacheMetrics; +import org.apache.cassandra.metrics.CacheSizeMetrics; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.AccordExecutor.AccordExecutorFactory; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; + +import static org.apache.cassandra.config.AccordSpec.QueueShardModel.THREAD_PER_SHARD; +import static org.apache.cassandra.config.DatabaseDescriptor.getAccordQueueShardCount; +import static org.apache.cassandra.config.DatabaseDescriptor.getAccordQueueSubmissionModel; +import static org.apache.cassandra.service.accord.AccordExecutor.Mode.RUN_WITHOUT_LOCK; +import static org.apache.cassandra.service.accord.AccordExecutor.Mode.RUN_WITH_LOCK; +import static org.apache.cassandra.service.accord.AccordExecutor.constant; +import static org.apache.cassandra.service.accord.AccordExecutor.constantFactory; + +public class AccordCommandStores extends CommandStores implements CacheSize +{ + public static final String ACCORD_STATE_CACHE = "AccordStateCache"; + + private final CacheSizeMetrics cacheSizeMetrics; + private final AccordExecutor[] executors; + + private long cacheSize, workingSetSize; + private int maxQueuedLoads, maxQueuedRangeLoads; + private boolean shrinkingOn; + + AccordCommandStores(NodeCommandStoreService node, Agent agent, DataStore store, RandomSource random, + ShardDistributor shardDistributor, ProgressLog.Factory progressLogFactory, LocalListeners.Factory listenerFactory, + Journal journal, AccordExecutor[] executors) + { + super(node, agent, store, random, journal, shardDistributor, progressLogFactory, listenerFactory, + AccordCommandStore.factory(id -> executors[id % executors.length])); + this.executors = executors; + this.cacheSizeMetrics = new CacheSizeMetrics(ACCORD_STATE_CACHE, this); + cacheSize = DatabaseDescriptor.getAccordCacheSizeInMiB() << 20; + workingSetSize = DatabaseDescriptor.getAccordWorkingSetSizeInMiB() << 20; + maxQueuedLoads = DatabaseDescriptor.getAccordMaxQueuedLoadCount(); + maxQueuedRangeLoads = DatabaseDescriptor.getAccordMaxQueuedRangeLoadCount(); + shrinkingOn = DatabaseDescriptor.getAccordCacheShrinkingOn(); + refreshCapacities(); + } + + static Factory factory() + { + return (NodeCommandStoreService time, Agent agent, DataStore store, RandomSource random, Journal journal, ShardDistributor shardDistributor, ProgressLog.Factory progressLogFactory, LocalListeners.Factory listenersFactory) -> { + AccordExecutor[] executors = new AccordExecutor[getAccordQueueShardCount()]; + AccordExecutorFactory factory; + int maxThreads = Integer.MAX_VALUE; + switch (getAccordQueueSubmissionModel()) + { + default: throw new AssertionError("Unhandled QueueSubmissionModel: " + getAccordQueueSubmissionModel()); + case SYNC: factory = AccordExecutorSyncSubmit::new; break; + case SEMI_SYNC: factory = AccordExecutorSemiSyncSubmit::new; break; + case ASYNC: factory = AccordExecutorAsyncSubmit::new; break; + case EXEC_ST: + factory = AccordExecutorSimple::new; + maxThreads = 1; + break; + } + + for (int id = 0; id < executors.length; id++) + { + AccordCacheMetrics metrics = new AccordCacheMetrics(ACCORD_STATE_CACHE); + QueueShardModel shardModel = DatabaseDescriptor.getAccordQueueShardModel(); + String baseName = AccordExecutor.class.getSimpleName() + '[' + id; + int threads = Math.min(maxThreads, Math.max(DatabaseDescriptor.getAccordConcurrentOps() / getAccordQueueShardCount(), 1)); + switch (shardModel) + { + case THREAD_PER_SHARD: + case THREAD_PER_SHARD_SYNC_QUEUE: + executors[id] = factory.get(id, shardModel == THREAD_PER_SHARD ? RUN_WITHOUT_LOCK : RUN_WITH_LOCK, 1, constant(baseName + ']'), metrics, constantFactory(Stage.READ.executor()), constantFactory(Stage.MUTATION.executor()), constantFactory(Stage.READ.executor()), agent); + break; + case THREAD_POOL_PER_SHARD: + executors[id] = factory.get(id, RUN_WITHOUT_LOCK, threads, num -> baseName + ',' + num + ']', metrics, AccordExecutor::submitIOToSelf, AccordExecutor::submitIOToSelf, AccordExecutor::submitIOToSelf, agent); + break; + case THREAD_POOL_PER_SHARD_EXCLUDES_IO: + executors[id] = factory.get(id, RUN_WITHOUT_LOCK, threads, num -> baseName + ',' + num + ']', metrics, constantFactory(Stage.READ.executor()), constantFactory(Stage.MUTATION.executor()), constantFactory(Stage.READ.executor()), agent); + break; + } + } + + return new AccordCommandStores(time, agent, store, random, shardDistributor, progressLogFactory, listenersFactory, journal, executors); + }; + } + + @Override + protected boolean shouldBootstrap(Node node, Topology previous, Topology updated, Range range) + { + if (!super.shouldBootstrap(node, previous, updated, range)) + return false; + // we see new ranges when a new keyspace is added, so avoid bootstrap in these cases + return contains(previous, ((TokenKey) range.start()).table()); + } + + private static boolean contains(Topology previous, TableId searchTable) + { + for (Range range : previous.ranges()) + { + TableId table = ((TokenKey) range.start()).table(); + if (table.equals(searchTable)) + return true; + } + return false; + } + + public synchronized void setCapacity(long bytes) + { + cacheSize = bytes; + refreshCapacities(); + } + + public synchronized void setWorkingSetSize(long bytes) + { + workingSetSize = bytes; + refreshCapacities(); + } + + public synchronized void setCapacityAndWorkingSetSize(long newCacheSize, long newWorkingSetSize) + { + cacheSize = newCacheSize; + workingSetSize = newWorkingSetSize; + refreshCapacities(); + } + + public synchronized void setMaxQueuedLoads(int total, int range) + { + maxQueuedLoads = total; + maxQueuedRangeLoads = range; + refreshCapacities(); + } + + @Override + public long capacity() + { + return cacheSize; + } + + @Override + public int size() + { + int size = 0; + for (AccordExecutor executor : executors) + size += executor.size(); + return size; + } + + @Override + public long weightedSize() + { + long size = 0; + for (AccordExecutor executor : executors) + size += executor.weightedSize(); + return size; + } + + synchronized void refreshCapacities() + { + long capacityPerExecutor = cacheSize / executors.length; + long workingSetPerExecutor = workingSetSize < 0 ? Long.MAX_VALUE : workingSetSize / executors.length; + int maxLoadsPerExecutor = (maxQueuedLoads + executors.length - 1) / executors.length; + int maxRangeLoadsPerExecutor = (maxQueuedRangeLoads + executors.length - 1) / executors.length; + for (AccordExecutor executor : executors) + { + executor.executeDirectlyWithLock(() -> { + executor.setCapacity(capacityPerExecutor); + executor.setWorkingSetSize(workingSetPerExecutor); + executor.setMaxQueuedLoads(maxLoadsPerExecutor, maxRangeLoadsPerExecutor); + executor.cacheExclusive().setShrinkingOn(shrinkingOn); + }); + } + } + + public List executors() + { + return Arrays.asList(executors.clone()); + } + + public void waitForQuiescense() + { + boolean hadPending; + try + { + do + { + hadPending = false; + List> futures = new ArrayList<>(); + for (AccordExecutor executor : this.executors) + { + hadPending |= executor.hasTasks(); + futures.add(executor.submit(() -> {})); + } + for (Future future : futures) + future.get(); + futures.clear(); + } + while (hadPending); + } + catch (ExecutionException e) + { + throw new IllegalStateException("Should have never been thrown", e); + } + catch (InterruptedException e) + { + throw new UncheckedInterruptedException(e); + } + } + + @Override + public synchronized void shutdown() + { + super.shutdown(); + for (AccordExecutor executor : executors) + { + executor.shutdown(); + try + { + executor.awaitTermination(1, TimeUnit.MINUTES); + } + catch (InterruptedException e) + { + throw new RuntimeException(e); + } + } + //TODO (expected): shutdown isn't useful by itself, we need a way to "wait" as well. Should be AutoCloseable or offer awaitTermination as well (think Shutdownable interface) + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordConfigurationService.java b/src/java/org/apache/cassandra/service/accord/AccordConfigurationService.java new file mode 100644 index 000000000000..fade204b1f7f --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordConfigurationService.java @@ -0,0 +1,644 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.HashSet; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; +import java.util.function.BiConsumer; +import java.util.stream.Collectors; +import javax.annotation.Nullable; +import javax.annotation.concurrent.GuardedBy; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Sets; + +import accord.impl.AbstractConfigurationService; +import accord.local.Node; +import accord.primitives.Ranges; +import accord.topology.Shard; +import accord.topology.Topology; +import accord.utils.Invariants; +import accord.utils.SortedListSet; +import accord.utils.async.AsyncResult; +import accord.utils.async.AsyncResults; +import org.agrona.collections.LongArrayList; +import org.apache.cassandra.concurrent.ScheduledExecutorPlus; +import org.apache.cassandra.concurrent.ScheduledExecutors; +import org.apache.cassandra.concurrent.Shutdownable; +import org.apache.cassandra.concurrent.Stage; +import org.apache.cassandra.gms.FailureDetector; +import org.apache.cassandra.gms.IFailureDetector; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.MessageDelivery; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.repair.SharedContext; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.listeners.ChangeListener; +import org.apache.cassandra.tcm.membership.NodeState; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.Simulate; +import org.apache.cassandra.utils.concurrent.AsyncPromise; +import org.apache.cassandra.utils.concurrent.Future; + +import static org.apache.cassandra.service.accord.AccordTopology.tcmIdToAccord; +import static org.apache.cassandra.utils.Simulate.With.MONITORS; + +// TODO (desired): listen to FailureDetector and rearrange fast path accordingly +@Simulate(with=MONITORS) +public class AccordConfigurationService extends AbstractConfigurationService implements AccordEndpointMapper, AccordSyncPropagator.Listener, Shutdownable +{ + private final AccordSyncPropagator syncPropagator; + public final WatermarkCollector watermarkCollector; + + private enum State { INITIALIZED, STARTED, SHUTDOWN } + + @GuardedBy("this") + private State state = State.INITIALIZED; + private volatile EndpointMapping mapping = EndpointMapping.EMPTY; + + public enum SyncStatus { NOT_STARTED, NOTIFYING, COMPLETED } + + static class EpochState extends AbstractConfigurationService.AbstractEpochState + { + private volatile SyncStatus syncStatus = SyncStatus.NOT_STARTED; + protected final AsyncResult.Settable localSyncNotified = AsyncResults.settable(); + + public EpochState(long epoch) + { + super(epoch); + } + + void setSyncStatus(SyncStatus status) + { + this.syncStatus = status; + if (status == SyncStatus.COMPLETED) + localSyncNotified.trySuccess(null); + } + + AsyncResult received() + { + return received; + } + + AsyncResult acknowledged() + { + return acknowledged; + } + + @Nullable AsyncResult reads() + { + return reads; + } + + AsyncResult.Settable localSyncNotified() + { + return localSyncNotified; + } + } + + static class EpochHistory extends AbstractConfigurationService.AbstractEpochHistory + { + @Override + protected EpochState createEpochState(long epoch) + { + return new EpochState(epoch); + } + } + + //TODO (required): should not be public + public final ChangeListener listener = new MetadataChangeListener(); + private class MetadataChangeListener implements ChangeListener + { + @Override + public void notifyPostCommit(ClusterMetadata prev, ClusterMetadata next, boolean fromSnapshot) + { + maybeReportMetadata(next); + } + } + + public AccordConfigurationService(Node.Id node, MessageDelivery messagingService, IFailureDetector failureDetector, ScheduledExecutorPlus scheduledTasks) + { + super(node); + this.syncPropagator = new AccordSyncPropagator(localId, this, messagingService, failureDetector, scheduledTasks, this); + this.watermarkCollector = new WatermarkCollector(); + listeners.add(watermarkCollector); + } + + public AccordConfigurationService(Node.Id node) + { + this(node, MessagingService.instance(), FailureDetector.instance, ScheduledExecutors.scheduledTasks); + } + + @Override + protected EpochHistory createEpochHistory() + { + return new EpochHistory(); + } + + /** + * On restart, loads topologies. On bootstrap, discovers existing topologies and initializes the node. + */ + public synchronized void start() + { + Invariants.require(state == State.INITIALIZED, "Expected state to be INITIALIZED but was %s", state); + state = State.STARTED; + + // for all nodes removed, or pending removal, mark them as removed, so we don't wait on their replies + Map removedNodes = mapping.removedNodes(); + for (Map.Entry e : removedNodes.entrySet()) + onNodeRemoved(e.getValue(), currentTopology(), e.getKey()); + } + + @Override + public synchronized boolean isTerminated() + { + return state == State.SHUTDOWN; + } + + @Override + public synchronized void shutdown() + { + if (isTerminated()) + return; + ClusterMetadataService.instance().log().removeListener(listener); + state = State.SHUTDOWN; + } + + @Override + public Object shutdownNow() + { + shutdown(); + return null; + } + + @Override + public boolean awaitTermination(long timeout, TimeUnit units) throws InterruptedException + { + return isTerminated(); + } + + @Override + public Node.Id mappedIdOrNull(InetAddressAndPort endpoint) + { + return mapping.mappedIdOrNull(endpoint); + } + + @Override + public InetAddressAndPort mappedEndpointOrNull(Node.Id id) + { + return mapping.mappedEndpointOrNull(id); + } + + @VisibleForTesting + synchronized void updateMapping(EndpointMapping mapping) + { + if (mapping.epoch() > this.mapping.epoch()) + this.mapping = mapping; + } + + public synchronized void updateMapping(ClusterMetadata metadata) + { + updateMapping(AccordTopology.directoryToMapping(metadata.epoch.getEpoch(), metadata.directory)); + } + + private void reportMetadata(ClusterMetadata metadata) + { + Stage.MISC.submit(() -> reportMetadataInternal(metadata)); + } + + void reportMetadataInternal(ClusterMetadata metadata) + { + updateMapping(metadata); + Topology topology = AccordTopology.createAccordTopology(metadata); + if (Invariants.isParanoid()) + { + for (Node.Id node : topology.nodes()) + { + if (mapping.mappedEndpointOrNull(node) == null) + throw new IllegalStateException(String.format("Epoch %d has node %s but mapping does not!", topology.epoch(), node)); + } + } + reportTopology(topology); + Set stillLiveNodes = metadata.directory.states.entrySet() + .stream() + .filter(e -> e.getValue() != NodeState.LEFT && e.getValue() != NodeState.LEAVING) + .map(e -> tcmIdToAccord(e.getKey())) + .collect(Collectors.toSet()); + if (epochs.lastAcknowledged() >= topology.epoch()) checkIfNodesRemoved(topology, stillLiveNodes); + else epochs.acknowledgeFuture(topology.epoch()).invokeIfSuccess(() -> checkIfNodesRemoved(topology, stillLiveNodes)); + } + + private void checkIfNodesRemoved(Topology topology, Set stillLiveNodes) + { + if (epochs.minEpoch() == topology.epoch()) return; + Topology previous = getTopologyForEpoch(topology.epoch() - 1); + // for all nodes removed, or pending removal, mark them as removed so we don't wait on their replies + Set removedNodes = Sets.difference(previous.nodes(), topology.nodes()); + removedNodes = Sets.filter(removedNodes, id -> !stillLiveNodes.contains(id)); + // TODO (desired, efficiency): there should be no need to notify every epoch for every removed node + for (Node.Id removedNode : removedNodes) + { + if (topology.epoch() >= epochs.minEpoch()) + onNodeRemoved(topology.epoch(), previous, removedNode); + } + } + + private static boolean shareShard(Topology current, Node.Id target, Node.Id self) + { + for (Shard shard : current.shards()) + { + if (!shard.contains(target)) continue; + if (shard.contains(self)) return true; + } + return false; + } + + public void onNodeRemoved(long epoch, Topology current, Node.Id removed) + { + syncPropagator.onNodesRemoved(removed); + // TODO (now): it seems to be incorrect to mark remote syncs complete if/when node got removed. + for (long oldEpoch : nonCompletedEpochsBefore(epoch)) + receiveRemoteSyncCompletePreListenerNotify(removed, oldEpoch); + + listeners.forEach(l -> l.onRemoveNode(epoch, removed)); + } + + private long[] nonCompletedEpochsBefore(long max) + { + LongArrayList notComplete = new LongArrayList(); + synchronized (epochs) + { + for (long epoch = epochs.minEpoch(), maxKnown = epochs.maxEpoch(); epoch <= max && epoch <= maxKnown; epoch++) + { + EpochSnapshot snapshot = getEpochSnapshot(epoch); + if (snapshot.syncStatus != SyncStatus.COMPLETED) + notComplete.add(epoch); + } + } + return notComplete.toLongArray(); + } + + @VisibleForTesting + void maybeReportMetadata(ClusterMetadata metadata) + { + // don't report metadata until the previous one has been acknowledged + long epoch = metadata.epoch.getEpoch(); + synchronized (epochs) + { + // On first boot, we have 2 options: + // + // - we can start listening to TCM _before_ we replay topologies + // - we can start listening to TCM _after_ we replay topologies + // + // If we start listening to TCM _before_ we replay topologies from other nodes, + // we may end up in a situation where TCM reports metadata that would create an + // `epoch - 1` epoch state that is not associated with any topologies, and + // therefore should not be listened upon. + // + // If we start listening to TCM _after_ we replay topologies, we may end up in a + // situation where TCM reports metadata that is 1 (or more) epochs _ahead_ of the + // last known epoch. Previous implementations were using TCM peer catch up, which + // could have resulted in gaps. + // + // Current protocol solves both problems by _first_ replaying topologies form peers, + // then subscribing to TCM _and_, if there are still any gaps, filling them again. + // However, it still has a slight chance of creating an `epoch - 1` epoch state + // not associated with any topologies, which under "right" circumstances could + // have been waited upon with `epochReady`. This check precludes creation of this + // epoch: by the time this code can be called, remote topology replay is already + // done, so TCM listener will only report epochs that are _at least_ min epoch. + if (epochs.maxEpoch() == 0 || epochs.minEpoch() == metadata.epoch.getEpoch()) + { + getOrCreateEpochState(epoch); // touch epoch state so subsequent calls see it + reportMetadata(metadata); + return; + } + } + + getOrCreateEpochState(epoch - 1).acknowledged().invokeIfSuccess(() -> reportMetadata(metadata)); + } + + private final Map> pendingTopologies = new ConcurrentHashMap<>(); + + @Override + public void fetchTopologyForEpoch(long epoch) + { + long minEpoch = currentEpoch() + 1; + // Find and fetch all epochs in-between + for (long i = minEpoch; i <= epoch; ++i) + fetchTopologyInternal(i); + } + + private static final Object Success = new Object(); + + protected void fetchTopologyInternal(long epoch) + { + pendingTopologies.computeIfAbsent(epoch, (epoch_) -> { + AsyncPromise future = new AsyncPromise<>(); + fetchTopologyAsync(epoch_, + (success, throwable) -> { + Future removed = pendingTopologies.remove(epoch_); + Invariants.require(future == removed, "%s should be equal to %s", future, removed); + if (success != null) + future.setSuccess(null); + else + { + future.setFailure(Invariants.nonNull(throwable)); + fetchTopologyForEpoch(epoch_); + } + }); + return future; + }); + } + + private void fetchTopologyAsync(long epoch, BiConsumer onResult) + { + // It's not safe for this to block on CMS so for now pick a thread pool to handle it + Stage.ACCORD_MIGRATION.execute(() -> { + try + { + if (ClusterMetadata.current().epoch.getEpoch() < epoch) + ClusterMetadataService.instance().fetchLogFromCMS(Epoch.create(epoch)); + } + catch (Throwable t) + { + onResult.accept(null, t); + return; + } + + // In most cases, after fetching log from CMS, we will be caught up to the required epoch. + // This TCM will also notify Accord via reportMetadata, so we do not need to fetch topologies. + // If metadata has reported has skipped one or more epochs, and is _ahead_ of the requested epoch, + // we need to fetch topologies from peers to fill in the gap. + ClusterMetadata metadata = ClusterMetadata.current(); + if (metadata.epoch.getEpoch() == epoch) + { + onResult.accept(Success, null); + return; + } + + Set peers = new HashSet<>(metadata.directory.allJoinedEndpoints()); + peers.remove(FBUtilities.getBroadcastAddressAndPort()); + if (peers.isEmpty()) + { + onResult.accept(Success, null); + return; + } + + // Fetching only one epoch here since later epochs might have already been requested concurrently + FetchTopologies.fetch(SharedContext.Global.instance, peers, epoch, epoch) + .addCallback((topologyRange, t) -> { + if (t != null) + { + if (currentEpoch() >= epoch) + onResult.accept(Success, null); + else + onResult.accept(null, t); + } + else + { + topologyRange.forEach(this::reportTopology, epoch, 1); + onResult.accept(Success, null); + } + }); + }); + } + + @Override + public void reportTopology(Topology topology, boolean isLoad, boolean startSync) + { + long tcmEpoch = ClusterMetadata.current().epoch.getEpoch(); + Invariants.require(topology.epoch() <= tcmEpoch, + "Reported topology %s not known to TCM", topology.epoch(), tcmEpoch); + super.reportTopology(topology, isLoad, startSync); + } + + @Override + protected void localSyncComplete(Topology topology, boolean startSync) + { + long epoch = topology.epoch(); + EpochState epochState = getOrCreateEpochState(epoch); + if (!startSync || epochState.syncStatus != SyncStatus.NOT_STARTED) + return; + + synchronized (this) + { + if (epochState.syncStatus != SyncStatus.NOT_STARTED) + return; + epochState.setSyncStatus(SyncStatus.NOTIFYING); + } + + Set notify = SortedListSet.allOf(topology.nodes()); + notify.remove(localId); + syncPropagator.reportSyncComplete(epoch, notify, localId); + } + + @Override + public synchronized void onEndpointAck(Node.Id id, long epoch) + { + } + + @Override + public void onComplete(long epoch) + { + if (epochs.wasTruncated(epoch)) + return; + + EpochState epochState = getOrCreateEpochState(epoch); + synchronized (this) + { + epochState.setSyncStatus(SyncStatus.COMPLETED); + } + } + + @Override + protected synchronized void receiveRemoteSyncCompletePreListenerNotify(Node.Id node, long epoch) + { + } + + @Override + public void reportEpochClosed(Ranges ranges, long epoch) + { + checkStarted(); + EpochHistory epochs = this.epochs; + if (epoch < minEpoch() || epochs.wasTruncated(epoch)) + return; + + Topology topology = getTopologyForEpoch(epoch); + syncPropagator.reportClosed(epoch, topology.nodes(), ranges); + } + + @VisibleForTesting + public AccordSyncPropagator syncPropagator() + { + return syncPropagator; + } + + @Override + public void reportEpochRetired(Ranges ranges, long epoch) + { + checkStarted(); + // TODO (expected): ensure we aren't fetching a truncated epoch; otherwise this should be non-null + Topology topology = getTopologyForEpoch(epoch); + syncPropagator.reportRetired(epoch, topology.nodes(), ranges); + } + + @Override + public void receiveClosed(Ranges ranges, long epoch) + { + super.receiveClosed(ranges, epoch); + } + + @Override + public void receiveRetired(Ranges ranges, long epoch) + { + super.receiveRetired(ranges, epoch); + } + + @Override + public void reportEpochRemoved(long epoch) + { + epochs.truncateUntil(epoch); + } + + private synchronized void checkStarted() + { + State state = this.state; + Invariants.require(state == State.STARTED, "Expected state to be STARTED but was %s", state); + } + + @VisibleForTesting + public static class EpochSnapshot + { + public enum ResultStatus + { + PENDING, SUCCESS, FAILURE; + + static ResultStatus of(AsyncResult result) + { + if (result == null || !result.isDone()) + return PENDING; + + return result.isSuccess() ? SUCCESS : FAILURE; + } + } + + public final long epoch; + public final SyncStatus syncStatus; + public final ResultStatus received; + public final ResultStatus acknowledged; + public final ResultStatus reads; + + private EpochSnapshot(EpochState state) + { + this.epoch = state.epoch(); + this.syncStatus = state.syncStatus; + this.received = ResultStatus.of(state.received()); + this.acknowledged = ResultStatus.of(state.acknowledged()); + this.reads = ResultStatus.of(state.reads()); + } + + public EpochSnapshot(long epoch, SyncStatus syncStatus, ResultStatus received, ResultStatus acknowledged, ResultStatus reads) + { + this.epoch = epoch; + this.syncStatus = syncStatus; + this.received = received; + this.acknowledged = acknowledged; + this.reads = reads; + } + + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + EpochSnapshot that = (EpochSnapshot) o; + return epoch == that.epoch && syncStatus == that.syncStatus && received == that.received && acknowledged == that.acknowledged && reads == that.reads; + } + + public int hashCode() + { + return Objects.hash(epoch, syncStatus, received, acknowledged, reads); + } + + public String toString() + { + return "EpochSnapshot{" + + "epoch=" + epoch + + ", syncStatus=" + syncStatus + + ", received=" + received + + ", acknowledged=" + acknowledged + + ", reads=" + reads + + '}'; + } + + public static EpochSnapshot completed(long epoch) + { + return new EpochSnapshot(epoch, SyncStatus.COMPLETED, ResultStatus.SUCCESS, ResultStatus.SUCCESS, ResultStatus.SUCCESS); + } + } + + @VisibleForTesting + public EpochSnapshot getEpochSnapshot(long epoch) + { + EpochState state; + // If epoch truncate happens then getting the epoch again will recreate an empty one + synchronized (epochs) + { + if (epoch < epochs.minEpoch() || epoch > epochs.maxEpoch()) + return null; + + state = getOrCreateEpochState(epoch); + } + return new EpochSnapshot(state); + } + + @VisibleForTesting + public long minEpoch() + { + return epochs.minEpoch(); + } + + @VisibleForTesting + public long maxEpoch() + { + return epochs.maxEpoch(); + } + + /** + * The callback is resolved while holding the object lock, which can cause the future chain to resolve while also + * holding the lock! This behavior is exposed for tests and is unsafe due to the lock behind held while resolving + * the callback + */ + @VisibleForTesting + public Future unsafeLocalSyncNotified(long epoch) + { + AsyncPromise promise = new AsyncPromise<>(); + getOrCreateEpochState(epoch).localSyncNotified().invoke((result, failure) -> { + if (failure != null) promise.tryFailure(failure); + else promise.trySuccess(result); + }); + return promise; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordDataStore.java b/src/java/org/apache/cassandra/service/accord/AccordDataStore.java new file mode 100644 index 000000000000..2ab5c98641ab --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordDataStore.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +import accord.api.DataStore; +import accord.local.Node; +import accord.local.SafeCommandStore; +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.primitives.SyncPoint; +import accord.primitives.TxnId; +import accord.utils.async.AsyncResult; +import accord.utils.async.AsyncResults; +import org.agrona.collections.Object2ObjectHashMap; +import org.apache.cassandra.concurrent.ScheduledExecutors; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.DataRange; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.lifecycle.View; +import org.apache.cassandra.db.memtable.Memtable; +import org.apache.cassandra.db.memtable.TrieMemtable; +import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.sstable.SSTableReadsListener; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.utils.concurrent.Future; +import org.apache.cassandra.utils.concurrent.FutureCombiner; + +import static accord.utils.Invariants.require; +import static org.apache.cassandra.db.ColumnFamilyStore.FlushReason.ACCORD_TXN_GC; + +public class AccordDataStore implements DataStore +{ + @Override + public FetchResult fetch(Node node, SafeCommandStore safeStore, Ranges ranges, SyncPoint syncPoint, FetchRanges callback) + { + AccordFetchCoordinator coordinator = new AccordFetchCoordinator(node, ranges, syncPoint, callback, safeStore.commandStore()); + coordinator.start(); + return coordinator.result(); + } + + static class SnapshotBounds + { + final List> ranges = new ArrayList<>(); + long id; + } + + @Override + public AsyncResult snapshot(Ranges ranges, TxnId before) + { + AsyncResults.SettableResult result = new AsyncResults.SettableResult<>(); + // TODO (desired): maintain a list of Accord tables, perhaps in ClusterMetadata? + ClusterMetadata metadata = ClusterMetadata.current(); + Object2ObjectHashMap tables = new Object2ObjectHashMap<>(); + for (Range range : ranges) + { + tables.computeIfAbsent(((TokenRange)range).table(), ignore -> new SnapshotBounds()) + .ranges.add(((TokenRange) range).toKeyspaceRange()); + } + + for (Map.Entry e : tables.entrySet()) + { + // TODO (required): is it safe to ignore null table metadata / cfs? + TableMetadata tableMetadata = metadata.schema.getTableMetadata(e.getKey()); + if (tableMetadata == null || !tableMetadata.isAccordEnabled()) + continue; + + ColumnFamilyStore cfs = Keyspace.openAndGetStoreIfExists(tableMetadata); + if (cfs == null) + continue; + + // TODO (required): when we can safely map TxnId.hlc() -> local timestamp, consult Memtable timestamps + Memtable memtable = cfs.getCurrentMemtable(); + e.getValue().id = memtable.getMemtableId(); + } + + ScheduledExecutors.scheduledTasks.schedule(() -> { + List> futures = new ArrayList<>(); + for (Map.Entry e : tables.entrySet()) + { + TableMetadata tableMetadata = metadata.schema.getTableMetadata(e.getKey()); + SnapshotBounds bounds = e.getValue(); + ColumnFamilyStore cfs = Keyspace.openAndGetStoreIfExists(tableMetadata); + + // TODO (required): is it safe to ignore null cfs? + if (cfs == null) continue; + + View view = cfs.getTracker().getView(); + for (Memtable memtable : view.getAllMemtables()) + { + if (memtable.getMemtableId() > bounds.id) continue; + if (!intersects(cfs, memtable, bounds.ranges)) continue; + + futures.add(cfs.forceFlush(ACCORD_TXN_GC)); + break; + } + } + + FutureCombiner.allOf(futures).addCallback((objects, throwable) -> { + if (throwable != null) + result.setFailure(throwable); + else + result.setSuccess(null); + }); + }, DatabaseDescriptor.getAccordGCDelay(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS); + + return result; + } + + private boolean intersects(ColumnFamilyStore cfs, Memtable memtable, List> tableRanges) + { + boolean intersects = false; + // TrieMemtable doesn't support reverse iteration so can't find the last token + if (memtable instanceof TrieMemtable) + intersects = true; + else + { + Token firstToken = null; + try (UnfilteredPartitionIterator iterator = memtable.partitionIterator(ColumnFilter.all(cfs.metadata()), DataRange.allData(cfs.getPartitioner()), SSTableReadsListener.NOOP_LISTENER)) + { + if (iterator.hasNext()) + firstToken = iterator.next().partitionKey().getToken(); + } + Token lastToken = memtable.lastToken(); + + if (firstToken != null) + { + require(lastToken != null); + if (firstToken.equals(lastToken)) + { + for (org.apache.cassandra.dht.Range tableRange : tableRanges) + { + if (tableRange.contains(firstToken)) + { + intersects = true; + break; + } + } + } + else + { + require(firstToken.compareTo(lastToken) < 0); + org.apache.cassandra.dht.Range memtableRange = new org.apache.cassandra.dht.Range<>(firstToken, lastToken); + for (org.apache.cassandra.dht.Range tableRange : tableRanges) + { + if (tableRange.intersects(memtableRange)) + { + intersects = true; + break; + } + } + } + } + } + + return intersects; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordEndpointMapper.java b/src/java/org/apache/cassandra/service/accord/AccordEndpointMapper.java new file mode 100644 index 000000000000..fd0e8cf73618 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordEndpointMapper.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import accord.local.Node; +import accord.utils.Invariants; +import org.apache.cassandra.locator.InetAddressAndPort; + +/** + * Maps network addresses to accord ids + */ +public interface AccordEndpointMapper +{ + Node.Id mappedIdOrNull(InetAddressAndPort endpoint); + InetAddressAndPort mappedEndpointOrNull(Node.Id id); + + default Node.Id mappedId(InetAddressAndPort endpoint) + { + return Invariants.nonNull(mappedIdOrNull(endpoint), "Unable to map address %s to a Node.Id", endpoint); + } + + default InetAddressAndPort mappedEndpoint(Node.Id id) + { + return Invariants.nonNull(mappedEndpointOrNull(id), "Unable to map node id %s to a InetAddressAndPort", id); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordExecutor.java b/src/java/org/apache/cassandra/service/accord/AccordExecutor.java new file mode 100644 index 000000000000..9d5c97b46385 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordExecutor.java @@ -0,0 +1,1111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.concurrent.Callable; +import java.util.concurrent.Executor; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Lock; +import java.util.function.BiConsumer; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.function.IntFunction; + +import accord.api.Agent; +import accord.api.RoutingKey; +import accord.local.AgentExecutor; +import accord.local.Command; +import accord.local.cfk.CommandsForKey; +import accord.primitives.TxnId; +import accord.utils.ArrayBuffers.BufferList; +import accord.utils.IntrusivePriorityHeap; +import accord.utils.Invariants; +import accord.utils.QuadConsumer; +import accord.utils.QuadFunction; +import accord.utils.QuintConsumer; +import accord.utils.TriConsumer; +import accord.utils.TriFunction; +import accord.utils.UnhandledEnum; +import accord.utils.async.AsyncChain; +import accord.utils.async.AsyncChains; +import accord.utils.async.Cancellable; +import org.agrona.collections.Object2ObjectHashMap; +import org.apache.cassandra.cache.CacheSize; +import org.apache.cassandra.concurrent.ExecutorPlus; +import org.apache.cassandra.concurrent.ScheduledExecutors; +import org.apache.cassandra.concurrent.Shutdownable; +import org.apache.cassandra.metrics.AccordCacheMetrics; +import org.apache.cassandra.utils.concurrent.AsyncPromise; +import org.apache.cassandra.utils.concurrent.Future; + +import static org.apache.cassandra.service.accord.AccordCache.CommandAdapter.COMMAND_ADAPTER; +import static org.apache.cassandra.service.accord.AccordCache.CommandsForKeyAdapter.CFK_ADAPTER; +import static org.apache.cassandra.service.accord.AccordCache.registerJfrListener; +import static org.apache.cassandra.service.accord.AccordCacheEntry.Status.EVICTED; +import static org.apache.cassandra.service.accord.AccordTask.State.LOADING; +import static org.apache.cassandra.service.accord.AccordTask.State.SCANNING_RANGES; +import static org.apache.cassandra.service.accord.AccordTask.State.WAITING_TO_LOAD; +import static org.apache.cassandra.service.accord.AccordTask.State.WAITING_TO_RUN; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; + +public abstract class AccordExecutor implements CacheSize, AccordCacheEntry.OnLoaded, AccordCacheEntry.OnSaved, Shutdownable, AgentExecutor +{ + public interface AccordExecutorFactory + { + AccordExecutor get(int executorId, Mode mode, int threads, IntFunction name, AccordCacheMetrics metrics, ExecutorFunctionFactory loadExecutor, ExecutorFunctionFactory saveExecutor, ExecutorFunctionFactory rangeLoadExecutor, Agent agent); + } + + public enum Mode { RUN_WITH_LOCK, RUN_WITHOUT_LOCK } + + public interface ExecutorFunction extends BiFunction {} + public interface ExecutorFunctionFactory extends Function {} + + // WARNING: this is a shared object, so close is NOT idempotent + public static final class ExclusiveGlobalCaches extends GlobalCaches implements AutoCloseable + { + final Lock lock; + + public ExclusiveGlobalCaches(Lock lock, AccordCache global, AccordCache.Type commands, AccordCache.Type commandsForKey) + { + super(global, commands, commandsForKey); + this.lock = lock; + } + + @Override + public void close() + { + lock.unlock(); + } + } + + public static class GlobalCaches + { + public final AccordCache global; + public final AccordCache.Type commands; + public final AccordCache.Type commandsForKey; + + public GlobalCaches(AccordCache global, AccordCache.Type commands, AccordCache.Type commandsForKey) + { + this.global = global; + this.commands = commands; + this.commandsForKey = commandsForKey; + } + } + + final Lock lock; + final Agent agent; + final int executorId; + private final AccordCache cache; + private final ExecutorFunction loadExecutor; + private final ExecutorFunction rangeLoadExecutor; + + private final TaskQueue> scanningRanges = new TaskQueue<>(SCANNING_RANGES); // never queried, just parked here while scanning + private final TaskQueue> loading = new TaskQueue<>(LOADING); // never queried, just parked here while loading + + private final TaskQueue> waitingToLoadRangeTxns = new TaskQueue<>(WAITING_TO_LOAD); + + private final TaskQueue> waitingToLoad = new TaskQueue<>(WAITING_TO_LOAD); + private final TaskQueue waitingToRun = new TaskQueue<>(WAITING_TO_RUN); + private final Object2ObjectHashMap commandStoreQueues = new Object2ObjectHashMap<>(); + + private final AccordCacheEntry.OnLoaded onRangeLoaded = this::onRangeLoaded; + private final ExclusiveGlobalCaches caches; + + /** + * The maximum total number of loads we can queue at once - this includes loads for range transactions, + * which are subject to this limit as well as that imposed by {@link #maxQueuedRangeLoads} + */ + private int maxQueuedLoads = 64; + /** + * The maximum number of loads exclusively for range transactions we can queue at once; the {@link #maxQueuedLoads} limit also applies. + */ + private int maxQueuedRangeLoads = 8; + + private long maxWorkingSetSizeInBytes; + private long maxWorkingCapacityInBytes; + private int nextPosition; + private int activeLoads, activeRangeLoads; + private boolean hasPausedLoading; + int tasks; + int running; + + AccordExecutor(Lock lock, int executorId, AccordCacheMetrics metrics, ExecutorFunctionFactory loadExecutor, ExecutorFunctionFactory saveExecutor, ExecutorFunctionFactory rangeLoadExecutor, Agent agent) + { + this.lock = lock; + this.executorId = executorId; + this.cache = new AccordCache(alwaysNullTask(saveExecutor.apply(this)), this, 0, metrics); + this.loadExecutor = loadExecutor.apply(this); + this.rangeLoadExecutor = rangeLoadExecutor.apply(this); + this.agent = agent; + + final AccordCache.Type commands; + final AccordCache.Type commandsForKey; + commands = cache.newType(TxnId.class, COMMAND_ADAPTER); + registerJfrListener(executorId, commands, "Command"); + + commandsForKey = cache.newType(RoutingKey.class, CFK_ADAPTER); + registerJfrListener(executorId, commandsForKey, "CommandsForKey"); + + this.caches = new ExclusiveGlobalCaches(lock, cache, commands, commandsForKey); + ScheduledExecutors.scheduledFastTasks.scheduleAtFixedRate(() -> { + executeDirectlyWithLock(cache::processNoEvictQueue); + }, 1L, 1L, TimeUnit.SECONDS); + } + + public int executorId() + { + return executorId; + } + + public ExclusiveGlobalCaches lockCaches() + { + //noinspection LockAcquiredButNotSafelyReleased + lock.lock(); + return caches; + } + + public AccordCache cacheExclusive() + { + Invariants.require(isOwningThread()); + return cache; + } + + public AccordCache cacheUnsafe() + { + return cache; + } + + boolean hasWaitingToRun() + { + updateWaitingToRunExclusive(); + return !waitingToRun.isEmpty(); + } + + Task pollWaitingToRunExclusive() + { + updateWaitingToRunExclusive(); + return waitingToRun.poll(); + } + + void updateWaitingToRunExclusive() + { + maybeUnpauseLoading(); + } + + void maybeUnpauseLoading() + { + if (!hasPausedLoading) + return; + + if (cache.weightedSize() < maxWorkingCapacityInBytes || (loading.isEmpty() && waitingToRun.isEmpty())) + { + hasPausedLoading = false; + enqueueLoadsExclusive(); + } + } + + public abstract boolean hasTasks(); + abstract boolean isOwningThread(); + + private void enqueueLoadsExclusive() + { + outer: while (true) + { + TaskQueue> queue = waitingToLoadRangeTxns.isEmpty() || activeRangeLoads >= maxQueuedRangeLoads ? waitingToLoad : waitingToLoadRangeTxns; + AccordTask next = queue.peek(); + if (next == null) + return; + + if (hasPausedLoading || cache.weightedSize() >= maxWorkingCapacityInBytes) + { + // we have too much in memory already, and we have work waiting to run, so let that complete before queueing more + if (!loading.isEmpty() || !waitingToRun.isEmpty()) + { + hasPausedLoading = true; + return; + } + } + + switch (next.state()) + { + default: + { + failExclusive(next, new AssertionError("Unexpected state: " + next.toDescription())); + break; + } + case WAITING_TO_SCAN_RANGES: + if (activeRangeLoads >= maxQueuedRangeLoads) + { + parkRangeLoad(next); + } + else + { + ++activeRangeLoads; + ++activeLoads; + next.rangeScanner().start(rangeLoadExecutor); + updateQueue(next); + } + break; + + case WAITING_TO_LOAD: + while (true) + { + AccordCacheEntry load = next.peekWaitingToLoad(); + boolean isForRange = isForRange(next, load); + if (isForRange && activeRangeLoads >= maxQueuedRangeLoads) + { + parkRangeLoad(next); + continue outer; + } + + Invariants.require(load != null); + AccordCacheEntry.OnLoaded onLoaded = this; + ++activeLoads; + if (isForRange) + { + ++activeRangeLoads; + onLoaded = onRangeLoaded; + } + + for (AccordTask task : cache.load(loadExecutor, next, load, onLoaded)) + { + if (task == next) continue; + if (task.onLoading(load)) + updateQueue(task); + } + Object prev = next.pollWaitingToLoad(); + Invariants.require(prev == load); + if (next.peekWaitingToLoad() == null) + break; + + Invariants.require(next.state() == WAITING_TO_LOAD, "Invalid state: %s", next); + if (activeLoads >= maxQueuedLoads) + return; + } + Invariants.require(next.state().compareTo(LOADING) >= 0, "Invalid state: %s", next); + updateQueue(next); + } + } + } + + private boolean isForRange(AccordTask task, AccordCacheEntry load) + { + boolean isForRangeTxn = task.hasRanges(); + if (!isForRangeTxn) + return false; + + for (AccordTask t : load.loadingOrWaiting().waiters()) + { + if (!t.hasRanges()) + return false; + } + return true; + } + + @Override + public Agent agent() + { + return agent; + } + + @Override + public AsyncChain build(Callable task) + { + return AsyncChains.ofCallable(this, task); + } + + private void parkRangeLoad(AccordTask task) + { + if (task.queued() != waitingToLoadRangeTxns) + { + task.unqueueIfQueued(); + task.addToQueue(waitingToLoadRangeTxns); + } + } + + void consumeExclusive(Object object) + { + try + { + if (object instanceof AccordTask) + loadExclusive((AccordTask) object); + else + ((SubmitAsync) object).acceptExclusive(this); + } + catch (Throwable t) + { + agent.onUncaughtException(t); + } + } + + private void updateQueue(AccordTask task) + { + task.unqueueIfQueued(); + switch (task.state()) + { + default: throw new AssertionError("Unexpected state: " + task.toDescription()); + case WAITING_TO_SCAN_RANGES: + case WAITING_TO_LOAD: + task.addToQueue(waitingToLoad); + break; + case SCANNING_RANGES: + task.addToQueue(scanningRanges); + break; + case LOADING: + task.addToQueue(loading); + break; + case WAITING_TO_RUN: + task.runQueuedAt = nanoTime(); + commandStoreQueues.computeIfAbsent(task.commandStore, CommandStoreQueue::new) + .appendOrSetNext(task); + break; + } + } + + private void waitingToRun(Task task) + { + if (task.commandStore == null) + { + waitingToRun.append(task); + } + else + { + commandStoreQueues.computeIfAbsent(task.commandStore, CommandStoreQueue::new) + .appendOrSetNext(task); + } + } + + private Cancellable submitIOExclusive(Task parent, Runnable run) + { + Invariants.require(isOwningThread()); + ++tasks; + PlainRunnable task = new PlainRunnable(null, run, null); + // TODO (expected): adopt queue position of the submitting task + if (parent == null) assignNewQueuePosition(task); + else assignQueueSubPosition(parent, task); + waitingToRun.append(task); + return task; + } + + private void assignNewQueuePosition(Task task) + { + task.queuePosition = (((long)++nextPosition) & 0xffffffffL) << 31; + } + + private void assignQueueSubPosition(Task parent, Task task) + { + task.queuePosition = parent.queuePosition | (++nextPosition & 0x7fffffff); + } + + public Executor executor(AccordCommandStore commandStore) + { + return task -> AccordExecutor.this.submit(task, commandStore); + } + + public void submit(AccordTask operation) + { + submit(AccordExecutor::loadExclusive, Function.identity(), operation); + } + + public void cancel(AccordTask task) + { + Invariants.require(task.commandStore.executor() == this, + "%s is a wrong command store for %s, should be %s", + this, task, task); + submit(AccordExecutor::cancelExclusive, CancelAsync::new, task); + } + + public void onScannedRanges(AccordTask task, Throwable fail) + { + submit(AccordExecutor::onScannedRangesExclusive, OnScannedRanges::new, task, fail); + } + + public void onSaved(AccordCacheEntry saved, Object identity, Throwable fail) + { + submit(AccordExecutor::onSavedExclusive, OnSaved::new, saved, identity, fail); + } + + @Override + public void onLoaded(AccordCacheEntry loaded, V value, Throwable fail) + { + submit(AccordExecutor::onLoadedExclusive, OnLoaded::new, loaded, value, fail, false); + } + + public void onRangeLoaded(AccordCacheEntry loaded, V value, Throwable fail) + { + submit(AccordExecutor::onLoadedExclusive, OnLoaded::new, loaded, value, fail, true); + } + + private void submit(BiConsumer sync, Function async, P1 p1) + { + submit((e, c, p1a, p2a, p3) -> c.accept(e, p1a), (f, p1a, p2a, p3) -> f.apply(p1a), sync, async, p1, null, null); + } + + private void submit(TriConsumer sync, BiFunction async, P1 p1, P2 p2) + { + submit((e, c, p1a, p2a, p3) -> c.accept(e, p1a, p2a), (f, p1a, p2a, p3) -> f.apply(p1a, p2a), sync, async, p1, p2, null); + } + + private void submit(QuadConsumer sync, TriFunction async, P1 p1, P2 p2, P3 p3) + { + submit((e, c, p1a, p2a, p3a) -> c.accept(e, p1a, p2a, p3a), TriFunction::apply, sync, async, p1, p2, p3); + } + + private void submit(QuintConsumer sync, QuadFunction async, P1 p1, P2 p2, P3 p3, P4 p4) + { + submit(sync, async, p1, p1, p2, p3, p4); + } + + abstract void submit(QuintConsumer sync, QuadFunction async, P1s p1s, P1a p1a, P2 p2, P3 p3, P4 p4); + + private void submitExclusive(AsyncPromise result, Runnable run, AccordCommandStore commandStore) + { + ++tasks; + PlainRunnable task = new PlainRunnable(result, run, commandStore); + task.queuePosition = ++nextPosition; + waitingToRun(task); + } + + private void submitExclusive(AsyncPromise result, PlainRunnable task) + { + ++tasks; + task.queuePosition = ++nextPosition; + waitingToRun(task); + } + + private void loadExclusive(AccordTask task) + { + ++tasks; + assignNewQueuePosition(task); + task.setupExclusive(); + updateQueue(task); + enqueueLoadsExclusive(); + } + + private void cancelExclusive(AccordTask task) + { + switch (task.state()) + { + default: throw new UnhandledEnum(task.state()); + case INITIALIZED: + // we could be cancelled before we even reach the queue + task.cancelExclusive(); + break; + + case SCANNING_RANGES: + case LOADING: + case WAITING_TO_LOAD: + case WAITING_TO_SCAN_RANGES: + case WAITING_TO_RUN: + --tasks; + task.unqueueIfQueued(); + task.cancelExclusive(); + break; + + case FAILING: + case RUNNING: + case PERSISTING: + case FINISHED: + case CANCELLED: + case FAILED: + // cannot safely cancel + } + } + + private void onScannedRangesExclusive(AccordTask task, Throwable fail) + { + --activeLoads; + --activeRangeLoads; + // the task may have already been cancelled, in which case we don't need to fail it + if (!task.state().isExecuted()) + { + if (fail != null) + { + failExclusive(task, fail); + } + else + { + task.rangeScanner().scannedExclusive(); + updateQueue(task); + } + } + enqueueLoadsExclusive(); + } + + private void failExclusive(AccordTask task, Throwable fail) + { + if (task.state().isExecuted()) + return; + + --tasks; + try { task.failExclusive(fail); } + catch (Throwable t) { agent.onUncaughtException(t); } + finally + { + task.unqueueIfQueued(); + task.cleanupExclusive(); + } + } + + private void onSavedExclusive(AccordCacheEntry state, Object identity, Throwable fail) + { + cache.saved(state, identity, fail); + } + + private void onLoadedExclusive(AccordCacheEntry loaded, V value, Throwable fail, boolean isForRange) + { + --activeLoads; + if (isForRange) + --activeRangeLoads; + + if (loaded.status() != EVICTED) + { + try (BufferList> tasks = loaded.loading().copyWaiters()) + { + if (fail != null) + { + for (AccordTask task : tasks) + failExclusive(task, fail); + cache.failedToLoad(loaded); + } + else + { + cache.loaded(loaded, value); + for (AccordTask task : tasks) + { + if (task.onLoad(loaded)) + { + Invariants.require(task.queued() == loading); + task.unqueue(); + waitingToRun(task); + } + } + } + } + } + + enqueueLoadsExclusive(); + } + + public Future submit(Runnable run) + { + return submit(run, null); + } + + // TODO (expected): offer queue jumping/priorities + public Future submit(Runnable run, AccordCommandStore commandStore) + { + PlainRunnable task = new PlainRunnable(new AsyncPromise<>(), run, commandStore); + AsyncPromise result = new AsyncPromise<>(); + submit(AccordExecutor::submitExclusive, SubmitPlainRunnable::new, result, run, commandStore); + return result; + } + + public void execute(Runnable command) + { + submit(command); + } + + public void executeDirectlyWithLock(Runnable command) + { + lock.lock(); + try + { + command.run(); + } + finally + { + lock.unlock(); + } + } + + public void execute(Runnable command, AccordCommandStore commandStore) + { + submit(command, commandStore); + } + + @Override + public void setCapacity(long bytes) + { + Invariants.require(isOwningThread()); + cache.setCapacity(bytes); + maxWorkingCapacityInBytes = cache.capacity() + maxWorkingSetSizeInBytes; + } + + public void setWorkingSetSize(long bytes) + { + Invariants.require(isOwningThread()); + maxWorkingSetSizeInBytes = bytes; + maxWorkingCapacityInBytes = cache.capacity() + maxWorkingSetSizeInBytes; + if (maxWorkingCapacityInBytes < maxWorkingSetSizeInBytes) + maxWorkingCapacityInBytes = Long.MAX_VALUE; + } + + public void setMaxQueuedLoads(int total, int range) + { + Invariants.require(isOwningThread()); + maxQueuedLoads = total; + maxQueuedRangeLoads = range; + } + + @Override + public long capacity() + { + return cache.capacity(); + } + + @Override + public int size() + { + return cache.size(); + } + + @Override + public long weightedSize() + { + return cache.weightedSize(); + } + + public static abstract class Task extends IntrusivePriorityHeap.Node + { + final AccordCommandStore commandStore; + long queuePosition; + + protected Task(AccordCommandStore commandStore) + { + this.commandStore = commandStore; + } + + /** + * Prepare to run while holding the state cache lock + */ + abstract protected void preRunExclusive(); + + /** + * Run the command; the state cache lock may or may not be held depending on the executor implementation + */ + abstract protected void run(); + /** + * Fail the command; the state cache lock may or may not be held depending on the executor implementation + */ + abstract protected void fail(Throwable fail); + + /** + * Cleanup the command while holding the state cache lock + */ + abstract protected void cleanupExclusive(); + + abstract protected void addToQueue(TaskQueue queue); + } + + static class CommandStoreQueueTask extends Task + { + private final CommandStoreQueue queue; + private Task task; + + CommandStoreQueueTask(CommandStoreQueue queue, AccordCommandStore commandStore) + { + super(commandStore); + this.queue = queue; + } + + public boolean isSet() + { + return this.task != null; + } + + public void reset() + { + queuePosition = -1; + this.task = null; + } + + public void setNext(Task task) + { + queuePosition = task.queuePosition; + this.task = task; + } + + @Override + protected void preRunExclusive() + { + Invariants.require(task != null); + Thread self = Thread.currentThread(); + commandStore.setOwner(self, self); + task.preRunExclusive(); + } + + @Override + protected void run() + { + task.run(); + } + + @Override + protected void fail(Throwable t) + { + task.fail(t); + } + + @Override + protected void cleanupExclusive() + { + task.cleanupExclusive(); + commandStore.setOwner(null, Thread.currentThread()); + queue.updateNext(); + } + + @Override + protected void addToQueue(TaskQueue queue) + { + throw new UnsupportedOperationException(); + } + } + + class CommandStoreQueue extends TaskQueue + { + final CommandStoreQueueTask next; + + CommandStoreQueue(AccordCommandStore commandStore) + { + super(WAITING_TO_RUN); + this.next = new CommandStoreQueueTask(this, commandStore); + } + + void updateNext() + { + updateNext(super.poll()); + } + + void updateNext(Task task) + { + next.reset(); + if (task != null) + task.addToQueue(this); + } + + public void appendOrSetNext(Task task) + { + if (!next.isSet()) + task.addToQueue(this); + else + super.append(task); + } + + @Override + public void append(Task task) + { + Invariants.require(!next.isSet()); + // TODO (expected): if the new task is higher priority, replace next + next.setNext(task); + waitingToRun.append(next); + } + + @Override + public void remove(Task remove) + { + if (next.isSet() && next.task == remove) + { + next.reset(); + waitingToRun.remove(next); + return; + } + + super.remove(remove); + } + + @Override + public Task poll() + { + throw new UnsupportedOperationException(); + } + + @Override + public Task peek() + { + throw new UnsupportedOperationException(); + } + + @Override + public boolean contains(Task contains) + { + throw new UnsupportedOperationException(); + } + } + + static class TaskQueue extends IntrusivePriorityHeap + { + final AccordTask.State kind; + + TaskQueue(AccordTask.State kind) + { + this.kind = kind; + } + + @Override + public int compare(T o1, T o2) + { + return Long.compare(o1.queuePosition, o2.queuePosition); + } + public void append(T task) + { + super.append(task); + } + + public T poll() + { + ensureHeapified(); + return pollNode(); + } + + public T peek() + { + ensureHeapified(); + return peekNode(); + } + + public void remove(T remove) + { + Invariants.require(super.contains(remove)); + super.remove(remove); + Invariants.require(!super.contains(remove)); + } + + public boolean contains(T contains) + { + return super.contains(contains); + } + } + + private abstract static class SubmitAsync + { + abstract void acceptExclusive(AccordExecutor executor); + } + + private static class SubmitPlainRunnable extends SubmitAsync + { + final AsyncPromise result; + final Runnable run; + final AccordCommandStore commandStore; + + private SubmitPlainRunnable(AsyncPromise result, Runnable run, AccordCommandStore commandStore) + { + this.result = result; + this.run = run; + this.commandStore = commandStore; + } + + @Override + void acceptExclusive(AccordExecutor executor) + { + executor.submitExclusive(result, run, commandStore); + } + } + + private static class OnLoaded extends SubmitAsync + { + static final int FAIL = 1; + static final int RANGE = 2; + final AccordCacheEntry loaded; + final Object result; + final int flags; + + OnLoaded(AccordCacheEntry loaded, V success, Throwable fail, boolean isForRange) + { + this.loaded = loaded; + int flags = isForRange ? RANGE : 0; + if (fail == null) + { + result = success; + } + else + { + result = fail; + flags |= FAIL; + } + this.flags = flags; + } + + V success() + { + return (flags & FAIL) == 0 ? (V) result : null; + } + + Throwable fail() + { + return (flags & FAIL) == 0 ? null : (Throwable) result; + } + + boolean isForRange() + { + return (flags & RANGE) != 0; + } + + @Override + void acceptExclusive(AccordExecutor executor) + { + executor.onLoadedExclusive(loaded, success(), fail(), isForRange()); + } + } + + private static class OnScannedRanges extends SubmitAsync + { + final AccordTask scanned; + final Throwable fail; + + private OnScannedRanges(AccordTask scanned, Throwable fail) + { + this.scanned = scanned; + this.fail = fail; + } + + @Override + void acceptExclusive(AccordExecutor executor) + { + executor.onScannedRangesExclusive(scanned, fail); + } + } + + private static class OnSaved extends SubmitAsync + { + final AccordCacheEntry state; + final Object identity; + final Throwable fail; + + private OnSaved(AccordCacheEntry state, Object identity, Throwable fail) + { + this.state = state; + this.identity = identity; + this.fail = fail; + } + + @Override + void acceptExclusive(AccordExecutor executor) + { + executor.onSavedExclusive(state, identity, fail); + } + } + + private static class CancelAsync extends SubmitAsync + { + final AccordTask cancel; + + private CancelAsync(AccordTask cancel) + { + this.cancel = cancel; + } + + @Override + void acceptExclusive(AccordExecutor executor) + { + executor.cancelExclusive(cancel); + } + } + + static IntFunction constant(O out) + { + return ignore -> out; + } + + static ExecutorFunctionFactory constantFactory(ExecutorFunction exec) + { + return ignore -> exec; + } + + static ExecutorFunctionFactory constantFactory(ExecutorPlus exec) + { + return ignore -> wrap(exec); + } + + static ExecutorFunction wrap(ExecutorPlus exec) + { + return (t, r) -> wrap(exec.submit(r)); + } + + static Cancellable wrap(Future f) + { + return () -> f.cancel(false); + } + + public static ExecutorFunction submitIOToSelf(AccordExecutor executor) + { + return executor::submitIOExclusive; + } + + private static Function alwaysNullTask(ExecutorFunction f) + { + return r -> f.apply(null, r); + } + + class PlainRunnable extends Task implements Cancellable + { + final AsyncPromise result; + final Runnable run; + + PlainRunnable(AsyncPromise result, Runnable run, AccordCommandStore commandStore) + { + super(commandStore); + this.result = result; + this.run = run; + } + + @Override + protected void preRunExclusive() {} + + @Override + protected void run() + { + run.run(); + if (result != null) + result.trySuccess(null); + } + + @Override + protected void fail(Throwable t) + { + if (result != null) + result.tryFailure(t); + agent.onUncaughtException(t); + } + + @Override + protected void cleanupExclusive() {} + + @Override + protected void addToQueue(TaskQueue queue) + { + Invariants.require(queue.kind == WAITING_TO_RUN); + queue.append(this); + } + + @Override + public void cancel() + { + executeDirectlyWithLock(() -> { + if (isInHeap()) + { + waitingToRun.remove(this); + if (result != null) + result.cancel(false); + } + }); + } + } + +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordExecutorAbstractLockLoop.java b/src/java/org/apache/cassandra/service/accord/AccordExecutorAbstractLockLoop.java new file mode 100644 index 000000000000..8ef72c95b09f --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordExecutorAbstractLockLoop.java @@ -0,0 +1,291 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.concurrent.locks.Lock; + +import accord.api.Agent; +import accord.utils.QuadFunction; +import accord.utils.QuintConsumer; +import org.apache.cassandra.metrics.AccordCacheMetrics; +import org.apache.cassandra.utils.concurrent.ConcurrentLinkedStack; + +import static org.apache.cassandra.service.accord.AccordExecutor.Mode.RUN_WITH_LOCK; + +abstract class AccordExecutorAbstractLockLoop extends AccordExecutor +{ + final ConcurrentLinkedStack submitted = new ConcurrentLinkedStack<>(); + boolean isHeldByExecutor; + boolean shutdown; + + AccordExecutorAbstractLockLoop(Lock lock, int executorId, AccordCacheMetrics metrics, ExecutorFunctionFactory loadExecutor, ExecutorFunctionFactory saveExecutor, ExecutorFunctionFactory rangeLoadExecutor, Agent agent) + { + super(lock, executorId, metrics, loadExecutor, saveExecutor, rangeLoadExecutor, agent); + } + + abstract void notifyWork(); + abstract void notifyWorkExclusive(); + abstract void awaitExclusive() throws InterruptedException; + abstract boolean isInLoop(); + abstract void submitExternal(QuintConsumer sync, QuadFunction async, P1s p1s, P1a p1a, P2 p2, P3 p3, P4 p4); + + void submit(QuintConsumer sync, QuadFunction async, P1s p1s, P1a p1a, P2 p2, P3 p3, P4 p4) + { + // if we're a loop thread, we will poll the waitingToRun queue when we come around + if (isInLoop()) submitted.push(async.apply(p1a, p2, p3, p4)); + else submitExternal(sync, async, p1s, p1a, p2, p3, p4); + } + + void submitExternalExclusive(QuintConsumer sync, QuadFunction async, P1s p1s, P1a p1a, P2 p2, P3 p3, P4 p4) + { + try + { + try + { + drainSubmittedExclusive(); + } + catch (Throwable t) + { + try { sync.accept(this, p1s, p2, p3, p4); } + catch (Throwable t2) { t.addSuppressed(t2); } + throw t; + } + sync.accept(this, p1s, p2, p3, p4); + } + finally + { + notifyIfMoreWorkExclusive(); + } + } + + public boolean hasTasks() + { + if (tasks > 0 || !submitted.isEmpty() || running > 0) + return true; + + lock.lock(); + try + { + return tasks > 0 || !submitted.isEmpty() || running > 0; + } + finally + { + lock.unlock(); + } + } + + void updateWaitingToRunExclusive() + { + drainSubmittedExclusive(); + super.updateWaitingToRunExclusive(); + } + + void drainSubmittedExclusive() + { + submitted.drain(AccordExecutor::consumeExclusive, this, true); + } + + void notifyIfMoreWorkExclusive() + { + if (hasWaitingToRun()) + notifyWorkExclusive(); + } + + private void enterLockExclusive() + { + isHeldByExecutor = true; + } + + private void exitLockExclusive() + { + notifyIfMoreWorkExclusive(); + } + + private void pauseExclusive() + { + isHeldByExecutor = false; + --running; + } + + private void resumeExclusive() + { + isHeldByExecutor = true; + ++running; + } + + Runnable task(Mode mode) + { + return mode == RUN_WITH_LOCK ? this::runWithLock : this::runWithoutLock; + } + + protected void runWithLock() + { + while (true) + { + lock.lock(); + try + { + resumeExclusive(); + enterLockExclusive(); + while (true) + { + Task task = pollWaitingToRunExclusive(); + + if (task != null) + { + --tasks; + try + { + task.preRunExclusive(); + task.run(); + } + catch (Throwable t) + { + task.fail(t); + } + finally + { + task.cleanupExclusive(); + } + } + else + { + if (shutdown) + { + pauseExclusive(); + exitLockExclusive(); + notifyWorkExclusive(); // always notify on shutdown + return; + } + + pauseExclusive(); + awaitExclusive(); + resumeExclusive(); + } + } + } + catch (Throwable t) + { + pauseExclusive(); + exitLockExclusive(); + + try { agent.onUncaughtException(t); } + catch (Throwable t2) { } + } + finally + { + lock.unlock(); + } + } + } + + protected void runWithoutLock() + { + Task task = null; + while (true) + { + lock.lock(); + try + { + if (task != null) task.cleanupExclusive(); + else resumeExclusive(); + enterLockExclusive(); + + while (true) + { + task = pollWaitingToRunExclusive(); + if (task != null) + { + exitLockExclusive(); + break; + } + + if (shutdown) + { + exitLockExclusive(); + notifyWorkExclusive(); + return; + } + + pauseExclusive(); + awaitExclusive(); + resumeExclusive(); + } + --tasks; + task.preRunExclusive(); + } + catch (Throwable t) + { + if (task != null) + { + try { task.fail(t); } + catch (Throwable t2) { t.addSuppressed(t2); } + try { task.cleanupExclusive(); } + catch (Throwable t2) { t.addSuppressed(t2); } + try { agent.onUncaughtException(t); } + catch (Throwable t2) { /* nothing we can sensibly do after already reporting */ } + task = null; + } + if (isHeldByExecutor) + pauseExclusive(); + exitLockExclusive(); + continue; + } + finally + { + lock.unlock(); + } + + try + { + task.run(); + } + catch (Throwable t) + { + try { task.fail(t); } + catch (Throwable t2) + { + try + { + t2.addSuppressed(t); + agent.onUncaughtException(t2); + } + catch (Throwable t3) + { + // empty to ensure we definitely loop so we cleanup the task + } + } + } + } + } + + @Override + public void shutdown() + { + shutdown = true; + notifyWork(); + } + + @Override + public Object shutdownNow() + { + shutdown(); + return null; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordExecutorAbstractSemiSyncSubmit.java b/src/java/org/apache/cassandra/service/accord/AccordExecutorAbstractSemiSyncSubmit.java new file mode 100644 index 000000000000..42cde814cf3a --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordExecutorAbstractSemiSyncSubmit.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.concurrent.locks.Lock; + +import accord.api.Agent; +import accord.utils.QuadFunction; +import accord.utils.QuintConsumer; +import org.apache.cassandra.metrics.AccordCacheMetrics; + +abstract class AccordExecutorAbstractSemiSyncSubmit extends AccordExecutorAbstractLockLoop +{ + AccordExecutorAbstractSemiSyncSubmit(Lock lock, int executorId, AccordCacheMetrics metrics, ExecutorFunctionFactory loadExecutor, ExecutorFunctionFactory saveExecutor, ExecutorFunctionFactory rangeLoadExecutor, Agent agent) + { + super(lock, executorId, metrics, loadExecutor, saveExecutor, rangeLoadExecutor, agent); + } + + abstract void awaitExclusive() throws InterruptedException; + + void submitExternal(QuintConsumer sync, QuadFunction async, P1s p1s, P1a p1a, P2 p2, P3 p3, P4 p4) + { + if (!lock.tryLock()) + { + submitted.push(async.apply(p1a, p2, p3, p4)); + notifyWork(); + return; + } + + try + { + submitExternalExclusive(sync, async, p1s, p1a, p2, p3, p4); + } + finally + { + lock.unlock(); + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordExecutorAsyncSubmit.java b/src/java/org/apache/cassandra/service/accord/AccordExecutorAsyncSubmit.java new file mode 100644 index 000000000000..7510e6159ac2 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordExecutorAsyncSubmit.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.concurrent.TimeUnit; +import java.util.function.IntFunction; + +import accord.api.Agent; +import org.apache.cassandra.metrics.AccordCacheMetrics; +import org.apache.cassandra.utils.concurrent.LockWithAsyncSignal; + +// WARNING: experimental - needs more testing +class AccordExecutorAsyncSubmit extends AccordExecutorAbstractSemiSyncSubmit +{ + private final AccordExecutorLoops loops; + private final LockWithAsyncSignal lock; + + public AccordExecutorAsyncSubmit(int executorId, Mode mode, int threads, IntFunction name, AccordCacheMetrics metrics, ExecutorFunctionFactory loadExecutor, ExecutorFunctionFactory saveExecutor, ExecutorFunctionFactory rangeLoadExecutor, Agent agent) + { + this(new LockWithAsyncSignal(), executorId, mode, threads, name, metrics, loadExecutor, saveExecutor, rangeLoadExecutor, agent); + } + + private AccordExecutorAsyncSubmit(LockWithAsyncSignal lock, int executorId, Mode mode, int threads, IntFunction name, AccordCacheMetrics metrics, ExecutorFunctionFactory loadExecutor, ExecutorFunctionFactory saveExecutor, ExecutorFunctionFactory rangeLoadExecutor, Agent agent) + { + super(lock, executorId, metrics, loadExecutor, saveExecutor, rangeLoadExecutor, agent); + this.lock = lock; + this.loops = new AccordExecutorLoops(mode, threads, name, this::task); + } + + @Override + void awaitExclusive() throws InterruptedException + { + lock.clearSignal(); + if (submitted.isEmpty()) + lock.await(); + } + + @Override + boolean isInLoop() + { + return loops.isInLoop(); + } + + @Override + void notifyWork() + { + lock.signal(); + } + + @Override + void notifyWorkExclusive() + { + lock.signal(); + } + + @Override + boolean isOwningThread() + { + return lock.isOwner(Thread.currentThread()); + } + + @Override + public boolean isTerminated() + { + return loops.isTerminated(); + } + + @Override + public boolean awaitTermination(long timeout, TimeUnit unit) throws InterruptedException + { + return loops.awaitTermination(timeout, unit); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordExecutorLoops.java b/src/java/org/apache/cassandra/service/accord/AccordExecutorLoops.java new file mode 100644 index 000000000000..c5442c644da4 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordExecutorLoops.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Function; +import java.util.function.IntFunction; + +import accord.utils.Invariants; +import org.agrona.collections.Long2ObjectHashMap; +import org.apache.cassandra.service.accord.AccordExecutor.Mode; +import org.apache.cassandra.utils.concurrent.Condition; + +import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; +import static org.apache.cassandra.concurrent.ExecutorFactory.SimulatorThreadTag.INFINITE_LOOP; +import static org.apache.cassandra.concurrent.ExecutorFactory.SystemThreadTag.NON_DAEMON; +import static org.apache.cassandra.service.accord.AccordExecutor.Mode.RUN_WITH_LOCK; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; + +class AccordExecutorLoops +{ + private final Long2ObjectHashMap loops; + + private final AtomicInteger running = new AtomicInteger(); + private final Condition terminated = Condition.newOneTimeCondition(); + + public AccordExecutorLoops(Mode mode, int threads, IntFunction name, Function loopFactory) + { + Invariants.require(mode == RUN_WITH_LOCK ? threads == 1 : threads >= 1); + running.addAndGet(threads); + loops = new Long2ObjectHashMap<>(threads, 0.65f); + for (int i = 0; i < threads; ++i) + { + Thread thread = executorFactory().startThread(name.apply(i), wrap(loopFactory.apply(mode)), NON_DAEMON, INFINITE_LOOP); + Thread conflict = loops.putIfAbsent(thread.getId(), thread); + Invariants.require(conflict == null || !conflict.isAlive(), "Allocated two threads with the same threadId!"); + } + } + + private Runnable wrap(Runnable run) + { + return () -> + { + try + { + run.run(); + } + finally + { + if (0 == running.decrementAndGet()) + terminated.signalAll(); + } + }; + } + + public boolean isInLoop() + { + Thread thread = Thread.currentThread(); + return loops.get(thread.getId()) == thread; + } + + public boolean isTerminated() + { + return terminated.isSignalled(); + } + + public boolean awaitTermination(long timeout, TimeUnit unit) throws InterruptedException + { + long deadline = nanoTime() + unit.toNanos(timeout); + return terminated.awaitUntil(deadline); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordExecutorSemiSyncSubmit.java b/src/java/org/apache/cassandra/service/accord/AccordExecutorSemiSyncSubmit.java new file mode 100644 index 000000000000..0e26e38537b1 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordExecutorSemiSyncSubmit.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; +import java.util.function.IntFunction; + +import accord.api.Agent; +import org.apache.cassandra.metrics.AccordCacheMetrics; + +// WARNING: experimental - needs more testing +class AccordExecutorSemiSyncSubmit extends AccordExecutorAbstractSemiSyncSubmit +{ + private final AccordExecutorLoops loops; + private final ReentrantLock lock; + private final Condition hasWork; + + public AccordExecutorSemiSyncSubmit(int executorId, Mode mode, int threads, IntFunction name, AccordCacheMetrics metrics, ExecutorFunctionFactory loadExecutor, ExecutorFunctionFactory saveExecutor, ExecutorFunctionFactory rangeLoadExecutor, Agent agent) + { + this(new ReentrantLock(), executorId, mode, threads, name, metrics, loadExecutor, saveExecutor, rangeLoadExecutor, agent); + } + + private AccordExecutorSemiSyncSubmit(ReentrantLock lock, int executorId, Mode mode, int threads, IntFunction name, AccordCacheMetrics metrics, ExecutorFunctionFactory loadExecutor, ExecutorFunctionFactory saveExecutor, ExecutorFunctionFactory rangeLoadExecutor, Agent agent) + { + super(lock, executorId, metrics, loadExecutor, saveExecutor, rangeLoadExecutor, agent); + this.lock = lock; + this.hasWork = lock.newCondition(); + this.loops = new AccordExecutorLoops(mode, threads, name, this::task); + } + + @Override + void awaitExclusive() throws InterruptedException + { + if (submitted.isEmpty()) + hasWork.await(); + } + + @Override + boolean isInLoop() + { + return loops.isInLoop(); + } + + @Override + void notifyWork() + { + // we check running both sides of tryLock for ordering guarantees + boolean hadRunning = isHeldByExecutor; + if (lock.tryLock()) + { + try { hasWork.signal(); } + finally { lock.unlock(); } + } + else if (!hadRunning || !isHeldByExecutor) + { + lock.lock(); + try { hasWork.signal(); } + finally { lock.unlock(); } + } + } + + @Override + void notifyWorkExclusive() + { + hasWork.signal(); + } + + @Override + boolean isOwningThread() + { + return lock.isHeldByCurrentThread(); + } + + @Override + public boolean isTerminated() + { + return loops.isTerminated(); + } + + @Override + public boolean awaitTermination(long timeout, TimeUnit unit) throws InterruptedException + { + return loops.awaitTermination(timeout, unit); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordExecutorSimple.java b/src/java/org/apache/cassandra/service/accord/AccordExecutorSimple.java new file mode 100644 index 000000000000..cd192e43c536 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordExecutorSimple.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.ReentrantLock; +import java.util.function.IntFunction; + +import accord.api.Agent; +import accord.utils.Invariants; +import accord.utils.QuadFunction; +import accord.utils.QuintConsumer; +import org.apache.cassandra.concurrent.ExecutorPlus; +import org.apache.cassandra.concurrent.Stage; +import org.apache.cassandra.metrics.AccordCacheMetrics; + +import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; + +class AccordExecutorSimple extends AccordExecutor +{ + final ExecutorPlus executor; + final ReentrantLock lock; + + public AccordExecutorSimple(int executorId, String name, AccordCacheMetrics metrics, Agent agent) + { + this(executorId, name, metrics, Stage.READ.executor(), Stage.MUTATION.executor(), Stage.READ.executor(), agent); + } + + public AccordExecutorSimple(int executorId, String name, AccordCacheMetrics metrics, ExecutorPlus loadExecutor, ExecutorPlus saveExecutor, ExecutorPlus rangeLoadExecutor, Agent agent) + { + this(executorId, name, metrics, wrap(loadExecutor), wrap(saveExecutor), wrap(rangeLoadExecutor), agent); + } + + public AccordExecutorSimple(int executorId, String name, AccordCacheMetrics metrics, ExecutorFunction loadExecutor, ExecutorFunction saveExecutor, ExecutorFunction rangeLoadExecutor, Agent agent) + { + this(new ReentrantLock(), executorId, name, metrics, loadExecutor, saveExecutor, rangeLoadExecutor, agent); + } + + private AccordExecutorSimple(ReentrantLock lock, int executorId, String name, AccordCacheMetrics metrics, ExecutorFunction loadExecutor, ExecutorFunction saveExecutor, ExecutorFunction rangeLoadExecutor, Agent agent) + { + super(lock, executorId, metrics, constantFactory(loadExecutor), constantFactory(saveExecutor), constantFactory(rangeLoadExecutor), agent); + this.lock = lock; + this.executor = executorFactory().sequential(name); + } + + public AccordExecutorSimple(int executorId, Mode mode, int threads, IntFunction name, AccordCacheMetrics metrics, ExecutorFunctionFactory loadExecutor, ExecutorFunctionFactory saveExecutor, ExecutorFunctionFactory rangeLoadExecutor, Agent agent) + { + this(new ReentrantLock(), executorId, mode, threads, name, metrics, loadExecutor, saveExecutor, rangeLoadExecutor, agent); + } + + public AccordExecutorSimple(ReentrantLock lock, int executorId, Mode mode, int threads, IntFunction name, AccordCacheMetrics metrics, ExecutorFunctionFactory loadExecutor, ExecutorFunctionFactory saveExecutor, ExecutorFunctionFactory rangeLoadExecutor, Agent agent) + { + super(lock, executorId, metrics, loadExecutor, saveExecutor, rangeLoadExecutor, agent); + Invariants.requireArgument(threads == 1); + this.lock = lock; + this.executor = executorFactory().sequential(name.apply(0)); + + } + + @Override + public boolean hasTasks() + { + return tasks + executor.getActiveTaskCount() + executor.getPendingTaskCount() > 0; + } + + protected void run() + { + lock.lock(); + try + { + running = 1; + while (true) + { + Task task = pollWaitingToRunExclusive(); + if (task == null) + return; + + --tasks; + try { task.preRunExclusive(); task.run(); } + catch (Throwable t) { task.fail(t); } + finally { task.cleanupExclusive(); } + } + } + catch (Throwable t) + { + throw t; + } + finally + { + running = 0; + if (hasWaitingToRun()) + executor.execute(this::run); + lock.unlock(); + } + } + + @Override + void submit(QuintConsumer sync, QuadFunction async, P1s p1s, P1a p1a, P2 p2, P3 p3, P4 p4) + { + lock.lock(); + try + { + sync.accept(this, p1s, p2, p3, p4); + } + finally + { + if (hasWaitingToRun()) + executor.execute(this::run); + + lock.unlock(); + } + } + + @Override + boolean isOwningThread() + { + return lock.isHeldByCurrentThread(); + } + + @Override + public boolean isTerminated() + { + return executor.isTerminated(); + } + + @Override + public void shutdown() + { + executor.shutdown(); + } + + @Override + public Object shutdownNow() + { + return executor.shutdownNow(); + } + + @Override + public boolean awaitTermination(long timeout, TimeUnit units) throws InterruptedException + { + return executor.awaitTermination(timeout, units); + } + +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordExecutorSyncSubmit.java b/src/java/org/apache/cassandra/service/accord/AccordExecutorSyncSubmit.java new file mode 100644 index 000000000000..5fb5d7895b3b --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordExecutorSyncSubmit.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; +import java.util.function.IntFunction; + +import accord.api.Agent; +import accord.utils.QuadFunction; +import accord.utils.QuintConsumer; +import org.apache.cassandra.concurrent.ExecutorPlus; +import org.apache.cassandra.metrics.AccordCacheMetrics; + +class AccordExecutorSyncSubmit extends AccordExecutorAbstractLockLoop +{ + private final AccordExecutorLoops loops; + private final ReentrantLock lock; + private final Condition hasWork; + + public AccordExecutorSyncSubmit(int executorId, Mode mode, String name, AccordCacheMetrics metrics, ExecutorPlus loadExecutor, ExecutorPlus saveExecutor, ExecutorPlus rangeLoadExecutor, Agent agent) + { + this(executorId, mode, 1, constant(name), metrics, loadExecutor, saveExecutor, rangeLoadExecutor, agent); + } + + public AccordExecutorSyncSubmit(int executorId, Mode mode, int threads, IntFunction name, AccordCacheMetrics metrics, ExecutorPlus loadExecutor, ExecutorPlus saveExecutor, ExecutorPlus rangeLoadExecutor, Agent agent) + { + this(executorId, mode, threads, name, metrics, constantFactory(loadExecutor), constantFactory(saveExecutor), constantFactory(rangeLoadExecutor), agent); + } + + public AccordExecutorSyncSubmit(int executorId, Mode mode, int threads, IntFunction name, AccordCacheMetrics metrics, ExecutorFunctionFactory loadExecutor, ExecutorFunctionFactory saveExecutor, ExecutorFunctionFactory rangeLoadExecutor, Agent agent) + { + this(new ReentrantLock(), executorId, mode, threads, name, metrics, loadExecutor, saveExecutor, rangeLoadExecutor, agent); + } + + private AccordExecutorSyncSubmit(ReentrantLock lock, int executorId, Mode mode, int threads, IntFunction name, AccordCacheMetrics metrics, ExecutorFunctionFactory loadExecutor, ExecutorFunctionFactory saveExecutor, ExecutorFunctionFactory rangeLoadExecutor, Agent agent) + { + super(lock, executorId, metrics, loadExecutor, saveExecutor, rangeLoadExecutor, agent); + this.lock = lock; + this.hasWork = lock.newCondition(); + this.loops = new AccordExecutorLoops(mode, threads, name, this::task); + } + + @Override + void awaitExclusive() throws InterruptedException + { + hasWork.await(); + } + + @Override + boolean isInLoop() + { + return loops.isInLoop(); + } + + @Override + boolean isOwningThread() + { + return lock.isHeldByCurrentThread(); + } + + @Override + void notifyWork() + { + lock.lock(); + try + { + hasWork.signal(); + } + finally + { + lock.unlock(); + } + } + + @Override + void notifyWorkExclusive() + { + hasWork.signal(); + } + + void submitExternal(QuintConsumer sync, QuadFunction async, P1s p1s, P1a p1a, P2 p2, P3 p3, P4 p4) + { + lock.lock(); + try + { + submitExternalExclusive(sync, async, p1s, p1a, p2, p3, p4); + } + finally + { + lock.unlock(); + } + } + + @Override + public boolean isTerminated() + { + return loops.isTerminated(); + } + + @Override + public boolean awaitTermination(long timeout, TimeUnit unit) throws InterruptedException + { + return loops.awaitTermination(timeout, unit); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordFastPath.java b/src/java/org/apache/cassandra/service/accord/AccordFastPath.java new file mode 100644 index 000000000000..71150355dd8b --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordFastPath.java @@ -0,0 +1,302 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.io.IOException; +import java.util.Map; +import java.util.Objects; + +import accord.local.Node; +import com.google.common.collect.ImmutableMap; + +import com.google.common.collect.ImmutableSet; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.serializers.TopologySerializers; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.MetadataValue; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; + +/** + * Cluster availability info for services that need a consistent view of availability for a given epoch, such + * as accord topology calculation + */ +public class AccordFastPath implements MetadataValue +{ + public static final AccordFastPath EMPTY = new AccordFastPath(ImmutableMap.of(), Epoch.EMPTY); + + public enum Status + { + NORMAL, SHUTDOWN, UNAVAILABLE; + + public boolean isUnavailable() + { + switch (this) + { + case UNAVAILABLE: + case SHUTDOWN: + return true; + default: + return false; + } + } + + public static final MetadataSerializer serializer = new MetadataSerializer() + { + @Override + public void serialize(Status status, DataOutputPlus out, Version version) throws IOException + { + switch (status) + { + case NORMAL: out.write(0); break; + case SHUTDOWN: out.write(1); break; + case UNAVAILABLE: out.write(2); break; + default: throw new IllegalStateException("Unhandled status: " + this); + } + } + + @Override + public Status deserialize(DataInputPlus in, Version version) throws IOException + { + byte b = in.readByte(); + switch (b) + { + case 0: return NORMAL; + case 1: return SHUTDOWN; + case 2: return UNAVAILABLE; + default: throw new IllegalArgumentException("Unhandled status byte: " + b); + } + } + + @Override + public long serializedSize(Status status, Version version) + { + return TypeSizes.BYTE_SIZE; + } + }; + }; + + public static class NodeInfo + { + public final Status status; + public final long updated; + + public NodeInfo(Status status, long updated) + { + this.status = status; + this.updated = updated; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + NodeInfo nodeInfo = (NodeInfo) o; + return updated == nodeInfo.updated && status == nodeInfo.status; + } + + @Override + public int hashCode() + { + return Objects.hash(status, updated); + } + + @Override + public String toString() + { + return "NodeInfo{" + + "status=" + status + + ", updated=" + updated + + '}'; + } + + private static final MetadataSerializer serializer = new MetadataSerializer() + { + @Override + public void serialize(NodeInfo info, DataOutputPlus out, Version version) throws IOException + { + Status.serializer.serialize(info.status, out, version); + out.writeUnsignedVInt(info.updated); + } + + @Override + public NodeInfo deserialize(DataInputPlus in, Version version) throws IOException + { + return new NodeInfo(Status.serializer.deserialize(in, version), in.readUnsignedVInt()); + } + + @Override + public long serializedSize(NodeInfo info, Version version) + { + return Status.serializer.serializedSize(info.status, version) + TypeSizes.sizeofUnsignedVInt(info.updated); + } + }; + } + + public final ImmutableMap info; + + private final Epoch lastModified; + + AccordFastPath(ImmutableMap info, Epoch lastModified) + { + this.info = info; + this.lastModified = lastModified; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + AccordFastPath that = (AccordFastPath) o; + return info.equals(that.info) && lastModified.equals(that.lastModified); + } + + @Override + public int hashCode() + { + return Objects.hash(info, lastModified); + } + + public AccordFastPath withoutNode(NodeId tcmId) + { + Node.Id node = AccordTopology.tcmIdToAccord(tcmId); + if (!info.containsKey(node)) + return this; + + ImmutableMap.Builder builder = ImmutableMap.builder(); + info.forEach((n, info) -> { + if (!n.equals(node)) + builder.put(n, info); + }); + return new AccordFastPath(builder.build(), lastModified); + } + + public AccordFastPath withNodeStatusSince(Node.Id node, Status status, long updateTimeMillis, long updateDelayMillis) + { + NodeInfo current = info.get(node); + if (status == Status.SHUTDOWN && current != null) + { + // nodes report when they're being shutdown and aren't superseded + updateTimeMillis = Math.max(updateTimeMillis, current.updated + 1); + } + + if (!canUpdateNodeTo(current, status, updateTimeMillis, updateDelayMillis)) + throw new InvalidRequestException(String.format("cannot transition %s to %s at %s; current %s", node, status, updateTimeMillis, current)); + + ImmutableMap.Builder builder = ImmutableMap.builder(); + builder.put(node, new NodeInfo(status, updateTimeMillis)); + info.forEach((n, info) -> { + if (!n.equals(node)) + builder.put(n, info); + }); + return new AccordFastPath(builder.build(), lastModified); + } + + public boolean canUpdateNodeTo(NodeInfo current, Status status, long updateTimeMillis, long updateDelayMillis) + { + if (current == null) + return status != Status.NORMAL; + + if (current.status == status) + return false; + + return updateTimeMillis > current.updated + (status == Status.SHUTDOWN ? 0 : updateDelayMillis); + } + + public AccordFastPath withLastModified(Epoch epoch) + { + return new AccordFastPath(info, epoch); + } + + public Epoch lastModified() + { + return lastModified; + } + + public ImmutableSet unavailableIds() + { + ImmutableSet.Builder builder = ImmutableSet.builder(); + info.entrySet().stream() + .filter(entry -> entry.getValue().status.isUnavailable()) + .map(Map.Entry::getKey) + .forEach(builder::add); + return builder.build(); + } + + public static final MetadataSerializer serializer = new MetadataSerializer() + { + private void serializeMap(Map map, DataOutputPlus out, Version version) throws IOException + { + out.writeInt(map.size()); + for (Map.Entry entry : map.entrySet()) + { + TopologySerializers.nodeId.serialize(entry.getKey(), out); + NodeInfo.serializer.serialize(entry.getValue(), out, version); + } + } + + public void serialize(AccordFastPath accordFastPath, DataOutputPlus out, Version version) throws IOException + { + serializeMap(accordFastPath.info, out, version); + Epoch.serializer.serialize(accordFastPath.lastModified, out, version); + } + + private ImmutableMap deserializeMap(DataInputPlus in, Version version) throws IOException + { + int size = in.readInt(); + if (size == 0) + return ImmutableMap.of(); + + ImmutableMap.Builder builder = ImmutableMap.builder(); + for (int i=0; i map, Version version) + { + long size = TypeSizes.INT_SIZE; + for (Map.Entry entry : map.entrySet()) + { + size += TopologySerializers.nodeId.serializedSize(entry.getKey()); + size += NodeInfo.serializer.serializedSize(entry.getValue(), version); + } + return size; + } + + public long serializedSize(AccordFastPath accordFastPath, Version version) + { + return serializedMapSize(accordFastPath.info, version) + + Epoch.serializer.serializedSize(accordFastPath.lastModified, version); + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordFastPathCoordinator.java b/src/java/org/apache/cassandra/service/accord/AccordFastPathCoordinator.java new file mode 100644 index 000000000000..4e214e664f27 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordFastPathCoordinator.java @@ -0,0 +1,347 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import accord.api.ConfigurationService; +import accord.local.Node; +import accord.primitives.Ranges; +import accord.topology.Topology; +import accord.utils.Invariants; +import accord.utils.async.AsyncResult; +import accord.utils.async.AsyncResults; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableSet; +import org.apache.cassandra.concurrent.ScheduledExecutors; +import org.apache.cassandra.concurrent.Stage; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.gms.EndpointState; +import org.apache.cassandra.gms.Gossiper; +import org.apache.cassandra.gms.IEndpointStateChangeSubscriber; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.AccordFastPath.NodeInfo; +import org.apache.cassandra.service.accord.AccordFastPath.Status; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.listeners.ChangeListener; +import org.apache.cassandra.tcm.transformations.ReconfigureAccordFastPath; +import org.apache.cassandra.utils.Clock; + +import java.util.*; +import java.util.concurrent.TimeUnit; + +/** + * Listens to availability status of peers and updates tcm fast path data accordingly + */ +public abstract class AccordFastPathCoordinator implements ChangeListener, ConfigurationService.Listener +{ + private static final AsyncResult SUCCESS = AsyncResults.success(null); + + private static class PeerStatus + { + final Node.Id peer; + final Status status; + + public PeerStatus(Node.Id peer, Status status) + { + this.peer = peer; + this.status = status; + } + + boolean shouldUpdateFastPath(AccordFastPath fastPath, long nowMillis, long delayMillis) + { + NodeInfo info = fastPath.info.get(peer); + + if (info == null) + return status != Status.NORMAL; + + if (info.status == status || info.status == Status.SHUTDOWN) + return false; + + return nowMillis - info.updated > delayMillis; + } + } + + private static class Peers + { + static final Peers EMPTY = new Peers(0, ImmutableSet.of(), Collections.emptyMap()); + final long epoch; + final ImmutableSet peers; + final Map statusMap; + + public Peers(long epoch, ImmutableSet peers, Map statusMap) + { + this.epoch = epoch; + this.peers = peers; + this.statusMap = statusMap; + } + + public boolean contains(Node.Id node) + { + return peers.contains(node); + } + + public static Peers from(Node.Id localId, Topology topology, Peers prev) + { + Set peers = new HashSet<>(); + topology.forEachOn(localId, (shard, index) -> peers.addAll(shard.nodes)); + peers.remove(localId); + + Map statusMap = new HashMap<>(); + for (Node.Id peer : peers) + { + PeerStatus status = prev.statusMap.get(peer); + if (status != null) + statusMap.put(peer, status); + } + + return new Peers(topology.epoch(), ImmutableSet.copyOf(peers), statusMap); + } + + public PeerStatus onUpdate(Node.Id node, Status status) + { + Invariants.requireArgument(contains(node)); + PeerStatus peerStatus = new PeerStatus(node, status); + statusMap.put(node, peerStatus); + return peerStatus; + } + + public Iterable statusIterable() + { + return statusMap.values(); + } + } + + private boolean receivedShutdownSignal = false; + private volatile Epoch startupEpoch = null; + private volatile boolean issuedStartupUpdate = false; + private boolean hasRegistered = false; + private Peers peers = Peers.EMPTY; + private final Node.Id localId; + + public AccordFastPathCoordinator(Node.Id localId) + { + this.localId = localId; + } + + private boolean isShutdown(AccordFastPath fastPath) + { + NodeInfo info = fastPath.info.get(localId); + return info != null && info.status == Status.SHUTDOWN; + } + + public synchronized void start() + { + if (hasRegistered) + return; + + ClusterMetadata cm = currentMetadata(); + startupEpoch = cm.epoch; + registerAsListener(); + + // TODO: start check routine + + hasRegistered = true; + + AccordFastPath fastPath = cm.accordFastPath; + + long updateDelayMillis = getAccordFastPathUpdateDelayMillis(); + if (updateDelayMillis < 0) + return; + + if (isShutdown(fastPath)) + { + updateFastPath(localId, Status.NORMAL, Clock.Global.currentTimeMillis(), updateDelayMillis); + issuedStartupUpdate = true; + } + + scheduleMaintenanceTask(updateDelayMillis); + } + + abstract ClusterMetadata currentMetadata(); + abstract void registerAsListener(); + abstract void updateFastPath(Node.Id node, Status status, long updateTimeMillis, long updateDelayMillis); + abstract long getAccordFastPathUpdateDelayMillis(); + + private static class Impl extends AccordFastPathCoordinator implements IEndpointStateChangeSubscriber + { + private final AccordConfigurationService configService; + + public Impl(Node.Id localId, AccordConfigurationService configService) + { + super(localId); + this.configService = configService; + } + + @Override + ClusterMetadata currentMetadata() + { + return ClusterMetadata.current(); + } + + @Override + void registerAsListener() + { + Gossiper.instance.register(this); + StorageService.instance.addPreShutdownHook(this::onShutdown); + configService.registerListener(this); + } + + @Override + void updateFastPath(Node.Id node, Status status, long updateTimeMillis, long updateDelayMillis) + { + Stage.MISC.submit(() -> { + ClusterMetadataService.instance().commit(new ReconfigureAccordFastPath(node, status, updateTimeMillis, updateDelayMillis), + metadata -> metadata, ((code, message) -> null)); + }); + } + + @Override + long getAccordFastPathUpdateDelayMillis() + { + return DatabaseDescriptor.getAccordFastPathUpdateDelayMillis(); + } + + @Override + public void onAlive(InetAddressAndPort endpoint, EndpointState state) + { + Node.Id node = configService.mappedIdOrNull(endpoint); + if (node != null) onAlive(node); + } + + @Override + public void onDead(InetAddressAndPort endpoint, EndpointState state) + { + Node.Id node = configService.mappedIdOrNull(endpoint); + if (node != null) onDead(node); + } + } + + public static AccordFastPathCoordinator create(Node.Id localId, AccordConfigurationService configService) + { + return new Impl(localId, configService); + } + + synchronized void maybeUpdateFastPath(Node.Id node, Status status) + { + long nowMillis = Clock.Global.currentTimeMillis(); + long delayMillis = getAccordFastPathUpdateDelayMillis(); + + // don't schedule updates for nodes we don't share shards with + if (!peers.contains(node)) + return; + + PeerStatus peerStatus = peers.onUpdate(node, status); + ClusterMetadata metadata = currentMetadata(); + if (peerStatus.shouldUpdateFastPath(metadata.accordFastPath, nowMillis, delayMillis)) + updateFastPath(node, status, nowMillis, delayMillis); + } + + private void scheduleMaintenanceTask(long delayMillis) + { + ScheduledExecutors.scheduledTasks.scheduleSelfRecurring(this::maintenance, delayMillis, TimeUnit.MILLISECONDS); + } + + synchronized void maintenance() + { + long nowMillis = Clock.Global.currentTimeMillis(); + long delayMillis = getAccordFastPathUpdateDelayMillis(); + try + { + ClusterMetadata metadata = currentMetadata(); + for (PeerStatus status : peers.statusIterable()) + { + if (status.shouldUpdateFastPath(metadata.accordFastPath, nowMillis, delayMillis)) + updateFastPath(status.peer, status.status, nowMillis, delayMillis); + } + } + finally + { + scheduleMaintenanceTask(delayMillis); + } + } + + void onAlive(Node.Id node) + { + maybeUpdateFastPath(node, Status.NORMAL); + } + + public void onDead(Node.Id node) + { + maybeUpdateFastPath(node, Status.UNAVAILABLE); + } + + public void onShutdown() + { + synchronized (this) + { + receivedShutdownSignal = true; + } + + updateFastPath(localId, Status.SHUTDOWN, Clock.Global.currentTimeMillis(), getAccordFastPathUpdateDelayMillis()); + } + + /** + * In case we somehow missed that we've marked ourselves shutdown on startup + */ + @Override + public void notifyPostCommit(ClusterMetadata prev, ClusterMetadata next, boolean fromSnapshot) + { + if (next.epoch.compareTo(startupEpoch) <= 0) + return; + + if (!isShutdown(next.accordFastPath)) + return; + + synchronized (this) + { + if (receivedShutdownSignal || issuedStartupUpdate) + return; + issuedStartupUpdate = true; + } + + updateFastPath(localId, Status.NORMAL, Clock.Global.currentTimeMillis(), getAccordFastPathUpdateDelayMillis()); + } + + synchronized void updatePeers(Topology topology) + { + if (topology.epoch() <= peers.epoch) + return; + + peers = Peers.from(localId, topology, peers); + } + + @VisibleForTesting + synchronized boolean isPeer(Node.Id node) + { + return peers.contains(node); + } + + @Override + public AsyncResult onTopologyUpdate(Topology topology, boolean isLoad, boolean startSync) + { + updatePeers(topology); + return SUCCESS; + } + + @Override public void onRemoteSyncComplete(Node.Id node, long epoch) {} + @Override public void onEpochClosed(Ranges ranges, long epoch) {} + @Override public void onEpochRetired(Ranges ranges, long epoch) {} +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordFetchCoordinator.java b/src/java/org/apache/cassandra/service/accord/AccordFetchCoordinator.java new file mode 100644 index 000000000000..1ceedc461227 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordFetchCoordinator.java @@ -0,0 +1,440 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import com.google.common.collect.ImmutableMap; + +import accord.api.Data; +import accord.api.DataStore; +import accord.api.Query; +import accord.api.Read; +import accord.api.Update; +import accord.impl.AbstractFetchCoordinator; +import accord.local.CommandStore; +import accord.local.Node; +import accord.local.SafeCommandStore; +import accord.primitives.PartialDeps; +import accord.primitives.PartialTxn; +import accord.primitives.Participants; +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.primitives.Routable; +import accord.primitives.Seekable; +import accord.primitives.Seekables; +import accord.primitives.SyncPoint; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.utils.Invariants; +import accord.utils.async.AsyncChain; +import accord.utils.async.AsyncChains; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.ParameterisedVersionedSerializer; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.RangesAtEndpoint; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.serializers.CommandSerializers; +import org.apache.cassandra.service.accord.serializers.IVersionedSerializer; +import org.apache.cassandra.service.accord.serializers.KeySerializers; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.streaming.PreviewKind; +import org.apache.cassandra.streaming.StreamCoordinator; +import org.apache.cassandra.streaming.StreamManager; +import org.apache.cassandra.streaming.StreamOperation; +import org.apache.cassandra.streaming.StreamPlan; +import org.apache.cassandra.streaming.StreamResultFuture; +import org.apache.cassandra.streaming.StreamSession; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.TimeUUID; + +import static org.apache.cassandra.utils.CollectionSerializers.deserializeMap; +import static org.apache.cassandra.utils.CollectionSerializers.serializeMap; +import static org.apache.cassandra.utils.CollectionSerializers.serializedMapSize; + +public class AccordFetchCoordinator extends AbstractFetchCoordinator implements StreamManager.StreamListener +{ + private static final Query noopQuery = (txnId, executeAt, keys, data, read, update) -> null; + + public static class StreamData implements Data + { + public static class SessionInfo + { + final TimeUUID planId; + final boolean hasData; + + public SessionInfo(TimeUUID planId, boolean hasData) + { + this.planId = planId; + this.hasData = hasData; + } + + static final UnversionedSerializer serializer = new UnversionedSerializer<>() + { + public void serialize(SessionInfo info, DataOutputPlus out) throws IOException + { + TimeUUID.Serializer.instance.serialize(info.planId, out); + out.writeBoolean(info.hasData); + + } + + public SessionInfo deserialize(DataInputPlus in) throws IOException + { + return new SessionInfo(TimeUUID.Serializer.instance.deserialize(in), in.readBoolean()); + } + + public long serializedSize(SessionInfo info) + { + return TimeUUID.Serializer.instance.serializedSize(info.planId) + TypeSizes.BOOL_SIZE; + } + }; + } + public static final UnversionedSerializer serializer = new UnversionedSerializer<>() + { + @Override + public void serialize(StreamData data, DataOutputPlus out) throws IOException + { + serializeMap(data.streams, out, TokenRange.serializer, SessionInfo.serializer); + } + + @Override + public StreamData deserialize(DataInputPlus in) throws IOException + { + + return new StreamData(ImmutableMap.copyOf(deserializeMap(in, + TokenRange.serializer, + SessionInfo.serializer))); + } + + @Override + public long serializedSize(StreamData data) + { + return serializedMapSize(data.streams, TokenRange.serializer, SessionInfo.serializer); + } + }; + + private final ImmutableMap streams; + + public StreamData(ImmutableMap streams) + { + this.streams = streams; + } + + public static StreamData of(TokenRange range, TimeUUID streamId, boolean hasData) + { + return new StreamData(ImmutableMap.of(range, new SessionInfo(streamId, hasData))); + } + + @Override + public StreamData merge(Data data) + { + StreamData that = (StreamData) data; + if (that.streams.keySet().stream().anyMatch(this.streams::containsKey)) + throw new IllegalStateException(String.format("Unable to merge: key found in multiple StreamData %s %s", + this.streams.keySet(), that.streams.keySet())); + Invariants.require(!that.streams.keySet().stream().anyMatch(this.streams::containsKey)); + ImmutableMap.Builder builder = ImmutableMap.builder(); + builder.putAll(this.streams); + builder.putAll(that.streams); + return new StreamData(builder.build()); + } + } + + // needs to be externally synchronized + private class IncomingStream + { + private final TimeUUID planId; + private Range range; + private Node.Id from; + private StreamResultFuture future; + + public IncomingStream(TimeUUID planId) + { + this.planId = planId; + } + + private void rangeReceived(Range range, Node.Id from) + { + Invariants.nonNull(range); + Invariants.nonNull(from); + Invariants.require(this.range == null, "range was not null: %s", this.range); + Invariants.require(this.from == null, "from was not null: %s", this.from); + this.range = range; + this.from = from; + maybeListen(); + } + + private void futureReceived(StreamResultFuture future) + { + Invariants.nonNull(future); + Invariants.require(this.future == null, "future was not null: %s", this.future); + this.future = future; + maybeListen(); + } + + private void maybeListen() + { + if (range == null || future == null) + return; + + Invariants.nonNull(from); + + future.addCallback((state, fail) -> { + if (fail == null) success(from, Ranges.of(range)); + else fail(from, Ranges.of(range), fail); + }, ((AccordCommandStore) commandStore()).taskExecutor()); + } + } + + public static class StreamingRead implements Read + { + public static final ParameterisedVersionedSerializer serializer = new ParameterisedVersionedSerializer() + { + @Override + public void serialize(StreamingRead read, TableMetadatasAndKeys seekables, DataOutputPlus out, Version version) throws IOException + { + InetAddressAndPort.Serializer.inetAddressAndPortSerializer.serialize(read.to, out, version.messageVersion()); + KeySerializers.ranges.serialize(read.ranges, out); + } + + @Override + public StreamingRead deserialize(TableMetadatasAndKeys seekables, DataInputPlus in, Version version) throws IOException + { + return new StreamingRead(InetAddressAndPort.Serializer.inetAddressAndPortSerializer.deserialize(in, version.messageVersion()), + KeySerializers.ranges.deserialize(in)); + } + + @Override + public long serializedSize(StreamingRead read, TableMetadatasAndKeys seekables, Version version) + { + return InetAddressAndPort.Serializer.inetAddressAndPortSerializer.serializedSize(read.to, version.messageVersion()) + + KeySerializers.ranges.serializedSize(read.ranges); + } + }; + + private final InetAddressAndPort to; + private final Ranges ranges; + + public StreamingRead(InetAddressAndPort to, Ranges ranges) + { + this.to = to; + this.ranges = ranges; + } + + @Override + public Seekables keys() { return ranges; } + + private static boolean hasDataToStream(StreamCoordinator coordinator, InetAddressAndPort to) + { + for (StreamSession session : coordinator.getAllStreamSessions()) + { + if (!session.peer.equals(to)) + continue; + + Invariants.require(session.getNumRequests() == 0, "Requested to send data: %s", session); + if (session.getNumTransfers() > 0) + return true; + } + return false; + } + + @Override + public AsyncChain read(Seekable key, SafeCommandStore commandStore, Timestamp executeAt, DataStore store) + { + try + { + Invariants.requireArgument(key.domain() == Routable.Domain.Range, "Required Range but saw %s: %s", key.domain(), key); + TokenRange range = (TokenRange) key; + + // TODO (required): check epoch + // TODO (required): handle dropped tables + TableId tableId = range.table(); + TableMetadata table = ClusterMetadata.current().schema.getKeyspaces().getTableOrViewNullable(tableId); + Invariants.require(table != null, "Table with id %s not found", tableId); + + // TODO (required): may also be relocation + StreamPlan plan = new StreamPlan(StreamOperation.BOOTSTRAP, 1, false, + null, PreviewKind.NONE).flushBeforeTransfer(true); + + RangesAtEndpoint ranges = RangesAtEndpoint.toDummyList(Collections.singleton(range.toKeyspaceRange())); + plan.transferRanges(to, table.keyspace, ranges, table.name); + StreamResultFuture future = plan.execute(); + return AsyncChains.success(StreamData.of(range, future.planId, hasDataToStream(future.getCoordinator(), to))); + } + catch (Throwable t) + { + return AsyncChains.failure(t); + } + } + + @Override + public Read slice(Ranges ranges) { return new StreamingRead(to, this.ranges.slice(ranges)); } + + @Override + public Read intersecting(Participants participants) { return new StreamingRead(to, this.ranges.slice(ranges)); } + + @Override + public Read merge(Read other) { throw new UnsupportedOperationException(); } + } + + public static class StreamingTxn + { + private static final ParameterisedVersionedSerializer read = (ParameterisedVersionedSerializer)StreamingRead.serializer; + + private static final UnversionedSerializer query = new UnversionedSerializer<>() + { + @Override + public void serialize(Query t, DataOutputPlus out) + { + Invariants.requireArgument(t == noopQuery); + } + + @Override + public Query deserialize(DataInputPlus in) + { + return noopQuery; + } + + @Override + public long serializedSize(Query t) + { + Invariants.requireArgument(t == noopQuery); + return 0; + } + }; + + private static final ParameterisedVersionedSerializer update = new ParameterisedVersionedSerializer<>() + { + @Override + public void serialize(Update t, TableMetadatasAndKeys seekables, DataOutputPlus out, Version version) + { + Invariants.requireArgument(t == null); + } + + @Override + public Update deserialize(TableMetadatasAndKeys seekables, DataInputPlus in, Version version) + { + return null; + } + + @Override + public long serializedSize(Update t, TableMetadatasAndKeys seekables, Version version) + { + Invariants.requireArgument(t == null); + return 0; + } + }; + + // TODO (desired): this could be serialized as an InetAddressAndPort and Ranges if we had a special case PartialTxn implementation + public static final IVersionedSerializer serializer = new CommandSerializers.PartialTxnSerializer(read, query, update, TableMetadatasAndKeys.serializer); + } + + private final Map streams = new HashMap<>(); + + public AccordFetchCoordinator(Node node, Ranges ranges, SyncPoint syncPoint, DataStore.FetchRanges fetchRanges, CommandStore commandStore) + { + super(node, ranges, syncPoint, fetchRanges, commandStore); + } + + @Override + public void start() + { + StreamManager.instance.addListener(this); + super.start(); + } + + private IncomingStream stream(TimeUUID id) + { + return streams.computeIfAbsent(id, IncomingStream::new); + } + + // called from stream thread + @Override + public synchronized void onRegister(StreamResultFuture result) + { + stream(result.planId).futureReceived(result); + } + + protected void onDone(Ranges success, Throwable failure) + { + StreamManager.instance.removeListener(this); + super.onDone(success, failure); + } + + @Override + protected PartialTxn rangeReadTxn(Ranges ranges) + { + StreamingRead read = new StreamingRead(FBUtilities.getBroadcastAddressAndPort(), ranges); + return new PartialTxn.InMemory(Txn.Kind.Read, ranges, read, noopQuery, null, TableMetadatasAndKeys.none(Routable.Domain.Range)); + } + + @Override + protected synchronized void onReadOk(Node.Id from, CommandStore commandStore, Data data, Ranges received) + { + if (data == null) + return; + + StreamData streamData = (StreamData) data; + streamData.streams.forEach((range, streamInfo) -> { + if (streamInfo.hasData) + { + stream(streamInfo.planId).rangeReceived(range, from); + } + else + { + // if there was no data to stream, no connection is initiated, and we aren't notified via the stream + // listener, so the stream initiator notifies us and we mark it complete here + success(from, Ranges.of(range)); + } + }); + } + + public static class AccordFetchRequest extends FetchRequest + { + public AccordFetchRequest(long sourceEpoch, TxnId syncId, Ranges ranges, PartialDeps partialDeps, PartialTxn partialTxn) + { + super(sourceEpoch, syncId, ranges, partialDeps, partialTxn); + } + + @Override + protected AsyncChain beginRead(SafeCommandStore safeStore, Timestamp executeAt, PartialTxn txn, Participants execute) + { + AsyncChain result = super.beginRead(safeStore, executeAt, txn, execute); + // TODO (required): verify that streaming snapshots have all been created by now, so we won't stream any data that arrives after this + readStarted(safeStore); + return result; + } + } + + @Override + protected FetchRequest newFetchRequest(long sourceEpoch, TxnId syncId, Ranges ranges, PartialDeps partialDeps, PartialTxn partialTxn) + { + return new AccordFetchRequest(sourceEpoch, syncId, ranges, partialDeps, partialTxn); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordJournal.java b/src/java/org/apache/cassandra/service/accord/AccordJournal.java new file mode 100644 index 000000000000..09477015f3c3 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordJournal.java @@ -0,0 +1,851 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service.accord; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.NavigableMap; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import javax.annotation.Nullable; + +import com.google.common.annotations.VisibleForTesting; + +import accord.impl.CommandChange; +import accord.impl.CommandChange.Field; +import accord.local.Cleanup; +import accord.local.Command; +import accord.local.CommandStore; +import accord.local.CommandStores; +import accord.local.CommandStores.RangesForEpoch; +import accord.local.DurableBefore; +import accord.local.Node; +import accord.local.RedundantBefore; +import accord.primitives.EpochSupplier; +import accord.primitives.Ranges; +import accord.primitives.SaveStatus; +import accord.primitives.Status.Durability; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import accord.utils.Invariants; +import accord.utils.PersistentField; +import accord.utils.UnhandledEnum; +import accord.utils.async.AsyncChains; +import accord.utils.async.AsyncResult; +import accord.utils.async.AsyncResults; +import org.apache.cassandra.concurrent.Shutdownable; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.journal.Compactor; +import org.apache.cassandra.journal.Journal; +import org.apache.cassandra.journal.Params; +import org.apache.cassandra.journal.RecordPointer; +import org.apache.cassandra.journal.SegmentCompactor; +import org.apache.cassandra.journal.StaticSegment; +import org.apache.cassandra.journal.ValueSerializer; +import org.apache.cassandra.service.accord.AccordJournalValueSerializers.FlyweightImage; +import org.apache.cassandra.service.accord.AccordJournalValueSerializers.IdentityAccumulator; +import org.apache.cassandra.service.accord.JournalKey.JournalKeySupport; +import org.apache.cassandra.service.accord.journal.AccordTopologyUpdate; +import org.apache.cassandra.service.accord.serializers.CommandSerializers; +import org.apache.cassandra.service.accord.serializers.CommandSerializers.ExecuteAtSerializer; +import org.apache.cassandra.service.accord.serializers.DepsSerializers; +import org.apache.cassandra.service.accord.serializers.ResultSerializers; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.service.accord.serializers.WaitingOnSerializer; +import org.apache.cassandra.utils.CloseableIterator; +import org.apache.cassandra.utils.ExecutorUtils; + +import static accord.impl.CommandChange.Field.CLEANUP; +import static accord.impl.CommandChange.anyFieldChanged; +import static accord.impl.CommandChange.describeFlags; +import static accord.impl.CommandChange.getFlags; +import static accord.impl.CommandChange.isChanged; +import static accord.impl.CommandChange.isNull; +import static accord.impl.CommandChange.nextSetField; +import static accord.impl.CommandChange.toIterableNonNullFields; +import static accord.impl.CommandChange.toIterableSetFields; +import static accord.impl.CommandChange.unsetIterable; +import static accord.impl.CommandChange.validateFlags; +import static accord.local.Cleanup.Input.FULL; +import static org.apache.cassandra.service.accord.AccordJournalValueSerializers.DurableBeforeAccumulator; + +public class AccordJournal implements accord.api.Journal, RangeSearcher.Supplier, Shutdownable +{ + static final ThreadLocal keyCRCBytes = ThreadLocal.withInitial(() -> new byte[JournalKeySupport.TOTAL_SIZE]); + + @VisibleForTesting + protected final Journal journal; + @VisibleForTesting + protected final AccordJournalTable journalTable; + private final Params params; + Node node; + + enum Status { INITIALIZED, STARTING, REPLAY, STARTED, TERMINATING, TERMINATED } + private volatile Status status = Status.INITIALIZED; + + public AccordJournal(Params params) + { + this(params, new File(DatabaseDescriptor.getAccordJournalDirectory()), Keyspace.open(AccordKeyspace.metadata().name).getColumnFamilyStore(AccordKeyspace.JOURNAL)); + } + + @VisibleForTesting + public AccordJournal(Params params, File directory, ColumnFamilyStore cfs) + { + Version userVersion = Version.fromVersion(params.userVersion()); + this.journal = new Journal<>("AccordJournal", directory, params, JournalKey.SUPPORT, + // In Accord, we are using streaming serialization, i.e. Reader/Writer interfaces instead of materializing objects + new ValueSerializer<>() + { + @Override + public void serialize(JournalKey key, Object value, DataOutputPlus out, int userVersion) + { + throw new UnsupportedOperationException(); + } + + @Override + public Object deserialize(JournalKey key, DataInputPlus in, int userVersion) + { + throw new UnsupportedOperationException(); + } + }, + compactor(cfs, userVersion)); + this.journalTable = new AccordJournalTable<>(journal, JournalKey.SUPPORT, cfs, userVersion); + this.params = params; + } + + protected SegmentCompactor compactor(ColumnFamilyStore cfs, Version userVersion) + { + return new AccordSegmentCompactor<>(userVersion, cfs) { + @Nullable + @Override + public Collection> compact(Collection> staticSegments) + { + if (journalTable == null) + throw new IllegalStateException("Unsafe access to AccordJournal during ; journalTable was touched before it was published"); + Collection> result = super.compact(staticSegments); + journalTable.safeNotify(index -> index.remove(staticSegments)); + return result; + } + }; + } + + @VisibleForTesting + public int inMemorySize() + { + return journal.currentActiveSegment().index().size(); + } + + public void start(Node node) + { + Invariants.require(status == Status.INITIALIZED); + this.node = node; + status = Status.STARTING; + journal.start(); + } + + public boolean started() + { + return status == Status.STARTED; + } + + public Params configuration() + { + return params; + } + + public Compactor compactor() + { + return journal.compactor(); + } + + @Override + public boolean isTerminated() + { + return status == Status.TERMINATED; + } + + @Override + public void shutdown() + { + Invariants.require(status == Status.REPLAY || status == Status.STARTED, "%s", status); + status = Status.TERMINATING; + journal.shutdown(); + status = Status.TERMINATED; + } + + @Override + public Object shutdownNow() + { + shutdown(); + return null; + } + + @Override + public boolean awaitTermination(long timeout, TimeUnit units) throws InterruptedException + { + try + { + ExecutorUtils.awaitTermination(timeout, units, Collections.singletonList(journal)); + return true; + } + catch (TimeoutException e) + { + return false; + } + } + + @Override + public Command loadCommand(int commandStoreId, TxnId txnId, RedundantBefore redundantBefore, DurableBefore durableBefore) + { + Builder builder = load(commandStoreId, txnId); + builder.maybeCleanup(true, FULL, redundantBefore, durableBefore); + return builder.construct(redundantBefore); + } + + @Override + public Command.Minimal loadMinimal(int commandStoreId, TxnId txnId, Load load, RedundantBefore redundantBefore, DurableBefore durableBefore) + { + Builder builder = loadDiffs(commandStoreId, txnId, load); + if (builder.isEmpty()) + return null; + + Cleanup cleanup = builder.shouldCleanup(FULL, redundantBefore, durableBefore); + switch (cleanup) + { + case VESTIGIAL: + case EXPUNGE: + case ERASE: + return null; + } + Invariants.require(builder.saveStatus() != null, "No saveSatus loaded, but next was called and cleanup was not: %s", builder); + return builder.asMinimal(); + } + + @Override + public RedundantBefore loadRedundantBefore(int commandStoreId) + { + IdentityAccumulator accumulator = readAll(new JournalKey(TxnId.NONE, JournalKey.Type.REDUNDANT_BEFORE, commandStoreId)); + return accumulator.get(); + } + + @Override + public NavigableMap loadBootstrapBeganAt(int commandStoreId) + { + IdentityAccumulator> accumulator = readAll(new JournalKey(TxnId.NONE, JournalKey.Type.BOOTSTRAP_BEGAN_AT, commandStoreId)); + return accumulator.get(); + } + + @Override + public NavigableMap loadSafeToRead(int commandStoreId) + { + IdentityAccumulator> accumulator = readAll(new JournalKey(TxnId.NONE, JournalKey.Type.SAFE_TO_READ, commandStoreId)); + return accumulator.get(); + } + + @Override + public CommandStores.RangesForEpoch loadRangesForEpoch(int commandStoreId) + { + IdentityAccumulator accumulator = readAll(new JournalKey(TxnId.NONE, JournalKey.Type.RANGES_FOR_EPOCH, commandStoreId)); + return accumulator.get(); + } + + @Override + public void saveCommand(int commandStoreId, CommandUpdate update, @Nullable Runnable onFlush) + { + Writer diff = Writer.make(update.before, update.after); + if (diff == null) + { + if (onFlush != null) + onFlush.run(); + return; + } + + JournalKey key = new JournalKey(update.txnId, JournalKey.Type.COMMAND_DIFF, commandStoreId); + RecordPointer pointer = journal.asyncWrite(key, diff); + if (journalTable.shouldIndex(key) + && diff.hasParticipants() + && diff.after.route() != null) + journal.onDurable(pointer, () -> + journalTable.safeNotify(index -> + index.update(pointer.segment, key.commandStoreId, key.id, diff.after.route()))); + if (onFlush != null) + journal.onDurable(pointer, onFlush); + } + + @Override + public Iterator replayTopologies() + { + AccordTopologyUpdate.Accumulator accumulator = readAll(TopologyUpdateKey); + return accumulator.images(); + } + + private static final JournalKey TopologyUpdateKey = new JournalKey(TxnId.NONE, JournalKey.Type.TOPOLOGY_UPDATE, 0); + @Override + public void saveTopology(TopologyUpdate topologyUpdate, Runnable onFlush) + { + RecordPointer pointer = appendInternal(TopologyUpdateKey, AccordTopologyUpdate.newTopology(topologyUpdate)); + if (onFlush != null) + journal.onDurable(pointer, onFlush); + } + + @Override + public PersistentField.Persister durableBeforePersister() + { + return new PersistentField.Persister<>() + { + @Override + public AsyncResult persist(DurableBefore addDurableBefore, DurableBefore newDurableBefore) + { + AsyncResult.Settable result = AsyncResults.settable(); + JournalKey key = new JournalKey(TxnId.NONE, JournalKey.Type.DURABLE_BEFORE, 0); + RecordPointer pointer = appendInternal(key, addDurableBefore); + // TODO (required): what happens on failure? + journal.onDurable(pointer, () -> result.setSuccess(null)); + return result; + } + + @Override + public DurableBefore load() + { + DurableBeforeAccumulator accumulator = readAll(new JournalKey(TxnId.NONE, JournalKey.Type.DURABLE_BEFORE, 0)); + return accumulator.get(); + } + }; + } + + @Override + public void saveStoreState(int commandStoreId, FieldUpdates fieldUpdates, Runnable onFlush) + { + RecordPointer pointer = null; + // TODO: avoid allocating keys + if (fieldUpdates.newRedundantBefore != null) + pointer = appendInternal(new JournalKey(TxnId.NONE, JournalKey.Type.REDUNDANT_BEFORE, commandStoreId), fieldUpdates.newRedundantBefore); + if (fieldUpdates.newBootstrapBeganAt != null) + pointer = appendInternal(new JournalKey(TxnId.NONE, JournalKey.Type.BOOTSTRAP_BEGAN_AT, commandStoreId), fieldUpdates.newBootstrapBeganAt); + if (fieldUpdates.newSafeToRead != null) + pointer = appendInternal(new JournalKey(TxnId.NONE, JournalKey.Type.SAFE_TO_READ, commandStoreId), fieldUpdates.newSafeToRead); + if (fieldUpdates.newRangesForEpoch != null) + pointer = appendInternal(new JournalKey(TxnId.NONE, JournalKey.Type.RANGES_FOR_EPOCH, commandStoreId), fieldUpdates.newRangesForEpoch); + + if (onFlush == null) + return; + + if (pointer != null) + journal.onDurable(pointer, onFlush); + else + onFlush.run(); + } + + private Builder loadDiffs(int commandStoreId, TxnId txnId, Load load) + { + JournalKey key = new JournalKey(txnId, JournalKey.Type.COMMAND_DIFF, commandStoreId); + Builder builder = new Builder(txnId, load); + journalTable.readAll(key, builder::deserializeNext); + return builder; + } + + @VisibleForTesting + public Builder load(int commandStoreId, TxnId txnId) + { + return loadDiffs(commandStoreId, txnId, Load.ALL); + } + + private BUILDER readAll(JournalKey key) + { + BUILDER builder = (BUILDER) key.type.serializer.mergerFor(); + // TODO (expected): this can be further improved to avoid allocating lambdas + AccordJournalValueSerializers.FlyweightSerializer serializer = (AccordJournalValueSerializers.FlyweightSerializer) key.type.serializer; + // TODO (expected): for those where we store an image, read only the first entry we find in DESC order + journalTable.readAll(key, (in, userVersion) -> serializer.deserialize(key, builder, in, userVersion)); + return builder; + } + + private RecordPointer appendInternal(JournalKey key, T write) + { + AccordJournalValueSerializers.FlyweightSerializer serializer = (AccordJournalValueSerializers.FlyweightSerializer) key.type.serializer; + return journal.asyncWrite(key, (out, userVersion) -> serializer.serialize(key, write, out, Version.fromVersion(userVersion))); + } + + @VisibleForTesting + public void closeCurrentSegmentForTestingIfNonEmpty() + { + journal.closeCurrentSegmentForTestingIfNonEmpty(); + } + + public void sanityCheck(int commandStoreId, RedundantBefore redundantBefore, Command orig) + { + Builder builder = load(commandStoreId, orig.txnId()); + builder.forceResult(orig.result()); + // We can only use strict equality if we supply result. + Command reconstructed = builder.construct(redundantBefore); + Invariants.require(orig.equals(reconstructed), + '\n' + + "Original: %s\n" + + "Reconstructed: %s\n" + + "Diffs: %s", orig, reconstructed, builder); + } + + @VisibleForTesting + public void truncateForTesting() + { + journal.truncateForTesting(); + journalTable.safeNotify(RouteInMemoryIndex::truncateForTesting); + } + + @VisibleForTesting + public void runCompactorForTesting() + { + journal.runCompactorForTesting(); + } + + @Override + public void purge(CommandStores commandStores, EpochSupplier minEpoch) + { + journal.closeCurrentSegmentForTestingIfNonEmpty(); + journal.runCompactorForTesting(); + journalTable.forceCompaction(); + } + + @SuppressWarnings("unchecked") @Override + public void replay(CommandStores commandStores) + { + try (CloseableIterator> iter = journalTable.keyIterator()) + { + while (iter.hasNext()) + { + Journal.KeyRefs ref = iter.next(); + + if (ref.key().type != JournalKey.Type.COMMAND_DIFF) + continue; + + CommandStore commandStore = commandStores.forId(ref.key().commandStoreId); + Loader loader = commandStore.loader(); + AsyncChains.getUnchecked(loader.load(ref.key().id) + .map(command -> { + if (journalTable.shouldIndex(ref.key()) + && command.participants() != null + && command.participants().route() != null) + { + ref.segments(segment -> { + journalTable.safeNotify(index -> index.update(segment, ref.key().commandStoreId, ref.key().id, command.participants().route())); + }); + } + return command; + }) + .beginAsResult()); + } + } + catch (Throwable t) + { + throw new RuntimeException("Can not replay journal.", t); + } + } + + public static @Nullable ByteBuffer asSerializedChange(Command before, Command after, Version userVersion) throws IOException + { + try (DataOutputBuffer out = new DataOutputBuffer()) + { + Writer writer = Writer.make(before, after); + if (writer == null) + return null; + + writer.write(out, userVersion); + return out.asNewBuffer(); + } + } + + @VisibleForTesting + public void unsafeSetStarted() + { + status = Status.STARTED; + } + + @Override + public RangeSearcher rangeSearcher() + { + return journalTable.rangeSearcher(); + } + + public static class Writer implements Journal.Writer + { + private final Command after; + private final int flags; + + private Writer(Command after, int flags) + { + this.after = after; + this.flags = flags; + } + + public static Writer make(Command before, Command after) + { + if (before == after + || after == null + || after.saveStatus() == SaveStatus.Uninitialised) + return null; + + int flags = validateFlags(getFlags(before, after)); + if (!anyFieldChanged(flags)) + return null; + + return new Writer(after, flags); + } + + @Override + public void write(DataOutputPlus out, int userVersion) throws IOException + { + write(out, Version.fromVersion(userVersion)); + } + + public void write(DataOutputPlus out, Version userVersion) throws IOException + { + serialize(after, flags, out, userVersion); + } + + private static void serialize(Command command, int flags, DataOutputPlus out, Version userVersion) throws IOException + { + Invariants.require(flags != 0); + out.writeInt(flags); + + int iterable = toIterableSetFields(flags); + while (iterable != 0) + { + Field field = nextSetField(iterable); + if (isNull(field, flags)) + { + iterable = unsetIterable(field, iterable); + continue; + } + + switch (field) + { + case EXECUTE_AT: + ExecuteAtSerializer.serialize(command.txnId(), command.executeAt(), out); + break; + case EXECUTES_AT_LEAST: + ExecuteAtSerializer.serialize(command.executesAtLeast(), out); + break; + case MIN_UNIQUE_HLC: + out.writeUnsignedVInt(command.waitingOn().minUniqueHlc()); + break; + case SAVE_STATUS: + out.writeByte(command.saveStatus().ordinal()); + break; + case DURABILITY: + out.writeByte(command.durability().ordinal()); + break; + case ACCEPTED: + CommandSerializers.ballot.serialize(command.acceptedOrCommitted(), out); + break; + case PROMISED: + CommandSerializers.ballot.serialize(command.promised(), out); + break; + case PARTICIPANTS: + CommandSerializers.participants.serialize(command.participants(), out); + break; + case PARTIAL_TXN: + CommandSerializers.partialTxn.serialize(command.partialTxn(), out, userVersion); + break; + case PARTIAL_DEPS: + DepsSerializers.partialDeps.serialize(command.partialDeps(), out); + break; + case WAITING_ON: + Command.WaitingOn waitingOn = command.waitingOn(); + WaitingOnSerializer.serializeBitSetsOnly(command.txnId(), waitingOn, out); + break; + case WRITES: + CommandSerializers.writes.serialize(command.writes(), out, userVersion); + break; + case RESULT: + ResultSerializers.result.serialize(command.result(), out); + break; + case CLEANUP: + throw new IllegalStateException(); + } + + iterable = unsetIterable(field, iterable); + } + } + + private boolean hasField(Field fields) + { + return !isNull(fields, flags); + } + + public boolean hasParticipants() + { + return hasField(Field.PARTICIPANTS); + } + + @Override + public String toString() + { + return after.saveStatus() + " " + describeFlags(flags); + } + } + + public static class Builder extends CommandChange.Builder implements FlyweightImage + { + public Builder() + { + this(Load.ALL); + } + + public Builder(Load load) + { + super(null, load); + } + + public Builder(TxnId txnId) + { + super(txnId, Load.ALL); + } + + public Builder(TxnId txnId, Load load) + { + super(txnId, load); + } + + public void reset(JournalKey key) + { + reset(key.id); + } + + public ByteBuffer asByteBuffer(Version userVersion) throws IOException + { + try (DataOutputBuffer out = new DataOutputBuffer()) + { + serialize(out, userVersion); + return out.asNewBuffer(); + } + } + + public void serialize(DataOutputPlus out, Version userVersion) throws IOException + { + Invariants.require(mask == 0); + Invariants.require(flags != 0); + + int flags = validateFlags(this.flags); + serialize(flags, out, userVersion); + } + + private void serialize(int flags, DataOutputPlus out, Version userVersion) throws IOException + { + Invariants.require(flags != 0); + out.writeInt(flags); + + int iterable = toIterableNonNullFields(flags); + for (Field field = nextSetField(iterable) ; field != null; iterable = unsetIterable(field, iterable), field = nextSetField(iterable)) + { + switch (field) + { + default: throw new UnhandledEnum(field); + case CLEANUP: + out.writeByte(cleanup.ordinal()); + break; + case EXECUTE_AT: + Invariants.require(txnId != null); + Invariants.require(executeAt != null); + ExecuteAtSerializer.serialize(txnId, executeAt, out); + break; + case EXECUTES_AT_LEAST: + Invariants.require(executesAtLeast != null); + ExecuteAtSerializer.serialize(executesAtLeast, out); + break; + case MIN_UNIQUE_HLC: + Invariants.require(minUniqueHlc != 0); + out.writeUnsignedVInt(minUniqueHlc); + break; + case SAVE_STATUS: + Invariants.require(saveStatus != null); + out.writeByte(saveStatus.ordinal()); + break; + case DURABILITY: + Invariants.require(durability != null); + out.writeByte(durability.ordinal()); + break; + case ACCEPTED: + Invariants.require(acceptedOrCommitted != null); + CommandSerializers.ballot.serialize(acceptedOrCommitted, out); + break; + case PROMISED: + Invariants.require(promised != null); + CommandSerializers.ballot.serialize(promised, out); + break; + case PARTICIPANTS: + Invariants.require(participants != null); + CommandSerializers.participants.serialize(participants, out); + break; + case PARTIAL_TXN: + Invariants.require(partialTxn != null); + CommandSerializers.partialTxn.serialize(partialTxn, out, userVersion); + break; + case PARTIAL_DEPS: + Invariants.require(partialDeps != null); + DepsSerializers.partialDeps.serialize(partialDeps, out); + break; + case WAITING_ON: + Invariants.require(waitingOn != null); + ((WaitingOnSerializer.Provider)waitingOn).reserialize(out); + break; + case WRITES: + Invariants.require(writes != null); + CommandSerializers.writes.serialize(writes, out, userVersion); + break; + case RESULT: + Invariants.require(result != null); + ResultSerializers.result.serialize(result, out); + break; + } + } + } + + public void deserializeNext(DataInputPlus in, Version userVersion) throws IOException + { + Invariants.require(txnId != null); + int readFlags = in.readInt(); + Invariants.require(readFlags != 0); + hasUpdate = true; + count++; + + // batch-apply any new nulls + setNulls(false, readFlags); + // iterator sets low 16 bits; low readFlag bits are nulls, so masking with ~readFlags restricts to non-null changed fields + int iterable = toIterableSetFields(readFlags) & ~readFlags; + for (Field field = nextSetField(iterable) ; field != null; field = nextSetField(iterable = unsetIterable(field, iterable))) + { + // Since we are iterating in reverse order, we skip the fields that were + // set by entries writter later (i.e. already read ones). + if (isChanged(field, flags) && field != CLEANUP) + skip(txnId, field, in, userVersion); + else + deserialize(field, in, userVersion); + } + + // upper 16 bits are changed flags, lower are nulls; by masking upper by ~lower we restrict to only non-null changed fields + this.flags |= readFlags & (~readFlags << 16); + } + + private void deserialize(Field field, DataInputPlus in, Version userVersion) throws IOException + { + switch (field) + { + case EXECUTE_AT: + executeAt = ExecuteAtSerializer.deserialize(txnId, in); + break; + case EXECUTES_AT_LEAST: + executesAtLeast = ExecuteAtSerializer.deserialize(in); + break; + case MIN_UNIQUE_HLC: + minUniqueHlc = in.readUnsignedVInt(); + break; + case SAVE_STATUS: + saveStatus = SaveStatus.values()[in.readByte()]; + break; + case DURABILITY: + durability = Durability.values()[in.readByte()]; + break; + case ACCEPTED: + acceptedOrCommitted = CommandSerializers.ballot.deserialize(in); + break; + case PROMISED: + promised = CommandSerializers.ballot.deserialize(in); + break; + case PARTICIPANTS: + participants = CommandSerializers.participants.deserialize(in); + break; + case PARTIAL_TXN: + partialTxn = CommandSerializers.partialTxn.deserialize(in, userVersion); + break; + case PARTIAL_DEPS: + partialDeps = DepsSerializers.partialDeps.deserialize(in); + break; + case WAITING_ON: + waitingOn = WaitingOnSerializer.deserializeProvider(txnId, in); + break; + case WRITES: + writes = CommandSerializers.writes.deserialize(in, userVersion); + break; + case CLEANUP: + Cleanup newCleanup = Cleanup.forOrdinal(in.readByte()); + if (cleanup == null || newCleanup.compareTo(cleanup) > 0) + cleanup = newCleanup; + break; + case RESULT: + result = ResultSerializers.result.deserialize(in); + break; + } + } + + private static void skip(TxnId txnId, Field field, DataInputPlus in, Version userVersion) throws IOException + { + switch (field) + { + default: throw new UnhandledEnum(field); + case EXECUTE_AT: + ExecuteAtSerializer.skip(txnId, in); + break; + case EXECUTES_AT_LEAST: + ExecuteAtSerializer.skip(in); + break; + case MIN_UNIQUE_HLC: + in.readUnsignedVInt(); + break; + case SAVE_STATUS: + case DURABILITY: + case CLEANUP: + in.readByte(); + break; + case ACCEPTED: + case PROMISED: + CommandSerializers.ballot.skip(in); + break; + case PARTICIPANTS: + CommandSerializers.participants.deserialize(in); + break; + case PARTIAL_TXN: + CommandSerializers.partialTxn.deserialize(in, userVersion); + break; + case PARTIAL_DEPS: + // TODO (expected): skip + DepsSerializers.partialDeps.deserialize(in); + break; + case WAITING_ON: + WaitingOnSerializer.skip(txnId, in); + break; + case WRITES: + // TODO (expected): skip + CommandSerializers.writes.deserialize(in, userVersion); + break; + case RESULT: + // TODO (expected): skip + ResultSerializers.result.deserialize(in); + break; + } + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordJournalTable.java b/src/java/org/apache/cassandra/service/accord/AccordJournalTable.java new file mode 100644 index 000000000000..ed1594b30e8f --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordJournalTable.java @@ -0,0 +1,502 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service.accord; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.function.Consumer; +import javax.annotation.CheckForNull; +import javax.annotation.Nullable; + +import com.google.common.collect.AbstractIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import accord.utils.Invariants; +import org.agrona.collections.LongHashSet; +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.cql3.Operator; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.ColumnFamilyStore.RefViewFragment; +import org.apache.cassandra.db.DataRange; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.EmptyIterators; +import org.apache.cassandra.db.PartitionRangeReadCommand; +import org.apache.cassandra.db.ReadExecutionController; +import org.apache.cassandra.db.Slices; +import org.apache.cassandra.db.StorageHook; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.filter.DataLimits; +import org.apache.cassandra.db.filter.RowFilter; +import org.apache.cassandra.db.lifecycle.SSTableSet; +import org.apache.cassandra.db.lifecycle.View; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.BytesType; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.LongType; +import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; +import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.Unfiltered; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIterators; +import org.apache.cassandra.index.Index; +import org.apache.cassandra.index.accord.OrderedRouteSerializer; +import org.apache.cassandra.index.accord.RouteJournalIndex; +import org.apache.cassandra.io.sstable.ISSTableScanner; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.journal.EntrySerializer.EntryHolder; +import org.apache.cassandra.journal.Journal; +import org.apache.cassandra.journal.KeySupport; +import org.apache.cassandra.journal.RecordConsumer; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.utils.CloseableIterator; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.JVMStabilityInspector; +import org.apache.cassandra.utils.MergeIterator; + +import static org.apache.cassandra.io.sstable.SSTableReadsListener.NOOP_LISTENER; + +public class AccordJournalTable implements RangeSearcher.Supplier +{ + private static final Logger logger = LoggerFactory.getLogger(AccordJournalTable.class); + + private final Journal journal; + private final ColumnFamilyStore cfs; + + private final ColumnMetadata recordColumn; + private final ColumnMetadata versionColumn; + + private final KeySupport keySupport; + /** + * Access to this field should only ever be handled by {@link #safeNotify(Consumer)}. There is an assumption that + * an error in the index should not cause the journal to crash, so {@link #safeNotify(Consumer)} exists to make sure + * this property holds true. + */ + @Nullable + private final RouteInMemoryIndex index; + private final Version accordJournalVersion; + + public AccordJournalTable(Journal journal, KeySupport keySupport, ColumnFamilyStore cfs, Version accordJournalVersion) + { + this.journal = journal; + this.cfs = cfs; + this.recordColumn = cfs.metadata().getColumn(ColumnIdentifier.getInterned("record", false)); + this.versionColumn = cfs.metadata().getColumn(ColumnIdentifier.getInterned("user_version", false)); + this.keySupport = keySupport; + this.accordJournalVersion = accordJournalVersion; + + this.index = cfs.indexManager.getIndexByName(AccordKeyspace.JOURNAL_INDEX_NAME) != null + ? new RouteInMemoryIndex<>() + : null; + } + + boolean shouldIndex(JournalKey key) + { + if (index == null) return false; + return RouteJournalIndex.allowed(key); + } + + void safeNotify(Consumer> fn) + { + if (index == null) + return; + try + { + fn.accept(index); + } + catch (Throwable t) + { + JVMStabilityInspector.inspectThrowable(t); + logger.warn("Failure updating index", t); + } + } + + public void forceCompaction() + { + cfs.forceMajorCompaction(); + } + + @Override + public RangeSearcher rangeSearcher() + { + if (index == null) + return RangeSearcher.NoopRangeSearcher.instance; + return new TableRangeSearcher(); + } + + public interface Reader + { + void read(DataInputPlus input, Version userVersion) throws IOException; + } + + private class RecordConsumerAdapter implements RecordConsumer + { + protected final Reader reader; + + RecordConsumerAdapter(Reader reader) + { + this.reader = reader; + } + + private long prevSegment = -1; + private long prevPosition = -1; + + @Override + public void accept(long segment, int position, K key, ByteBuffer buffer, int userVersion) + { + Invariants.require(prevSegment == -1 || segment <= prevSegment, + "Records should always be iterated over in a reverse order, but %s was seen after %s", segment, prevSegment); + if (prevSegment != segment) + prevPosition = -1; + Invariants.require(prevPosition == -1 || position < prevPosition, + "Records should always be iterated over in a reverse order, but %s was seen after %s", position, prevPosition); + readBuffer(buffer, reader, Version.fromVersion(userVersion)); + prevSegment = segment; + prevPosition = position; + } + } + + private class TableRecordConsumer implements RecordConsumer + { + protected LongHashSet visited = null; + protected RecordConsumer delegate; + + TableRecordConsumer(RecordConsumer delegate) + { + this.delegate = delegate; + } + + void visit(long segment) + { + if (visited == null) + visited = new LongHashSet(); + visited.add(segment); + } + + boolean visited(long segment) + { + return visited != null && visited.contains(segment); + } + + @Override + public void accept(long segment, int position, K key, ByteBuffer buffer, int userVersion) + { + visit(segment); + delegate.accept(segment, position, key, buffer, userVersion); + } + } + + private class JournalAndTableRecordConsumer implements RecordConsumer + { + private final K key; + private final TableRecordConsumer tableRecordConsumer; + private final RecordConsumer delegate; + + JournalAndTableRecordConsumer(K key, RecordConsumer reader) + { + this.key = key; + this.tableRecordConsumer = new TableRecordConsumer(reader); + this.delegate = reader; + } + + void readTable() + { + readAllFromTable(key, tableRecordConsumer); + } + + @Override + public void accept(long segment, int position, K key, ByteBuffer buffer, int userVersion) + { + if (!tableRecordConsumer.visited(segment)) //TODO (required): don't need this anymore + delegate.accept(segment, position, key, buffer, userVersion); + } + } + + /** + * When using {@link PartitionRangeReadCommand} we need to work with {@link RowFilter} which works with columns. + * But the index doesn't care about table based queries and needs to be queried using the fields in the index, to + * support that this enum exists. This enum represents the fields present in the index and can be used to apply + * filters to the index. + */ + public enum SyntheticColumn + { + participants("participants", BytesType.instance), + store_id("store_id", Int32Type.instance), + txn_id("txn_id", AccordKeyspace.TIMESTAMP_TYPE); + + public final ColumnMetadata metadata; + + SyntheticColumn(String name, AbstractType type) + { + this.metadata = new ColumnMetadata("journal", "routes", new ColumnIdentifier(name, false), type, ColumnMetadata.NO_UNIQUE_ID, ColumnMetadata.NO_POSITION, ColumnMetadata.Kind.REGULAR, null); + } + } + + private class TableRangeSearcher implements RangeSearcher + { + private final Index tableIndex; + + private TableRangeSearcher() + { + this.tableIndex = cfs.indexManager.getIndexByName("record"); + if (!cfs.indexManager.isIndexQueryable(tableIndex)) + throw new AssertionError("Journal record index is not queryable"); + } + + @Override + public Result search(int commandStoreId, TokenRange range, TxnId minTxnId, Timestamp maxTxnId) + { + CloseableIterator inMemory = index.search(commandStoreId, range, minTxnId, maxTxnId).results(); + CloseableIterator table = tableSearch(commandStoreId, range.start(), range.end()); + return new DefaultResult(minTxnId, maxTxnId, MergeIterator.get(Arrays.asList(inMemory, table))); + } + + @Override + public Result search(int commandStoreId, TokenKey key, TxnId minTxnId, Timestamp maxTxnId) + { + CloseableIterator inMemory = index.search(commandStoreId, key, minTxnId, maxTxnId).results(); + CloseableIterator table = tableSearch(commandStoreId, key); + return new DefaultResult(minTxnId, maxTxnId, MergeIterator.get(Arrays.asList(inMemory, table))); + } + + private CloseableIterator tableSearch(int store, TokenKey start, TokenKey end) + { + RowFilter rowFilter = RowFilter.create(false); + rowFilter.add(AccordJournalTable.SyntheticColumn.participants.metadata, Operator.GT, OrderedRouteSerializer.serialize(start)); + rowFilter.add(AccordJournalTable.SyntheticColumn.participants.metadata, Operator.LTE, OrderedRouteSerializer.serialize(end)); + rowFilter.add(AccordJournalTable.SyntheticColumn.store_id.metadata, Operator.EQ, Int32Type.instance.decompose(store)); + + return process(store, rowFilter); + } + + private CloseableIterator tableSearch(int store, TokenKey key) + { + RowFilter rowFilter = RowFilter.create(false); + rowFilter.add(AccordJournalTable.SyntheticColumn.participants.metadata, Operator.GTE, OrderedRouteSerializer.serialize(key)); + rowFilter.add(AccordJournalTable.SyntheticColumn.participants.metadata, Operator.LTE, OrderedRouteSerializer.serialize(key)); + rowFilter.add(AccordJournalTable.SyntheticColumn.store_id.metadata, Operator.EQ, Int32Type.instance.decompose(store)); + + return process(store, rowFilter); + } + + private CloseableIterator process(int storeId, RowFilter rowFilter) + { + PartitionRangeReadCommand cmd = PartitionRangeReadCommand.create(cfs.metadata(), + FBUtilities.nowInSeconds(), + ColumnFilter.selectionBuilder() + .add(AccordJournalTable.SyntheticColumn.store_id.metadata) + .add(AccordJournalTable.SyntheticColumn.txn_id.metadata) + .build(), + rowFilter, + DataLimits.NONE, + DataRange.allData(cfs.getPartitioner())); + Index.Searcher s = tableIndex.searcherFor(cmd); + try (ReadExecutionController controller = cmd.executionController()) + { + UnfilteredPartitionIterator partitionIterator = s.search(controller); + return new CloseableIterator<>() + { + + @Override + public void close() + { + partitionIterator.close(); + } + + @Override + public boolean hasNext() + { + return partitionIterator.hasNext(); + } + + @Override + public TxnId next() + { + UnfilteredRowIterator next = partitionIterator.next(); + JournalKey partitionKeyComponents = AccordKeyspace.JournalColumns.getJournalKey(next.partitionKey()); + Invariants.require(partitionKeyComponents.commandStoreId == storeId, + () -> String.format("table index returned a command store other than the exepcted one; expected %d != %d", storeId, partitionKeyComponents.commandStoreId)); + return partitionKeyComponents.id; + } + }; + } + } + } + + /** + * Perform a read from Journal table, followed by the reads from all journal segments. + *

    + * When reading from journal segments, skip descriptors that were read from the table. + */ + public void readAll(K key, Reader reader) + { + readAll(key, new RecordConsumerAdapter(reader)); + } + + public void readAll(K key, RecordConsumer reader) + { + JournalAndTableRecordConsumer consumer = new JournalAndTableRecordConsumer(key, reader); + journal.readAll(key, consumer); + consumer.readTable(); + } + + private void readAllFromTable(K key, TableRecordConsumer onEntry) + { + DecoratedKey pk = AccordKeyspace.JournalColumns.decorate(key); + try (RefViewFragment view = cfs.selectAndReference(View.select(SSTableSet.LIVE, pk))) + { + if (view.sstables.isEmpty()) + return; + + List iters = new ArrayList<>(view.sstables.size()); + for (SSTableReader sstable : view.sstables) + if (sstable.mayContainAssumingKeyIsInRange(pk)) + iters.add(StorageHook.instance.makeRowIterator(cfs, sstable, pk, Slices.ALL, ColumnFilter.all(cfs.metadata()), false, NOOP_LISTENER)); + + if (!iters.isEmpty()) + { + EntryHolder into = new EntryHolder<>(); + try (UnfilteredRowIterator iter = UnfilteredRowIterators.merge(iters)) + { + while (iter.hasNext()) readRow(key, iter.next(), into, onEntry); + } + } + } + } + + private void readRow(K key, Unfiltered unfiltered, EntryHolder into, RecordConsumer onEntry) + { + Invariants.require(unfiltered.isRow()); + Row row = (Row) unfiltered; + + long descriptor = LongType.instance.compose(ByteBuffer.wrap((byte[]) row.clustering().get(0))); + int position = Int32Type.instance.compose(ByteBuffer.wrap((byte[]) row.clustering().get(1))); + into.key = key; + into.value = row.getCell(recordColumn).buffer(); + into.userVersion = Int32Type.instance.compose(row.getCell(versionColumn).buffer()); + + onEntry.accept(descriptor, position, into.key, into.value, into.userVersion); + } + + @SuppressWarnings("resource") // Auto-closeable iterator will release related resources + public CloseableIterator> keyIterator() + { + return new JournalAndTableKeyIterator(); + } + + private class TableIterator extends AbstractIterator implements CloseableIterator + { + private final UnfilteredPartitionIterator mergeIterator; + private final RefViewFragment view; + + private TableIterator() + { + view = cfs.selectAndReference(v -> v.select(SSTableSet.LIVE)); + List scanners = new ArrayList<>(); + for (SSTableReader sstable : view.sstables) + scanners.add(sstable.getScanner()); + + mergeIterator = view.sstables.isEmpty() + ? EmptyIterators.unfilteredPartition(cfs.metadata()) + : UnfilteredPartitionIterators.merge(scanners, UnfilteredPartitionIterators.MergeListener.NOOP); + } + + @CheckForNull + protected K computeNext() + { + if (mergeIterator.hasNext()) + { + try (UnfilteredRowIterator partition = mergeIterator.next()) + { + return (K) AccordKeyspace.JournalColumns.getJournalKey(partition.partitionKey()); + } + } + else + return endOfData(); + } + + @Override + public void close() + { + mergeIterator.close(); + view.close(); + } + } + + private class JournalAndTableKeyIterator extends AbstractIterator> implements CloseableIterator> + { + final TableIterator tableIterator; + final Journal.StaticSegmentKeyIterator journalIterator; + + private JournalAndTableKeyIterator() + { + this.tableIterator = new TableIterator(); + this.journalIterator = journal.staticSegmentKeyIterator(); + } + + protected Journal.KeyRefs computeNext() + { + K tableKey = tableIterator.hasNext() ? tableIterator.peek() : null; + Journal.KeyRefs journalKey = journalIterator.hasNext() ? journalIterator.peek() : null; + + if (tableKey == null) + return journalKey == null ? endOfData() : journalIterator.next(); + + if (journalKey == null) + return new Journal.KeyRefs<>(tableIterator.next()); + + int cmp = keySupport.compare(tableKey, journalKey.key()); + if (cmp == 0) + { + tableIterator.next(); + return journalIterator.next(); + } + + return cmp > 0 ? new Journal.KeyRefs<>(tableIterator.next()) : journalIterator.next(); + } + + public void close() + { + tableIterator.close(); + journalIterator.close(); + } + } + + public static void readBuffer(ByteBuffer buffer, Reader reader, Version userVersion) + { + try (DataInputBuffer in = new DataInputBuffer(buffer, false)) + { + reader.read(in, userVersion); + } + catch (IOException e) + { + // can only throw if serializer is buggy + throw new RuntimeException(e); + } + } +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/service/accord/AccordJournalValueSerializers.java b/src/java/org/apache/cassandra/service/accord/AccordJournalValueSerializers.java new file mode 100644 index 000000000000..58b238d31f59 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordJournalValueSerializers.java @@ -0,0 +1,330 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.io.IOException; +import java.util.NavigableMap; + +import com.google.common.collect.ImmutableSortedMap; + +import accord.local.DurableBefore; +import accord.local.RedundantBefore; +import accord.primitives.Ranges; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.journal.AccordTopologyUpdate; +import org.apache.cassandra.service.accord.serializers.CommandStoreSerializers; +import org.apache.cassandra.service.accord.serializers.Version; + +import static accord.local.CommandStores.RangesForEpoch; + +// TODO (required): test with large collection values, and perhaps split out some fields if they have a tendency to grow larger +// TODO (required): alert on metadata size +// TODO (required): versioning +public class AccordJournalValueSerializers +{ + public interface FlyweightImage + { + void reset(JournalKey key); + } + + public interface FlyweightSerializer + { + IMAGE mergerFor(); + + void serialize(JournalKey key, ENTRY from, DataOutputPlus out, Version userVersion) throws IOException; + + void reserialize(JournalKey key, IMAGE from, DataOutputPlus out, Version userVersion) throws IOException; + + void deserialize(JournalKey key, IMAGE into, DataInputPlus in, Version userVersion) throws IOException; + } + + public static class CommandDiffSerializer + implements FlyweightSerializer + { + @Override + public AccordJournal.Builder mergerFor() + { + return new AccordJournal.Builder(); + } + + @Override + public void serialize(JournalKey key, AccordJournal.Writer writer, DataOutputPlus out, Version userVersion) + { + try + { + writer.write(out, userVersion); + } + catch (IOException e) + { + throw new RuntimeException(e); + } + } + + @Override + public void reserialize(JournalKey key, AccordJournal.Builder from, DataOutputPlus out, Version userVersion) throws IOException + { + from.serialize(out, + // In CompactionIterator, we are dealing with relatively recent records, so we do not pass redundant before here. + // However, we do on load and during Journal SSTable compaction. + userVersion); + } + + @Override + public void deserialize(JournalKey journalKey, AccordJournal.Builder into, DataInputPlus in, Version userVersion) throws IOException + { + into.deserializeNext(in, userVersion); + } + } + + public abstract static class Accumulator implements FlyweightImage + { + protected A accumulated; + + public Accumulator(A initial) + { + this.accumulated = initial; + } + + protected void update(V newValue) + { + accumulated = accumulate(accumulated, newValue); + } + + protected abstract A accumulate(A oldValue, V newValue); + + public A get() + { + return accumulated; + } + } + + public static class IdentityAccumulator extends Accumulator + { + final T initial; + boolean hasRead; + public IdentityAccumulator(T initial) + { + super(initial); + this.initial = initial; + } + + @Override + public void reset(JournalKey key) + { + hasRead = false; + accumulated = initial; + } + + @Override + protected T accumulate(T oldValue, T newValue) + { + if (hasRead) + return oldValue; + hasRead = true; + return newValue; + } + } + + public static class RedundantBeforeSerializer + implements FlyweightSerializer> + { + @Override + public IdentityAccumulator mergerFor() + { + return new IdentityAccumulator<>(RedundantBefore.EMPTY); + } + + @Override + public void serialize(JournalKey key, RedundantBefore entry, DataOutputPlus out, Version userVersion) + { + try + { + if (entry == RedundantBefore.EMPTY) + { + out.writeInt(0); + return; + } + out.writeInt(1); + CommandStoreSerializers.redundantBefore.serialize(entry, out); + } + catch (IOException e) + { + throw new RuntimeException(e); + } + } + + @Override + public void reserialize(JournalKey key, IdentityAccumulator from, DataOutputPlus out, Version userVersion) throws IOException + { + serialize(key, from.get(), out, userVersion); + } + + @Override + public void deserialize(JournalKey journalKey, IdentityAccumulator into, DataInputPlus in, Version userVersion) throws IOException + { + if (in.readInt() == 0) + { + into.update(RedundantBefore.EMPTY); + return; + } + into.update(CommandStoreSerializers.redundantBefore.deserialize(in)); + } + } + + public static class DurableBeforeAccumulator extends Accumulator + { + public DurableBeforeAccumulator() + { + super(DurableBefore.EMPTY); + } + + @Override + public void reset(JournalKey key) + { + accumulated = DurableBefore.EMPTY; + } + + @Override + protected DurableBefore accumulate(DurableBefore oldValue, DurableBefore newValue) + { + return DurableBefore.merge(oldValue, newValue); + } + } + + public static class DurableBeforeSerializer + implements FlyweightSerializer + { + public DurableBeforeAccumulator mergerFor() + { + return new DurableBeforeAccumulator(); + } + + @Override + public void serialize(JournalKey key, DurableBefore entry, DataOutputPlus out, Version userVersion) + { + try + { + CommandStoreSerializers.durableBefore.serialize(entry, out); + } + catch (IOException e) + { + throw new RuntimeException(e); + } + } + + @Override + public void reserialize(JournalKey key, DurableBeforeAccumulator from, DataOutputPlus out, Version userVersion) throws IOException + { + serialize(key, from.get(), out, userVersion); + } + + @Override + public void deserialize(JournalKey journalKey, DurableBeforeAccumulator into, DataInputPlus in, Version userVersion) throws IOException + { + into.update(CommandStoreSerializers.durableBefore.deserialize(in)); + } + } + + public static class BootstrapBeganAtSerializer + implements FlyweightSerializer, IdentityAccumulator>> + { + @Override + public IdentityAccumulator> mergerFor() + { + return new IdentityAccumulator<>(ImmutableSortedMap.of(TxnId.NONE, Ranges.EMPTY)); + } + + @Override + public void serialize(JournalKey key, NavigableMap entry, DataOutputPlus out, Version userVersion) throws IOException + { + CommandStoreSerializers.bootstrapBeganAt.serialize(entry, out); + } + + @Override + public void reserialize(JournalKey key, IdentityAccumulator> image, DataOutputPlus out, Version userVersion) throws IOException + { + serialize(key, image.get(), out, userVersion); + } + + @Override + public void deserialize(JournalKey key, IdentityAccumulator> into, DataInputPlus in, Version userVersion) throws IOException + { + into.update(CommandStoreSerializers.bootstrapBeganAt.deserialize(in)); + } + } + + public static class SafeToReadSerializer + implements FlyweightSerializer, IdentityAccumulator>> + { + @Override + public IdentityAccumulator> mergerFor() + { + return new IdentityAccumulator<>(ImmutableSortedMap.of(Timestamp.NONE, Ranges.EMPTY)); + } + + @Override + public void serialize(JournalKey key, NavigableMap from, DataOutputPlus out, Version userVersion) throws IOException + { + CommandStoreSerializers.safeToRead.serialize(from, out); + } + + @Override + public void reserialize(JournalKey key, IdentityAccumulator> from, DataOutputPlus out, Version userVersion) throws IOException + { + serialize(key, from.get(), out, userVersion); + } + + @Override + public void deserialize(JournalKey key, IdentityAccumulator> into, DataInputPlus in, Version userVersion) throws IOException + { + into.update(CommandStoreSerializers.safeToRead.deserialize(in)); + } + } + + public static class RangesForEpochSerializer + implements FlyweightSerializer> + { + public static final RangesForEpochSerializer instance = new RangesForEpochSerializer(); + public IdentityAccumulator mergerFor() + { + return new IdentityAccumulator<>(null); + } + + @Override + public void serialize(JournalKey key, RangesForEpoch from, DataOutputPlus out, Version userVersion) throws IOException + { + AccordTopologyUpdate.RangesForEpochSerializer.instance.serialize(from, out); + } + + @Override + public void reserialize(JournalKey key, Accumulator from, DataOutputPlus out, Version userVersion) throws IOException + { + serialize(key, from.get(), out, userVersion); + } + + @Override + public void deserialize(JournalKey key, Accumulator into, DataInputPlus in, Version userVersion) throws IOException + { + into.update(AccordTopologyUpdate.RangesForEpochSerializer.instance.deserialize(in)); + } + } +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/service/accord/AccordKeyspace.java b/src/java/org/apache/cassandra/service/accord/AccordKeyspace.java new file mode 100644 index 000000000000..e597d7d4155c --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordKeyspace.java @@ -0,0 +1,590 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.local.RedundantBefore; +import accord.local.cfk.CommandsForKey; +import accord.local.cfk.Serialize; +import accord.primitives.TxnId; +import accord.utils.Invariants; +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.cql3.statements.schema.CreateTableStatement; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ClusteringComparator; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Columns; +import org.apache.cassandra.db.DataRange; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.db.ReadExecutionController; +import org.apache.cassandra.db.RegularAndStaticColumns; +import org.apache.cassandra.db.SinglePartitionReadCommand; +import org.apache.cassandra.db.Slices; +import org.apache.cassandra.db.WriteContext; +import org.apache.cassandra.db.filter.ClusteringIndexFilter; +import org.apache.cassandra.db.filter.ClusteringIndexNamesFilter; +import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.filter.DataLimits; +import org.apache.cassandra.db.filter.RowFilter; +import org.apache.cassandra.db.lifecycle.View; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; +import org.apache.cassandra.db.marshal.BytesType; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.LongType; +import org.apache.cassandra.db.marshal.TupleType; +import org.apache.cassandra.db.memtable.Memtable; +import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; +import org.apache.cassandra.db.rows.BTreeRow; +import org.apache.cassandra.db.rows.BufferCell; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.RowIterator; +import org.apache.cassandra.db.transform.FilteredPartitions; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.dht.Bounds; +import org.apache.cassandra.dht.ExcludingBounds; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.IncludingExcludingBounds; +import org.apache.cassandra.dht.LocalPartitioner; +import org.apache.cassandra.dht.LocalPartitioner.LocalToken; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.index.accord.RouteJournalIndex; +import org.apache.cassandra.index.transactions.UpdateTransaction; +import org.apache.cassandra.io.sstable.SSTableReadsListener; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.CompactionParams; +import org.apache.cassandra.schema.IndexMetadata; +import org.apache.cassandra.schema.Indexes; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.Tables; +import org.apache.cassandra.schema.Types; +import org.apache.cassandra.schema.UserFunctions; +import org.apache.cassandra.schema.Views; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.accord.serializers.CommandSerializers; +import org.apache.cassandra.utils.AbstractIterator; +import org.apache.cassandra.utils.Clock.Global; +import org.apache.cassandra.utils.CloseableIterator; +import org.apache.cassandra.utils.MergeIterator; +import org.apache.cassandra.utils.btree.BTreeSet; +import org.apache.cassandra.utils.concurrent.OpOrder; +import org.apache.cassandra.utils.vint.VIntCoding; + +import static java.lang.String.format; +import static java.util.Collections.emptyMap; +import static org.apache.cassandra.db.partitions.PartitionUpdate.singleRowUpdate; +import static org.apache.cassandra.db.rows.BTreeRow.singleCellRow; +import static org.apache.cassandra.schema.SchemaConstants.ACCORD_KEYSPACE_NAME; + +public class AccordKeyspace +{ + private static final Logger logger = LoggerFactory.getLogger(AccordKeyspace.class); + + public static final String JOURNAL = "journal"; + public static final String COMMANDS_FOR_KEY = "commands_for_key"; + public static final String JOURNAL_INDEX_NAME = "record"; + + public static final Set TABLE_NAMES = ImmutableSet.of(COMMANDS_FOR_KEY, JOURNAL); + + public static final TupleType TIMESTAMP_TYPE = new TupleType(Lists.newArrayList(LongType.instance, LongType.instance, Int32Type.instance)); + + private static final ClusteringIndexFilter FULL_PARTITION = new ClusteringIndexNamesFilter(BTreeSet.of(new ClusteringComparator(), Clustering.EMPTY), false); + + public static TableMetadata journalMetadata(String tableName, boolean index) + { + TableMetadata.Builder builder = parse(tableName, + "accord journal", + "CREATE TABLE %s (" + + "key blob," + + "descriptor bigint," + + "offset int," + + "user_version int," + + "record blob," + + "PRIMARY KEY((key), descriptor, offset)" + + ") WITH CLUSTERING ORDER BY (descriptor DESC, offset DESC)" + + " WITH compression = {'class':'NoopCompressor'};") + .compaction(CompactionParams.lcs(emptyMap())) + .bloomFilterFpChance(0.01) + .partitioner(new LocalPartitioner(BytesType.instance)); + if (index) + builder.indexes(Indexes.builder() + .add(IndexMetadata.fromSchemaMetadata(JOURNAL_INDEX_NAME, IndexMetadata.Kind.CUSTOM, ImmutableMap.of("class_name", RouteJournalIndex.class.getCanonicalName(), "target", "record,user_version"))) + .build()); + return builder.build(); + } + + public static final TableMetadata Journal = journalMetadata(JOURNAL, true); + + private static ColumnMetadata getColumn(TableMetadata metadata, String name) + { + ColumnMetadata column = metadata.getColumn(new ColumnIdentifier(name, true)); + if (column == null) + throw new IllegalArgumentException(format("Unknown column %s for %s.%s", name, metadata.keyspace, metadata.name)); + return column; + } + + private static final LocalPartitioner CFKPartitioner = new LocalPartitioner(BytesType.instance); + public static final TableMetadata CommandsForKeys = commandsForKeysTable(COMMANDS_FOR_KEY); + public static final CommandsForKeyAccessor CFKAccessor = new CommandsForKeyAccessor(CommandsForKeys); + private static TableMetadata commandsForKeysTable(String tableName) + { + return parse(tableName, + "accord commands per key", + "CREATE TABLE %s (" + + "key blob, " + + "data blob, " + + "PRIMARY KEY(key)" + + ')' + + " WITH compression = {'class':'NoopCompressor'};") + .partitioner(CFKPartitioner) + .compaction(CompactionParams.lcs(emptyMap())) + .bloomFilterFpChance(0.01) + .build(); + } + + public static class CommandsForKeyAccessor + { + final TableMetadata table; + final ClusteringComparator keyComparator; + final ColumnFilter allColumns; + final ColumnMetadata data; + + final RegularAndStaticColumns columns; + + public CommandsForKeyAccessor(TableMetadata table) + { + this.table = table; + this.keyComparator = table.partitionKeyAsClusteringComparator(); + this.allColumns = ColumnFilter.all(table); + this.data = getColumn(table, "data"); + this.columns = new RegularAndStaticColumns(Columns.NONE, Columns.from(Lists.newArrayList(data))); + } + + public static int getCommandStoreId(ByteBuffer partitionKey) + { + return partitionKey.getInt(partitionKey.position()); + } + + public static TokenKey getUserTableKey(TableId tableId, DecoratedKey key) + { + return getUserTableKey(tableId, key.getKey()); + } + + public static TokenKey getUserTableKey(TableId tableId, ByteBuffer partitionKey) + { + return TokenKey.serializer.deserializeWithPrefixAndImpliedLength(tableId, partitionKey, ByteBufferAccessor.instance, 4); + } + + public static TokenKey getUserTableKey(TableId tableId, DecoratedKey key, IPartitioner partitioner) + { + return getUserTableKey(tableId, key.getKey(), partitioner); + } + + public static TokenKey getUserTableKey(TableId tableId, ByteBuffer partitionKey, IPartitioner partitioner) + { + return TokenKey.serializer.deserializeWithPrefixAndImpliedLength(tableId, partitionKey, ByteBufferAccessor.instance, 4, partitioner); + } + + public static DecoratedKey makeSystemTableKey(int commandStoreId, TokenKey key) + { + return CFKPartitioner.decorateKey(makeSystemTableKeyBytes(commandStoreId, key)); + } + + public static LocalToken makeSystemTableToken(int commandStore, TokenKey key) + { + return CFKPartitioner.getToken(makeSystemTableKeyBytes(commandStore, key)); + } + + public static ByteBuffer makeSystemTableKeyBytes(int commandStore, TokenKey key) + { + ByteBuffer result = ByteBuffer.allocate(4 + TokenKey.serializer.serializedSizeWithoutPrefix(key)); + result.putInt(commandStore); + TokenKey.serializer.serializeWithoutPrefixOrLength(key, result); + result.flip(); + return result; + } + + public static ByteBuffer serializeUserTableKey(TokenKey key) + { + return TokenKey.serializer.serializeWithoutPrefixOrLength(key); + } + + public CommandsForKey fromRow(TokenKey key, Row row) + { + Cell cell = row.getCell(data); + if (cell == null) + return null; + + return Serialize.fromBytes(key, cell.buffer()); + } + + public static CommandsForKey load(int commandStoreId, TokenKey key) + { + return unsafeLoad(CFKAccessor, commandStoreId, key); + } + + static CommandsForKey unsafeLoad(CommandsForKeyAccessor accessor, int commandStoreId, TokenKey key) + { + long timestampMicros = TimeUnit.MILLISECONDS.toMicros(Global.currentTimeMillis()); + int nowInSeconds = (int) TimeUnit.MICROSECONDS.toSeconds(timestampMicros); + + SinglePartitionReadCommand command = makeRead(accessor, commandStoreId, key, nowInSeconds); + + try (ReadExecutionController controller = command.executionController(); + FilteredPartitions partitions = FilteredPartitions.filter(command.executeLocally(controller), nowInSeconds)) + { + if (!partitions.hasNext()) + return null; + + try (RowIterator partition = partitions.next()) + { + Invariants.require(partition.hasNext()); + Row row = partition.next(); + ByteBuffer data = cellValue(row, accessor.data); + return Serialize.fromBytes(key, data); + } + } + catch (Throwable t) + { + logger.error("Exception loading AccordCommandsForKey " + key, t); + throw t; + } + } + + // TODO (expected): garbage-free filtering, reusing encoding + public Row withoutRedundantCommands(TokenKey key, Row row, RedundantBefore.Bounds redundantBefore) + { + Invariants.require(row.columnCount() == 1); + Cell cell = row.getCell(data); + if (cell == null) + return row; + + CommandsForKey current = Serialize.fromBytes(key, cell.buffer()); + if (current == null) + return null; + + CommandsForKey updated = current.withRedundantBeforeAtLeast(redundantBefore.gcBefore()); + if (current == updated) + return row; + + if (updated.isEmpty()) + return null; + + ByteBuffer buffer = Serialize.toBytesWithoutKey(updated); + return BTreeRow.singleCellRow(Clustering.EMPTY, BufferCell.live(data, cell.timestamp(), buffer)); + } + + public static SinglePartitionReadCommand makeRead(int storeId, TokenKey key, int nowInSeconds) + { + return makeRead(CFKAccessor, storeId, key, nowInSeconds); + } + + private static SinglePartitionReadCommand makeRead(CommandsForKeyAccessor accessor, int storeId, TokenKey key, long nowInSeconds) + { + return SinglePartitionReadCommand.create(accessor.table, nowInSeconds, + accessor.allColumns, + RowFilter.none(), + DataLimits.NONE, + makeSystemTableKey(storeId, key), + FULL_PARTITION); + } + + private static PartitionUpdate makeUpdate(int storeId, TokenKey key, CommandsForKey commandsForKey, Object serialized, long timestampMicros) + { + ByteBuffer bytes; + if (serialized instanceof ByteBuffer) bytes = (ByteBuffer) serialized; + else bytes = Serialize.toBytesWithoutKey(commandsForKey); + return makeUpdate(storeId, key, timestampMicros, bytes); + } + + @VisibleForTesting + public static PartitionUpdate makeUpdate(int storeId, TokenKey key, long timestampMicros, ByteBuffer bytes) + { + return singleRowUpdate(CFKAccessor.table, + CommandsForKeyAccessor.makeSystemTableKey(storeId, key), + singleCellRow(Clustering.EMPTY, BufferCell.live(CFKAccessor.data, timestampMicros, bytes))); + } + + public static Runnable systemTableUpdater(int storeId, TokenKey key, CommandsForKey update, Object serialized, long timestampMicros) + { + PartitionUpdate upd = makeUpdate(storeId, key, update, serialized, timestampMicros); + return () -> { + ColumnFamilyStore cfs = AccordColumnFamilyStores.commandsForKey; + try (OpOrder.Group group = Keyspace.writeOrder.start()) + { + cfs.getCurrentMemtable().put(upd, UpdateTransaction.NO_OP, group, true); + } + }; + } + + /** + * Calculates token bounds based on key prefixes. + */ + public static void findAllKeysBetween(int commandStore, TableId tableId, IPartitioner partitioner, + TokenKey start, boolean startInclusive, + TokenKey end, boolean endInclusive, + Consumer consumer) + { + + Token startToken = CommandsForKeyAccessor.makeSystemTableToken(commandStore, start); + Token endToken = CommandsForKeyAccessor.makeSystemTableToken(commandStore, end); + + if (start.isTableSentinel()) + startInclusive = true; + if (end.isTableSentinel()) + endInclusive = true; + + PartitionPosition startPosition = startInclusive ? startToken.minKeyBound() : startToken.maxKeyBound(); + PartitionPosition endPosition = endInclusive ? endToken.maxKeyBound() : endToken.minKeyBound(); + AbstractBounds bounds; + if (startInclusive && endInclusive) + bounds = new Bounds<>(startPosition, endPosition); + else if (endInclusive) + bounds = new Range<>(startPosition, endPosition); + else if (startInclusive) + bounds = new IncludingExcludingBounds<>(startPosition, endPosition); + else + bounds = new ExcludingBounds<>(startPosition, endPosition); + + ColumnFamilyStore baseCfs = AccordColumnFamilyStores.commandsForKey; + try (OpOrder.Group baseOp = baseCfs.readOrdering.start(); + WriteContext writeContext = baseCfs.keyspace.getWriteHandler().createContextForRead(); + CloseableIterator iter = keyIterator(CommandsForKeys, bounds)) + { + // Need the second try to handle callback errors vs read errors. + // Callback will see the read errors, but if the callback fails the outer try will see those errors + while (iter.hasNext()) + { + TokenKey pk = CommandsForKeyAccessor.getUserTableKey(tableId, iter.next(), partitioner); + consumer.accept(pk); + } + } + catch (IOException e) + { + throw new RuntimeException(e); + } + } + + /** + * Returns a DecoratedKey iterator for the given range. Skips reading data files for sstable formats with a partition index file + */ + private static CloseableIterator keyIterator(Memtable memtable, AbstractBounds range) + { + DataRange dataRange = new DataRange(range, new ClusteringIndexSliceFilter(Slices.ALL, false)); + UnfilteredPartitionIterator iter = memtable.partitionIterator(ColumnFilter.NONE, dataRange, SSTableReadsListener.NOOP_LISTENER); + + int rangeStartCmpMin = range.isStartInclusive() ? 0 : 1; + int rangeEndCmpMax = range.isEndInclusive() ? 0 : -1; + + return new AbstractIterator<>() + { + @Override + protected DecoratedKey computeNext() + { + while (iter.hasNext()) + { + DecoratedKey key = iter.next().partitionKey(); + if (key.compareTo(range.left) < rangeStartCmpMin) + continue; + + if (key.compareTo(range.right) > rangeEndCmpMax) + break; + + return key; + } + return endOfData(); + } + + @Override + public void close() + { + iter.close(); + } + }; + } + + private static CloseableIterator keyIterator(TableMetadata metadata, AbstractBounds range) throws IOException + { + ColumnFamilyStore cfs = Keyspace.openAndGetStore(metadata); + ColumnFamilyStore.ViewFragment view = cfs.select(View.selectLive(range)); + + List> closeableIterators = new ArrayList<>(); + List> iterators = new ArrayList<>(); + + try + { + for (Memtable memtable : view.memtables) + { + CloseableIterator iter = keyIterator(memtable, range); + iterators.add(iter); + closeableIterators.add(iter); + } + + for (SSTableReader sstable : view.sstables) + { + CloseableIterator iter = sstable.keyIterator(range); + iterators.add(iter); + closeableIterators.add(iter); + } + } + catch (Throwable e) + { + for (CloseableIterator iter: closeableIterators) + { + try + { + iter.close(); + } + catch (Throwable e2) + { + e.addSuppressed(e2); + } + } + throw e; + } + + return MergeIterator.get(iterators, DecoratedKey::compareTo, new MergeIterator.Reducer.Trivial<>()); + } + } + + public static final CommandsForKeyAccessor CommandsForKeysAccessor = new CommandsForKeyAccessor(CommandsForKeys); + + private static TableMetadata.Builder parse(String name, String description, String cql) + { + return CreateTableStatement.parse(format(cql, name), ACCORD_KEYSPACE_NAME) + .id(TableId.forSystemTable(ACCORD_KEYSPACE_NAME, name)) + .comment(description) + .gcGraceSeconds((int) TimeUnit.DAYS.toSeconds(90)); + } + + private static void flush(TableMetadata table) + { + Keyspace.open(table.keyspace).getColumnFamilyStore(table.id).forceBlockingFlush(ColumnFamilyStore.FlushReason.ACCORD); + } + + public static KeyspaceMetadata metadata() + { + return KeyspaceMetadata.create(ACCORD_KEYSPACE_NAME, KeyspaceParams.local(), tables(), Views.none(), Types.none(), UserFunctions.none()); + } + + public static Tables TABLES = Tables.of(CommandsForKeys, Journal); + public static Tables tables() + { + return TABLES; + } + + public static void truncateAllCaches() + { + Keyspace ks = Keyspace.open(ACCORD_KEYSPACE_NAME); + for (String table : new String[]{ CommandsForKeys.name }) + { + if (!ks.getColumnFamilyStore(table).isEmpty()) + ks.getColumnFamilyStore(table).truncateBlocking(); + } + } + + private static ByteBuffer cellValue(Cell cell) + { + return cell.accessor().toBuffer(cell.value()); + } + + // TODO (desired): convert to byte array + private static ByteBuffer cellValue(Row row, ColumnMetadata column) + { + Cell cell = row.getCell(column); + return (cell != null && !cell.isTombstone()) ? cellValue(cell) : null; + } + + public static class JournalColumns + { + public static final ColumnMetadata key = getColumn(Journal, "key"); + public static final ColumnMetadata record = getColumn(Journal, "record"); + public static final ColumnMetadata user_version = getColumn(Journal, "user_version"); + public static final RegularAndStaticColumns regular = new RegularAndStaticColumns(Columns.NONE, Columns.from(Arrays.asList(record, user_version))); + + public static DecoratedKey decorate(JournalKey key) + { + int commandStoreIdBytes = VIntCoding.computeUnsignedVIntSize(key.commandStoreId); + int length = commandStoreIdBytes + 1; + if (key.type == JournalKey.Type.COMMAND_DIFF) + length += CommandSerializers.txnId.serializedSize(key.id); + ByteBuffer pk = ByteBuffer.allocate(length); + ByteBufferAccessor.instance.putUnsignedVInt32(pk, 0, key.commandStoreId); + pk.put(commandStoreIdBytes, (byte)key.type.id); + if (key.type == JournalKey.Type.COMMAND_DIFF) + CommandSerializers.txnId.serializeComparable(key.id, pk, ByteBufferAccessor.instance, commandStoreIdBytes + 1); + return Journal.partitioner.decorateKey(pk); + } + + public static int getStoreId(DecoratedKey pk) + { + return VIntCoding.readUnsignedVInt32(pk.getKey(), 0); + } + + public static JournalKey getJournalKey(DecoratedKey key) + { + ByteBuffer bb = key.getKey(); + int storeId = ByteBufferAccessor.instance.getUnsignedVInt32(bb, 0); + int offset = VIntCoding.readLengthOfVInt(bb, 0); + JournalKey.Type type = JournalKey.Type.fromId(bb.get(offset)); + TxnId txnId = type != JournalKey.Type.COMMAND_DIFF ? TxnId.NONE : CommandSerializers.txnId.deserializeComparable(bb, ByteBufferAccessor.instance, offset + 1); + return new JournalKey(txnId, type, storeId); + } + } + + @VisibleForTesting + public static void unsafeClear() + { + for (ColumnFamilyStore store : Keyspace.open(SchemaConstants.ACCORD_KEYSPACE_NAME).getColumnFamilyStores()) + store.truncateBlockingWithoutSnapshot(); + } + + public static class AccordColumnFamilyStores + { + public static final ColumnFamilyStore journal = Schema.instance.getColumnFamilyStoreInstance(Journal.id); + public static final ColumnFamilyStore commandsForKey = Schema.instance.getColumnFamilyStoreInstance(CommandsForKeys.id); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordMessageSink.java b/src/java/org/apache/cassandra/service/accord/AccordMessageSink.java new file mode 100644 index 000000000000..cfa9db8101a6 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordMessageSink.java @@ -0,0 +1,259 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.Collections; +import java.util.EnumSet; +import java.util.Map; +import java.util.Set; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableMap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.MessageSink; +import accord.impl.RequestCallbacks; +import accord.local.AgentExecutor; +import accord.local.Node; +import accord.messages.Callback; +import accord.messages.MessageType; +import accord.messages.Reply; +import accord.messages.ReplyContext; +import accord.messages.Request; +import accord.primitives.TxnId; +import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessageDelivery; +import org.apache.cassandra.net.MessageFlag; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.net.ResponseContext; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.service.TimeoutStrategy; +import org.apache.cassandra.service.accord.api.AccordAgent; +import org.apache.cassandra.utils.Clock; + +import static accord.messages.MessageType.StandardMessage.*; +import static java.util.concurrent.TimeUnit.NANOSECONDS; +import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.expire; +import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.slowPreaccept; +import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.slowRead; + +public class AccordMessageSink implements MessageSink +{ + private static final Logger logger = LoggerFactory.getLogger(AccordMessageSink.class); + + public enum AccordMessageType implements MessageType + { + INTEROP_READ_REQ(Verb.ACCORD_INTEROP_READ_REQ), + INTEROP_STABLE_THEN_READ_REQ(Verb.ACCORD_INTEROP_STABLE_THEN_READ_REQ), + INTEROP_READ_RSP(Verb.ACCORD_INTEROP_READ_RSP), + INTEROP_READ_REPAIR_REQ(Verb.ACCORD_INTEROP_READ_REPAIR_REQ), + INTEROP_READ_REPAIR_RSP(Verb.ACCORD_INTEROP_READ_REPAIR_RSP), + INTEROP_APPLY_REQ(Verb.ACCORD_INTEROP_APPLY_REQ); + final Verb verb; + + AccordMessageType(Verb verb) + { + this.verb = verb; + } + } + + private static class VerbMapping + { + private static final Map> overrideReplyVerbs = ImmutableMap.>builder() + // read takes Result | Nack + .put(Verb.ACCORD_FETCH_DATA_REQ, EnumSet.of(Verb.ACCORD_FETCH_DATA_RSP, Verb.ACCORD_READ_RSP /* nack */)) + .put(Verb.ACCORD_INTEROP_STABLE_THEN_READ_REQ, EnumSet.of(Verb.ACCORD_INTEROP_READ_RSP, Verb.ACCORD_READ_RSP)) + .put(Verb.ACCORD_INTEROP_READ_REPAIR_REQ, EnumSet.of(Verb.ACCORD_INTEROP_READ_REPAIR_RSP, Verb.ACCORD_READ_RSP)) + .build(); + + static + { + ImmutableMap.Builder builder = ImmutableMap.builder(); + builder.put(SIMPLE_RSP, Verb.ACCORD_SIMPLE_RSP); + builder.put(PRE_ACCEPT_REQ, Verb.ACCORD_PRE_ACCEPT_REQ); + builder.put(PRE_ACCEPT_RSP, Verb.ACCORD_PRE_ACCEPT_RSP); + builder.put(ACCEPT_REQ, Verb.ACCORD_ACCEPT_REQ); + builder.put(ACCEPT_RSP, Verb.ACCORD_ACCEPT_RSP); + builder.put(NOT_ACCEPT_REQ, Verb.ACCORD_NOT_ACCEPT_REQ); + builder.put(RECOVER_AWAIT_REQ, Verb.ACCORD_RECOVER_AWAIT_REQ); + builder.put(RECOVER_AWAIT_RSP, Verb.ACCORD_RECOVER_AWAIT_RSP); + builder.put(GET_LATEST_DEPS_REQ, Verb.ACCORD_GET_LATEST_DEPS_REQ); + builder.put(GET_LATEST_DEPS_RSP, Verb.ACCORD_GET_LATEST_DEPS_RSP); + builder.put(GET_EPHEMERAL_READ_DEPS_REQ, Verb.ACCORD_GET_EPHMRL_READ_DEPS_REQ); + builder.put(GET_EPHEMERAL_READ_DEPS_RSP, Verb.ACCORD_GET_EPHMRL_READ_DEPS_RSP); + builder.put(GET_MAX_CONFLICT_REQ, Verb.ACCORD_GET_MAX_CONFLICT_REQ); + builder.put(GET_MAX_CONFLICT_RSP, Verb.ACCORD_GET_MAX_CONFLICT_RSP); + builder.put(COMMIT_REQ, Verb.ACCORD_COMMIT_REQ); + builder.put(COMMIT_INVALIDATE_REQ, Verb.ACCORD_COMMIT_INVALIDATE_REQ); + builder.put(APPLY_REQ, Verb.ACCORD_APPLY_REQ); + builder.put(APPLY_RSP, Verb.ACCORD_APPLY_RSP); + builder.put(READ_REQ, Verb.ACCORD_READ_REQ); + builder.put(STABLE_THEN_READ_REQ, Verb.ACCORD_STABLE_THEN_READ_REQ); + builder.put(READ_EPHEMERAL_REQ, Verb.ACCORD_READ_REQ); + builder.put(READ_RSP, Verb.ACCORD_READ_RSP); + builder.put(BEGIN_RECOVER_REQ, Verb.ACCORD_BEGIN_RECOVER_REQ); + builder.put(BEGIN_RECOVER_RSP, Verb.ACCORD_BEGIN_RECOVER_RSP); + builder.put(BEGIN_INVALIDATE_REQ, Verb.ACCORD_BEGIN_INVALIDATE_REQ); + builder.put(BEGIN_INVALIDATE_RSP, Verb.ACCORD_BEGIN_INVALIDATE_RSP); + builder.put(AWAIT_REQ, Verb.ACCORD_AWAIT_REQ); + builder.put(AWAIT_RSP, Verb.ACCORD_AWAIT_RSP); + builder.put(ASYNC_AWAIT_COMPLETE_REQ, Verb.ACCORD_AWAIT_ASYNC_RSP_REQ); + builder.put(WAIT_UNTIL_APPLIED_REQ, Verb.ACCORD_WAIT_UNTIL_APPLIED_REQ); + builder.put(APPLY_THEN_WAIT_UNTIL_APPLIED_REQ, Verb.ACCORD_APPLY_AND_WAIT_REQ); + builder.put(INFORM_DURABLE_REQ, Verb.ACCORD_INFORM_DURABLE_REQ); + builder.put(CHECK_STATUS_REQ, Verb.ACCORD_CHECK_STATUS_REQ); + builder.put(CHECK_STATUS_RSP, Verb.ACCORD_CHECK_STATUS_RSP); + builder.put(FETCH_DATA_REQ, Verb.ACCORD_FETCH_DATA_REQ); + builder.put(FETCH_DATA_RSP, Verb.ACCORD_FETCH_DATA_RSP); + builder.put(SET_SHARD_DURABLE_REQ, Verb.ACCORD_SET_SHARD_DURABLE_REQ); + builder.put(SET_GLOBALLY_DURABLE_REQ, Verb.ACCORD_SET_GLOBALLY_DURABLE_REQ); + builder.put(GET_DURABLE_BEFORE_REQ, Verb.ACCORD_GET_DURABLE_BEFORE_REQ); + builder.put(GET_DURABLE_BEFORE_RSP, Verb.ACCORD_GET_DURABLE_BEFORE_RSP); + builder.put(FAILURE_RSP, Verb.FAILURE_RSP); + Map mapping = builder.build(); + StandardMessage.initialise(mapping); + } + + private static Verb getVerb(MessageType type) + { + if (type.getClass() == StandardMessage.class) + return (Verb) ((StandardMessage) type).mapToImplementation(); + return ((AccordMessageType)type).verb; + } + + private static Verb getVerb(Request request) + { + MessageType type = request.type(); + if (type != null) + return getVerb(type); + return null; + } + } + + private final AccordAgent agent; + private final MessageDelivery messaging; + private final AccordEndpointMapper endpointMapper; + private final RequestCallbacks callbacks; + + public AccordMessageSink(AccordAgent agent, MessageDelivery messaging, AccordEndpointMapper endpointMapper, RequestCallbacks callbacks) + { + this.agent = agent; + this.messaging = messaging; + this.endpointMapper = endpointMapper; + this.callbacks = callbacks; + } + + public AccordMessageSink(AccordAgent agent, AccordConfigurationService endpointMapper, RequestCallbacks callbacks) + { + this(agent, MessagingService.instance(), endpointMapper, callbacks); + } + + @Override + public void send(Node.Id to, Request request) + { + Verb verb = VerbMapping.getVerb(request); + Preconditions.checkNotNull(verb, "Verb is null for type %s", request.type()); + Message message = Message.out(verb, request); + InetAddressAndPort endpoint = endpointMapper.mappedEndpoint(to); + logger.trace("Sending {} {} to {}", verb, message.payload, endpoint); + messaging.send(message, endpoint); + } + + // TODO (expected): permit bulk send to save esp. on callback registration (and combine records) + @Override + public void send(Node.Id to, Request request, int attempt, AgentExecutor executor, Callback callback) + { + Verb verb = VerbMapping.getVerb(request); + Preconditions.checkNotNull(verb, "Verb is null for type %s", request.type()); + + long nowNanos = Clock.Global.nanoTime(); + TxnId txnId = request.primaryTxnId(); + long slowAtNanos = Long.MAX_VALUE; + long expiresAtNanos = nowNanos + expire(txnId, verb).computeWait(attempt, NANOSECONDS); + + switch (verb) + { + case ACCORD_READ_REQ: + case ACCORD_STABLE_THEN_READ_REQ: + case ACCORD_CHECK_STATUS_REQ: + { + TimeoutStrategy slow = slowRead(txnId); + if (slow != null) + slowAtNanos = nowNanos + slow.computeWait(attempt, NANOSECONDS); + break; + } + + case ACCORD_PRE_ACCEPT_REQ: + { + TimeoutStrategy slow = slowPreaccept(txnId); + if (slow != null) + slowAtNanos = nowNanos + slow.computeWait(attempt, NANOSECONDS); + break; + } + } + + Message message = Message.out(verb, request, expiresAtNanos); + InetAddressAndPort endpoint = endpointMapper.mappedEndpoint(to); + logger.trace("Sending {} {} to {}", verb, message.payload, endpoint); + callbacks.registerAt(message.id(), executor, callback, to, nowNanos, slowAtNanos, expiresAtNanos, NANOSECONDS); + messaging.send(message, endpoint); + } + + @Override + public void reply(Node.Id replyingToNode, ReplyContext replyContext, Reply reply) + { + ResponseContext respondTo = (ResponseContext) replyContext; + Message responseMsg = Message.responseWith(reply, respondTo); + if (!reply.isFinal()) + responseMsg = responseMsg.withFlag(MessageFlag.NOT_FINAL); + checkReplyType(reply, respondTo); + InetAddressAndPort endpoint = endpointMapper.mappedEndpoint(replyingToNode); + logger.trace("Replying {} {} to {}", responseMsg.verb(), responseMsg.payload, endpoint); + messaging.send(responseMsg, endpoint); + } + + @Override + public void replyWithUnknownFailure(Node.Id replyingToNode, ReplyContext replyContext, Throwable failure) + { + ResponseContext respondTo = (ResponseContext) replyContext; + Message responseMsg = Message.failureResponse(RequestFailureReason.UNKNOWN, failure, respondTo); + InetAddressAndPort endpoint = endpointMapper.mappedEndpoint(replyingToNode); + logger.trace("Replying with failure {} {} to {}", responseMsg.verb(), responseMsg.payload, endpoint); + messaging.send(responseMsg, endpoint); + } + + private static void checkReplyType(Reply reply, ResponseContext respondTo) + { + Verb verb = VerbMapping.getVerb(reply.type()); + Preconditions.checkNotNull(verb, "Verb is null for type %s", reply.type()); + Set allowedVerbs = expectedReplyTypes(respondTo.verb()); + Preconditions.checkArgument(allowedVerbs.contains(verb), "Expected reply message with verbs %s but got %s; reply type was %s, request verb was %s", allowedVerbs, verb, reply.type(), respondTo.verb()); + } + + private static Set expectedReplyTypes(Verb verb) + { + Set extra = VerbMapping.overrideReplyVerbs.get(verb); + if (extra != null) return extra; + Verb v = verb.responseVerb; + return v == null ? Collections.emptySet() : Collections.singleton(v); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordObjectSizes.java b/src/java/org/apache/cassandra/service/accord/AccordObjectSizes.java new file mode 100644 index 000000000000..c3b55718c59c --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordObjectSizes.java @@ -0,0 +1,444 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.UUID; +import java.util.function.ToLongFunction; + +import accord.api.Key; +import accord.api.Result; +import accord.api.RoutingKey; +import accord.local.Command; +import accord.local.ICommand; +import accord.local.Node; +import accord.local.StoreParticipants; +import accord.local.cfk.CommandsForKey; +import accord.local.cfk.CommandsForKey.TxnInfo; +import accord.local.cfk.CommandsForKey.TxnInfoExtra; +import accord.primitives.AbstractKeys; +import accord.primitives.AbstractRanges; +import accord.primitives.Ballot; +import accord.primitives.Deps; +import accord.primitives.FullKeyRoute; +import accord.primitives.FullRangeRoute; +import accord.primitives.KeyDeps; +import accord.primitives.Keys; +import accord.primitives.PartialDeps; +import accord.primitives.PartialKeyRoute; +import accord.primitives.PartialRangeRoute; +import accord.primitives.PartialTxn; +import accord.primitives.Participants; +import accord.primitives.Range; +import accord.primitives.RangeDeps; +import accord.primitives.Ranges; +import accord.primitives.Routable.Domain; +import accord.primitives.RoutingKeys; +import accord.primitives.SaveStatus; +import accord.primitives.Seekable; +import accord.primitives.Seekables; +import accord.primitives.Timestamp; +import accord.primitives.Txn.Kind; +import accord.primitives.TxnId; +import accord.primitives.Unseekables; +import accord.primitives.Writes; +import accord.utils.ImmutableBitSet; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.service.accord.serializers.ResultSerializers; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.txn.AccordUpdate; +import org.apache.cassandra.service.accord.txn.TxnData; +import org.apache.cassandra.service.accord.txn.TxnQuery; +import org.apache.cassandra.service.accord.txn.TxnRead; +import org.apache.cassandra.service.accord.txn.TxnResult; +import org.apache.cassandra.service.accord.txn.TxnWrite; +import org.apache.cassandra.utils.ObjectSizes; + +import static accord.local.Command.Accepted.accepted; +import static accord.local.Command.Committed.committed; +import static accord.local.Command.Executed.executed; +import static accord.local.Command.NotAcceptedWithoutDefinition.notAccepted; +import static accord.local.Command.NotDefined.notDefined; +import static accord.local.Command.PreAccepted.preaccepted; +import static accord.local.Command.Truncated.*; +import static accord.local.cfk.CommandsForKey.InternalStatus.ACCEPTED; +import static accord.primitives.Status.Durability.NotDurable; +import static accord.primitives.TxnId.NO_TXNIDS; +import static org.apache.cassandra.utils.ObjectSizes.measure; + +public class AccordObjectSizes +{ + public static long key(Key key) + { + return ((PartitionKey) key).estimatedSizeOnHeap(); + } + + public static long key(RoutingKey key) + { + return ((TokenKey) key).estimatedSizeOnHeap(); + } + + private static final TableId EMPTY_ID = TableId.fromUUID(new UUID(0, 0)); + private static final long EMPTY_RANGE_SIZE = measure(TokenRange.fullRange(EMPTY_ID, Murmur3Partitioner.instance)); + public static long range(Range range) + { + return EMPTY_RANGE_SIZE + key(range.start()) + key(range.end()); + } + + public static long seekable(Seekable seekable) + { + switch (seekable.domain()) + { + default: throw new AssertionError(); + case Key: return key((Key) seekable); + case Range: return range((Range) seekable); + } + } + + private static final long EMPTY_RANGES_SIZE = measure(Ranges.of()); + public static long ranges(Ranges ranges) + { + long size = EMPTY_RANGES_SIZE; + int numberOfRanges = ranges.size(); + size += ObjectSizes.sizeOfReferenceArray(numberOfRanges); + if (numberOfRanges > 1 && DatabaseDescriptor.getPartitioner().isFixedLength()) + return size + numberOfRanges * range(ranges.get(0)); + + for (int i = 0 ; i < numberOfRanges ; i++) + size += range(ranges.get(i)); + return size; + } + + private static final long EMPTY_KEYS_SIZE = measure(Keys.of()); + public static long keys(Keys keys) + { + long size = EMPTY_KEYS_SIZE; + size += ObjectSizes.sizeOfReferenceArray(keys.size()); + for (int i=0, mi=keys.size(); i seekables) + { + switch (seekables.domain()) + { + default: throw new AssertionError(); + case Key: return keys((Keys) seekables); + case Range: return ranges((Ranges) seekables); + } + } + + private static long routingKeysOnly(AbstractKeys keys) + { + int numberOfKeys = keys.size(); + long size = ObjectSizes.sizeOfReferenceArray(numberOfKeys); + if (numberOfKeys > 1 && DatabaseDescriptor.getPartitioner().isFixedLength()) + return size + numberOfKeys * key(keys.get(0)); + + for (int i=0 ; i < numberOfKeys; i++) + size += key(keys.get(i)); + return size; + } + + private static final long EMPTY_ROUTING_KEYS_SIZE = measure(RoutingKeys.of()); + public static long routingKeys(RoutingKeys keys) + { + return EMPTY_ROUTING_KEYS_SIZE + routingKeysOnly(keys); + } + + private static final long EMPTY_FULL_KEY_ROUTE_SIZE = measure(new FullKeyRoute(new TokenKey(null, null), new RoutingKey[0])); + public static long fullKeyRoute(FullKeyRoute route) + { + return EMPTY_FULL_KEY_ROUTE_SIZE + + routingKeysOnly(route) + + key(route.homeKey()); // TODO (desired): we will probably dedup homeKey, serializer dependent, but perhaps this is an acceptable error + } + + private static final long EMPTY_PARTIAL_KEY_ROUTE_KEYS_SIZE = measure(new PartialKeyRoute(new TokenKey(null, null), new RoutingKey[0])); + public static long partialKeyRoute(PartialKeyRoute route) + { + return EMPTY_PARTIAL_KEY_ROUTE_KEYS_SIZE + + routingKeysOnly(route) + + key(route.homeKey()); + } + + public static long ranges(AbstractRanges ranges) + { + long size = ObjectSizes.sizeOfReferenceArray(ranges.size()); + for (int i=0, mi=ranges.size(); i unseekables) + { + switch (unseekables.kind()) + { + default: throw new AssertionError(); + case RoutingKeys: return routingKeys((RoutingKeys) unseekables); + case PartialKeyRoute: return partialKeyRoute((PartialKeyRoute) unseekables); + case FullKeyRoute: return fullKeyRoute((FullKeyRoute) unseekables); + case RoutingRanges: return ranges((Ranges) unseekables); + case PartialRangeRoute: return partialRangeRoute((PartialRangeRoute) unseekables); + case FullRangeRoute: return fullRangeRoute((FullRangeRoute) unseekables); + } + } + + private static final long EMPTY_TXN = measure(new PartialTxn.InMemory(null, null, null, null, null, TableMetadatasAndKeys.none(Domain.Key))); + public static long txn(PartialTxn txn) + { + long size = EMPTY_TXN; + size += seekables(txn.keys()); + size += ((TxnRead) txn.read()).estimatedSizeOnHeap(); + if (txn.update() != null) + size += ((AccordUpdate) txn.update()).estimatedSizeOnHeap(); + if (txn.query() != null) + size += ((TxnQuery) txn.query()).estimatedSizeOnHeap(); + return size; + } + + // don't count Id size, as should normally be shared + private static final long TIMESTAMP_SIZE = ObjectSizes.measure(Timestamp.fromBits(0, 0, new Node.Id(0))); + private static final long BALLOT_SIZE = ObjectSizes.measure(Ballot.ZERO); + + public static long timestamp() + { + return TIMESTAMP_SIZE; + } + + public static long timestamp(Timestamp timestamp) + { + return TIMESTAMP_SIZE; + } + + public static long ballot() + { + return BALLOT_SIZE; + } + + public static long ballot(Ballot ballot) + { + return ballot == Ballot.ZERO ? 0 : BALLOT_SIZE; + } + + private static final long EMPTY_DEPS_SIZE = ObjectSizes.measureDeep(Deps.NONE); + public static long dependencies(Deps dependencies) + { + // TODO (expected): this doesn't measure the backing arrays, is inefficient; + // doesn't account for txnIdToKeys, txnIdToRanges, and searchable fields; + // fix to accunt for, in case caching isn't redone + long size = EMPTY_DEPS_SIZE - EMPTY_KEYS_SIZE - ObjectSizes.sizeOfReferenceArray(0); + size += routingKeys(dependencies.keyDeps.keys()); + for (int i = 0 ; i < dependencies.rangeDeps.rangeCount() ; ++i) + size += range(dependencies.rangeDeps.range(i)); + size += ObjectSizes.sizeOfReferenceArray(dependencies.rangeDeps.rangeCount()); + + for (int i = 0 ; i < dependencies.keyDeps.txnIdCount() ; ++i) + size += timestamp(dependencies.keyDeps.txnId(i)); + for (int i = 0 ; i < dependencies.rangeDeps.txnIdCount() ; ++i) + size += timestamp(dependencies.rangeDeps.txnId(i)); + + size += KeyDeps.SerializerSupport.keysToTxnIdsCount(dependencies.keyDeps) * 4L; + size += RangeDeps.SerializerSupport.rangesToTxnIdsCount(dependencies.rangeDeps) * 4L; + return size; + } + + private static final long EMPTY_WRITES_SIZE = measure(new Writes(null, null, null, null)); + public static long writes(Writes writes) + { + long size = EMPTY_WRITES_SIZE; + size += timestamp(writes.executeAt); + size += seekables(writes.keys); + if (writes.write != null) + size += ((TxnWrite) writes.write).estimatedSizeOnHeap(); + return size; + } + + public static long results(Result result) + { + if (result == ResultSerializers.APPLIED) + return 0; + return ((TxnResult) result).estimatedSizeOnHeap(); + } + + private static class CommandEmptySizes + { + private final static TokenKey EMPTY_KEY = new TokenKey(EMPTY_ID, null); + private final static TxnId EMPTY_TXNID = new TxnId(42, 42, 0, Kind.Read, Domain.Key, new Node.Id(42)); + + private static ICommand attrs(boolean hasDeps, boolean hasTxn, boolean executes) + { + FullKeyRoute route = new FullKeyRoute(EMPTY_KEY, new RoutingKey[]{ EMPTY_KEY }); + Participants empty = route.slice(0, 0); + ICommand.Builder builder = new ICommand.Builder(EMPTY_TXNID) + .setParticipants(StoreParticipants.create(route, empty, executes ? empty : null, executes ? empty : null, empty, route)) + .durability(NotDurable) + .executeAt(EMPTY_TXNID) + .promised(Ballot.ZERO); + if (hasDeps) + builder.partialDeps(PartialDeps.NONE); + + if (hasTxn) + builder.partialTxn(new PartialTxn.InMemory(null, null, null, null, null, TableMetadatasAndKeys.none(Domain.Key))); + + if (executes) + { + builder.waitingOn(WaitingOn.empty(Domain.Key)); + builder.result(new TxnData()); + } + + return builder; + } + + final static long NOT_DEFINED = measure(notDefined(attrs(false, false, false))); + final static long PREACCEPTED = measure(preaccepted(attrs(false, true, false), SaveStatus.PreAccepted)); + final static long NOTACCEPTED = measure(notAccepted(attrs(false, false, false), SaveStatus.AcceptedInvalidate)); + final static long ACCEPTED = measure(accepted(attrs(true, false, false), SaveStatus.AcceptedMedium)); + final static long COMMITTED = measure(committed(attrs(true, true, false), SaveStatus.Committed)); + final static long EXECUTED = measure(executed(attrs(true, true, true), SaveStatus.Applied)); + // TODO (expected): TruncatedAwaitsOnlyDeps + final static long TRUNCATED = measure(vestigial(EMPTY_TXNID, attrs(false, false, false).participants())); + final static long INVALIDATED = measure(invalidated(EMPTY_TXNID, attrs(false, false, false).participants())); + + private static long emptySize(Command command) + { + switch (command.saveStatus()) + { + case Uninitialised: + case NotDefined: + return NOT_DEFINED; + case PreAccepted: + case PreAcceptedWithDeps: + case PreAcceptedWithVote: + return PREACCEPTED; + case AcceptedInvalidate: + return NOTACCEPTED; + case AcceptedInvalidateWithDefinition: + case AcceptedMedium: + case AcceptedMediumWithDefinition: + case AcceptedSlow: + case AcceptedSlowWithDefinition: + case PreCommitted: + case PreCommittedWithDeps: + case PreCommittedWithFixedDeps: + case PreCommittedWithDefinition: + case PreCommittedWithDefAndDeps: + case PreCommittedWithDefAndFixedDeps: + return ACCEPTED; + case Committed: + case ReadyToExecute: + case Stable: + return COMMITTED; + case PreApplied: + case Applying: + case Applied: + return EXECUTED; + case TruncatedApply: + case TruncatedUnapplied: + case TruncatedApplyWithOutcome: + case Vestigial: + case Erased: + return TRUNCATED; + case Invalidated: + return INVALIDATED; + default: + throw new IllegalStateException("Unhandled status " + command.status()); + } + } + } + + private static long sizeNullable(T value, ToLongFunction measure) + { + if (value == null) + return 0; + return measure.applyAsLong(value); + } + + public static long command(Command command) + { + long size = CommandEmptySizes.emptySize(command); + size += sizeNullable(command.route(), AccordObjectSizes::route); + size += sizeNullable(command.promised(), AccordObjectSizes::timestamp); + size += sizeNullable(command.executeAt(), AccordObjectSizes::timestamp); + size += sizeNullable(command.partialTxn(), AccordObjectSizes::txn); + size += sizeNullable(command.partialDeps(), AccordObjectSizes::dependencies); + size += sizeNullable(command.acceptedOrCommitted(), AccordObjectSizes::timestamp); + size += sizeNullable(command.writes(), AccordObjectSizes::writes); + size += sizeNullable(command.result(), AccordObjectSizes::results); + size += sizeNullable(command.waitingOn(), AccordObjectSizes::waitingOn); + return size; + } + + private static long EMPTY_WAITING_ON_SIZE = measure(new WaitingOn(null, null, null, null)); + private static long EMPTY_BIT_SET_SIZE = measure(new ImmutableBitSet(0)); + private static long waitingOn(WaitingOn waitingOn) + { + // TODO (desired): this doesn't correctly account for object padding of bitset arrays + long size = EMPTY_WAITING_ON_SIZE + EMPTY_BIT_SET_SIZE + (waitingOn.waitingOn.size() * 8L); + if (waitingOn.appliedOrInvalidated != null) + size += EMPTY_BIT_SET_SIZE + (waitingOn.appliedOrInvalidated.size() * 8L); + return size; + } + + private static long EMPTY_CFK_SIZE = measure(new CommandsForKey(null)); + private static long EMPTY_INFO_SIZE = measure(CommandsForKey.NO_INFO); + private static long EMPTY_INFO_EXTRA_ADDITIONAL_SIZE = measure(TxnInfo.create(TxnId.NONE, ACCEPTED, false, TxnId.NONE, NO_TXNIDS, Ballot.MAX)) - EMPTY_INFO_SIZE; + public static long commandsForKey(CommandsForKey cfk) + { + long size = EMPTY_CFK_SIZE; + size += key(cfk.key()); + size += ObjectSizes.sizeOfReferenceArray(cfk.size()); + size += cfk.size() * EMPTY_INFO_SIZE; + for (int i = 0 ; i < cfk.size() ; ++i) + { + TxnInfo info = cfk.get(i); + if (info.executeAt != info) size += TIMESTAMP_SIZE; + if (info.getClass() != TxnInfoExtra.class) continue; + TxnInfoExtra infoExtra = (TxnInfoExtra) info; + if (infoExtra.missing.length > 0) + { + size += EMPTY_INFO_EXTRA_ADDITIONAL_SIZE; + size += ObjectSizes.sizeOfReferenceArray(infoExtra.missing.length); + size += infoExtra.missing.length * TIMESTAMP_SIZE; + size += ballot(infoExtra.ballot); + } + } + return size; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordOperations.java b/src/java/org/apache/cassandra/service/accord/AccordOperations.java new file mode 100644 index 000000000000..e7820919e170 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordOperations.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.transformations.AccordMarkStale; +import org.apache.cassandra.tcm.transformations.AccordMarkRejoining; +import org.apache.cassandra.utils.MBeanWrapper; + +public class AccordOperations implements AccordOperationsMBean +{ + public static final String MBEAN_OBJECT_NAME = "org.apache.cassandra.service.accord:type=AccordOperations"; + public static final AccordOperations instance = new AccordOperations(ClusterMetadataService.instance()); + + private final ClusterMetadataService cms; + + public static void initJmx() + { + MBeanWrapper.instance.registerMBean(instance, MBEAN_OBJECT_NAME); + } + + private AccordOperations(ClusterMetadataService cms) + { + this.cms = cms; + } + + @Override + public Map describe() + { + Map info = new HashMap<>(); + ClusterMetadata metadata = ClusterMetadata.current(); + + info.put("EPOCH", Long.toString(metadata.epoch.getEpoch())); + String staleReplicas = metadata.accordStaleReplicas.ids().stream().sorted().map(Object::toString).collect(Collectors.joining(",")); + info.put("STALE_REPLICAS", staleReplicas); + return info; + } + + @Override + public void accordMarkStale(List nodeIdStrings) + { + Set nodeIds = nodeIdStrings.stream().map(NodeId::fromString).collect(Collectors.toSet()); + cms.commit(new AccordMarkStale(nodeIds)); + } + + @Override + public void accordMarkRejoining(List nodeIdStrings) + { + Set nodeIds = nodeIdStrings.stream().map(NodeId::fromString).collect(Collectors.toSet()); + cms.commit(new AccordMarkRejoining(nodeIds)); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordOperationsMBean.java b/src/java/org/apache/cassandra/service/accord/AccordOperationsMBean.java new file mode 100644 index 000000000000..e0b0884733c3 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordOperationsMBean.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.List; +import java.util.Map; + +public interface AccordOperationsMBean +{ + Map describe(); + + void accordMarkStale(List nodeIds); + + void accordMarkRejoining(List nodeIds); +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordResponseVerbHandler.java b/src/java/org/apache/cassandra/service/accord/AccordResponseVerbHandler.java new file mode 100644 index 000000000000..330043d35157 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordResponseVerbHandler.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service.accord; + +import java.util.concurrent.TimeUnit; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.coordinate.Timeout; +import accord.impl.RequestCallbacks; +import accord.local.Node; +import accord.messages.Reply; +import org.apache.cassandra.exceptions.RequestFailure; +import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.net.IVerbHandler; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.tracing.Tracing; +import org.apache.cassandra.utils.NoSpamLogger; +import org.apache.cassandra.utils.NoSpamLogger.NoSpamLogStatement; + +import static java.util.concurrent.TimeUnit.NANOSECONDS; +import static org.apache.cassandra.utils.MonotonicClock.Global.approxTime; + +class AccordResponseVerbHandler implements IVerbHandler +{ + private static final Logger logger = LoggerFactory.getLogger(AccordResponseVerbHandler.class); + private static final NoSpamLogStatement dropping = NoSpamLogger.getStatement(logger, "Dropping response {} from {}", 1L, TimeUnit.SECONDS); + + private final RequestCallbacks callbacks; + private final AccordEndpointMapper endpointMapper; + + AccordResponseVerbHandler(RequestCallbacks callbacks, AccordEndpointMapper endpointMapper) + { + this.callbacks = callbacks; + this.endpointMapper = endpointMapper; + } + + @Override + public void doVerb(Message message) + { + if (!AccordService.instance().shouldAcceptMessages()) + { + dropping.debug(message.verb(), message.from()); + return; + } + + Node.Id from = endpointMapper.mappedId(message.from()); + logger.trace("Receiving {} from {}", message.payload, message.from()); + if (message.isFailureResponse()) + { + Tracing.trace("Processing failure response from {}", message.from()); + callbacks.onFailure(message.id(), from, convertFailureMessage((RequestFailure) message.payload)); + } + else + { + Tracing.trace("Processing response from {}", message.from()); + boolean remove = !(message.payload instanceof Reply) || ((Reply) message.payload).isFinal(); + RequestCallbacks.CallbackEntry cbe = callbacks.onSuccess(message.id(), from, message.payload, remove); + if (cbe == null) + return; + + long latencyNanos = approxTime.now() - cbe.registeredAt(NANOSECONDS); + MessagingService.instance().latencySubscribers.add(message.from(), latencyNanos, NANOSECONDS); + } + } + + private static Throwable convertFailureMessage(RequestFailure failure) + { + return failure.reason == RequestFailureReason.TIMEOUT ? + new Timeout(null, null) : + new RuntimeException(failure.failure); + } + +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordResult.java b/src/java/org/apache/cassandra/service/accord/AccordResult.java new file mode 100644 index 000000000000..86315b6d6028 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordResult.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; +import java.util.function.BiConsumer; +import javax.annotation.Nullable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.coordinate.CoordinationFailed; +import accord.coordinate.Exhausted; +import accord.coordinate.Preempted; +import accord.coordinate.Timeout; +import accord.coordinate.TopologyMismatch; +import accord.primitives.Seekable; +import accord.primitives.Seekables; +import accord.primitives.TxnId; +import accord.utils.UnhandledEnum; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.exceptions.RequestExecutionException; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.api.AccordAgent; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.service.accord.txn.RetryWithNewProtocolResult; +import org.apache.cassandra.tracing.Tracing; +import org.apache.cassandra.utils.concurrent.AsyncFuture; + +import static org.apache.cassandra.utils.Clock.Global.nanoTime; + +public class AccordResult extends AsyncFuture implements BiConsumer, IAccordService.IAccordResult +{ + private static final Logger logger = LoggerFactory.getLogger(AccordResult.class); + + final @Nullable TxnId txnId; + final Seekables keysOrRanges; + final RequestBookkeeping bookkeeping; + final long startedAtNanos, deadlineAtNanos; + final boolean isTxnRequest; + + public AccordResult(@Nullable TxnId txnId, Seekables keysOrRanges, RequestBookkeeping bookkeeping, long startedAtNanos, long deadlineAtNanos, boolean isTxnRequest) + { + this.txnId = txnId; + this.keysOrRanges = keysOrRanges; + this.bookkeeping = bookkeeping; + this.startedAtNanos = startedAtNanos; + this.deadlineAtNanos = deadlineAtNanos; + this.isTxnRequest = isTxnRequest; + } + + @Override + public V awaitAndGet() throws RequestExecutionException + { + try + { + if (!awaitUntil(deadlineAtNanos)) + accept(null, new Timeout(txnId, null)); + } + catch (InterruptedException e) + { + accept(null, e); + } + + Throwable fail = fail(); + if (fail != null) + throw (RequestExecutionException) fail; + return success(); + } + + @Override + public void accept(V success, Throwable fail) + { + if (fail != null) + { + RequestExecutionException report; + CoordinationFailed coordinationFailed = findCoordinationFailed(fail); + TxnId txnId = this.txnId; + if (coordinationFailed != null) + { + if (txnId == null && coordinationFailed.txnId() != null) + txnId = coordinationFailed.txnId(); + + if (coordinationFailed instanceof Timeout) + { + report = bookkeeping.newTimeout(txnId, keysOrRanges); + } + else if (coordinationFailed instanceof Preempted) + { + report = bookkeeping.newPreempted(txnId, keysOrRanges); + } + else if (coordinationFailed instanceof Exhausted) + { + report = bookkeeping.newExhausted(txnId, keysOrRanges); + } + else if (isTxnRequest && coordinationFailed instanceof TopologyMismatch) + { + // Excluding bugs topology mismatch can occur because a table was dropped in between creating the txn + // and executing it. + // It could also race with the table stopping/starting being managed by Accord. + // The caller can retry if the table indeed exists and is managed by Accord. + Set txnDroppedTables = txnDroppedTables(keysOrRanges); + Tracing.trace("Accord returned topology mismatch: " + coordinationFailed.getMessage()); + logger.debug("Accord returned topology mismatch", coordinationFailed); + bookkeeping.markTopologyMismatch(); + // Throw IRE in case the caller fails to check if the table still exists + if (!txnDroppedTables.isEmpty()) + { + Tracing.trace("Accord txn uses dropped tables {}", txnDroppedTables); + logger.debug("Accord txn uses dropped tables {}", txnDroppedTables); + throw new InvalidRequestException("Accord transaction uses dropped tables"); + } + trySuccess((V) RetryWithNewProtocolResult.instance); + return; + } + else + { + report = bookkeeping.newFailed(txnId, keysOrRanges); + } + // this case happens when a non-timeout exception is seen, and we are unable to move forward + if (txnId != null && txnId.isSyncPoint()) + AccordAgent.onFailedBarrier(txnId, fail); + } + else + { + report = bookkeeping.newFailed(txnId, keysOrRanges); + } + report.addSuppressed(fail); + tryFailure(report); + } + else + { + if (success == RetryWithNewProtocolResult.instance) + { + bookkeeping.markRetryDifferentSystem(); + Tracing.trace("Got retry different system error from Accord, will retry"); + } + trySuccess(success); + } + bookkeeping.markElapsedNanos(nanoTime() - startedAtNanos); + } + + @Override + public boolean awaitUntil(long nanoTimeDeadline) throws InterruptedException + { + if (super.awaitUntil(nanoTimeDeadline)) + return true; + + accept(null, new Timeout(null, null)); + return false; + } + + public Throwable fail() + { + return cause(); + } + + public V success() + { + return getNow(); + } + + @Override + public AccordResult addCallback(BiConsumer callback) + { + super.addCallback(callback); + return this; + } + + private static CoordinationFailed findCoordinationFailed(Throwable fail) + { + while (fail != null) + { + if (fail instanceof CoordinationFailed) + return (CoordinationFailed) fail; + Throwable next = fail.getCause(); + if (next == fail) + return null; + fail = next; + } + return null; + } + + private static Set txnDroppedTables(Seekables keys) + { + Set tables = new HashSet<>(); + for (Seekable seekable : keys) + { + switch (seekable.domain()) + { + default: UnhandledEnum.unknown(seekable.domain()); + case Key: + tables.add(((PartitionKey) seekable).table()); + break; + case Range: + tables.add(((TokenRange) seekable).table()); + break; + } + } + + Iterator tablesIterator = tables.iterator(); + while (tablesIterator.hasNext()) + { + TableId table = tablesIterator.next(); + if (Schema.instance.getTableMetadata(table) != null) + tablesIterator.remove(); + } + return tables; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordSafeCommand.java b/src/java/org/apache/cassandra/service/accord/AccordSafeCommand.java new file mode 100644 index 000000000000..162a8d326bab --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordSafeCommand.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.Objects; + +import com.google.common.annotations.VisibleForTesting; + +import accord.api.Journal; +import accord.local.Command; +import accord.local.SafeCommand; +import accord.primitives.TxnId; +import org.apache.cassandra.utils.concurrent.Ref; + +public class AccordSafeCommand extends SafeCommand implements AccordSafeState +{ + public static class DebugAccordSafeCommand extends AccordSafeCommand + { + final Ref selfRef; + public DebugAccordSafeCommand(AccordCacheEntry global) + { + super(global); + selfRef = new Ref<>(this, null); + selfRef.debug(global.key().toString()); + } + + @Override + public void invalidate() + { + super.invalidate(); + selfRef.release(); + } + + public static void trace(AccordSafeCommand safeCommand, String message) + { + ((DebugAccordSafeCommand)safeCommand).selfRef.debug(message); + } + } + + private boolean invalidated; + private final AccordCacheEntry global; + private Command original; + private Command current; + + public AccordSafeCommand(AccordCacheEntry global) + { + super(global.key()); + this.global = global; + this.original = null; + this.current = null; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + AccordSafeCommand that = (AccordSafeCommand) o; + return Objects.equals(original, that.original) && Objects.equals(current, that.current); + } + + @Override + public int hashCode() + { + throw new UnsupportedOperationException(); + } + + @Override + public String toString() + { + return "AccordSafeCommand{" + + "invalidated=" + invalidated + + ", global=" + global + + ", original=" + original + + ", current=" + current + + '}'; + } + + @Override + public AccordCacheEntry global() + { + checkNotInvalidated(); + return global; + } + + @Override + public Command current() + { + checkNotInvalidated(); + return current; + } + + @Override + @VisibleForTesting + public void set(Command command) + { + checkNotInvalidated(); + this.current = command; + } + + @Override + public Command original() + { + checkNotInvalidated(); + return original; + } + + public Journal.CommandUpdate update() + { + return new Journal.CommandUpdate(original, current); + } + + @Override + public void preExecute() + { + checkNotInvalidated(); + original = global.getExclusive(); + current = original; + if (isUnset()) + uninitialised(); + } + + @Override + public void invalidate() + { + invalidated = true; + } + + @Override + public boolean invalidated() + { + return invalidated; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordSafeCommandStore.java b/src/java/org/apache/cassandra/service/accord/AccordSafeCommandStore.java new file mode 100644 index 000000000000..bc7c7cf8418e --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordSafeCommandStore.java @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.Collections; +import java.util.Map; +import java.util.Set; +import java.util.function.Predicate; +import javax.annotation.Nullable; + +import com.google.common.annotations.VisibleForTesting; + +import accord.api.Agent; +import accord.api.DataStore; +import accord.api.Journal.FieldUpdates; +import accord.api.ProgressLog; +import accord.api.RoutingKey; +import accord.impl.AbstractSafeCommandStore; +import accord.local.CommandStores; +import accord.local.NodeCommandStoreService; +import accord.local.cfk.CommandsForKey; +import accord.local.cfk.SafeCommandsForKey; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.primitives.Unseekables; +import org.apache.cassandra.service.accord.AccordCommandStore.ExclusiveCaches; +import org.apache.cassandra.service.accord.AccordCommandStore.SafeRedundantBefore; + +import static accord.utils.Invariants.illegalState; + +public class AccordSafeCommandStore extends AbstractSafeCommandStore +{ + final AccordTask task; + private final @Nullable CommandsForRanges commandsForRanges; + private final AccordCommandStore commandStore; + + private AccordSafeCommandStore(AccordTask task, + @Nullable CommandsForRanges commandsForRanges, + AccordCommandStore commandStore) + { + super(task.preLoadContext(), commandStore); + this.task = task; + this.commandsForRanges = commandsForRanges; + this.commandStore = commandStore; + commandStore.updateRangesForEpoch(this); + } + + @Override + public CommandStores.RangesForEpoch ranges() + { + CommandStores.RangesForEpoch ranges = super.ranges(); + if (ranges != null) + return ranges; + + return commandStore.unsafeGetRangesForEpoch(); + } + + public static AccordSafeCommandStore create(AccordTask operation, + @Nullable CommandsForRanges commandsForRanges, + AccordCommandStore commandStore) + { + return new AccordSafeCommandStore(operation, commandsForRanges, commandStore); + } + + @VisibleForTesting + public Set commandsForKeysKeys() + { + if (task.commandsForKey() == null) + return Collections.emptySet(); + return task.commandsForKey().keySet(); + } + + @Override + protected AccordSafeCommand getInternal(TxnId txnId) + { + Map commands = task.commands(); + if (commands == null) + return null; + return commands.get(txnId); + } + + @Override + protected ExclusiveCaches tryGetCaches() + { + return commandStore.tryLockCaches(); + } + + protected AccordSafeCommand add(AccordSafeCommand safeCommand, ExclusiveCaches caches) + { + Object check = task.ensureCommands().putIfAbsent(safeCommand.txnId(), safeCommand); + if (check == null) + { + safeCommand.preExecute(); + return safeCommand; + } + else + { + caches.commands().release(safeCommand, task); + throw illegalState("Attempted to take a duplicate reference to %s", safeCommand.txnId()); + } + } + + @Override + protected void persistFieldUpdates() + { + super.persistFieldUpdates(); + } + + protected void persistFieldUpdatesInternal(Runnable onDone) + { + FieldUpdates updates = fieldUpdates(); + if (updates == null) + return; + + if (updates.newRedundantBefore != null) + { + long ticket = AccordCommandStore.nextSafeRedundantBeforeTicket.incrementAndGet(); + SafeRedundantBefore update = new SafeRedundantBefore(ticket, updates.newRedundantBefore); + Runnable reportRedundantBefore = () -> { + AccordCommandStore.safeRedundantBeforeUpdater.accumulateAndGet((AccordCommandStore)commandStore, update, SafeRedundantBefore::max); + }; + Runnable prevOnDone = onDone; + onDone = prevOnDone == null ? reportRedundantBefore : () -> { + try { reportRedundantBefore.run(); } + finally { prevOnDone.run(); } + }; + } + commandStore.persistFieldUpdates(updates, onDone); + } + + protected AccordSafeCommandsForKey add(AccordSafeCommandsForKey safeCfk, ExclusiveCaches caches) + { + Object check = task.ensureCommandsForKey().putIfAbsent(safeCfk.key(), safeCfk); + if (check == null) + { + safeCfk.preExecute(); + return safeCfk; + } + else + { + caches.commandsForKeys().release(safeCfk, task); + throw illegalState("Attempted to take a duplicate reference to CFK for %s", safeCfk.key()); + } + } + + @Override + protected AccordSafeCommandsForKey getInternal(RoutingKey key) + { + Map commandsForKey = task.commandsForKey(); + if (commandsForKey == null) + return null; + return commandsForKey.get(key); + } + + @Override + public AccordCommandStore commandStore() + { + return commandStore; + } + + @Override + public DataStore dataStore() + { + return commandStore().dataStore(); + } + + @Override + public Agent agent() + { + return commandStore.agent(); + } + + @Override + public ProgressLog progressLog() + { + return commandStore().progressLog(); + } + + @Override + public NodeCommandStoreService node() + { + return commandStore.node(); + } + + private boolean visitForKey(Unseekables keysOrRanges, Predicate forEach) + { + Map commandsForKey = task.commandsForKey; + if (commandsForKey == null) + return true; + + Unseekables skip = context.keys().without(keysOrRanges); + for (SafeCommandsForKey safeCfk : commandsForKey.values()) + { + if (skip.contains(safeCfk.key())) + continue; + + if (!forEach.test(safeCfk.current())) + return false; + } + return true; + } + + @Override + public void visit(Unseekables keysOrRanges, Timestamp startedBefore, Txn.Kind.Kinds testKind, ActiveCommandVisitor visitor, P1 p1, P2 p2) + { + visitForKey(keysOrRanges, cfk -> { cfk.visit(startedBefore, testKind, visitor, p1, p2); return true; }); + if (commandsForRanges != null) + commandsForRanges.visit(keysOrRanges, startedBefore, testKind, visitor, p1, p2); + } + + @Override + public boolean visit(Unseekables keysOrRanges, TxnId testTxnId, Txn.Kind.Kinds testKind, TestStartedAt testStartedAt, Timestamp testStartedAtTimestamp, ComputeIsDep computeIsDep, AllCommandVisitor visit) + { + return visitForKey(keysOrRanges, cfk -> cfk.visit(testTxnId, testKind, testStartedAt, testStartedAtTimestamp, computeIsDep, null, visit)) + && (commandsForRanges == null || commandsForRanges.visit(keysOrRanges, testTxnId, testKind, testStartedAt, testStartedAtTimestamp, computeIsDep, visit)); + } + + @Override + public String toString() + { + return "AccordSafeCommandStore(id=" + commandStore().id() + ")"; + } +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/service/accord/AccordSafeCommandsForKey.java b/src/java/org/apache/cassandra/service/accord/AccordSafeCommandsForKey.java new file mode 100644 index 000000000000..634abda68b18 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordSafeCommandsForKey.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.Objects; + +import com.google.common.annotations.VisibleForTesting; + +import accord.api.RoutingKey; +import accord.local.cfk.CommandsForKey; +import accord.local.cfk.SafeCommandsForKey; + +public class AccordSafeCommandsForKey extends SafeCommandsForKey implements AccordSafeState +{ + private boolean invalidated; + private final AccordCacheEntry global; + private CommandsForKey original; + private CommandsForKey current; + + public AccordSafeCommandsForKey(AccordCacheEntry global) + { + super(global.key()); + this.global = global; + this.original = null; + this.current = null; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + AccordSafeCommandsForKey that = (AccordSafeCommandsForKey) o; + return Objects.equals(original, that.original) && Objects.equals(current, that.current); + } + + @Override + public int hashCode() + { + throw new UnsupportedOperationException(); + } + + @Override + public String toString() + { + return "AccordSafeCommandsForKey{" + + "invalidated=" + invalidated + + ", global=" + global + + ", original=" + original + + ", current=" + current + + '}'; + } + + @Override + public boolean hasUpdate() + { + boolean hasUpdate = AccordSafeState.super.hasUpdate(); + + // cfk initialization is legal, but doesn't need to be propagated to the cache (and would + // cause an exception to be thrown if it were). Making an exception on the cache side could + // throw away applied cfk updates as well, so it's special cased here + if (hasUpdate && original == null && current != null && current.size() == 0) + return false; + + return hasUpdate; + } + + @Override + public AccordCacheEntry global() + { + checkNotInvalidated(); + return global; + } + + @Override + public CommandsForKey current() + { + checkNotInvalidated(); + return current; + } + + @Override + @VisibleForTesting + public void set(CommandsForKey cfk) + { + checkNotInvalidated(); + this.current = cfk; + } + + public CommandsForKey original() + { + checkNotInvalidated(); + return original; + } + + @Override + public void preExecute() + { + checkNotInvalidated(); + original = global.getExclusive(); + current = original; + if (isUnset()) + initialize(); + } + + @Override + public void invalidate() + { + invalidated = true; + } + + @Override + public boolean invalidated() + { + return invalidated; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordSafeState.java b/src/java/org/apache/cassandra/service/accord/AccordSafeState.java new file mode 100644 index 000000000000..66b3eb1d1d99 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordSafeState.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service.accord; + +import accord.impl.SafeState; +import accord.utils.async.Cancellable; + +public interface AccordSafeState extends SafeState +{ + void set(V update); + V original(); + void invalidate(); + boolean invalidated(); + void preExecute(); + + AccordCacheEntry global(); + + default boolean hasUpdate() + { + return original() != current(); + } + + default void revert() + { + set(original()); + } + + default K key() + { + return global().key(); + } + + default Cancellable saving() + { + return global().saving(); + } + + default Throwable failure() + { + return global().failure(); + } + + default void checkNotInvalidated() + { + if (invalidated()) + throw new IllegalStateException("Cannot access invalidated " + this); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordSegmentCompactor.java b/src/java/org/apache/cassandra/service/accord/AccordSegmentCompactor.java new file mode 100644 index 000000000000..45633619a4c4 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordSegmentCompactor.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service.accord; + +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.SerializationHeader; +import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.io.sstable.Descriptor; +import org.apache.cassandra.io.sstable.SSTableTxnWriter; +import org.apache.cassandra.service.accord.serializers.Version; + +/** + * Segment compactor: takes static segments and compacts them into a single SSTable. + */ +public class AccordSegmentCompactor extends AbstractAccordSegmentCompactor +{ + private SSTableTxnWriter writer; + + public AccordSegmentCompactor(Version userVersion, ColumnFamilyStore cfs) + { + super(userVersion, cfs); + } + + @Override + void initializeWriter() + { + Descriptor descriptor = cfs.newSSTableDescriptor(cfs.getDirectories().getDirectoryForNewSSTables()); + SerializationHeader header = new SerializationHeader(true, cfs.metadata(), cfs.metadata().regularAndStaticColumns(), EncodingStats.NO_STATS); + + this.writer = SSTableTxnWriter.create(cfs, descriptor, 0, 0, null, false, header); + } + + @Override + SSTableTxnWriter writer() + { + return writer; + } + + @Override + void finishAndAddWriter() + { + cfs.addSSTables(writer.finish(true)); + writer.close(); + writer = null; + } + + @Override + Throwable cleanupWriter(Throwable t) + { + return writer.abort(t); + } +} + diff --git a/src/java/org/apache/cassandra/service/accord/AccordSerializers.java b/src/java/org/apache/cassandra/service/accord/AccordSerializers.java new file mode 100644 index 000000000000..2bae9c8a0c84 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordSerializers.java @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import accord.utils.VIntCoding; +import org.apache.cassandra.cql3.terms.MultiElements; +import org.apache.cassandra.cql3.terms.Term; +import org.apache.cassandra.db.ArrayClustering; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ClusteringPrefix; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.CollectionType; +import org.apache.cassandra.db.marshal.ListType; +import org.apache.cassandra.db.marshal.MapType; +import org.apache.cassandra.db.marshal.SetType; +import org.apache.cassandra.db.marshal.ValueAccessor; +import org.apache.cassandra.io.AsymmetricVersionedSerializer; +import org.apache.cassandra.io.EmbeddedAsymmetricVersionedSerializer; +import org.apache.cassandra.io.ParameterisedUnversionedSerializer; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.serializers.IVersionedSerializer; +import org.apache.cassandra.service.accord.serializers.Version; + +import static org.apache.cassandra.db.TypeSizes.sizeof; +import static org.apache.cassandra.db.TypeSizes.sizeofUnsignedVInt; +import static org.apache.cassandra.db.marshal.CollectionType.Kind.LIST; +import static org.apache.cassandra.db.marshal.CollectionType.Kind.MAP; +import static org.apache.cassandra.db.marshal.CollectionType.Kind.SET; + +public class AccordSerializers +{ + public static EmbeddedAsymmetricVersionedSerializer embedded(Version version, AsymmetricVersionedSerializer serializer) + { + return new EmbeddedAsymmetricVersionedSerializer<>(version, Version.Serializer.instance, serializer); + } + + public static Term.Terminal deserializeCqlCollectionAsTerm(ByteBuffer buffer, AbstractType type) + { + CollectionType collectionType = (CollectionType) type; + + if (collectionType.kind == SET) + return MultiElements.Value.fromSerialized(buffer, (SetType) type); + else if (collectionType.kind == LIST) + return MultiElements.Value.fromSerialized(buffer, (ListType) type); + else if (collectionType.kind == MAP) + return MultiElements.Value.fromSerialized(buffer, (MapType) type); + + throw new UnsupportedOperationException("Unsupported collection type: " + type); + } + + public static final ParameterisedUnversionedSerializer columnMetadataSerializer = new ParameterisedUnversionedSerializer<>() + { + @Override + public void serialize(ColumnMetadata column, TableMetadata table, DataOutputPlus out) throws IOException + { + out.writeUnsignedVInt32(column.uniqueId); + } + + @Override + public ColumnMetadata deserialize(TableMetadata table, DataInputPlus in) throws IOException + { + return table.getColumnById(in.readUnsignedVInt32()); + } + + @Override + public long serializedSize(ColumnMetadata column, TableMetadata table) + { + return VIntCoding.sizeOfUnsignedVInt(column.uniqueId); + } + }; + + public static final IVersionedSerializer tableMetadataSerializer = new IVersionedSerializer<>() + { + @Override + public void serialize(TableMetadata metadata, DataOutputPlus out, Version version) throws IOException + { + metadata.id.serializeCompact(out); + } + + @Override + public TableMetadata deserialize(DataInputPlus in, Version version) throws IOException + { + return Schema.instance.getTableMetadata(TableId.deserializeCompact(in)); + } + + @Override + public long serializedSize(TableMetadata metadata, Version version) + { + return metadata.id.serializedCompactSize(); + } + }; + + public static final UnversionedSerializer> clusteringSerializer = new UnversionedSerializer>() + { + @Override + public void serialize(Clustering clustering, DataOutputPlus out) throws IOException + { + doSerialize(clustering, out); + } + + private void doSerialize(Clustering clustering, DataOutputPlus out) throws IOException + { + if (clustering.kind() == ClusteringPrefix.Kind.STATIC_CLUSTERING) + { + out.writeBoolean(true); + } + else + { + out.writeBoolean(false); + out.writeUnsignedVInt32(clustering.size()); + ValueAccessor accessor = clustering.accessor(); + for (int i = 0; i < clustering.size(); i++) + { + accessor.writeWithVIntLength(clustering.get(i), out); + } + } + } + + @Override + public Clustering deserialize(DataInputPlus in) throws IOException + { + Clustering clustering; + if (in.readBoolean()) + { + clustering = Clustering.STATIC_CLUSTERING; + } + else + { + int numComponents = in.readUnsignedVInt32(); + byte[][] components = new byte[numComponents][]; + for (int ci = 0; ci < numComponents; ci++) + { + int componentLength = in.readUnsignedVInt32(); + components[ci] = new byte[componentLength]; + in.readFully(components[ci]); + } + clustering = new ArrayClustering(components); + } + return clustering; + } + + @Override + public long serializedSize(Clustering clustering) + { + return computeSerializedSize(clustering); + } + + private long computeSerializedSize(Clustering clustering) + { + int size = sizeof(true); + if (clustering.kind() != ClusteringPrefix.Kind.STATIC_CLUSTERING) + { + size += sizeofUnsignedVInt(clustering.size()); + ValueAccessor accessor = clustering.accessor(); + for (int i = 0; i < clustering.size(); i++) + { + int valueSize = accessor.size(clustering.get(i)); + size += valueSize; + size += sizeofUnsignedVInt(valueSize); + } + } + return size; + } + }; + + public static final UnversionedSerializer consistencyLevelSerializer = new UnversionedSerializer() + { + @Override + public void serialize(ConsistencyLevel t, DataOutputPlus out) throws IOException + { + out.writeByte(t.code); + } + + @Override + public ConsistencyLevel deserialize(DataInputPlus in) throws IOException + { + return ConsistencyLevel.fromCode(in.readByte()); + } + + @Override + public long serializedSize(ConsistencyLevel t) + { + return 1; + } + }; +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/service/accord/AccordService.java b/src/java/org/apache/cassandra/service/accord/AccordService.java new file mode 100644 index 000000000000..f85aee62afaf --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordService.java @@ -0,0 +1,984 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.function.BiFunction; +import java.util.function.Function; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import javax.annotation.concurrent.GuardedBy; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.primitives.Ints; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.Journal; +import accord.coordinate.CoordinateMaxConflict; +import accord.coordinate.CoordinateTransaction; +import accord.coordinate.KeyBarriers; +import accord.impl.AbstractConfigurationService; +import accord.impl.DefaultLocalListeners; +import accord.impl.DefaultRemoteListeners; +import accord.impl.RequestCallbacks; +import accord.impl.SizeOfIntersectionSorter; +import accord.impl.progresslog.DefaultProgressLogs; +import accord.local.Command; +import accord.local.CommandStore; +import accord.local.CommandStores; +import accord.local.KeyHistory; +import accord.local.Node; +import accord.local.Node.Id; +import accord.local.PreLoadContext; +import accord.local.SafeCommand; +import accord.local.ShardDistributor.EvenSplit; +import accord.local.UniqueTimeService.AtomicUniqueTimeWithStaleReservation; +import accord.local.cfk.CommandsForKey; +import accord.local.cfk.SafeCommandsForKey; +import accord.local.durability.DurabilityService; +import accord.local.durability.ShardDurability; +import accord.messages.Reply; +import accord.messages.Request; +import accord.primitives.FullRoute; +import accord.primitives.Keys; +import accord.primitives.Ranges; +import accord.primitives.RoutingKeys; +import accord.primitives.SaveStatus; +import accord.primitives.Seekable; +import accord.primitives.Seekables; +import accord.primitives.Status; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.topology.Topology; +import accord.topology.TopologyManager; +import accord.utils.DefaultRandom; +import accord.utils.Invariants; +import accord.utils.async.AsyncChain; +import accord.utils.async.AsyncChains; +import accord.utils.async.AsyncResult; +import org.apache.cassandra.concurrent.Shutdownable; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.exceptions.RequestExecutionException; +import org.apache.cassandra.journal.Params; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.IVerbHandler; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessageDelivery; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.repair.SharedContext; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.AccordSyncPropagator.Notification; +import org.apache.cassandra.service.accord.TimeOnlyRequestBookkeeping.LatencyRequestBookkeeping; +import org.apache.cassandra.service.accord.api.AccordAgent; +import org.apache.cassandra.service.accord.api.AccordRoutableKey; +import org.apache.cassandra.service.accord.api.AccordScheduler; +import org.apache.cassandra.service.accord.api.AccordTimeService; +import org.apache.cassandra.service.accord.api.AccordTopologySorter; +import org.apache.cassandra.service.accord.api.CompositeTopologySorter; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.accord.api.TokenKey.KeyspaceSplitter; +import org.apache.cassandra.service.accord.interop.AccordInteropAdapter.AccordInteropFactory; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.txn.TxnQuery; +import org.apache.cassandra.service.accord.txn.TxnRead; +import org.apache.cassandra.service.accord.txn.TxnResult; +import org.apache.cassandra.service.accord.txn.TxnUpdate; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.service.consensus.migration.TableMigrationState; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.transport.Dispatcher; +import org.apache.cassandra.utils.ExecutorUtils; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.concurrent.AsyncPromise; +import org.apache.cassandra.utils.concurrent.Future; +import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; + +import static accord.local.durability.DurabilityService.SyncLocal.Self; +import static accord.local.durability.DurabilityService.SyncRemote.All; +import static accord.messages.SimpleReply.Ok; +import static accord.primitives.Routable.Domain.Key; +import static accord.primitives.Txn.Kind.Write; +import static accord.primitives.TxnId.Cardinality.cardinality; +import static accord.topology.TopologyManager.TopologyRange; +import static java.util.concurrent.TimeUnit.NANOSECONDS; +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.apache.cassandra.config.DatabaseDescriptor.getAccordCommandStoreShardCount; +import static org.apache.cassandra.config.DatabaseDescriptor.getAccordGlobalDurabilityCycle; +import static org.apache.cassandra.config.DatabaseDescriptor.getAccordShardDurabilityCycle; +import static org.apache.cassandra.config.DatabaseDescriptor.getAccordShardDurabilityTargetSplits; +import static org.apache.cassandra.config.DatabaseDescriptor.getPartitioner; +import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.accordReadBookkeeping; +import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.accordWriteBookkeeping; +import static org.apache.cassandra.service.accord.journal.AccordTopologyUpdate.ImmutableTopoloyImage; +import static org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter.getTableMetadata; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; + +public class AccordService implements IAccordService, Shutdownable +{ + private static final Logger logger = LoggerFactory.getLogger(AccordService.class); + + private enum State {INIT, STARTED, SHUTTING_DOWN, SHUTDOWN} + + private final Node node; + private final Shutdownable nodeShutdown; + private final AccordMessageSink messageSink; + private final AccordConfigurationService configService; + private final AccordFastPathCoordinator fastPathCoordinator; + private final AccordScheduler scheduler; + private final AccordDataStore dataStore; + private final AccordJournal journal; + private final AccordVerbHandler requestHandler; + private final AccordResponseVerbHandler responseHandler; + + @GuardedBy("this") + private State state = State.INIT; + + private static final IAccordService NOOP_SERVICE = new NoOpAccordService(); + + private static volatile IAccordService instance = null; + + @VisibleForTesting + public static void unsafeSetNewAccordService(IAccordService service) + { + instance = service; + } + + @VisibleForTesting + public static void unsafeSetNoop() + { + instance = NOOP_SERVICE; + } + + public static boolean isSetup() + { + return instance != null; + } + + public static IVerbHandler watermarkHandlerOrNoop() + { + if (!isSetup()) return ignore -> {}; + AccordService i = (AccordService) instance(); + return i.configService().watermarkCollector.handler; + } + + public static IVerbHandler requestHandlerOrNoop() + { + if (!isSetup()) return ignore -> {}; + return instance().requestHandler(); + } + + public static IVerbHandler responseHandlerOrNoop() + { + if (!isSetup()) return ignore -> {}; + return instance().responseHandler(); + } + + public synchronized static void startup(NodeId tcmId) + { + if (!DatabaseDescriptor.getAccordTransactionsEnabled()) + { + instance = NOOP_SERVICE; + return; + } + + if (instance != null) + return; + + AccordService as = new AccordService(AccordTopology.tcmIdToAccord(tcmId)); + as.startup(); + if (StorageService.instance.isReplacingSameAddress()) + { + // when replacing another node but using the same ip the hostId will also match, this causes no TCM transactions + // to be committed... + // In order to bootup correctly, need to pull in the current epoch + ClusterMetadata current = ClusterMetadata.current(); + as.configService().listener.notifyPostCommit(current, current, false); + } + instance = as; + + replayJournal(as); + } + + @VisibleForTesting + public static void replayJournal(AccordService as) + { + logger.info("Starting journal replay."); + CommandsForKey.disableLinearizabilityViolationsReporting(); + try + { + AccordKeyspace.truncateAllCaches(); + as.journal().replay(as.node().commandStores()); + + logger.info("Waiting for command stores to quiesce."); + ((AccordCommandStores)as.node.commandStores()).waitForQuiescense(); + as.journal.unsafeSetStarted(); + } + finally + { + CommandsForKey.enableLinearizabilityViolationsReporting(); + } + + logger.info("Finished journal replay."); + } + + public static void shutdownServiceAndWait(long timeout, TimeUnit unit) throws InterruptedException, TimeoutException + { + IAccordService i = instance; + if (i == null) + return; + i.shutdownAndWait(timeout, unit); + } + + @Override + public boolean shouldAcceptMessages() + { + return state == State.STARTED && journal.started(); + } + + public static IAccordService instance() + { + if (!DatabaseDescriptor.getAccordTransactionsEnabled()) + return NOOP_SERVICE; + IAccordService i = instance; + Invariants.require(i != null, "AccordService was not started"); + return i; + } + + public static boolean started() + { + if (!DatabaseDescriptor.getAccordTransactionsEnabled()) + return false; + return instance != null; + } + + @VisibleForTesting + public AccordService(Id localId) + { + Invariants.require(localId != null, "static localId must be set before instantiating AccordService"); + logger.info("Starting accord with nodeId {}", localId); + AccordAgent agent = FBUtilities.construct(CassandraRelevantProperties.ACCORD_AGENT_CLASS.getString(AccordAgent.class.getName()), "AccordAgent"); + agent.setNodeId(localId); + AccordTimeService time = new AccordTimeService(); + final RequestCallbacks callbacks = new RequestCallbacks(time); + this.scheduler = new AccordScheduler(); + this.dataStore = new AccordDataStore(); + this.journal = new AccordJournal(DatabaseDescriptor.getAccord().journal); + this.configService = new AccordConfigurationService(localId); + this.fastPathCoordinator = AccordFastPathCoordinator.create(localId, configService); + this.messageSink = new AccordMessageSink(agent, configService, callbacks); + this.node = new Node(localId, + messageSink, + configService, + time, new AtomicUniqueTimeWithStaleReservation(time), + () -> dataStore, + new KeyspaceSplitter(new EvenSplit<>(getAccordCommandStoreShardCount(), getPartitioner().accordSplitter())), + agent, + new DefaultRandom(), + scheduler, + CompositeTopologySorter.create(SizeOfIntersectionSorter.SUPPLIER, + new AccordTopologySorter.Supplier(configService, DatabaseDescriptor.getNodeProximity())), + DefaultRemoteListeners::new, + ignore -> callbacks, + DefaultProgressLogs::new, + DefaultLocalListeners.Factory::new, + AccordCommandStores.factory(), + new AccordInteropFactory(agent, configService), + journal.durableBeforePersister(), + journal); + this.nodeShutdown = toShutdownable(node); + this.requestHandler = new AccordVerbHandler<>(node, configService); + this.responseHandler = new AccordResponseVerbHandler<>(callbacks, configService); + } + + @Override + public synchronized void startup() + { + unsafeStartupWithOverrides(null); + } + + @VisibleForTesting + public synchronized void unsafeStartupWithOverrides(@Nullable Journal.TopologyUpdate overrideNullTopologyUpdate) + { + if (state != State.INIT) + return; + journal.start(node); + node.load(); + + ClusterMetadata metadata = ClusterMetadata.current(); + configService.updateMapping(metadata); + + long highestKnown = -1; + List images = new ArrayList<>(); + + // Collect locally known topologies + Iterator iter = journal.replayTopologies(); + Journal.TopologyUpdate prev = null; + while (iter.hasNext()) + { + ImmutableTopoloyImage next = iter.next(); + // Due to partial compaction, we can clean up only some of the old epochs, creating gaps. We skip these epochs here. + if (prev != null && next.global.epoch() > prev.global.epoch() + 1) + images.clear(); + + images.add(next); + prev = next; + } + + if (prev == null) + prev = overrideNullTopologyUpdate; + + // Instantiate latest topology from the log, if known + if (prev != null) + { + node.commandStores().initializeTopologyUnsafe(prev); + highestKnown = prev.global.epoch(); + } + + try + { + TopologyRange remote = fetchTopologies(highestKnown + 1); + + // Replay local epochs + for (ImmutableTopoloyImage image : images) + configService.reportTopology(image.global); + + if (remote != null) + remote.forEach(configService::reportTopology, highestKnown + 1, Integer.MAX_VALUE); + else if (images.isEmpty()) // First boot, single-node cluster + configService.reportTopology(AccordTopology.createAccordTopology(metadata)); + + ClusterMetadataService.instance().log().addListener(configService.listener); + { + metadata = ClusterMetadata.current(); + highestKnown = configService.currentEpoch(); + if (metadata.epoch.getEpoch() > highestKnown) + { + remote = fetchTopologies(highestKnown + 1); + if (remote != null) + remote.forEach(configService::reportTopology, highestKnown + 1, Integer.MAX_VALUE); + } + } + + WatermarkCollector.fetchAndReportWatermarksAsync(configService()); + + int attempt = 0; + int waitSeconds = 5; + while (true) + { + Epoch await = Epoch.max(Epoch.create(configService.currentEpoch()), metadata.epoch); + try + { + epochReady(await).get(waitSeconds, SECONDS); + break; + } + catch (TimeoutException e) + { + logger.warn("Epoch {} is not ready after waiting for {} seconds", metadata.epoch, (++attempt) * waitSeconds); + } + } + } + catch (InterruptedException e) + { + Thread.currentThread().interrupt(); + throw new UncheckedInterruptedException(e); + } + catch (ExecutionException e) + { + throw new RuntimeException(e); + } + + configService.start(); + fastPathCoordinator.start(); + ClusterMetadataService.instance().log().addListener(fastPathCoordinator); + node.durability().shards().setTargetShardSplits(Ints.checkedCast(getAccordShardDurabilityTargetSplits())); + node.durability().shards().setShardCycleTime(Ints.checkedCast(getAccordShardDurabilityCycle(SECONDS)), SECONDS); + node.durability().global().setGlobalCycleTime(Ints.checkedCast(getAccordGlobalDurabilityCycle(SECONDS)), SECONDS); + node.durability().start(); + state = State.STARTED; + } + + /** + * Queries peers to discover min epoch, and then fetches all topologies between min and current epochs + */ + private TopologyRange fetchTopologies(long from) throws ExecutionException, InterruptedException + { + ClusterMetadata metadata = ClusterMetadata.current(); + + Set peers = new HashSet<>(); + peers.addAll(metadata.directory.allAddresses()); + peers.remove(FBUtilities.getBroadcastAddressAndPort()); + + // No peers: single node cluster or first node to boot + if (peers.isEmpty()) + return null; + + try + { + logger.info("Fetching topologies for epochs [{}, {}] from {}", from, metadata.epoch.getEpoch(), peers); + Invariants.require(from <= metadata.epoch.getEpoch(), + "Accord epochs should never be ahead of TCM ones, but %d was ahead of %d", from, metadata.epoch.getEpoch()); + + Future futures = FetchTopologies.fetch(SharedContext.Global.instance, + peers, + from, + Long.MAX_VALUE); + TopologyRange response = futures.get(); + logger.info("Fetched topologies {}", response); + + // We're behind and need to catch up CMS first. + if (response.current > ClusterMetadata.current().epoch.getEpoch()) + ClusterMetadataService.instance().fetchLogFromCMS(Epoch.create(response.current)); + + if (response.current >= from) + return response; + } + catch (Throwable e) + { + logger.info("Failed to fetch epochs [{}, {}] from {}", from, metadata.epoch.getEpoch(), peers); + } + + // After trying to contact all peers, and retrying according to retry spec on them, we give up. + // If there were no new known TCM epochs, we still allow Accord to start up, assuming there are no new epochs. + return null; + } + + @Override + public IVerbHandler requestHandler() + { + return requestHandler; + } + + @Override + public IVerbHandler responseHandler() + { + return responseHandler; + } + + public ShardDurability.ImmutableView shardDurability() + { + return node.durability().shards().immutableView(); + } + + @Override + public AsyncChain sync(Object requestedBy, Timestamp minBound, Ranges ranges, @Nullable Collection include, DurabilityService.SyncLocal syncLocal, DurabilityService.SyncRemote syncRemote) + { + return node.durability().sync(requestedBy, minBound, ranges, include, syncLocal, syncRemote); + } + + @Override + public AsyncChain sync(Timestamp minBound, Keys keys, DurabilityService.SyncLocal syncLocal, DurabilityService.SyncRemote syncRemote) + { + if (keys.size() != 1) + return syncInternal(minBound, keys, syncLocal, syncRemote); + + return KeyBarriers.find(node, minBound, keys.get(0).toUnseekable(), syncLocal, syncRemote) + .flatMap(found -> KeyBarriers.await(node, found, syncLocal, syncRemote)) + .flatMap(success -> { + if (success) + return null; + return syncInternal(minBound, keys, syncLocal, syncRemote); + }); + } + + private AsyncChain syncInternal(Timestamp minBound, Keys keys, DurabilityService.SyncLocal syncLocal, DurabilityService.SyncRemote syncRemote) + { + TxnId txnId = node.nextTxnId(minBound, Write, Key, cardinality(keys)); + FullRoute route = node.computeRoute(txnId, keys); + Txn txn = new Txn.InMemory(Write, keys, TxnRead.createNoOpRead(keys), TxnQuery.NONE, TxnUpdate.empty(), new TableMetadatasAndKeys(TableMetadatas.none(), keys)); + return CoordinateTransaction.coordinate(node, route, txnId, txn) + .map(ignore -> (Void)null).beginAsResult(); + } + + @Override + public AsyncChain maxConflict(Ranges ranges) + { + return node.commandStores().any().build(() -> CoordinateMaxConflict.maxConflict(node, ranges)).flatMap(i -> i); + } + + public static V getBlocking(AsyncChain async, Seekables keysOrRanges, RequestBookkeeping bookkeeping, long startedAt, long deadline, boolean isTxnRequest) + { + return getBlocking(async, null, keysOrRanges, bookkeeping, startedAt, deadline, isTxnRequest); + } + + public static V getBlocking(AsyncChain async, @Nullable TxnId txnId, Seekables keysOrRanges, RequestBookkeeping bookkeeping, long startedAt, long deadline, boolean isTxnRequest) + { + AccordResult result = new AccordResult<>(txnId, keysOrRanges, bookkeeping, startedAt, deadline, isTxnRequest); + async.begin(result); + return result.awaitAndGet(); + } + + public static void getBlocking(AsyncChain async, Seekables keysOrRanges, RequestBookkeeping bookkeeping, long startedAt, long deadline) + { + getBlocking(async, keysOrRanges, bookkeeping, startedAt, deadline, false); + } + + public static Keys intersecting(Keys keys) + { + if (keys.isEmpty()) + return keys; + + TableId tableId = tableId(keys, r -> ((AccordRoutableKey)r).table()); + return sliceToAccord(tableId, keys, Keys::slice); + } + + public static Ranges intersecting(Ranges ranges) + { + if (ranges.isEmpty()) + return ranges; + + TableId tableId = tableId(ranges, r -> ((TokenRange)r).table()); + return sliceToAccord(tableId, ranges, Ranges::slice); + } + + private static > C sliceToAccord(TableId tableId, C collection, BiFunction slice) + { + ClusterMetadata cm = ClusterMetadata.current(); + TableMetadata tm = getTableMetadata(cm, tableId); + + // Barriers can be needed just because it's an Accord managed range, but it could also be a migration back to Paxos + // in which case we do want to barrier the migrating/migrated ranges even though the target for the migration is not Accord + // In either case Accord should be aware of those ranges and not generate a topology mismatch + if (tm.params.transactionalMode != TransactionalMode.off || tm.params.transactionalMigrationFrom.migratingFromAccord()) + { + TableMigrationState tms = cm.consensusMigrationState.tableStates.get(tm.id); + // null is fine could be completely migrated or was always an Accord table on creation + if (tms == null) + return collection; + // Use migratingAndMigratedRanges (not accordSafeToReadRanges) because barriers are allowed even if Accord can't perform + // a read because they are only finishing/recovering existing Accord transactions + Ranges migratingAndMigratedRanges = AccordTopology.toAccordRanges(tms.tableId, tms.migratingAndMigratedRanges); + return slice.apply(collection, migratingAndMigratedRanges); + } + + return slice.apply(collection, Ranges.EMPTY); + } + + + private static > TableId tableId(C collection, Function getTableId) + { + TableId tableId = getTableId.apply(collection.get(0)); + for (int i = 1, maxi = collection.size() ; i < maxi ; ++i) + { + TableId check = getTableId.apply(collection.get(i)); + Invariants.require(tableId.equals(check), "Currently only one table is handled here."); + } + return tableId; + } + + @Override + public long currentEpoch() + { + return configService.currentEpoch(); + } + + + @Override + public TopologyManager topology() + { + return node.topology(); + } + + /** + * Consistency level is just echoed back in timeouts, in the future it may be used for interoperability + * with non-Accord operations. + */ + @Override + public @Nonnull TxnResult coordinate(long minEpoch, @Nonnull Txn txn, @Nonnull ConsistencyLevel consistencyLevel, @Nonnull Dispatcher.RequestTime requestTime) throws RequestExecutionException + { + return coordinateAsync(minEpoch, txn, consistencyLevel, requestTime).awaitAndGet(); + } + + @Override + public @Nonnull IAccordResult coordinateAsync(long minEpoch, @Nonnull Txn txn, @Nonnull ConsistencyLevel consistencyLevel, @Nonnull Dispatcher.RequestTime requestTime) + { + TxnId txnId = node.nextTxnId(txn.kind(), txn.keys().domain(), cardinality(txn.keys())); + long timeout = txnId.isWrite() ? DatabaseDescriptor.getWriteRpcTimeout(NANOSECONDS) : DatabaseDescriptor.getReadRpcTimeout(NANOSECONDS); + ClientRequestBookkeeping bookkeeping = txn.isWrite() ? accordWriteBookkeeping : accordReadBookkeeping; + bookkeeping.metrics.keySize.update(txn.keys().size()); + long deadlineNanos = requestTime.computeDeadline(timeout); + AccordResult result = new AccordResult<>(txnId, txn.keys(), bookkeeping, requestTime.startedAtNanos(), deadlineNanos, true); + ((AsyncResult)node.coordinate(txnId, txn, minEpoch, deadlineNanos)).begin(result); + return result; + } + + @Override + public void setCacheSize(long kb) + { + long bytes = kb << 10; + AccordCommandStores commandStores = (AccordCommandStores) node.commandStores(); + commandStores.setCapacity(bytes); + } + + @Override + public void setWorkingSetSize(long kb) + { + long bytes = kb << 10; + AccordCommandStores commandStores = (AccordCommandStores) node.commandStores(); + commandStores.setWorkingSetSize(bytes); + } + + @Override + public boolean isTerminated() + { + return scheduler.isTerminated(); + } + + @Override + public synchronized void shutdown() + { + if (state != State.STARTED) + return; + state = State.SHUTTING_DOWN; + shutdownAndWait(1, TimeUnit.MINUTES); + state = State.SHUTDOWN; + } + + @Override + public Object shutdownNow() + { + shutdown(); + return null; + } + + @Override + public boolean awaitTermination(long timeout, TimeUnit units) throws InterruptedException + { + try + { + ExecutorUtils.awaitTermination(timeout, units, shutdownableSubsystems()); + return true; + } + catch (TimeoutException e) + { + return false; + } + } + + private List shutdownableSubsystems() + { + return Arrays.asList(scheduler, nodeShutdown, journal, configService); + } + + @VisibleForTesting + @Override + public void shutdownAndWait(long timeout, TimeUnit unit) + { + if (!ExecutorUtils.shutdownSequentiallyAndWait(shutdownableSubsystems(), timeout, unit)) + logger.error("One or more subsystems did not shut down cleanly."); + } + + @Override + public AccordScheduler scheduler() + { + return scheduler; + } + + public Id nodeId() + { + return node.id(); + } + + @Override + public List debugTxnBlockedGraph(TxnId txnId) + { + AsyncChain> states = loadDebug(txnId); + try + { + return AsyncChains.getBlocking(states); + } + catch (InterruptedException e) + { + throw new UncheckedInterruptedException(e); + } + catch (ExecutionException e) + { + throw new RuntimeException(e.getCause()); + } + } + + public AsyncChain> loadDebug(TxnId original) + { + CommandStores commandStores = node.commandStores(); + if (commandStores.count() == 0) + return AsyncChains.success(Collections.emptyList()); + int[] ids = commandStores.ids(); + List> chains = new ArrayList<>(ids.length); + for (int id : ids) + chains.add(loadDebug(original, commandStores.forId(id))); + return AsyncChains.allOf(chains); + } + + private AsyncChain loadDebug(TxnId txnId, CommandStore store) + { + CommandStoreTxnBlockedGraph.Builder state = new CommandStoreTxnBlockedGraph.Builder(store.id()); + return populate(state, store, txnId).map(ignore -> state.build()); + } + + private static AsyncChain populate(CommandStoreTxnBlockedGraph.Builder state, CommandStore store, TxnId txnId) + { + AsyncChain> submit = store.submit(txnId, in -> { + AsyncChain chain = populate(state, (AccordSafeCommandStore) in, txnId); + return chain == null ? AsyncChains.success(null) : chain; + }); + return submit.flatMap(Function.identity()); + } + + private static AsyncChain populate(CommandStoreTxnBlockedGraph.Builder state, CommandStore commandStore, TokenKey blockedBy, TxnId txnId, Timestamp executeAt) + { + AsyncChain> submit = commandStore.submit(PreLoadContext.contextFor(txnId, RoutingKeys.of(blockedBy.toUnseekable()), KeyHistory.SYNC), in -> { + AsyncChain chain = populate(state, (AccordSafeCommandStore) in, blockedBy, txnId, executeAt); + return chain == null ? AsyncChains.success(null) : chain; + }); + return submit.flatMap(Function.identity()); + } + + @Nullable + private static AsyncChain populate(CommandStoreTxnBlockedGraph.Builder state, AccordSafeCommandStore safeStore, TxnId txnId) + { + SafeCommand safeCommand = safeStore.unsafeGet(txnId); + Invariants.nonNull(safeCommand, "Txn %s is not in the cache", txnId); + if (safeCommand.current() == null || safeCommand.current().saveStatus() == SaveStatus.Uninitialised) + return null; + CommandStoreTxnBlockedGraph.TxnState cmdTxnState = populate(state, safeCommand.current()); + if (cmdTxnState.notBlocked()) + return null; + //TODO (expected): check depth + List> chains = new ArrayList<>(); + for (TxnId blockedBy : cmdTxnState.blockedBy) + { + if (state.knows(blockedBy)) continue; + // need to fetch the state + if (safeStore.ifLoadedAndInitialised(blockedBy) != null) + { + AsyncChain chain = populate(state, safeStore, blockedBy); + if (chain != null) + chains.add(chain); + } + else + { + // go fetch it + chains.add(populate(state, safeStore.commandStore(), blockedBy)); + } + } + for (TokenKey blockedBy : cmdTxnState.blockedByKey) + { + if (state.keys.containsKey(blockedBy)) continue; + if (safeStore.ifLoadedAndInitialised(blockedBy) != null) + { + AsyncChain chain = populate(state, safeStore, blockedBy, txnId, safeCommand.current().executeAt()); + if (chain != null) + chains.add(chain); + } + else + { + // go fetch it + chains.add(populate(state, safeStore.commandStore(), blockedBy, txnId, safeCommand.current().executeAt())); + } + } + if (chains.isEmpty()) + return null; + return AsyncChains.allOf(chains).map(ignore -> null); + } + + private static AsyncChain populate(CommandStoreTxnBlockedGraph.Builder state, AccordSafeCommandStore safeStore, TokenKey pk, TxnId txnId, Timestamp executeAt) + { + SafeCommandsForKey commandsForKey = safeStore.ifLoadedAndInitialised(pk); + TxnId blocking = commandsForKey.current().blockedOnTxnId(txnId, executeAt); + if (blocking instanceof CommandsForKey.TxnInfo) + blocking = ((CommandsForKey.TxnInfo) blocking).plainTxnId(); + state.keys.put(pk, blocking); + if (state.txns.containsKey(blocking)) return null; + if (safeStore.ifLoadedAndInitialised(blocking) != null) return populate(state, safeStore, blocking); + return populate(state, safeStore.commandStore(), blocking); + } + + private static CommandStoreTxnBlockedGraph.TxnState populate(CommandStoreTxnBlockedGraph.Builder state, Command cmd) + { + CommandStoreTxnBlockedGraph.Builder.TxnBuilder cmdTxnState = state.txn(cmd.txnId(), cmd.executeAt(), cmd.saveStatus()); + if (!cmd.hasBeen(Status.Applied) && cmd.hasBeen(Status.Stable)) + { + // check blocking state + Command.WaitingOn waitingOn = cmd.asCommitted().waitingOn(); + waitingOn.waitingOn.reverseForEach(null, null, null, null, (i1, i2, i3, i4, i) -> { + if (i < waitingOn.txnIdCount()) + { + // blocked on txn + cmdTxnState.blockedBy.add(waitingOn.txnId(i)); + } + else + { + // blocked on key + cmdTxnState.blockedByKey.add((TokenKey) waitingOn.keys.get(i - waitingOn.txnIdCount())); + } + }); + } + return cmdTxnState.build(); + } + + @Nullable + @Override + public Long minEpoch() + { + return node.topology().minEpoch(); + } + + public Node node() + { + return node; + } + + public AccordJournal journal() + { + return journal; + } + + @Override + public Future epochReady(Epoch epoch) + { + AsyncPromise promise = new AsyncPromise<>(); + AsyncChain ready = configService.epochReady(epoch.getEpoch()); + ready.begin((result, failure) -> { + if (failure == null) promise.trySuccess(result); + else promise.tryFailure(failure); + }); + return promise; + } + + @Override + public void receive(Message message) + { + receive(MessagingService.instance(), configService, message); + } + + @VisibleForTesting + public static void receive(MessageDelivery sink, AbstractConfigurationService configService, Message message) + { + AccordSyncPropagator.Notification notification = message.payload; + notification.syncComplete.forEach(id -> configService.receiveRemoteSyncComplete(id, notification.epoch)); + if (!notification.closed.isEmpty()) + configService.receiveClosed(notification.closed, notification.epoch); + if (!notification.retired.isEmpty()) + configService.receiveRetired(notification.retired, notification.epoch); + sink.respond(Ok, message); + } + + private static Shutdownable toShutdownable(Node node) + { + return new Shutdownable() { + private volatile boolean isShutdown = false; + + @Override + public boolean isTerminated() + { + // we don't know about terminiated... so settle for shutdown! + return isShutdown; + } + + @Override + public void shutdown() + { + isShutdown = true; + node.shutdown(); + } + + @Override + public Object shutdownNow() + { + // node doesn't offer shutdownNow + shutdown(); + return null; + } + + @Override + public boolean awaitTermination(long timeout, TimeUnit units) + { + // TODO (required): expose awaitTermination in Node + // node doesn't offer + return true; + } + }; + } + + @VisibleForTesting + public AccordConfigurationService configService() + { + return configService; + } + + @Override + public AccordCompactionInfos getCompactionInfo() + { + AccordCompactionInfos compactionInfos = new AccordCompactionInfos(node.durableBefore(), node.topology().minEpoch()); + node.commandStores().forEachCommandStore(commandStore -> { + compactionInfos.put(commandStore.id(), ((AccordCommandStore)commandStore).getCompactionInfo()); + }); + return compactionInfos; + } + + @Override + public AccordAgent agent() + { + return (AccordAgent) node.agent(); + } + + @Override + public void awaitDone(TableId id, long epoch) + { + // Need to make sure no existing txn are still being processed for this table... this is only used by DROP TABLE so NEW txn are expected to be blocked, so just need to "wait" for existing ones to complete + Topology topology = node.topology().current(); + List rangeList = topology.reduce(new ArrayList<>(), + s -> ((TokenRange) s.range).table().equals(id), + (accum, s) -> { + accum.add((TokenRange) s.range); + return accum; + }); + if (rangeList.isEmpty()) return; // nothing to see here + + Ranges ranges = Ranges.of(rangeList.toArray(accord.primitives.Range[]::new)); + long startedAt = nanoTime(); + long deadline = startedAt + DatabaseDescriptor.getAccordRangeSyncPointTimeoutNanos(); + // TODO (required): relax this requirement - too expensive + getBlocking(node.durability().sync("Drop Keyspace/Table (Epoch " + epoch + ')', TxnId.minForEpoch(epoch), ranges, Self, All), ranges, new LatencyRequestBookkeeping(null), startedAt, deadline, false); + } + + public Params journalConfiguration() + { + return journal.configuration(); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordStaleReplicas.java b/src/java/org/apache/cassandra/service/accord/AccordStaleReplicas.java new file mode 100644 index 000000000000..9d0a01bad1f0 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordStaleReplicas.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.io.IOException; +import java.util.Objects; +import java.util.Set; + +import javax.annotation.concurrent.Immutable; + +import com.google.common.collect.ImmutableSet; + +import accord.local.Node; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.serializers.TopologySerializers; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.MetadataValue; +import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.utils.CollectionSerializers; + +@Immutable +public class AccordStaleReplicas implements MetadataValue +{ + public static final AccordStaleReplicas EMPTY = new AccordStaleReplicas(ImmutableSet.of(), Epoch.EMPTY); + + private final Set staleIds; + private final Epoch lastModified; + + public AccordStaleReplicas(Set staleIds, Epoch lastModified) + { + this.staleIds = staleIds; + this.lastModified = lastModified; + } + + @Override + public AccordStaleReplicas withLastModified(Epoch epoch) + { + return new AccordStaleReplicas(staleIds, epoch); + } + + @Override + public Epoch lastModified() + { + return lastModified; + } + + public AccordStaleReplicas withNodeIds(Set ids) + { + ImmutableSet.Builder builder = new ImmutableSet.Builder<>(); + Set newIds = builder.addAll(staleIds).addAll(ids).build(); + return new AccordStaleReplicas(newIds, lastModified); + } + + public AccordStaleReplicas without(Set ids) + { + ImmutableSet.Builder builder = new ImmutableSet.Builder<>(); + + for (Node.Id staleId : staleIds) + if (!ids.contains(staleId)) + builder.add(staleId); + + return new AccordStaleReplicas(builder.build(), lastModified); + } + + public boolean contains(Node.Id nodeId) + { + return staleIds.contains(nodeId); + } + + public Set ids() + { + return staleIds; + } + + @Override + public String toString() + { + return "AccordStaleReplicas{staleIds=" + staleIds + ", lastModified=" + lastModified + '}'; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + AccordStaleReplicas that = (AccordStaleReplicas) o; + return Objects.equals(staleIds, that.staleIds) && Objects.equals(lastModified, that.lastModified); + } + + @Override + public int hashCode() + { + return Objects.hash(staleIds, lastModified); + } + + public static final MetadataSerializer serializer = new MetadataSerializer<>() + { + @Override + public void serialize(AccordStaleReplicas replicas, DataOutputPlus out, Version version) throws IOException + { + CollectionSerializers.serializeCollection(replicas.staleIds, out, TopologySerializers.nodeId); + Epoch.serializer.serialize(replicas.lastModified, out, version); + } + + @Override + public AccordStaleReplicas deserialize(DataInputPlus in, Version version) throws IOException + { + return new AccordStaleReplicas(CollectionSerializers.deserializeSet(in, TopologySerializers.nodeId), + Epoch.serializer.deserialize(in, version)); + } + + @Override + public long serializedSize(AccordStaleReplicas replicas, Version version) + { + return CollectionSerializers.serializedCollectionSize(replicas.staleIds, TopologySerializers.nodeId) + + Epoch.serializer.serializedSize(replicas.lastModified, version); + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordSyncPropagator.java b/src/java/org/apache/cassandra/service/accord/AccordSyncPropagator.java new file mode 100644 index 000000000000..66ae868231cb --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordSyncPropagator.java @@ -0,0 +1,418 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.TimeUnit; + +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Iterables; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.local.Node; +import accord.messages.SimpleReply; +import accord.primitives.Ranges; +import accord.utils.Invariants; +import org.agrona.collections.Int2ObjectHashMap; +import org.agrona.collections.Long2ObjectHashMap; +import org.apache.cassandra.concurrent.ScheduledExecutorPlus; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.exceptions.RequestFailure; +import org.apache.cassandra.gms.Gossiper; +import org.apache.cassandra.gms.IFailureDetector; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.IVerbHandler; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessageDelivery; +import org.apache.cassandra.net.RequestCallback; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.service.accord.serializers.KeySerializers; +import org.apache.cassandra.service.accord.serializers.TopologySerializers; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.utils.CollectionSerializers; + +/** + * Receives information about closed, retired ranges, and about sync completion, and + * propagates this information to the peers. + * + * Notifies remote replicas that the local replica has synchronised coordination + * information for this epoch. + */ +public class AccordSyncPropagator +{ + private static final Logger logger = LoggerFactory.getLogger(AccordSyncPropagator.class); + + public static final IVerbHandler verbHandler = message -> { + if (!AccordService.isSetup()) + return; + AccordService.instance().receive(message); + }; + + interface Listener + { + void onEndpointAck(Node.Id id, long epoch); + void onComplete(long epoch); + } + + private interface ReportPending + { + Notification report(PendingEpoch epoch, T value); + } + + static class PendingEpoch + { + final long epoch; + ImmutableSet syncComplete = ImmutableSet.of(); // TODO (desired): propagate ack's for other nodes + Ranges closed = Ranges.EMPTY, retired = Ranges.EMPTY; + + PendingEpoch(long epoch) + { + this.epoch = epoch; + } + + Notification syncComplete(Node.Id newSyncComplete) + { + if (syncComplete.contains(newSyncComplete)) + return null; + + syncComplete = ImmutableSet.builder() + .addAll(syncComplete) + .add(newSyncComplete) + .build(); + + return new Notification(epoch, Collections.singleton(newSyncComplete), Ranges.EMPTY, Ranges.EMPTY); + } + + Notification closed(Ranges addClosed) + { + if (closed.containsAll(addClosed)) + return null; + + addClosed = addClosed.without(closed); + closed = closed.with(addClosed); + return new Notification(epoch, Collections.emptySet(), addClosed, Ranges.EMPTY); + } + + Notification retired(Ranges addRetired) + { + if (retired.containsAll(addRetired)) + return null; + + addRetired = addRetired.without(retired); + retired = retired.with(addRetired); + return new Notification(epoch, Collections.emptySet(), Ranges.EMPTY, addRetired); + } + + boolean isEmpty() + { + return syncComplete.isEmpty() && closed.isEmpty() && retired.isEmpty(); + } + + boolean ack(Notification notification) + { + if (!notification.syncComplete.isEmpty()) + { + if (notification.syncComplete.containsAll(syncComplete)) syncComplete = ImmutableSet.of(); + else syncComplete = ImmutableSet.copyOf(Iterables.filter(syncComplete, v -> !notification.syncComplete.contains(v))); + } + closed = closed.without(notification.closed); + retired = retired.without(notification.retired); + return syncComplete.isEmpty() && closed.isEmpty() && retired.isEmpty(); + } + + @Override + public String toString() + { + return "PendingEpoch{" + + "epoch=" + epoch + + ", syncComplete=" + syncComplete + + ", closed=" + closed + + ", retired=" + retired + + '}'; + } + } + + static class PendingEpochs extends Long2ObjectHashMap + { + boolean ack(Notification notification) + { + PendingEpoch epoch = get(notification.epoch); + if (epoch != null && epoch.ack(notification)) + remove(notification.epoch); + return isEmpty(); + } + } + + static class PendingNodes extends Int2ObjectHashMap + { + boolean ack(Node.Id id, Notification notifications) + { + PendingEpochs node = get(id.id); + if (node == null) + return true; + + if (!node.ack(notifications)) + return false; + + remove(id.id); + return true; + } + } + + private final PendingNodes pending = new PendingNodes(); + private final Node.Id localId; + private final AccordEndpointMapper endpointMapper; + private final MessageDelivery messagingService; + private final IFailureDetector failureDetector; + private final ScheduledExecutorPlus scheduler; + private final Listener listener; + + public AccordSyncPropagator(Node.Id localId, AccordEndpointMapper endpointMapper, + MessageDelivery messagingService, IFailureDetector failureDetector, ScheduledExecutorPlus scheduler, + Listener listener) + { + this.localId = localId; + this.endpointMapper = endpointMapper; + this.messagingService = messagingService; + this.failureDetector = failureDetector; + this.scheduler = scheduler; + this.listener = listener; + } + + boolean hasPending() + { + return !pending.isEmpty(); + } + + synchronized boolean hasPending(long epoch) + { + if (pending.isEmpty()) return false; + return pending.values().stream().allMatch(n -> { + PendingEpoch p = n.get(epoch); + return p != null && !p.isEmpty(); + }); + } + + @Override + public String toString() + { + return "AccordSyncPropagator{" + + "localId=" + localId + + ", pending=" + pending + + '}'; + } + + public void onNodesRemoved(Node.Id removed) + { + long[] toAck; + boolean[] syncCompletedFor; + + synchronized (AccordSyncPropagator.this) + { + PendingEpochs pendingEpochs = pending.remove(removed.id); + if (pendingEpochs == null) return; + toAck = new long[pendingEpochs.size()]; + syncCompletedFor = new boolean[pendingEpochs.size()]; + Long2ObjectHashMap.KeyIterator it = pendingEpochs.keySet().iterator(); + for (int i = 0; it.hasNext(); i++) + { + long epoch = it.nextLong(); + toAck[i] = epoch; + syncCompletedFor[i] = hasSyncCompletedFor(epoch); + } + Arrays.sort(toAck); + } + + for (int i = 0; i < toAck.length; i++) + { + long epoch = toAck[i]; + listener.onEndpointAck(removed, epoch); + if (syncCompletedFor[i]) + listener.onComplete(epoch); + } + } + + public void reportSyncComplete(long epoch, Collection notify, Node.Id syncCompleteId) + { + if (notify.isEmpty()) + { + listener.onComplete(epoch); + return; + } + report(epoch, notify, PendingEpoch::syncComplete, syncCompleteId); + } + + public void reportClosed(long epoch, Collection notify, Ranges closed) + { + report(epoch, notify, PendingEpoch::closed, closed); + } + + public void reportRetired(long epoch, Collection notify, Ranges retired) + { + report(epoch, notify, PendingEpoch::retired, retired); + } + + private synchronized void report(long epoch, Collection notify, ReportPending report, T param) + { + // TODO (efficiency, now): for larger clusters this can be a problem as we trigger 1 msg for each instance, so in a 1k cluster its 1k messages; this can cause a thundering herd problem + // this is mostly a problem for reportSyncComplete as we include every node in the cluster, for reportClosed/reportRetired these tend to use only the nodes that are replicas of the range, + // and there is currently an assumption that sub-ranges are done, so only impacting a handful of nodes. + // TODO (correctness, now): during a host replacement multiple epochs are generated (move the range, remove the node), so its possible that notify will never be able to send the notification as the node is leaving the cluster + notify.forEach(id -> { + PendingEpoch pendingEpoch = pending.computeIfAbsent(id.id, ignore -> new PendingEpochs()) + .computeIfAbsent(epoch, PendingEpoch::new); + Notification notification = report.report(pendingEpoch, param); + if (notification != null) + notify(id, notification); + }); + } + + private boolean hasSyncCompletedFor(long epoch) + { + return pending.values().stream().noneMatch(node -> { + PendingEpoch pending = node.get(epoch); + if (pending == null) + return false; + return !pending.syncComplete.isEmpty(); + }); + } + + private boolean notify(Node.Id to, Notification notification) + { + InetAddressAndPort toEp = endpointMapper.mappedEndpoint(to); + Message msg = Message.out(Verb.ACCORD_SYNC_NOTIFY_REQ, notification); + RequestCallback cb = new RequestCallback<>() + { + @Override + public void onResponse(Message msg) + { + Invariants.require(msg.payload == SimpleReply.Ok, "Unexpected message: %s", msg); + Set completedEpochs = new HashSet<>(); + synchronized (AccordSyncPropagator.this) + { + pending.ack(to, notification); + long epoch = notification.epoch; + if (notification.syncComplete.contains(localId)) + { + if (hasSyncCompletedFor(epoch)) + completedEpochs.add(epoch); + } + } + + long epoch = notification.epoch; + listener.onEndpointAck(to, epoch); + if (completedEpochs.contains(epoch)) + listener.onComplete(epoch); + } + + @Override + public void onFailure(InetAddressAndPort from, RequestFailure failure) + { + scheduler.schedule(() -> AccordSyncPropagator.this.notify(to, notification), 1, TimeUnit.SECONDS); + } + + @Override + public boolean invokeOnFailure() + { + return true; + } + }; + if (!failureDetector.isAlive(toEp)) + { + // was the endpoint removed from membership? + ClusterMetadata metadata = ClusterMetadata.current(); + if (Gossiper.instance.getEndpointStateForEndpoint(toEp) == null && !metadata.directory.allJoinedEndpoints().contains(toEp) && !metadata.fullCMSMembers().contains(toEp)) + { + // endpoint no longer exists... + cb.onResponse(msg.responseWith(SimpleReply.Ok)); + return true; + } + logger.warn("Node{} is not alive, unable to notify of {}", to, notification); + scheduler.schedule(() -> notify(to, notification), 1, TimeUnit.MINUTES); + return false; + } + messagingService.sendWithCallback(msg, toEp, cb); + return true; + } + + public static class Notification + { + public static final UnversionedSerializer serializer = new UnversionedSerializer() + { + @Override + public void serialize(Notification notification, DataOutputPlus out) throws IOException + { + out.writeLong(notification.epoch); + CollectionSerializers.serializeCollection(notification.syncComplete, out, TopologySerializers.nodeId); + KeySerializers.ranges.serialize(notification.closed, out); + KeySerializers.ranges.serialize(notification.retired, out); + } + + @Override + public Notification deserialize(DataInputPlus in) throws IOException + { + return new Notification(in.readLong(), + CollectionSerializers.deserializeList(in, TopologySerializers.nodeId), + KeySerializers.ranges.deserialize(in), + KeySerializers.ranges.deserialize(in)); + } + + @Override + public long serializedSize(Notification notification) + { + return TypeSizes.LONG_SIZE + + CollectionSerializers.serializedCollectionSize(notification.syncComplete, TopologySerializers.nodeId) + + KeySerializers.ranges.serializedSize(notification.closed) + + KeySerializers.ranges.serializedSize(notification.retired); + } + }; + + final long epoch; + final Collection syncComplete; + final Ranges closed, retired; + + public Notification(long epoch, Collection syncComplete, Ranges closed, Ranges retired) + { + this.epoch = epoch; + this.syncComplete = syncComplete; + this.closed = closed; + this.retired = retired; + } + + @Override + public String toString() + { + return "Notification{" + + "epoch=" + epoch + + ", syncComplete=" + syncComplete + + ", closed=" + closed + + ", retired=" + retired + + '}'; + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordTask.java b/src/java/org/apache/cassandra/service/accord/AccordTask.java new file mode 100644 index 000000000000..9930d2093eac --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordTask.java @@ -0,0 +1,1118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service.accord; + +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.CancellationException; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.BiConsumer; +import java.util.function.BiFunction; +import java.util.function.Consumer; +import java.util.function.Function; +import javax.annotation.Nullable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.slf4j.MDC; + +import accord.api.Journal; +import accord.api.RoutingKey; +import accord.local.Command; +import accord.local.CommandStore; +import accord.local.PreLoadContext; +import accord.local.SafeCommandStore; +import accord.local.cfk.CommandsForKey; +import accord.primitives.AbstractRanges; +import accord.primitives.AbstractUnseekableKeys; +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.primitives.TxnId; +import accord.primitives.Unseekables; +import accord.utils.Invariants; +import accord.utils.async.AsyncChain; +import accord.utils.async.AsyncChains; +import accord.utils.async.Cancellable; +import org.agrona.collections.Object2ObjectHashMap; +import org.agrona.collections.ObjectHashSet; +import org.apache.cassandra.service.accord.AccordCacheEntry.Status; +import org.apache.cassandra.service.accord.AccordCommandStore.Caches; +import org.apache.cassandra.service.accord.AccordExecutor.Task; +import org.apache.cassandra.service.accord.AccordExecutor.TaskQueue; +import org.apache.cassandra.service.accord.AccordKeyspace.CommandsForKeyAccessor; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.utils.Clock; +import org.apache.cassandra.utils.NoSpamLogger; +import org.apache.cassandra.utils.concurrent.Condition; + +import static accord.primitives.Routable.Domain.Key; +import static accord.primitives.Txn.Kind.EphemeralRead; +import static accord.utils.Invariants.illegalState; +import static org.apache.cassandra.config.CassandraRelevantProperties.DTEST_ACCORD_JOURNAL_SANITY_CHECK_ENABLED; +import static org.apache.cassandra.config.DatabaseDescriptor.getPartitioner; +import static org.apache.cassandra.service.accord.AccordTask.State.CANCELLED; +import static org.apache.cassandra.service.accord.AccordTask.State.FAILED; +import static org.apache.cassandra.service.accord.AccordTask.State.FAILING; +import static org.apache.cassandra.service.accord.AccordTask.State.FINISHED; +import static org.apache.cassandra.service.accord.AccordTask.State.INITIALIZED; +import static org.apache.cassandra.service.accord.AccordTask.State.LOADING; +import static org.apache.cassandra.service.accord.AccordTask.State.PERSISTING; +import static org.apache.cassandra.service.accord.AccordTask.State.RUNNING; +import static org.apache.cassandra.service.accord.AccordTask.State.SCANNING_RANGES; +import static org.apache.cassandra.service.accord.AccordTask.State.WAITING_TO_LOAD; +import static org.apache.cassandra.service.accord.AccordTask.State.WAITING_TO_RUN; +import static org.apache.cassandra.service.accord.AccordTask.State.WAITING_TO_SCAN_RANGES; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; + +public abstract class AccordTask extends Task implements Runnable, Function, Cancellable +{ + private static final Logger logger = LoggerFactory.getLogger(AccordTask.class); + private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(logger, 1, TimeUnit.MINUTES); + private static final boolean SANITY_CHECK = DTEST_ACCORD_JOURNAL_SANITY_CHECK_ENABLED.getBoolean(); + + private static class LoggingProps + { + private static final String COMMAND_STORE = "command_store"; + private static final String ACCORD_TASK = "accord_task"; + } + + static class ForFunction extends AccordTask + { + private final Function function; + + public ForFunction(AccordCommandStore commandStore, PreLoadContext loadCtx, Function function) + { + super(commandStore, loadCtx); + this.function = function; + } + + @Override + public R apply(SafeCommandStore commandStore) + { + return function.apply(commandStore); + } + } + + // TODO (desired): these anonymous ops are somewhat tricky to debug. We may want to at least give them names. + static class ForConsumer extends AccordTask + { + private final Consumer consumer; + + private ForConsumer(AccordCommandStore commandStore, PreLoadContext loadCtx, Consumer consumer) + { + super(commandStore, loadCtx); + this.consumer = consumer; + } + + @Override + public Void apply(SafeCommandStore commandStore) + { + consumer.accept(commandStore); + return null; + } + } + + public static AccordTask create(CommandStore commandStore, PreLoadContext ctx, Function function) + { + return new ForFunction<>((AccordCommandStore) commandStore, ctx, function); + } + + public static AccordTask create(CommandStore commandStore, PreLoadContext ctx, Consumer consumer) + { + return new ForConsumer((AccordCommandStore) commandStore, ctx, consumer); + } + + public enum State + { + INITIALIZED(), + WAITING_TO_SCAN_RANGES(INITIALIZED), + SCANNING_RANGES(WAITING_TO_SCAN_RANGES), + WAITING_TO_LOAD(INITIALIZED, SCANNING_RANGES), + LOADING(INITIALIZED, SCANNING_RANGES, WAITING_TO_LOAD), + WAITING_TO_RUN(INITIALIZED, SCANNING_RANGES, WAITING_TO_LOAD, LOADING), + RUNNING(WAITING_TO_RUN), + PERSISTING(RUNNING), + FAILING(WAITING_TO_SCAN_RANGES, SCANNING_RANGES, WAITING_TO_LOAD, LOADING, WAITING_TO_RUN, RUNNING, PERSISTING), + FINISHED(RUNNING, PERSISTING), + CANCELLED(WAITING_TO_SCAN_RANGES, SCANNING_RANGES, WAITING_TO_LOAD, LOADING, WAITING_TO_RUN), + FAILED(WAITING_TO_SCAN_RANGES, SCANNING_RANGES, WAITING_TO_LOAD, LOADING, WAITING_TO_RUN, RUNNING, PERSISTING, FAILING); + + private final int permittedFrom; + + State() + { + this.permittedFrom = 0; + } + + State(State ... permittedFroms) + { + int permittedFrom = 0; + for (State state : permittedFroms) + permittedFrom |= 1 << state.ordinal(); + this.permittedFrom = permittedFrom; + } + + boolean isPermittedFrom(State prev) + { + return (permittedFrom & (1 << prev.ordinal())) != 0; + } + + boolean isExecuted() + { + return this.compareTo(PERSISTING) >= 0; + } + + boolean isComplete() + { + return this.compareTo(FINISHED) >= 0; + } + } + + private State state = INITIALIZED; + private final PreLoadContext preLoadContext; + private final String loggingId; + private static final AtomicLong nextLoggingId = new AtomicLong(Clock.Global.currentTimeMillis()); + + // TODO (desired): merge all of these maps into one + @Nullable Object2ObjectHashMap commands; + @Nullable Object2ObjectHashMap commandsForKey; + @Nullable Object2ObjectHashMap> loading; + // TODO (desired): collection supporting faster deletes but still fast poll (e.g. some ordered collection) + @Nullable ArrayDeque> waitingToLoad; + @Nullable RangeTxnScanner rangeScanner; + boolean hasRanges; + @Nullable CommandsForRanges commandsForRanges; + @Nullable private TaskQueue queued; + + private BiConsumer callback; + private List sanityCheck; + public long createdAt = nanoTime(), loadedAt, runQueuedAt, runAt, completedAt; + + private void setLoggingIds() + { + MDC.put(LoggingProps.COMMAND_STORE, commandStore.loggingId); + MDC.put(LoggingProps.ACCORD_TASK, loggingId); + } + + private void clearLoggingIds() + { + MDC.remove(LoggingProps.COMMAND_STORE); + MDC.remove(LoggingProps.ACCORD_TASK); + } + + public AccordTask(AccordCommandStore commandStore, PreLoadContext preLoadContext) + { + super(commandStore); + this.loggingId = "0x" + Long.toHexString(nextLoggingId.incrementAndGet()); + this.preLoadContext = preLoadContext; + + if (logger.isTraceEnabled()) + { + setLoggingIds(); + logger.trace("Created {} on {}", this, commandStore); + clearLoggingIds(); + } + } + + @Override + public String toString() + { + return "AccordTask{" + state + "}-" + loggingId; + } + + public String toDescription() + { + return "AccordTask{" + state + "}-" + loggingId + ": " + + (queued == null ? "unqueued" : queued.kind) + + ", primaryTxnId: " + preLoadContext.primaryTxnId() + + ", waitingToLoad: " + summarise(waitingToLoad) + + ", loading:" + summarise(loading, AccordSafeState::global) + + ", cfks:" + summarise(commandsForKey, AccordSafeState::global) + + ", txns:" + summarise(commands, AccordSafeState::global); + + } + + private static String summarise(Map map, Function transform) + { + if (map == null) + return "null"; + + return summarise(map.values(), transform); + } + + private static String summarise(Collection collection) + { + return summarise(collection, Function.identity()); + } + + private static String summarise(Collection collection, Function transform) + { + if (collection == null) + return "null"; + + StringBuilder out = new StringBuilder("["); + int count = 0; + for (V v : collection) + { + if (count++ > 0) + { + out.append(','); + if (count >= 10) + { + out.append("...(*").append(collection.size() - 10).append(')'); + break; + } + } + out.append(transform.apply(v)); + } + out.append(']'); + return out.toString(); + } + + private void state(State state) + { + Invariants.require(state.isPermittedFrom(this.state), "%s forbidden from %s", state, this, AccordTask::toDescription); + this.state = state; + if (state == WAITING_TO_RUN) + { + Invariants.require(rangeScanner == null || rangeScanner.scanned); + Invariants.require(loading == null && waitingToLoad == null, "WAITING_TO_RUN => no loading or waiting; found %s", this, AccordTask::toDescription); + loadedAt = nanoTime(); + } + else if (state == RUNNING) + { + runAt = nanoTime(); + } + else if (state.isExecuted()) + { + completedAt = nanoTime(); + } + } + + Unseekables keys() + { + return preLoadContext.keys(); + } + + public AsyncChain chain() + { + return new AsyncChains.Head<>() + { + @Override + protected Cancellable start(BiConsumer callback) + { + Invariants.require(AccordTask.this.callback == null); + AccordTask.this.callback = callback; + commandStore.tryPreSetup(AccordTask.this); + commandStore.executor().submit(AccordTask.this); + return AccordTask.this; + } + }; + } + + // to be invoked only by the CommandStore owning thread, to take references to objects already in use by the current execution + public void presetup(AccordTask parent) + { + // note we use the caches "unsafely" here deliberately, as we only reference commands we already have references to + // so we do not mutate anything, except the atomic counter of references + if (parent.commands != null) + { + for (TxnId txnId : preLoadContext.txnIds()) + presetupExclusive(txnId, AccordTask::ensureCommands, parent.commands, commandStore.cachesUnsafe().commands()); + } + + if (parent.commandsForKey == null) return; + if (preLoadContext.keys().domain() != Key) return; + switch (preLoadContext.keyHistory()) + { + default: throw new AssertionError("Unhandled KeyHistory: " + preLoadContext.keyHistory()); + case NONE: + break; + + case ASYNC: + case RECOVER: + case INCR: + case SYNC: + for (RoutingKey key : (AbstractUnseekableKeys)preLoadContext.keys()) + presetupExclusive(key, AccordTask::ensureCommandsForKey, parent.commandsForKey, commandStore.cachesUnsafe().commandsForKeys()); + break; + } + } + + public void setupExclusive() + { + setupInternal(commandStore.cachesExclusive()); + state(rangeScanner != null ? WAITING_TO_SCAN_RANGES + : waitingToLoad != null ? State.WAITING_TO_LOAD + : loading != null ? LOADING : WAITING_TO_RUN); + } + + private void setupInternal(Caches caches) + { + { + boolean hasPreSetup = commands != null; + for (TxnId txnId : preLoadContext.txnIds()) + { + if (hasPreSetup && completePresetupExclusive(txnId, commands, caches.commands())) + continue; + setupExclusive(txnId, AccordTask::ensureCommands, caches.commands()); + } + } + + if (preLoadContext.keys().isEmpty()) + return; + + switch (preLoadContext.keys().domain()) + { + case Key: setupKeyLoadsExclusive(caches, (AbstractUnseekableKeys)preLoadContext.keys(), false); break; + case Range: setupRangeLoadsExclusive(caches); + } + } + + private void setupKeyLoadsExclusive(Caches caches, Iterable keys, boolean isToCompleteRangeScan) + { + switch (preLoadContext.keyHistory()) + { + default: throw new AssertionError("Unhandled KeyHistory: " + preLoadContext.keyHistory()); + case NONE: + break; + + case RECOVER: + if (!isToCompleteRangeScan) + { + Invariants.require(rangeScanner == null); + rangeScanner = new RangeTxnScanner(); + } + + case ASYNC: + case INCR: + case SYNC: + { + boolean hasPreSetup = commandsForKey != null; + for (RoutingKey key : keys) + { + if (hasPreSetup && completePresetupExclusive(key, commandsForKey, caches.commandsForKeys())) continue; + setupExclusive(key, AccordTask::ensureCommandsForKey, caches.commandsForKeys()); + } + break; + } + } + } + + private void setupRangeLoadsExclusive(Caches caches) + { + switch (preLoadContext.keyHistory()) + { + default: throw new AssertionError("Unhandled KeyHistory: " + preLoadContext.keyHistory()); + case NONE: + case ASYNC: + break; + + case INCR: + throw new AssertionError("Incremental mode should only be used with an explicit list of keys"); + + case RECOVER: + case SYNC: + hasRanges = true; + rangeScanner = new RangeTxnAndKeyScanner(caches.commandsForKeys()); + } + } + + // expects mutual exclusivity only on the command store + private > void presetupExclusive(K k, Function, Map> loaded, Map parentMap, AccordCache.Type.Instance cache) + { + AccordSafeState ref = parentMap.get(k); + if (ref == null) + return; + + AccordCacheEntry node = ref.global(); + int refs = node.increment(); + Invariants.require(refs > 1); + loaded.apply(this).put(k, cache.parent().adapter().safeRef(node)); + } + + // expects to hold lock + private > boolean completePresetupExclusive(K k, Map map, AccordCache.Type.Instance cache) + { + AccordSafeState preacquired = map.get(k); + if (preacquired != null) + { + cache.recordPreAcquired(preacquired); + return true; + } + return false; + } + + // expects to hold lock + private > void setupExclusive(K k, Function, Map> loaded, AccordCache.Type.Instance cache) + { + S safeRef = cache.acquire(k); + Status entryStatus = safeRef.global().status(); + Map map; + switch (entryStatus) + { + default: throw new IllegalStateException("Unhandled global state: " + entryStatus); + case WAITING_TO_LOAD: + case LOADING: + map = ensureLoading(); + break; + case SAVING: + case LOADED: + case MODIFIED: + case FAILED_TO_SAVE: + map = loaded.apply(this); + } + + Object prev = map.putIfAbsent(k, safeRef); + if (prev != null) + { + noSpamLogger.warn("PreLoadContext {} contained key {} more than once", map, k); + cache.release(safeRef, this); + } + else if (map == loading) + { + if (entryStatus == Status.WAITING_TO_LOAD) + ensureWaitingToLoad().add(safeRef.global()); + safeRef.global().loadingOrWaiting().add(this); + Invariants.paranoid(safeRef.global().loadingOrWaiting().waiters().size() == safeRef.global().references()); + } + } + + // expects to hold lock + public boolean onLoad(AccordCacheEntry state) + { + AccordSafeState safeRef = loading == null ? null : loading.remove(state.key()); + Invariants.require(safeRef != null && safeRef.global() == state, "Expected to find %s loading; found %s", state, this, AccordTask::toDescription); + if (safeRef.getClass() == AccordSafeCommand.class) + ensureCommands().put((TxnId)state.key(), (AccordSafeCommand) safeRef); + else + ensureCommandsForKey().put((RoutingKey) state.key(), (AccordSafeCommandsForKey) safeRef); + + if (!loading.isEmpty()) + return false; + + loading = null; + if (this.state.compareTo(State.WAITING_TO_LOAD) < 0) + return false; + + Invariants.require(waitingToLoad == null, "Invalid state: %s", this, AccordTask::toDescription); + state(WAITING_TO_RUN); + return true; + } + + // expects to hold lock + public boolean onLoading(AccordCacheEntry state) + { + boolean removed = waitingToLoad != null && waitingToLoad.remove(state); + Invariants.require(removed, "%s not found in waitingToLoad %s", state, this, AccordTask::toDescription); + if (!waitingToLoad.isEmpty()) + return false; + + return onEmptyWaitingToLoad(); + } + + private boolean onEmptyWaitingToLoad() + { + waitingToLoad = null; + if (this.state.compareTo(State.WAITING_TO_LOAD) < 0) + return false; + + state(loading == null ? WAITING_TO_RUN : LOADING); + return true; + } + + public PreLoadContext preLoadContext() + { + return preLoadContext; + } + + public Map commands() + { + return commands; + } + + public Map ensureCommands() + { + if (commands == null) + commands = new Object2ObjectHashMap<>(); + return commands; + } + + public Map commandsForKey() + { + return commandsForKey; + } + + public Map ensureCommandsForKey() + { + if (commandsForKey == null) + commandsForKey = new Object2ObjectHashMap<>(); + return commandsForKey; + } + + private Map> ensureLoading() + { + if (loading == null) + loading = new Object2ObjectHashMap<>(); + return loading; + } + + private ArrayDeque> ensureWaitingToLoad() + { + Invariants.require(state.compareTo(WAITING_TO_LOAD) <= 0, "Expected status to be on or before WAITING_TO_LOAD; found %s", this, AccordTask::toDescription); + if (waitingToLoad == null) + waitingToLoad = new ArrayDeque<>(); + return waitingToLoad; + } + + public AccordCacheEntry pollWaitingToLoad() + { + Invariants.require(state == State.WAITING_TO_LOAD, "Expected status to be WAITING_TO_LOAD; found %s", this, AccordTask::toDescription); + if (waitingToLoad == null) + return null; + + AccordCacheEntry next = waitingToLoad.poll(); + if (waitingToLoad.isEmpty()) + onEmptyWaitingToLoad(); + return next; + } + + public AccordCacheEntry peekWaitingToLoad() + { + return waitingToLoad == null ? null : waitingToLoad.peek(); + } + + private void maybeSanityCheck(AccordSafeCommand safeCommand) + { + if (SANITY_CHECK) + { + if (sanityCheck == null) + sanityCheck = new ArrayList<>(commands.size()); + sanityCheck.add(safeCommand.current()); + } + } + + private void save(List diffs, Runnable onFlush) + { + if (sanityCheck != null) + { + Invariants.require(SANITY_CHECK); + Condition condition = Condition.newOneTimeCondition(); + this.commandStore.appendCommands(diffs, condition::signal); + condition.awaitUninterruptibly(); + + for (Command check : sanityCheck) + this.commandStore.sanityCheckCommand(commandStore.unsafeGetRedundantBefore(), check); + + if (onFlush != null) onFlush.run(); + } + else + { + this.commandStore.appendCommands(diffs, onFlush); + } + } + + @Override + protected void preRunExclusive() + { + state(RUNNING); + if (rangeScanner != null) + { + commandsForRanges = rangeScanner.finish(commandStore.cachesExclusive()); + rangeScanner = null; + } + if (commands != null) + commands.forEach((k, v) -> v.preExecute()); + if (commandsForKey != null) + commandsForKey.forEach((k, v) -> v.preExecute()); + } + + @Override + public void run() + { + setLoggingIds(); + logger.trace("Running {} with state {}", this, state); + AccordSafeCommandStore safeStore = null; + try + { + if (state != RUNNING) + throw illegalState("Unexpected state " + toDescription()); + + safeStore = commandStore.begin(this, commandsForRanges); + R result = apply(safeStore); + + List changes = null; + if (commands != null) + { + for (AccordSafeCommand safeCommand : commands.values()) + { + if (safeCommand.txnId().is(EphemeralRead)) + continue; + + Journal.CommandUpdate diff = safeCommand.update(); + if (diff == null) + continue; + + if (changes == null) + changes = new ArrayList<>(commands.size()); + changes.add(diff); + + maybeSanityCheck(safeCommand); + } + } + + boolean flush = changes != null || safeStore.fieldUpdates() != null; + if (flush) + { + state(PERSISTING); + Runnable onFlush = () -> finish(result, null); + safeStore.persistFieldUpdatesInternal(changes == null ? onFlush : null); + if (changes != null) save(changes, onFlush); + } + + commandStore.complete(safeStore); + safeStore = null; + if (!flush) + finish(result, null); + } + catch (Throwable t) + { + if (safeStore != null) + { + revert(); + commandStore.abort(safeStore); + } + throw t; + } + finally + { + logger.trace("Exiting {}", this); + clearLoggingIds(); + } + } + + public void fail(Throwable throwable) + { + commandStore.agent().onUncaughtException(throwable); + if (state.isComplete()) + return; + + if (commandStore.hasSafeStore()) + commandStore.agent().onUncaughtException(new IllegalStateException(String.format("Failure to cleanup safe store for %s; status=%s", this, state), throwable)); + + state(FAILING); + if (callback != null) + callback.accept(null, throwable); + } + + public void failExclusive(Throwable throwable) + { + boolean newFailure = state != FAILING; + try + { + if (newFailure) + { + commandStore.agent().onUncaughtException(throwable); + if (state.isComplete()) + return; + + if (commandStore.hasSafeStore()) + commandStore.agent().onUncaughtException(new IllegalStateException(String.format("Failure to cleanup safe store for %s; status=%s", this, state), throwable)); + } + + state(FAILED); + } + finally + { + if (newFailure && callback != null) + callback.accept(null, throwable); + } + } + + protected void cleanupExclusive() + { + releaseResources(commandStore.cachesExclusive()); + if (state == FAILING) + state(FAILED); + } + + @Nullable + public RangeTxnScanner rangeScanner() + { + return rangeScanner; + } + + public boolean hasRanges() + { + return hasRanges; + } + + @Override + public void cancel() + { + commandStore.executor().cancel(this); + } + + public void cancelExclusive() + { + releaseResources(commandStore.cachesExclusive()); + state(CANCELLED); + if (callback != null) + callback.accept(null, new CancellationException()); + } + + public State state() + { + return state; + } + + private void finish(R result, Throwable failure) + { + state(failure == null ? FINISHED : FAILED); + if (callback != null) + callback.accept(result, failure); + } + + void releaseResources(Caches caches) + { + try + { + // TODO (expected): we should destructively iterate to avoid invoking second time in fail; or else read and set to null + if (rangeScanner != null) + { + rangeScanner.cleanup(caches); + rangeScanner = null; + } + if (commands != null) + { + commands.forEach((k, v) -> caches.commands().release(v, this)); + commands.clear(); + commands = null; + } + if (commandsForKey != null) + { + commandsForKey.forEach((k, v) -> caches.commandsForKeys().release(v, this)); + commandsForKey.clear(); + commandsForKey = null; + } + if (waitingToLoad != null) + { + while (!waitingToLoad.isEmpty()) + waitingToLoad.poll().loadingOrWaiting().remove(this); + waitingToLoad = null; + } + if (loading != null) + { + loading.forEach((k, v) -> caches.global().release(v, this)); + loading.clear(); + loading = null; + } + } + catch (Throwable t) + { + releaseResourcesSlow(caches, t); + throw t; + } + } + + private void releaseResourcesSlow(Caches caches, Throwable suppressedBy) + { + if (commands != null) + { + safeRelease(commands, caches.commands(), suppressedBy); + commands.clear(); + commands = null; + } + if (commandsForKey != null) + { + safeRelease(commandsForKey, caches.commandsForKeys(), suppressedBy); + commandsForKey.clear(); + commandsForKey = null; + } + if (waitingToLoad != null) + { + while (!waitingToLoad.isEmpty()) + { + try { waitingToLoad.poll().loadingOrWaiting().remove(this); } + catch (Throwable t) { suppressedBy.addSuppressed(t); } + } + waitingToLoad = null; + } + if (loading != null) + { + safeRelease(loading, caches.global(), suppressedBy); + loading.clear(); + loading = null; + } + } + + private void safeRelease(Map> map, AccordCache.Type.Instance cache, Throwable suppressedBy) + { + for (AccordSafeState safeState : map.values()) + { + if (safeState.invalidated()) continue; + try { cache.release(safeState, this); } + catch (Throwable t) { suppressedBy.addSuppressed(t); } + } + } + + private void safeRelease(Map> map, AccordCache cache, Throwable suppressedBy) + { + for (AccordSafeState safeState : map.values()) + { + if (safeState.invalidated()) continue; + try { cache.release(safeState, this); } + catch (Throwable t) { suppressedBy.addSuppressed(t); } + } + } + + void revert() + { + if (commands != null) + commands.forEach((k, v) -> v.revert()); + if (commandsForKey != null) + commandsForKey.forEach((k, v) -> v.revert()); + } + + protected void addToQueue(TaskQueue queue) + { + if (state == CANCELLED) + return; + + Invariants.require(queue.kind == state || (queue.kind == State.WAITING_TO_LOAD && state == WAITING_TO_SCAN_RANGES), "Invalid queue type: %s vs %s", queue.kind, this, AccordTask::toDescription); + Invariants.require(this.queued == null, "Already queued with state: %s", this, AccordTask::toDescription); + queued = queue; + queue.append(this); + } + + @Nullable + TaskQueue queued() + { + return queued; + } + + TaskQueue unqueue() + { + TaskQueue wasQueued = queued; + queued.remove(this); + queued = null; + return wasQueued; + } + + TaskQueue unqueueIfQueued() + { + if (queued == null) + return null; + return unqueue(); + } + + public class RangeTxnAndKeyScanner extends RangeTxnScanner + { + class KeyWatcher implements AccordCache.Listener + { + @Override + public void onUpdate(AccordCacheEntry state) + { + if (ranges.contains(state.key())) + reference(state); + } + } + + final Set intersectingKeys = new ObjectHashSet<>(); + final KeyWatcher keyWatcher = new KeyWatcher(); + final Ranges ranges = ((AbstractRanges) preLoadContext.keys()).toRanges(); + final AccordCache.Type.Instance commandsForKeyCache; + + public RangeTxnAndKeyScanner(AccordCache.Type.Instance commandsForKeyCache) + { + this.commandsForKeyCache = commandsForKeyCache; + } + + boolean scanned; + + void runInternal() + { + for (Range range : ranges) + { + CommandsForKeyAccessor.findAllKeysBetween(commandStore.id(), commandStore.tableId(), getPartitioner(), + (TokenKey) range.start(), range.startInclusive(), + (TokenKey) range.end(), range.endInclusive(), + intersectingKeys::add); + } + super.runInternal(); + } + + private void reference(AccordCacheEntry entry) + { + if (loading != null && loading.containsKey(entry.key())) + return; + + switch (entry.status()) + { + default: throw new AssertionError("Unhandled Status: " + entry.status()); + case WAITING_TO_LOAD: + case LOADING: + if (scanned) + // if we've finished scanning and not already taken a reference we shouldn't need to witness (unless modified) + return; + ensureLoading().put(entry.key(), commandsForKeyCache.acquire(entry)); + if (entry.status() == Status.WAITING_TO_LOAD) + ensureWaitingToLoad().add(entry); + entry.loadingOrWaiting().add(AccordTask.this); + return; + + case MODIFIED: + case SAVING: + case LOADED: + case FAILED_TO_SAVE: + if (commandsForKey != null && commandsForKey.containsKey(entry.key())) + return; + ensureCommandsForKey().putIfAbsent(entry.key(), commandsForKeyCache.acquire(entry)); + } + } + + void startInternal(Caches caches) + { + for (RoutingKey key : caches.commandsForKeys().keySet()) + { + if (ranges.contains(key)) + intersectingKeys.add((TokenKey) key); + } + caches.commandsForKeys().register(keyWatcher); + super.startInternal(caches); + } + + void scannedInternal() + { + if (commandsForKey != null) + intersectingKeys.removeAll(commandsForKey.keySet()); + if (loading != null) + intersectingKeys.removeAll(loading.keySet()); + setupKeyLoadsExclusive(commandStore.cachesExclusive(), intersectingKeys, true); + super.scannedInternal(); + } + + void cleanup(Caches caches) + { + caches.commandsForKeys().tryUnregister(keyWatcher); + super.cleanup(caches); + } + + CommandsForRanges finish(Caches caches) + { + caches.commandsForKeys().unregister(keyWatcher); + return super.finish(caches); + } + } + + public class RangeTxnScanner implements Runnable + { + class CommandWatcher implements AccordCache.Listener + { + @Override + public void onUpdate(AccordCacheEntry state) + { + CommandsForRanges.Summary summary = summaryLoader.ifRelevant(state); + if (summary != null) + summaries.put(summary.txnId, summary); + } + } + + final ConcurrentHashMap summaries = new ConcurrentHashMap<>(); + // TODO (expected): produce key summaries to avoid locking all in memory + final CommandWatcher commandWatcher = new CommandWatcher(); + final Unseekables keysOrRanges = preLoadContext.keys(); + + CommandsForRanges.Loader summaryLoader; + boolean scanned; + + @Override + public void run() + { + try + { + runInternal(); + } + catch (Throwable t) + { + commandStore.executor().onScannedRanges(AccordTask.this, t); + throw t; + } + commandStore.executor().onScannedRanges(AccordTask.this, null); + } + + void runInternal() + { + summaryLoader.intersects(txnId -> { + if (summaries.containsKey(txnId)) + return; + + CommandsForRanges.Summary summary = summaryLoader.load(txnId); + if (summary != null) + summaries.putIfAbsent(txnId, summary); + }); + } + + public void start(BiFunction executor) + { + Caches caches = commandStore.cachesExclusive(); + state(SCANNING_RANGES); + startInternal(caches); + executor.apply(AccordTask.this, this); + } + + void startInternal(Caches caches) + { + summaryLoader = commandStore.diskCommandsForRanges().loader(preLoadContext.primaryTxnId(), preLoadContext.keyHistory(), keysOrRanges); + summaryLoader.forEachInCache(summary -> summaries.put(summary.txnId, summary), caches); + caches.commands().register(commandWatcher); + } + + public void scannedExclusive() + { + Invariants.require(state == SCANNING_RANGES, "Expected SCANNING_RANGES; found %s", AccordTask.this, AccordTask::toDescription); + scanned = true; + scannedInternal(); + if (loading == null) state(WAITING_TO_RUN); + else if (waitingToLoad == null) state(LOADING); + else state(State.WAITING_TO_LOAD); + } + + void scannedInternal() + { + } + + void cleanup(Caches caches) + { + caches.commands().tryUnregister(commandWatcher); + } + + CommandsForRanges finish(Caches caches) + { + caches.commands().unregister(commandWatcher); + return new CommandsForRanges(summaries); + } + } + +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordTopology.java b/src/java/org/apache/cassandra/service/accord/AccordTopology.java new file mode 100644 index 000000000000..ebfd99fd8a4b --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordTopology.java @@ -0,0 +1,408 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; +import java.util.stream.Collectors; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Iterables; +import com.google.common.collect.Sets; + +import accord.local.Node.Id; +import accord.primitives.Ranges; +import accord.topology.Shard; +import accord.topology.Topology; +import accord.utils.Invariants; +import accord.utils.SortedArrays.SortedArrayList; +import accord.utils.TinyEnumSet; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.schema.Diff; +import org.apache.cassandra.schema.DistributedSchema; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.Keyspaces; +import org.apache.cassandra.schema.ReplicationParams; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.accord.fastpath.FastPathStrategy; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.membership.Directory; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.ownership.DataPlacement; +import org.apache.cassandra.tcm.ownership.DataPlacements; +import org.apache.cassandra.tcm.ownership.VersionedEndpoints; +import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; + +import static java.util.concurrent.TimeUnit.MILLISECONDS; + +/** + * Deterministically computes accord topology from a ClusterMetadata instance + */ +public class AccordTopology +{ + public static Id tcmIdToAccord(NodeId nodeId) + { + return new Id(nodeId.id()); + } + + private static class ShardLookup extends HashMap + { + private Shard createOrReuse(TinyEnumSet flags, accord.primitives.Range range, SortedArrayList nodes, SortedArrayList fastPath, Set joining) + { + Shard prev = get(range); + if (prev != null + && prev.flags().bitset() == flags.bitset() + && prev.nodes.equals(nodes) + && prev.fastPathElectorateSize == fastPath.size() + && prev.nodes.without(prev.notInFastPath).equals(fastPath) + && joining.size() == prev.joining.size() && prev.joining.containsAll(joining)) + return prev; + + return Shard.create(range, nodes, fastPath, joining, flags); + } + } + + public static class KeyspaceShard + { + private final KeyspaceMetadata keyspace; + private final List> ranges; + private final SortedArrayList nodes; + private final Set pending; + + private KeyspaceShard(KeyspaceMetadata keyspace, List> ranges, SortedArrayList nodes, Set pending) + { + this.keyspace = keyspace; + this.ranges = ranges; + this.nodes = nodes; + this.pending = pending; + } + + // return the keyspace fast path strategy if the inherit keyspace strategy is used + private FastPathStrategy strategyFor(TableMetadata metadata) + { + FastPathStrategy tableStrategy = metadata.params.fastPath; + FastPathStrategy strategy = tableStrategy.kind() != FastPathStrategy.Kind.INHERIT_KEYSPACE + ? tableStrategy : keyspace.params.fastPath; + Invariants.require(strategy.kind() != FastPathStrategy.Kind.INHERIT_KEYSPACE); + return strategy; + } + + List createForTable(Epoch epoch, TableMetadata metadata, Set unavailable, Map dcMap, ShardLookup lookup) + { + Ranges ranges = this.ranges.stream() + .map(range -> Ranges.single(AccordTopology.range(metadata.id, range))) + .reduce(Ranges.EMPTY, Ranges::with) + .mergeTouching(); + + SortedArrayList electorate = strategyFor(metadata).calculateFastPath(nodes, unavailable, dcMap); + + List shards = new ArrayList<>(ranges.size()); + for (accord.primitives.Range range : ranges) + { + TinyEnumSet flags = Shard.NO_FLAGS; + if (metadata.params.pendingDrop) + flags = flags.with(Shard.Flag.PENDING_REMOVAL); + if (metadata.epoch.isEqualOrAfter(epoch)) + flags = flags.with(Shard.Flag.MUST_WITNESS); + shards.add(lookup.createOrReuse(flags, range, nodes, electorate, pending)); + } + return shards; + } + + private static KeyspaceShard forRange(KeyspaceMetadata keyspace, List> ranges, Directory directory, Set readEndpoints, Set writeEndpoints) + { + // TCM doesn't create wrap around ranges + for (Range range : ranges) + Invariants.requireArgument(!range.isWrapAround() || range.right.equals(range.right.minValue()), + "wrap around range %s found", range); + + Sets.SetView readOnly = Sets.difference(readEndpoints, writeEndpoints); + Invariants.require(readOnly.isEmpty(), "Read only replicas detected: %s", readOnly); + + SortedArrayList nodes = new SortedArrayList<>(writeEndpoints.stream() + .map(directory::peerId) + .map(AccordTopology::tcmIdToAccord) + .sorted().toArray(Id[]::new)); + + Set pending = readEndpoints.equals(writeEndpoints) ? + Collections.emptySet() : + writeEndpoints.stream() + .filter(e -> !readEndpoints.contains(e)) + .map(directory::peerId) + .map(AccordTopology::tcmIdToAccord) + .collect(Collectors.toSet()); + + return new KeyspaceShard(keyspace, ranges, nodes, pending); + } + + public static List forKeyspace(KeyspaceMetadata keyspace, DataPlacements placements, Directory directory) + { + class Group + { + private final Set reads, writes; + + Group(Set reads, Set writes) + { + this.reads = reads; + this.writes = writes; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Group group = (Group) o; + return reads.equals(group.reads) && writes.equals(group.writes); + } + + @Override + public int hashCode() + { + return Objects.hash(reads, writes); + } + } + ReplicationParams replication = keyspace.params.replication; + DataPlacement placement = placements.get(replication); + + List> ranges = placement.reads.ranges(); + List shards = new ArrayList<>(ranges.size()); + Map>> groupRanges = new LinkedHashMap<>(); + for (Range range : ranges) + { + VersionedEndpoints.ForRange reads = placement.reads.forRange(range); + VersionedEndpoints.ForRange writes = placement.writes.forRange(range); + groupRanges.computeIfAbsent(new Group(reads.endpoints(), writes.endpoints()), i -> new ArrayList<>()).add(range); + } + for (Map.Entry>> e : groupRanges.entrySet()) + { + Group group = e.getKey(); + shards.add(forRange(keyspace, e.getValue(), directory, group.reads, group.writes)); + } + return shards; + } + + public List nodes() + { + return nodes; + } + + public List> ranges() + { + return ranges; + } + } + + static TokenRange minRange(TableId table, Token token) + { + return TokenRange.create(TokenKey.min(table, token.getPartitioner()), new TokenKey(table, token)); + } + + static TokenRange maxRange(TableId table, Token token) + { + return TokenRange.create(new TokenKey(table, token), TokenKey.max(table, token.getPartitioner())); + } + + static TokenRange fullRange(TableId table, IPartitioner partitioner) + { + return TokenRange.create(TokenKey.min(table, partitioner), TokenKey.max(table, partitioner)); + } + + static TokenRange range(TableId table, Range range) + { + Token minToken = range.left.minValue(); + return TokenRange.create(range.left.equals(minToken) ? TokenKey.min(table, minToken.getPartitioner()) : new TokenKey(table, range.left), + range.right.equals(minToken) ? TokenKey.max(table, minToken.getPartitioner()) : new TokenKey(table, range.right)); + } + + public static accord.primitives.Ranges toAccordRanges(TableId tableId, Collection> ranges) + { + List> normalizedRanges = Range.normalize(ranges); + TokenRange[] tokenRanges = new TokenRange[normalizedRanges.size()]; + for (int i = 0; i < normalizedRanges.size(); i++) + tokenRanges[i] = range(tableId, normalizedRanges.get(i)); + return Ranges.of(tokenRanges); + } + + public static accord.primitives.Ranges toAccordRanges(String keyspace, Collection> ranges) + { + Keyspace ks = Keyspace.open(keyspace); + Ranges accordRanges = Ranges.EMPTY; + if (ks == null) + return accordRanges; + + for (TableMetadata tbm : ks.getMetadata().tables) + { + accordRanges = accordRanges.with(toAccordRanges(tbm.id, ranges)); + } + + return accordRanges; + } + + private static Map createDCMap(Directory directory) + { + ImmutableMap.Builder builder = ImmutableMap.builder(); + directory.knownDatacenters().forEach(dc -> { + Set dcEndpoints = directory.datacenterEndpoints(dc); + // nodes aren't added to the endpointsToDCMap until they've joined + if (dcEndpoints == null) + return; + dcEndpoints.forEach(ep -> { + NodeId tid = directory.peerId(ep); + Id aid = tcmIdToAccord(tid); + builder.put(aid, dc); + }); + }); + return builder.build(); + } + + public static Topology createAccordTopology(Epoch epoch, DistributedSchema schema, DataPlacements placements, + Directory directory, AccordFastPath accordFastPath, ShardLookup lookup, + AccordStaleReplicas staleReplicas) + { + List res = new ArrayList<>(); + Set unavailable = accordFastPath.unavailableIds(); + Map dcMap = createDCMap(directory); + + for (KeyspaceMetadata keyspace : schema.getKeyspaces()) + { + List tables = keyspace.tables.stream().filter(TableMetadata::requiresAccordSupport).collect(Collectors.toList()); + if (tables.isEmpty()) + continue; + List ksShards = KeyspaceShard.forKeyspace(keyspace, placements, directory); + tables.forEach(table -> ksShards.forEach(shard -> res.addAll(shard.createForTable(epoch, table, unavailable, dcMap, lookup)))); + } + + res.sort((a, b) -> a.range.compare(b.range)); + return new Topology(epoch.getEpoch(), SortedArrayList.copyUnsorted(staleReplicas.ids(), Id[]::new), res.toArray(new Shard[0])); + } + + public static Topology createAccordTopology(ClusterMetadata metadata, ShardLookup lookup) + { + return createAccordTopology(metadata.epoch, metadata.schema, metadata.placements, metadata.directory, metadata.accordFastPath, lookup, metadata.accordStaleReplicas); + } + + public static Topology createAccordTopology(ClusterMetadata metadata, Topology current) + { + return createAccordTopology(metadata, createShardLookup(current)); + } + + public static Topology createAccordTopology(ClusterMetadata metadata) + { + return createAccordTopology(metadata, (Topology) null); + } + + public static EndpointMapping directoryToMapping(long epoch, Directory directory) + { + EndpointMapping.Builder builder = EndpointMapping.builder(epoch); + for (NodeId id : directory.peerIds()) + builder.add(directory.endpoint(id), tcmIdToAccord(id)); + + // There are cases where nodes are removed from the cluster (host replacement, decom, etc.), but inflight events + // may still be happening; keep the ids around so pending events do not fail with a mapping error + for (Directory.RemovedNode removedNode : directory.removedNodes()) + builder.add(removedNode.endpoint, tcmIdToAccord(removedNode.id)); + return builder.build(); + } + + private static ShardLookup createShardLookup(Topology topology) + { + ShardLookup map = new ShardLookup(); + + if (topology == null) + return map; + + topology.forEach(shard -> map.put(shard.range, shard)); + return map; + } + private static boolean hasAccordSchemaChange(TableMetadata before, TableMetadata after) + { + return after.requiresAccordSupport() && (before == null || !before.requiresAccordSupport()); + } + + private static boolean hasAccordSchemaChange(TableMetadata created) + { + return hasAccordSchemaChange(null, created); + } + + private static boolean hasAccordSchemaChange(Diff.Altered diff) + { + return hasAccordSchemaChange(diff.before, diff.after); + } + + private static boolean hasAccordSchemaChange(Keyspaces.KeyspacesDiff keyspacesDiff) + { + for (KeyspaceMetadata.KeyspaceDiff keyspaceDiff : keyspacesDiff.altered) + { + if (Iterables.any(keyspaceDiff.tables.created, AccordTopology::hasAccordSchemaChange)) + return true; + + if (Iterables.any(keyspaceDiff.tables.altered, AccordTopology::hasAccordSchemaChange)) + return true; + } + + return false; + } + + /** + * If an accord related schema change occurs, we need to wait until accord has processed them + * before unblocking the change + */ + public static void awaitTopologyReadiness(Keyspaces.KeyspacesDiff keyspacesDiff, Epoch epoch) + { + if (!AccordService.isSetup()) + return; + + if (!hasAccordSchemaChange(keyspacesDiff)) + return; + + try + { + ClusterMetadataService.instance().fetchLogFromCMS(epoch); + IAccordService service = AccordService.instance(); + service.epochReady(epoch).get(service.agent().expireEpochWait(MILLISECONDS), MILLISECONDS); + } + catch (InterruptedException e) + { + throw new UncheckedInterruptedException(e); + } + catch (ExecutionException | TimeoutException e) + { + throw new RuntimeException(e); + } + } + +} diff --git a/src/java/org/apache/cassandra/service/accord/AccordVerbHandler.java b/src/java/org/apache/cassandra/service/accord/AccordVerbHandler.java new file mode 100644 index 000000000000..9562aea86b7e --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/AccordVerbHandler.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service.accord; + +import java.io.IOException; +import java.util.concurrent.TimeUnit; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.local.Node; +import accord.messages.Request; +import org.apache.cassandra.net.IVerbHandler; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.utils.NoSpamLogger; + +public class AccordVerbHandler implements IVerbHandler +{ + private static final Logger logger = LoggerFactory.getLogger(AccordVerbHandler.class); + private static final NoSpamLogger.NoSpamLogStatement dropping = NoSpamLogger.getStatement(logger, "Dropping message {} from {}", 1L, TimeUnit.SECONDS); + + private final Node node; + private final AccordEndpointMapper endpointMapper; + + public AccordVerbHandler(Node node, AccordEndpointMapper endpointMapper) + { + this.node = node; + this.endpointMapper = endpointMapper; + } + + @Override + public void doVerb(Message message) throws IOException + { + if (!AccordService.instance().shouldAcceptMessages()) + { + dropping.debug(message.verb(), message.from()); + return; + } + + logger.trace("Receiving {} from {}", message.payload, message.from()); + + T request = message.payload; + + /* + * TODO (desired): messages are retained on heap until the node catches up to waitForEpoch, + * which can be problematic in absense of proper Accord<->Messaging backpressure + */ + Node.Id fromNodeId = endpointMapper.mappedId(message.from()); + long waitForEpoch = request.waitForEpoch(); + if (node.topology().hasAtLeastEpoch(waitForEpoch)) + request.process(node, fromNodeId, message.header); + else + { + node.withEpoch(waitForEpoch, (ignored, withEpochFailure) -> { + if (withEpochFailure != null) + throw new RuntimeException("Timed out waiting for epoch when processing message from " + fromNodeId + " to " + node + " message " + message, withEpochFailure); + request.process(node, fromNodeId, message.header); + }); + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/ClientRequestBookkeeping.java b/src/java/org/apache/cassandra/service/accord/ClientRequestBookkeeping.java new file mode 100644 index 000000000000..0d960362eff1 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/ClientRequestBookkeeping.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.function.Function; + +import com.codahale.metrics.Meter; +import org.apache.cassandra.metrics.AccordClientRequestMetrics; +import org.apache.cassandra.metrics.ClientRequestMetrics; + +import static org.apache.cassandra.service.accord.RequestBookkeeping.ThrowsExceptionType.READ; +import static org.apache.cassandra.service.accord.RequestBookkeeping.ThrowsExceptionType.WRITE; + +// TODO (expected): merge with AccordClientRequestMetrics instead +public class ClientRequestBookkeeping extends RequestBookkeeping +{ + final AccordClientRequestMetrics metrics; + + public ClientRequestBookkeeping(boolean isWrite, AccordClientRequestMetrics metrics) + { + super(isWrite ? WRITE : READ); + this.metrics = metrics; + } + + @Override + final void markTimeout() + { + mark(metrics -> metrics.timeouts); + } + + @Override + final void markPreempted() + { + metrics.preempted.mark(); + } + + final void markFailure() + { + mark(metrics -> metrics.failures); + } + + @Override + final void markRetryDifferentSystem() + { + metrics.retryDifferentSystem.mark(); + } + + @Override + final void markTopologyMismatch() + { + metrics.topologyMismatches.mark(); + } + + private void mark(Function get) + { + get.apply(metrics).mark(); + if (metrics.shared != null) + get.apply(metrics.shared).mark(); + } + + public final void markElapsedNanos(long nanos) + { + metrics.addNano(nanos); + if (metrics.shared != null) + metrics.shared.addNano(nanos); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/CommandStoreTxnBlockedGraph.java b/src/java/org/apache/cassandra/service/accord/CommandStoreTxnBlockedGraph.java new file mode 100644 index 000000000000..40f2c3703efc --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/CommandStoreTxnBlockedGraph.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; + +import accord.primitives.SaveStatus; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import org.apache.cassandra.service.accord.api.TokenKey; + +public class CommandStoreTxnBlockedGraph +{ + public final int commandStoreId; + public final Map txns; + public final Map keys; + + public CommandStoreTxnBlockedGraph(Builder builder) + { + commandStoreId = builder.storeId; + txns = ImmutableMap.copyOf(builder.txns); + keys = ImmutableMap.copyOf(builder.keys); + } + + public static class TxnState + { + public final TxnId txnId; + public final Timestamp executeAt; + public final SaveStatus saveStatus; + public final List blockedBy; + public final Set blockedByKey; + + public TxnState(Builder.TxnBuilder builder) + { + txnId = builder.txnId; + executeAt = builder.executeAt; + saveStatus = builder.saveStatus; + blockedBy = ImmutableList.copyOf(builder.blockedBy); + blockedByKey = ImmutableSet.copyOf(builder.blockedByKey); + } + + public boolean isBlocked() + { + return !notBlocked(); + } + + public boolean notBlocked() + { + return blockedBy.isEmpty() && blockedByKey.isEmpty(); + } + } + + public static class Builder + { + final int storeId; + final Map txns = new LinkedHashMap<>(); + final Map keys = new LinkedHashMap<>(); + + public Builder(int storeId) + { + this.storeId = storeId; + } + + boolean knows(TxnId id) + { + return txns.containsKey(id); + } + + public CommandStoreTxnBlockedGraph build() + { + return new CommandStoreTxnBlockedGraph(this); + } + + public TxnBuilder txn(TxnId txnId, Timestamp executeAt, SaveStatus saveStatus) + { + return new TxnBuilder(txnId, executeAt, saveStatus); + } + + public class TxnBuilder + { + final TxnId txnId; + final Timestamp executeAt; + final SaveStatus saveStatus; + List blockedBy = new ArrayList<>(); + Set blockedByKey = new LinkedHashSet<>(); + + public TxnBuilder(TxnId txnId, Timestamp executeAt, SaveStatus saveStatus) + { + this.txnId = txnId; + this.executeAt = executeAt; + this.saveStatus = saveStatus; + } + + public TxnState build() + { + TxnState state = new TxnState(this); + txns.put(txnId, state); + return state; + } + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/CommandsForRanges.java b/src/java/org/apache/cassandra/service/accord/CommandsForRanges.java new file mode 100644 index 000000000000..470822906f7a --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/CommandsForRanges.java @@ -0,0 +1,261 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.Iterator; +import java.util.Map; +import java.util.NavigableMap; +import java.util.TreeMap; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.BiFunction; +import java.util.function.Consumer; +import java.util.function.UnaryOperator; +import javax.annotation.Nullable; + +import accord.local.Command; +import accord.local.CommandSummaries; +import accord.local.CommandSummaries.Summary; +import accord.local.KeyHistory; +import accord.local.RedundantBefore; +import accord.primitives.Ranges; +import accord.primitives.Routable; +import accord.primitives.Timestamp; +import accord.primitives.Txn.Kind.Kinds; +import accord.primitives.TxnId; +import accord.primitives.Unseekable; +import accord.primitives.Unseekables; +import accord.utils.Invariants; +import org.agrona.collections.ObjectHashSet; +import org.apache.cassandra.service.accord.api.TokenKey; + +import static accord.local.CommandSummaries.SummaryStatus.NOT_DIRECTLY_WITNESSED; + +// TODO (expected): move to accord-core, merge with existing logic there +public class CommandsForRanges extends TreeMap implements CommandSummaries.ByTxnIdSnapshot +{ + public CommandsForRanges(Map m) + { + super(m); + } + + @Override + public NavigableMap byTxnId() + { + return this; + } + + public static class Manager implements AccordCache.Listener + { + private final AccordCommandStore commandStore; + private final RangeSearcher searcher; + private AtomicReference> transitive = new AtomicReference<>(new TreeMap<>()); + private final ObjectHashSet cachedRangeTxns = new ObjectHashSet<>(); + + public Manager(AccordCommandStore commandStore) + { + this.commandStore = commandStore; + try (AccordCommandStore.ExclusiveCaches caches = commandStore.lockCaches()) + { + caches.commands().register(this); + } + this.searcher = commandStore.rangeSearcher(); + } + + @Override + public void onAdd(AccordCacheEntry state) + { + TxnId txnId = state.key(); + if (txnId.is(Routable.Domain.Range)) + cachedRangeTxns.add(txnId); + } + + @Override + public void onEvict(AccordCacheEntry state) + { + TxnId txnId = state.key(); + if (txnId.is(Routable.Domain.Range)) + cachedRangeTxns.remove(txnId); + } + + public CommandsForRanges.Loader loader(@Nullable TxnId primaryTxnId, KeyHistory keyHistory, Unseekables keysOrRanges) + { + RedundantBefore redundantBefore = commandStore.unsafeGetRedundantBefore(); + return Loader.loader(redundantBefore, primaryTxnId, keyHistory, keysOrRanges, this::newLoader); + } + + private Loader newLoader(Unseekables searchKeysOrRanges, RedundantBefore redundantBefore, Kinds testKind, TxnId minTxnId, Timestamp maxTxnId, @Nullable TxnId findAsDep) + { + return new Loader(this, searchKeysOrRanges, redundantBefore, testKind, minTxnId, maxTxnId, findAsDep); + } + + private void updateTransitive(UnaryOperator> update) + { + while (true) + { + NavigableMap prev = transitive.get(); + NavigableMap next = update.apply(prev); + if (next == null || prev == next) + return; + if (transitive.compareAndSet(prev, next)) + return; + } + } + + public void mergeTransitive(TxnId txnId, Ranges ranges, BiFunction remappingFunction) + { + updateTransitive(transitive -> { + NavigableMap next = new TreeMap<>(transitive); + next.merge(txnId, ranges, remappingFunction); + return next; + }); + } + + public void gcBefore(TxnId gcBefore, Ranges ranges) + { + updateTransitive(transitive -> { + NavigableMap next = null; + Iterator> iterator = transitive.headMap(gcBefore).entrySet().iterator(); + while (iterator.hasNext()) + { + Map.Entry e = iterator.next(); + Ranges newRanges = e.getValue().without(ranges); + if (!newRanges.isEmpty()) + { + if (next == null) + next = new TreeMap<>(); + next.put(e.getKey(), newRanges); + } + } + return next; + }); + } + } + + public static class Loader extends Summary.Loader + { + private final Manager manager; + + public Loader(Manager manager, Unseekables searchKeysOrRanges, RedundantBefore redundantBefore, Kinds testKinds, TxnId minTxnId, Timestamp maxTxnId, @Nullable TxnId findAsDep) + { + super(searchKeysOrRanges, redundantBefore, testKinds, minTxnId, maxTxnId, findAsDep); + this.manager = manager; + } + + public void intersects(Consumer forEach) + { + switch (searchKeysOrRanges.domain()) + { + case Range: + for (Unseekable range : searchKeysOrRanges) + manager.searcher.search(manager.commandStore.id(), (TokenRange) range, minTxnId, maxTxnId).consume(forEach); + break; + case Key: + for (Unseekable key : searchKeysOrRanges) + manager.searcher.search(manager.commandStore.id(), (TokenKey) key, minTxnId, maxTxnId).consume(forEach); + } + + NavigableMap transitive = manager.transitive.get(); + if (!transitive.isEmpty()) + { + for (Map.Entry e : transitive.tailMap(minTxnId, true).entrySet()) + { + if (e.getValue().intersects(searchKeysOrRanges)) + forEach.accept(e.getKey()); + } + } + } + + public void forEachInCache(Consumer

    forEach, AccordCommandStore.Caches caches) + { + for (TxnId txnId : manager.cachedRangeTxns) + { + AccordCacheEntry state = caches.commands().getUnsafe(txnId); + Summary summary = ifRelevant(state); + if (summary != null) + forEach.accept(summary); + } + } + + public Summary load(TxnId txnId) + { + if (findAsDep == null) + { + Command.Minimal cmd = manager.commandStore.loadMinimal(txnId); + if (cmd != null) + return ifRelevant(cmd); + } + else + { + Command cmd = manager.commandStore.loadCommand(txnId); + if (cmd != null) + return ifRelevant(cmd); + } + + Ranges ranges = manager.transitive.get().get(txnId); + if (ranges == null) + return null; + + ranges = ranges.intersecting(searchKeysOrRanges); + if (ranges.isEmpty()) + return null; + + return new Summary(txnId, txnId, NOT_DIRECTLY_WITNESSED, ranges, null, null); + } + + public Summary ifRelevant(AccordCacheEntry state) + { + if (state.key().domain() != Routable.Domain.Range) + return null; + + switch (state.status()) + { + default: throw new AssertionError("Unhandled status: " + state.status()); + case LOADING: + case WAITING_TO_LOAD: + case UNINITIALIZED: + return null; + + case LOADED: + case MODIFIED: + case SAVING: + case FAILED_TO_SAVE: + } + + TxnId txnId = state.key(); + if (!txnId.isVisible() || txnId.compareTo(minTxnId) < 0 || txnId.compareTo(maxTxnId) >= 0) + return null; + + Command command = state.getExclusive(); + if (command == null) + return null; + return ifRelevant(command); + } + + public Summary ifRelevant(Command cmd) + { + return ifRelevant(cmd.txnId(), cmd.executeAt(), cmd.saveStatus(), cmd.participants(), cmd.partialDeps()); + } + + public Summary ifRelevant(Command.Minimal cmd) + { + Invariants.require(findAsDep == null); + return ifRelevant(cmd.txnId, cmd.executeAt, cmd.saveStatus, cmd.participants, null); + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/EndpointMapping.java b/src/java/org/apache/cassandra/service/accord/EndpointMapping.java new file mode 100644 index 000000000000..78ec8c45892c --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/EndpointMapping.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.Map; + +import com.google.common.collect.BiMap; +import com.google.common.collect.HashBiMap; +import com.google.common.collect.ImmutableBiMap; +import com.google.common.collect.ImmutableMap; + +import accord.local.Node; +import accord.utils.Invariants; +import org.apache.cassandra.locator.InetAddressAndPort; + +class EndpointMapping implements AccordEndpointMapper +{ + public static final EndpointMapping EMPTY = new EndpointMapping(0, ImmutableBiMap.of(), ImmutableMap.of()); + private final long epoch; + private final ImmutableBiMap mapping; + private final ImmutableMap removedNodes; + + private EndpointMapping(long epoch, + ImmutableBiMap mapping, + ImmutableMap removedNodes) + { + this.epoch = epoch; + this.mapping = mapping; + this.removedNodes = removedNodes; + } + + long epoch() + { + return epoch; + } + + public boolean containsId(Node.Id id) + { + return mapping.containsKey(id); + } + + public Map removedNodes() + { + return removedNodes; + } + + @Override + public Node.Id mappedIdOrNull(InetAddressAndPort endpoint) + { + return mapping.inverse().get(endpoint); + } + + @Override + public InetAddressAndPort mappedEndpointOrNull(Node.Id id) + { + return mapping.get(id); + } + + static class Builder + { + private final long epoch; + private final BiMap mapping = HashBiMap.create(); + private final ImmutableMap.Builder removed = new ImmutableMap.Builder<>(); + + public Builder(long epoch) + { + this.epoch = epoch; + } + + public Builder add(InetAddressAndPort endpoint, Node.Id id) + { + Invariants.requireArgument(!mapping.containsKey(id), "Mapping already exists for Node.Id %s", id); + Invariants.requireArgument(!mapping.containsValue(endpoint), "Mapping already exists for %s", endpoint); + mapping.put(id, endpoint); + return this; + } + + public Builder removed(InetAddressAndPort endpoint, Node.Id id, long epoch) + { + Invariants.requireArgument(!mapping.containsKey(id), "Mapping already exists for Node.Id %s", id); + Invariants.requireArgument(!mapping.containsValue(endpoint), "Mapping already exists for %s", endpoint); + mapping.put(id, endpoint); + removed.put(id, epoch); + return this; + } + + public EndpointMapping build() + { + return new EndpointMapping(epoch, ImmutableBiMap.copyOf(mapping), removed.build()); + } + } + + static Builder builder(long epoch) + { + return new Builder(epoch); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/FetchTopologies.java b/src/java/org/apache/cassandra/service/accord/FetchTopologies.java new file mode 100644 index 000000000000..f9af39d2e287 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/FetchTopologies.java @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.topology.Topology; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.exceptions.RequestFailure; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.IVerbHandler; +import org.apache.cassandra.net.MessageDelivery; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.net.MessagingUtils; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.repair.SharedContext; +import org.apache.cassandra.service.accord.serializers.TopologySerializers; +import org.apache.cassandra.utils.concurrent.Future; + +import static accord.topology.TopologyManager.TopologyRange; +import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.retryFetchTopology; + +/** + * Fetch Accord topologies form remote peer. + */ +public class FetchTopologies +{ + private static final Logger logger = LoggerFactory.getLogger(FetchTopologies.class); + public String toString() + { + return "FetchTopology{" + + "epoch=" + minEpoch + + '}'; + } + + private final long minEpoch; + private final long maxEpoch; + + public static final UnversionedSerializer serializer = new UnversionedSerializer<>() + { + @Override + public void serialize(FetchTopologies t, DataOutputPlus out) throws IOException + { + out.writeUnsignedVInt(t.minEpoch); + out.writeUnsignedVInt(t.maxEpoch); + } + + @Override + public FetchTopologies deserialize(DataInputPlus in) throws IOException + { + return new FetchTopologies(in.readUnsignedVInt(), in.readUnsignedVInt()); + } + + @Override + public long serializedSize(FetchTopologies t) + { + return TypeSizes.sizeofUnsignedVInt(t.minEpoch) + + TypeSizes.sizeofUnsignedVInt(t.maxEpoch); + } + }; + + public FetchTopologies(long minEpoch, long maxEpoch) + { + this.minEpoch = minEpoch; + this.maxEpoch = maxEpoch; + } + + public static final UnversionedSerializer responseSerializer = new UnversionedSerializer<>() + { + @Override + public void serialize(TopologyRange t, DataOutputPlus out) throws IOException + { + out.writeUnsignedVInt(t.min); + out.writeUnsignedVInt(t.current); + out.writeUnsignedVInt(t.firstNonEmpty); + out.writeUnsignedVInt32(t.topologies.size()); + + for (Topology topology : t.topologies) + TopologySerializers.topology.serialize(topology, out); + } + + @Override + public TopologyRange deserialize(DataInputPlus in) throws IOException + { + long min = in.readUnsignedVInt(); + long current = in.readUnsignedVInt(); + long firstNonEmpty = in.readUnsignedVInt(); + int count = in.readUnsignedVInt32(); + List topologies = new ArrayList<>(count); + for (int i = 0; i < count; ++i) + topologies.add(TopologySerializers.topology.deserialize(in)); + return new TopologyRange(min, current, firstNonEmpty, topologies); + } + + @Override + public long serializedSize(TopologyRange t) + { + long size = TypeSizes.sizeofUnsignedVInt(t.min); + size += TypeSizes.sizeofUnsignedVInt(t.current); + size += TypeSizes.sizeofUnsignedVInt(t.firstNonEmpty); + size += TypeSizes.sizeofUnsignedVInt(t.topologies.size()); + for (Topology topology : t.topologies) + size += TopologySerializers.topology.serializedSize(topology); + return size; + } + }; + + public static final IVerbHandler handler = message -> { + if (!AccordService.isSetup()) + { + logger.debug("Accord unitialized, responding with failure to {}", message.payload); + MessagingService.instance().respondWithFailure(RequestFailure.UNKNOWN, message); + return; + } + + TopologyRange topologies = AccordService.instance().topology().between(message.payload.minEpoch, message.payload.maxEpoch); + logger.debug("Responding with {} failure to {}", topologies, message.payload); + MessagingService.instance().respond(topologies, message); + }; + + public static Future fetch(SharedContext context, Collection peers, long minEpoch, long maxEpoch) + { + FetchTopologies request = new FetchTopologies(minEpoch, maxEpoch); + return context.messaging().sendWithRetries(retryFetchTopology(), + context.optionalTasks()::schedule, + Verb.ACCORD_FETCH_TOPOLOGY_REQ, + request, + MessagingUtils.tryAliveFirst(context, peers, Verb.ACCORD_FETCH_TOPOLOGY_REQ.name()), + (attempt, from, failure) -> true, + MessageDelivery.RetryErrorMessage.EMPTY) + .map(m -> m.payload); + } +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/service/accord/IAccordService.java b/src/java/org/apache/cassandra/service/accord/IAccordService.java new file mode 100644 index 000000000000..2530a080fac1 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/IAccordService.java @@ -0,0 +1,502 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.Collection; +import java.util.Collections; +import java.util.EnumSet; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.function.BiConsumer; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.Agent; +import accord.local.durability.DurabilityService.SyncLocal; +import accord.local.durability.DurabilityService.SyncRemote; +import accord.local.CommandStores.RangesForEpoch; +import accord.local.DurableBefore; +import accord.local.Node; +import accord.local.Node.Id; +import accord.local.RedundantBefore; +import accord.messages.Reply; +import accord.messages.Request; +import accord.primitives.Keys; +import accord.primitives.Ranges; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.topology.TopologyManager; +import accord.utils.Invariants; +import accord.utils.async.AsyncChain; +import org.agrona.collections.Int2ObjectHashMap; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.exceptions.RequestExecutionException; +import org.apache.cassandra.journal.Params; +import org.apache.cassandra.net.IVerbHandler; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.AccordSyncPropagator.Notification; +import org.apache.cassandra.service.accord.api.AccordScheduler; +import org.apache.cassandra.service.accord.api.AccordTopologySorter; +import org.apache.cassandra.service.accord.txn.TxnResult; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.transport.Dispatcher.RequestTime; +import org.apache.cassandra.utils.concurrent.Future; +import org.apache.cassandra.utils.concurrent.ImmediateFuture; + +// Avoid default methods that aren't just providing wrappers around other methods +// so it will be a compile error if DelegatingAccordService doesn't implement them +public interface IAccordService +{ + Logger logger = LoggerFactory.getLogger(IAccordService.class); + + EnumSet SUPPORTED_COMMIT_CONSISTENCY_LEVELS = EnumSet.of(ConsistencyLevel.ANY, ConsistencyLevel.ONE, ConsistencyLevel.QUORUM, ConsistencyLevel.SERIAL, ConsistencyLevel.ALL); + EnumSet SUPPORTED_READ_CONSISTENCY_LEVELS = EnumSet.of(ConsistencyLevel.ONE, ConsistencyLevel.QUORUM, ConsistencyLevel.SERIAL, ConsistencyLevel.ALL); + + IVerbHandler requestHandler(); + IVerbHandler responseHandler(); + + AsyncChain sync(Object requestedBy, @Nullable Timestamp minBound, Ranges ranges, @Nullable Collection include, SyncLocal syncLocal, SyncRemote syncRemote); + AsyncChain sync(@Nullable Timestamp minBound, Keys keys, SyncLocal syncLocal, SyncRemote syncRemote); + AsyncChain maxConflict(Ranges ranges); + + @Nonnull IAccordResult coordinateAsync(long minEpoch, @Nonnull Txn txn, @Nonnull ConsistencyLevel consistencyLevel, RequestTime requestTime); + @Nonnull TxnResult coordinate(long minEpoch, @Nonnull Txn txn, @Nonnull ConsistencyLevel consistencyLevel, RequestTime requestTime) throws RequestExecutionException; + + interface IAccordResult + { + V success(); + Throwable fail(); + V awaitAndGet() throws RequestExecutionException; + IAccordResult addCallback(BiConsumer callback); + } + + long currentEpoch(); + + void setCacheSize(long kb); + void setWorkingSetSize(long kb); + + TopologyManager topology(); + + void startup(); + + void shutdownAndWait(long timeout, TimeUnit unit) throws InterruptedException, TimeoutException; + + AccordScheduler scheduler(); + + /** + * Return a future that will complete once the accord has completed it's local bootstrap process + * for any ranges gained in the given epoch + */ + Future epochReady(Epoch epoch); + + void receive(Message message); + + class AccordCompactionInfo + { + public final int commandStoreId; + public final RedundantBefore redundantBefore; + public final RangesForEpoch ranges; + public final TableId tableId; + + public AccordCompactionInfo(int commandStoreId, RedundantBefore redundantBefore, RangesForEpoch ranges, TableId tableId) + { + this.commandStoreId = commandStoreId; + this.redundantBefore = Invariants.nonNull(redundantBefore); + this.ranges = Invariants.nonNull(ranges); + this.tableId = Invariants.nonNull(tableId); + } + } + + class AccordCompactionInfos extends Int2ObjectHashMap + { + public final DurableBefore durableBefore; + public final long minEpoch; + + public AccordCompactionInfos(DurableBefore durableBefore, long minEpoch) + { + this.durableBefore = durableBefore; + this.minEpoch = minEpoch; + } + + public AccordCompactionInfos(DurableBefore durableBefore, long minEpoch, AccordCompactionInfos copy) + { + super(copy); + this.durableBefore = durableBefore; + this.minEpoch = minEpoch; + } + } + + /** + * Fetch the redundnant befores for every command store + */ + AccordCompactionInfos getCompactionInfo(); + + Agent agent(); + + Id nodeId(); + + List debugTxnBlockedGraph(TxnId txnId); + @Nullable + Long minEpoch(); + + void awaitDone(TableId id, long epoch); + + AccordConfigurationService configService(); + + Params journalConfiguration(); + + boolean shouldAcceptMessages(); + + Node node(); + + // Implementation for the NO_OP service that also has what used to be the default implementations + // that had to be overridden by the real AccordService anyways + class NoOpAccordService implements IAccordService + { + private static final Future BOOTSTRAP_SUCCESS = ImmediateFuture.success(null); + + @Override + public IVerbHandler requestHandler() + { + return null; + } + + @Override + public IVerbHandler responseHandler() + { + return null; + } + + @Override + public AsyncChain sync(Object requestedBy, @Nullable Timestamp onOrAfter, Ranges ranges, @Nullable Collection include, SyncLocal syncLocal, SyncRemote syncRemote) + { + throw new UnsupportedOperationException("No accord transaction should be executed when accord.enabled = false in cassandra.yaml"); + } + + @Override + public AsyncChain sync(@Nullable Timestamp onOrAfter, Keys keys, SyncLocal syncLocal, SyncRemote syncRemote) + { + throw new UnsupportedOperationException("No accord transaction should be executed when accord.enabled = false in cassandra.yaml"); + } + + @Override + public AsyncChain maxConflict(Ranges ranges) + { + throw new UnsupportedOperationException("No accord transaction should be executed when accord.enabled = false in cassandra.yaml"); + } + + @Override + public @Nonnull TxnResult coordinate(long minEpoch, @Nonnull Txn txn, @Nonnull ConsistencyLevel consistencyLevel, @Nonnull RequestTime requestTime) + { + throw new UnsupportedOperationException("No accord transaction should be executed when accord.enabled = false in cassandra.yaml"); + } + + @Override + public @Nonnull IAccordResult coordinateAsync(long minEpoch, @Nonnull Txn txn, @Nonnull ConsistencyLevel consistencyLevel, RequestTime requestTime) + { + throw new UnsupportedOperationException("No accord transaction should be executed when accord.enabled = false in cassandra.yaml"); + } + + @Override + public long currentEpoch() + { + throw new UnsupportedOperationException("Cannot return epoch when accord.enabled = false in cassandra.yaml"); + } + + @Override + public void setCacheSize(long kb) { } + + @Override + public void setWorkingSetSize(long kb) {} + + @Override + public TopologyManager topology() + { + throw new UnsupportedOperationException("Cannot return topology when accord.enabled = false in cassandra.yaml"); + } + + @Override + public void startup() + { + try + { + AccordTopologySorter.checkSnitchSupported(DatabaseDescriptor.getNodeProximity()); + } + catch (Throwable t) + { + logger.warn("Current snitch is not compatable with Accord, make sure to fix the snitch before enabling Accord; {}", t.toString()); + } + } + + @Override + public void shutdownAndWait(long timeout, TimeUnit unit) { } + + @Override + public AccordScheduler scheduler() + { + return null; + } + + @Override + public Future epochReady(Epoch epoch) + { + return BOOTSTRAP_SUCCESS; + } + + @Override + public void receive(Message message) {} + + @Override + public AccordCompactionInfos getCompactionInfo() + { + return new AccordCompactionInfos(DurableBefore.EMPTY, 0); + } + + @Override + public Agent agent() + { + return null; + } + + @Override + public Id nodeId() + { + throw new UnsupportedOperationException(); + } + + @Override + public List debugTxnBlockedGraph(TxnId txnId) + { + return Collections.emptyList(); + } + + @Nullable + @Override + public Long minEpoch() + { + return null; + } + + @Override + public void awaitDone(TableId id, long epoch) + { + + } + + @Override + public AccordConfigurationService configService() + { + return null; + } + + @Override + public Params journalConfiguration() + { + throw new UnsupportedOperationException("Cannot return configuration when accord.enabled = false in cassandra.yaml"); + } + + @Override + public boolean shouldAcceptMessages() + { + return true; + } + + @Override + public Node node() + { + return null; + } + } + + class DelegatingAccordService implements IAccordService + { + protected final IAccordService delegate; + + public DelegatingAccordService(IAccordService delegate) + { + this.delegate = delegate; + } + + @Override + public IVerbHandler requestHandler() + { + return delegate.requestHandler(); + } + + @Override + public IVerbHandler responseHandler() + { + return delegate.responseHandler(); + } + + @Override + public AsyncChain sync(Object requestedBy, @Nullable Timestamp onOrAfter, Ranges ranges, @Nullable Collection include, SyncLocal syncLocal, SyncRemote syncRemote) + { + return delegate.sync(requestedBy, onOrAfter, ranges, include, syncLocal, syncRemote); + } + + @Override + public AsyncChain sync(@Nullable Timestamp onOrAfter, Keys keys, SyncLocal syncLocal, SyncRemote syncRemote) + { + return delegate.sync(onOrAfter, keys, syncLocal, syncRemote); + } + + @Override + public AsyncChain maxConflict(Ranges ranges) + { + return delegate.maxConflict(ranges); + } + + @Override + public AccordConfigurationService configService() + { + return delegate.configService(); + } + + @Nonnull + @Override + public TxnResult coordinate(long minEpoch, @Nonnull Txn txn, @Nonnull ConsistencyLevel consistencyLevel, RequestTime requestTime) + { + return delegate.coordinate(minEpoch, txn, consistencyLevel, requestTime); + } + + @Nonnull + @Override + public IAccordResult coordinateAsync(long minEpoch, @Nonnull Txn txn, @Nonnull ConsistencyLevel consistencyLevel, RequestTime requestTime) + { + return delegate.coordinateAsync(minEpoch, txn, consistencyLevel, requestTime); + } + + @Override + public long currentEpoch() + { + return delegate.currentEpoch(); + } + + @Override + public void setCacheSize(long kb) + { + delegate.setCacheSize(kb); + } + + @Override + public void setWorkingSetSize(long kb) + { + delegate.setWorkingSetSize(kb); + } + + @Override + public TopologyManager topology() + { + return delegate.topology(); + } + + @Override + public void startup() + { + delegate.startup(); + } + + @Override + public void shutdownAndWait(long timeout, TimeUnit unit) throws InterruptedException, TimeoutException + { + delegate.shutdownAndWait(timeout, unit); + } + + @Override + public AccordScheduler scheduler() + { + return delegate.scheduler(); + } + + @Override + public Future epochReady(Epoch epoch) + { + return delegate.epochReady(epoch); + } + + @Override + public void receive(Message message) + { + delegate.receive(message); + } + + @Override + public AccordCompactionInfos getCompactionInfo() + { + return delegate.getCompactionInfo(); + } + + @Override + public Agent agent() + { + return delegate.agent(); + } + + @Override + public Id nodeId() + { + return delegate.nodeId(); + } + + @Override + public List debugTxnBlockedGraph(TxnId txnId) + { + return delegate.debugTxnBlockedGraph(txnId); + } + + @Nullable + @Override + public Long minEpoch() + { + return delegate.minEpoch(); + } + + @Override + public void awaitDone(TableId id, long epoch) + { + delegate.awaitDone(id, epoch); + } + + @Override + public Params journalConfiguration() + { + return delegate.journalConfiguration(); + } + + @Override + public boolean shouldAcceptMessages() + { + return delegate.shouldAcceptMessages(); + } + + @Override + public Node node() + { + return delegate.node(); + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/ImmutableAccordSafeState.java b/src/java/org/apache/cassandra/service/accord/ImmutableAccordSafeState.java new file mode 100644 index 000000000000..8d9eb7c5ffdd --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/ImmutableAccordSafeState.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import javax.annotation.Nullable; + +public abstract class ImmutableAccordSafeState implements AccordSafeState +{ + protected final K key; + @Nullable + protected V original; + protected boolean invalidated; + + protected ImmutableAccordSafeState(K key) + { + this.key = key; + } + + @Override + public K key() + { + return key; + } + + @Override + public V original() + { + checkNotInvalidated(); + return original; + } + + @Override + public V current() + { + checkNotInvalidated(); + return original; + } + + @Override + public void invalidate() + { + invalidated = true; + } + + @Override + public boolean invalidated() + { + return invalidated; + } + + @Override + public void set(V update) + { + throw new UnsupportedOperationException(); + } + + @Override + public void revert() + { + checkNotInvalidated(); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/IndexRange.java b/src/java/org/apache/cassandra/service/accord/IndexRange.java new file mode 100644 index 000000000000..426dde5a5b6e --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/IndexRange.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import org.apache.cassandra.cache.IMeasurableMemory; +import org.apache.cassandra.utils.ByteArrayUtil; +import org.apache.cassandra.utils.ObjectSizes; + +public class IndexRange implements Comparable, IMeasurableMemory +{ + private static final long EMPTY_SIZE = ObjectSizes.measure(new IndexRange(null, null)); + + public final byte[] start, end; + + public IndexRange(byte[] start, byte[] end) + { + this.start = start; + this.end = end; + } + + @Override + public int compareTo(IndexRange other) + { + int rc = ByteArrayUtil.compareUnsigned(start, 0, other.start, 0, start.length); + if (rc == 0) + rc = ByteArrayUtil.compareUnsigned(end, 0, other.end, 0, end.length); + return rc; + } + + @Override + public long unsharedHeapSize() + { + return EMPTY_SIZE + ObjectSizes.sizeOfArray(start) * 2; + } + + public boolean intersects(byte[] start, byte[] end) + { + if (ByteArrayUtil.compareUnsigned(this.start, 0, end, 0, end.length) >= 0) + return false; + if (ByteArrayUtil.compareUnsigned(this.end, 0, start, 0, start.length) <= 0) + return false; + return true; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/JournalKey.java b/src/java/org/apache/cassandra/service/accord/JournalKey.java new file mode 100644 index 000000000000..b299f91613e9 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/JournalKey.java @@ -0,0 +1,307 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Objects; +import java.util.zip.Checksum; + +import accord.local.Node.Id; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import accord.utils.Invariants; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.journal.KeySupport; +import org.apache.cassandra.service.accord.journal.AccordTopologyUpdate; +import org.apache.cassandra.utils.ByteArrayUtil; + +import static org.apache.cassandra.db.TypeSizes.BYTE_SIZE; +import static org.apache.cassandra.db.TypeSizes.INT_SIZE; +import static org.apache.cassandra.db.TypeSizes.LONG_SIZE; +import static org.apache.cassandra.service.accord.AccordJournalValueSerializers.*; + +public final class JournalKey +{ + public final Type type; + public final TxnId id; + public final int commandStoreId; + + public JournalKey(TxnId id, Type type, int commandStoreId) + { + Invariants.requireArgument(commandStoreId >= 0); + Invariants.require((id.lsb & (0xffff & ~TxnId.IDENTITY_FLAGS)) == 0); + Invariants.nonNull(type); + Invariants.nonNull(id); + this.type = type; + this.id = id; + this.commandStoreId = commandStoreId; + } + + /** + * Support for (de)serializing and comparing record keys. + *

    + * Implements its own serialization and comparison for {@link Timestamp} to satisty + * {@link KeySupport} contract - puts hybrid logical clock ahead of epoch + * when ordering timestamps. This is done for more precise elimination of candidate + * segments by min/max record key in segment. + */ + public static final JournalKeySupport SUPPORT = new JournalKeySupport(); + + public static final class JournalKeySupport implements KeySupport + { + private static final int CS_ID_OFFSET = 0; + private static final int TYPE_OFFSET = INT_SIZE; + private static final int MSB_OFFSET = TYPE_OFFSET + BYTE_SIZE; + private static final int LSB_OFFSET = MSB_OFFSET + LONG_SIZE; + private static final int NODE_OFFSET = LSB_OFFSET + LONG_SIZE; + // TODO (required): revisit commandStoreId - this can go arbitrarily high so may want to use vint + public static final int TOTAL_SIZE = NODE_OFFSET + INT_SIZE; + + @Override + public int serializedSize(int userVersion) + { + return TOTAL_SIZE; + } + + @Override + public void serialize(JournalKey key, DataOutputPlus out, int userVersion) throws IOException + { + out.writeInt(key.commandStoreId); + out.writeByte(key.type.id); + serializeTxnId(key.id, out); + } + + @Override + public void serialize(JournalKey key, ByteBuffer out, int userVersion) throws IOException + { + out.putInt(key.commandStoreId); + out.put((byte) key.type.id); + serializeTxnId(key.id, out); + } + + private void serialize(JournalKey key, byte[] out) + { + ByteArrayUtil.putInt(out, CS_ID_OFFSET, key.commandStoreId); + out[TYPE_OFFSET] = (byte) (key.type.id & 0xFF); + serializeTxnId(key.id, out); + } + + @Override + public JournalKey deserialize(DataInputPlus in, int userVersion) throws IOException + { + int commandStoreId = in.readInt(); + int type = in.readByte(); + TxnId txnId = deserializeTxnId(in); + return new JournalKey(txnId, Type.fromId(type), commandStoreId); + } + + @Override + public JournalKey deserialize(ByteBuffer buffer, int position, int userVersion) + { + int commandStoreId = buffer.getInt(position + CS_ID_OFFSET); + int type = buffer.get(position + TYPE_OFFSET); + TxnId txnId = deserializeTxnId(buffer, position); + return new JournalKey(txnId, Type.fromId(type), commandStoreId); + } + + @Override + public JournalKey deserialize(ByteBuffer buffer, int userVersion) + { + int commandStoreId = buffer.getInt(); + int type = buffer.get(); + TxnId txnId = deserializeTxnId(buffer); + return new JournalKey(txnId, Type.fromId(type), commandStoreId); + } + + private void serializeTxnId(TxnId txnId, DataOutputPlus out) throws IOException + { + out.writeLong(txnId.msb); + out.writeLong(txnId.lsb); + out.writeInt(txnId.node.id); + } + + private TxnId deserializeTxnId(DataInputPlus in) throws IOException + { + long msb = in.readLong(); + long lsb = in.readLong(); + int nodeId = in.readInt(); + return TxnId.fromBits(msb, lsb, new Id(nodeId)); + } + + private TxnId deserializeTxnId(ByteBuffer in) + { + long msb = in.getLong(); + long lsb = in.getLong(); + int nodeId = in.getInt(); + return TxnId.fromBits(msb, lsb, new Id(nodeId)); + } + + private void serializeTxnId(TxnId txnId, byte[] out) + { + ByteArrayUtil.putLong(out, MSB_OFFSET, txnId.msb); + ByteArrayUtil.putLong(out, LSB_OFFSET, txnId.lsb); + ByteArrayUtil.putInt(out, NODE_OFFSET, txnId.node.id); + } + + private void serializeTxnId(TxnId txnId, ByteBuffer out) + { + out.putLong(txnId.msb); + out.putLong(txnId.lsb); + out.putInt(txnId.node.id); + } + + private TxnId deserializeTxnId(ByteBuffer buffer, int position) + { + long msb = buffer.getLong(position + MSB_OFFSET); + long lsb = buffer.getLong(position + LSB_OFFSET); + int nodeId = buffer.getInt(position + NODE_OFFSET); + return TxnId.fromBits(msb, lsb, new Id(nodeId)); + } + + @Override + public void updateChecksum(Checksum crc, JournalKey key, int userVersion) + { + byte[] out = AccordJournal.keyCRCBytes.get(); + serialize(key, out); + crc.update(out, 0, out.length); + } + + @Override + public int compareWithKeyAt(JournalKey k, ByteBuffer buffer, int position, int userVersion) + { + int commandStoreId = buffer.getInt(position + CS_ID_OFFSET); + int cmp = Integer.compare(k.commandStoreId, commandStoreId); + if (cmp != 0) return cmp; + + byte type = buffer.get(position + TYPE_OFFSET); + cmp = Byte.compare((byte) k.type.id, type); + if (cmp != 0) return cmp; + + cmp = compareWithTxnIdAt(k.id, buffer, position); + return cmp; + } + + private int compareWithTxnIdAt(TxnId txnId, ByteBuffer buffer, int position) + { + long msb = buffer.getLong(position + MSB_OFFSET); + int cmp = Timestamp.compareMsb(txnId.msb, msb); + if (cmp != 0) return cmp; + + long lsb = buffer.getLong(position + LSB_OFFSET); + cmp = Timestamp.compareLsb(txnId.lsb, lsb); + if (cmp != 0) return cmp; + + int nodeId = buffer.getInt(position + NODE_OFFSET); + cmp = Integer.compare(txnId.node.id, nodeId); + return cmp; + } + + @Override + public int compare(JournalKey k1, JournalKey k2) + { + int cmp = Integer.compare(k1.commandStoreId, k2.commandStoreId); + if (cmp == 0) cmp = Byte.compare((byte) k1.type.id, (byte) k2.type.id); + if (cmp == 0) cmp = k1.id.compareTo(k2.id); + return cmp; + } + }; + + @Override + public boolean equals(Object other) + { + if (this == other) + return true; + return (other instanceof JournalKey) && equals((JournalKey) other); + } + + boolean equals(JournalKey other) + { + return this.id.equals(other.id) && + this.type == other.type && + this.commandStoreId == other.commandStoreId; + } + + @Override + public int hashCode() + { + return Objects.hash(id, type, commandStoreId); + } + + public String toString() + { + return "Key{" + + "id=" + id + + ", type=" + type + + ", commandStoreId=" + commandStoreId + + '}'; + } + + public enum Type + { + COMMAND_DIFF (0, new CommandDiffSerializer()), + REDUNDANT_BEFORE (1, new RedundantBeforeSerializer()), + DURABLE_BEFORE (2, new DurableBeforeSerializer()), + SAFE_TO_READ (3, new SafeToReadSerializer()), + BOOTSTRAP_BEGAN_AT (4, new BootstrapBeganAtSerializer()), + RANGES_FOR_EPOCH (5, new RangesForEpochSerializer()), + TOPOLOGY_UPDATE (6, AccordTopologyUpdate.AccumulatingSerializer.defaultInstance), + ; + + public final int id; + public final FlyweightSerializer serializer; + + Type(int id, FlyweightSerializer serializer) + { + this.id = id; + this.serializer = serializer; + } + + private static final Type[] idToTypeMapping; + + static + { + Type[] types = values(); + + int maxId = -1; + for (Type type : types) + maxId = Math.max(type.id, maxId); + + Type[] idToType = new Type[maxId + 1]; + for (Type type : types) + { + if (null != idToType[type.id]) + throw new IllegalStateException("Duplicate Type id " + type.id); + idToType[type.id] = type; + } + idToTypeMapping = idToType; + } + + static Type fromId(int id) + { + if (id < 0 || id >= idToTypeMapping.length) + throw new IllegalArgumentException("Out or range Type id " + id); + Type type = idToTypeMapping[id]; + if (null == type) + throw new IllegalArgumentException("Unknown Type id " + id); + return type; + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/RangeSearcher.java b/src/java/org/apache/cassandra/service/accord/RangeSearcher.java new file mode 100644 index 000000000000..48861015fee7 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/RangeSearcher.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.function.Consumer; + +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.utils.CloseableIterator; + +public interface RangeSearcher +{ + Result search(int commandStoreId, TokenRange range, TxnId minTxnId, Timestamp maxTxnId); + Result search(int commandStoreId, TokenKey key, TxnId minTxnId, Timestamp maxTxnId); + + static RangeSearcher extractRangeSearcher(Object o) + { + if (o instanceof RangeSearcher.Supplier) + return ((RangeSearcher.Supplier) o).rangeSearcher(); + return NoopRangeSearcher.instance; + } + + interface Supplier + { + RangeSearcher rangeSearcher(); + } + + interface Result + { + void consume(Consumer forEach); + CloseableIterator results(); + } + + class DefaultResult implements Result + { + private final TxnId minTxnId; + private final Timestamp maxTxnId; + private final CloseableIterator results; + private boolean consumed = false; + + public DefaultResult(TxnId minTxnId, Timestamp maxTxnId, CloseableIterator results) + { + this.minTxnId = minTxnId; + this.maxTxnId = maxTxnId; + this.results = results; + } + + @Override + public CloseableIterator results() + { + consume(); + return results; + } + + @Override + public void consume(Consumer forEach) + { + consume(); + try (results) + { + while (results.hasNext()) + { + TxnId next = results.next(); + if (next.compareTo(minTxnId) >= 0 && next.compareTo(maxTxnId) < 0) + forEach.accept(next); + } + } + } + + private void consume() + { + if (consumed) + throw new IllegalStateException("Attempted to consume an already consumed result"); + consumed = true; + } + } + + enum NoopResult implements Result + { + instance; + + @Override + public void consume(Consumer forEach) + { + + } + + @Override + public CloseableIterator results() + { + return CloseableIterator.empty(); + } + } + + enum NoopRangeSearcher implements RangeSearcher + { + instance; + + @Override + public Result search(int commandStoreId, TokenRange range, TxnId minTxnId, Timestamp maxTxnId) + { + return NoopResult.instance; + } + + @Override + public Result search(int commandStoreId, TokenKey key, TxnId minTxnId, Timestamp maxTxnId) + { + return NoopResult.instance; + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/RangeTreeRangeAccessor.java b/src/java/org/apache/cassandra/service/accord/RangeTreeRangeAccessor.java new file mode 100644 index 000000000000..80eabf4b5a10 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/RangeTreeRangeAccessor.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import accord.api.RoutingKey; +import accord.primitives.Range; +import org.apache.cassandra.utils.RangeTree; + +public enum RangeTreeRangeAccessor implements RangeTree.Accessor +{ + instance; + + @Override + public RoutingKey start(Range range) + { + return range.start(); + } + + @Override + public RoutingKey end(Range range) + { + return range.end(); + } + + @Override + public boolean contains(Range range, RoutingKey routingKey) + { + return range.contains(routingKey); + } + + @Override + public boolean contains(RoutingKey start, RoutingKey end, RoutingKey routingKey) + { + if (routingKey.compareTo(start) <= 0) + return false; + if (routingKey.compareTo(end) > 0) + return false; + return true; + } + + @Override + public boolean intersects(Range range, RoutingKey start, RoutingKey end) + { + if (range.start().compareTo(end) >= 0) return false; + if (range.end().compareTo(start) <= 0) return false; + return true; + } + + @Override + public boolean intersects(Range left, Range right) + { + return left.compareIntersecting(right) == 0; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/RequestBookkeeping.java b/src/java/org/apache/cassandra/service/accord/RequestBookkeeping.java new file mode 100644 index 000000000000..a4876e15a5cb --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/RequestBookkeeping.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.Collections; +import javax.annotation.Nullable; + +import accord.primitives.Seekables; +import accord.primitives.TxnId; +import org.apache.cassandra.db.WriteType; +import org.apache.cassandra.exceptions.ReadFailureException; +import org.apache.cassandra.exceptions.ReadTimeoutException; +import org.apache.cassandra.exceptions.RequestFailureException; +import org.apache.cassandra.exceptions.RequestTimeoutException; +import org.apache.cassandra.exceptions.WriteFailureException; +import org.apache.cassandra.exceptions.WriteTimeoutException; +import org.apache.cassandra.service.accord.exceptions.AccordReadExhaustedException; +import org.apache.cassandra.service.accord.exceptions.AccordReadPreemptedException; +import org.apache.cassandra.service.accord.exceptions.AccordWriteExhaustedException; +import org.apache.cassandra.service.accord.exceptions.AccordWritePreemptedException; + +import static org.apache.cassandra.db.ConsistencyLevel.SERIAL; +import static org.apache.cassandra.service.accord.RequestBookkeeping.ThrowsExceptionType.WRITE; + +public abstract class RequestBookkeeping +{ + public enum ThrowsExceptionType { READ, WRITE } + + final ThrowsExceptionType exceptionType; + + protected RequestBookkeeping(ThrowsExceptionType exceptionType) + { + this.exceptionType = exceptionType; + } + + abstract void markFailure(); + abstract void markTimeout(); + abstract void markPreempted(); + abstract void markRetryDifferentSystem(); + abstract void markTopologyMismatch(); + abstract void markElapsedNanos(long nanos); + + public RequestTimeoutException newTimeout(@Nullable TxnId txnId, Seekables keysOrRanges) + { + markTimeout(); + return newTimeout(txnId, exceptionType, keysOrRanges); + } + + public RequestTimeoutException newPreempted(@Nullable TxnId txnId, Seekables keysOrRanges) + { + markPreempted(); + return newPreempted(txnId, exceptionType, keysOrRanges); + } + + public RequestTimeoutException newExhausted(@Nullable TxnId txnId, Seekables keysOrRanges) + { + markFailure(); + return newExhausted(txnId, exceptionType, keysOrRanges); + } + + public RequestFailureException newFailed(@Nullable TxnId txnId, Seekables keysOrRanges) + { + markFailure(); + return newFailed(txnId, exceptionType, keysOrRanges); + } + + private static RequestTimeoutException newTimeout(TxnId txnId, ThrowsExceptionType type, Seekables keysOrRanges) + { + // Client protocol doesn't handle null consistency level so use ANY + return type == WRITE ? new WriteTimeoutException(WriteType.CAS, SERIAL, 0, 0, describe(txnId, keysOrRanges)) + : new ReadTimeoutException(SERIAL, 0, 0, false, describe(txnId, keysOrRanges)); + } + + private static RequestTimeoutException newPreempted(TxnId txnId, ThrowsExceptionType type, Seekables keysOrRanges) + { + return type == WRITE ? new AccordWritePreemptedException(0, 0, describe(txnId, keysOrRanges)) + : new AccordReadPreemptedException(0, 0, false, describe(txnId, keysOrRanges)); + } + + private static RequestTimeoutException newExhausted(TxnId txnId, ThrowsExceptionType type, Seekables keysOrRanges) + { + return type == WRITE ? new AccordWriteExhaustedException(0, 0, describe(txnId, keysOrRanges)) + : new AccordReadExhaustedException(0, 0, false, describe(txnId, keysOrRanges)); + } + + private static RequestFailureException newFailed(TxnId txnId, ThrowsExceptionType type, Seekables keysOrRanges) + { + // TODO (required): plumb in per-peer failure responses from Accord, and describe the txnId/keys + return type == WRITE ? new WriteFailureException(SERIAL, 0, 0, WriteType.CAS, Collections.emptyMap()) + : new ReadFailureException(SERIAL, 0, 0, false, Collections.emptyMap()); + } + + private static String describe(TxnId txnId, Seekables keysOrRanges) + { + return txnId + ": " + keysOrRanges; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/RouteInMemoryIndex.java b/src/java/org/apache/cassandra/service/accord/RouteInMemoryIndex.java new file mode 100644 index 000000000000..3745ea4cb66b --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/RouteInMemoryIndex.java @@ -0,0 +1,234 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.NavigableSet; +import java.util.TreeSet; +import java.util.function.Consumer; + +import accord.primitives.Route; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import accord.primitives.Unseekable; +import accord.utils.Invariants; +import org.agrona.collections.Int2ObjectHashMap; +import org.agrona.collections.Long2ObjectHashMap; +import org.apache.cassandra.index.accord.OrderedRouteSerializer; +import org.apache.cassandra.index.accord.RouteJournalIndex; +import org.apache.cassandra.journal.StaticSegment; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.utils.ByteArrayUtil; +import org.apache.cassandra.utils.CloseableIterator; +import org.apache.cassandra.utils.FastByteOperations; +import org.apache.cassandra.utils.RTree; +import org.apache.cassandra.utils.RangeTree; + +import static accord.primitives.Routable.Domain.Range; + +public class RouteInMemoryIndex implements RangeSearcher +{ + private final Long2ObjectHashMap segmentIndexes = new Long2ObjectHashMap<>(); + + public synchronized void update(long segment, int commandStoreId, TxnId id, Route route) + { + if (!RouteJournalIndex.allowed(id)) + return; + Invariants.nonNull(route, "route"); + segmentIndexes.computeIfAbsent(segment, SegmentIndex::new).add(commandStoreId, id, route); + } + + public synchronized void remove(Collection> oldSegments) + { + // As of this writing compact in accord journal takes StaticSegments, writes them to a SSTable, and pushes to a table; + // it then stops managing those segments... for this reason compactedSegments is normally empty and none of the + // oldSegments are expected to be tracked anymore, so this index should remove the reference (there is normal table 2i to pick up the job) + oldSegments.forEach(s -> segmentIndexes.remove(s.id())); + } + + @Override + public RangeSearcher.Result search(int commandStoreId, TokenRange range, TxnId minTxnId, Timestamp maxTxnId) + { + NavigableSet result = search(commandStoreId, range.table(), + OrderedRouteSerializer.serializeTokenOnly(range.start()), + OrderedRouteSerializer.serializeTokenOnly(range.end())); + return new DefaultResult(minTxnId, maxTxnId, CloseableIterator.wrap(result.iterator())); + } + + private synchronized NavigableSet search(int storeId, TableId tableId, byte[] start, byte[] end) + { + TreeSet matches = new TreeSet<>(); + segmentIndexes.values().forEach(s -> s.search(storeId, tableId, start, end, e -> matches.add(e.getValue()))); + return matches.isEmpty() ? Collections.emptyNavigableSet() : matches; + } + + @Override + public RangeSearcher.Result search(int commandStoreId, TokenKey key, TxnId minTxnId, Timestamp maxTxnId) + { + NavigableSet result = search(commandStoreId, key.table(), OrderedRouteSerializer.serializeTokenOnly(key)); + return new DefaultResult(minTxnId, maxTxnId, CloseableIterator.wrap(result.iterator())); + } + + private synchronized NavigableSet search(int storeId, TableId tableId, byte[] key) + { + TreeSet matches = new TreeSet<>(); + segmentIndexes.values().forEach(s -> s.search(storeId, tableId, key, e -> matches.add(e.getValue()))); + return matches.isEmpty() ? Collections.emptyNavigableSet() : matches; + } + + public synchronized void truncateForTesting() + { + segmentIndexes.clear(); + } + + private static class SegmentIndex + { + private final Int2ObjectHashMap storeIndexes = new Int2ObjectHashMap<>(); + + private SegmentIndex(long segment) + { + } + + public void add(int commandStoreId, TxnId id, Route route) + { + storeIndexes.computeIfAbsent(commandStoreId, i -> new StoreIndex()).add(id, route); + } + + public void search(int storeId, TableId tableId, byte[] start, byte[] end, Consumer> fn) + { + StoreIndex idx = storeIndexes.get(storeId); + if (idx == null) return; + idx.search(tableId, start, end, fn); + } + + public void search(int storeId, TableId tableId, byte[] key, Consumer> fn) + { + StoreIndex idx = storeIndexes.get(storeId); + if (idx == null) return; + idx.search(tableId, key, fn); + } + } + + private static class StoreIndex + { + private final Map tableIndex = new HashMap<>(); + + private StoreIndex() + { + } + + public void add(TxnId id, Route route) + { + for (Unseekable keyOrRange : route) + add(id, keyOrRange); + } + + private void add(TxnId id, Unseekable keyOrRange) + { + Invariants.require(keyOrRange.domain() == Range); + TokenRange ts = (TokenRange) keyOrRange; + TableId tableId = ts.table(); + tableIndex.computeIfAbsent(tableId, i -> new TableIndex()).add(id, ts); + } + + public void search(TableId tableId, byte[] start, byte[] end, Consumer> fn) + { + TableIndex index = tableIndex.get(tableId); + if (index == null) return; + index.search(start, end, fn); + } + + public void search(TableId tableId, byte[] key, Consumer> fn) + { + TableIndex index = tableIndex.get(tableId); + if (index == null) return; + index.search(key, fn); + } + } + + private static class TableIndex + { + private final RangeTree index = createRangeTree(); + + private TableIndex() + { + } + + public void add(TxnId id, TokenRange ts) + { + byte[] start = OrderedRouteSerializer.serializeTokenOnly(ts.start()); + byte[] end = OrderedRouteSerializer.serializeTokenOnly(ts.end()); + IndexRange range = new IndexRange(start, end); + + index.add(range, id); + } + + public void search(byte[] start, byte[] end, Consumer> fn) + { + index.search(new IndexRange(start, end), fn); + } + + public void search(byte[] key, Consumer> fn) + { + index.searchToken(key, fn); + } + } + + private static RangeTree createRangeTree() + { + return new RTree<>((a, b) -> ByteArrayUtil.compareUnsigned(a, 0, b, 0, a.length), new RangeTree.Accessor<>() + { + @Override + public byte[] start(IndexRange range) + { + return range.start; + } + + @Override + public byte[] end(IndexRange range) + { + return range.end; + } + + @Override + public boolean contains(byte[] start, byte[] end, byte[] bytes) + { + // bytes are ordered, start is exclusive, end is inclusive + return FastByteOperations.compareUnsigned(start, bytes) < 0 + && FastByteOperations.compareUnsigned(end, bytes) >= 0; + } + + @Override + public boolean intersects(IndexRange range, byte[] start, byte[] end) + { + return range.intersects(start, end); + } + + @Override + public boolean intersects(IndexRange left, IndexRange right) + { + return left.intersects(right.start, right.end); + } + }); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/TimeOnlyRequestBookkeeping.java b/src/java/org/apache/cassandra/service/accord/TimeOnlyRequestBookkeeping.java new file mode 100644 index 000000000000..1a6e25870316 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/TimeOnlyRequestBookkeeping.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import org.apache.cassandra.metrics.LatencyMetrics; +import static org.apache.cassandra.service.accord.RequestBookkeeping.ThrowsExceptionType.READ; + +public abstract class TimeOnlyRequestBookkeeping extends RequestBookkeeping +{ + public static class LatencyRequestBookkeeping extends TimeOnlyRequestBookkeeping + { + final LatencyMetrics latency; + + public LatencyRequestBookkeeping(LatencyMetrics latency) + { + this.latency = latency; + } + + public final void markElapsedNanos(long nanos) + { + if (latency != null) + latency.addNano(nanos); + } + } + + TimeOnlyRequestBookkeeping() + { + super(READ); + } + + @Override + final void markTimeout() + { + } + + @Override + final void markPreempted() + { + } + + final void markFailure() + { + } + + @Override + final void markRetryDifferentSystem() + { + throw new UnsupportedOperationException(); + } + + @Override + void markTopologyMismatch() + { + throw new UnsupportedOperationException(); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/TokenRange.java b/src/java/org/apache/cassandra/service/accord/TokenRange.java new file mode 100644 index 000000000000..3b546a74de0d --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/TokenRange.java @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.io.IOException; + +import com.google.common.annotations.VisibleForTesting; + +import accord.api.RoutingKey; +import accord.primitives.Range; +import accord.utils.Invariants; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.utils.ObjectSizes; + +public class TokenRange extends Range.EndInclusive +{ + public static final long EMPTY_SIZE = ObjectSizes.measure(new TokenRange(TokenKey.min(TableId.fromLong(0), Murmur3Partitioner.instance), TokenKey.max(TableId.fromLong(0), Murmur3Partitioner.instance))); + + // Don't make this public use create or createUnsafe + private TokenRange(TokenKey start, TokenKey end) + { + super(start, end); + } + + public static TokenRange create(TokenKey start, TokenKey end) + { + Invariants.requireArgument(start.table().equals(end.table()), + "Token ranges cannot cover more than one keyspace start:%s, end:%s", + start, end); + return new TokenRange(start, end); + } + + public static TokenRange createUnsafe(TokenKey start, TokenKey end) + { + return new TokenRange(start, end); + } + + public TableId table() + { + return start().table(); + } + + @Override + public TokenKey start() + { + return (TokenKey) super.start(); + } + + @Override + public TokenKey end() + { + return (TokenKey) super.end(); + } + + public long estimatedSizeOnHeap() + { + return EMPTY_SIZE + start().estimatedSizeOnHeap() + end().estimatedSizeOnHeap(); + } + + public boolean isFullRange() + { + return start().isMin() && end().isMax(); + } + + @VisibleForTesting + public Range withTable(TableId table) + { + return new TokenRange(start().withTable(table), end().withTable(table)); + } + + public static TokenRange fullRange(TableId table, IPartitioner partitioner) + { + return new TokenRange(TokenKey.min(table, partitioner), TokenKey.max(table, partitioner)); + } + + @Override + public TokenRange newRange(RoutingKey start, RoutingKey end) + { + return new TokenRange((TokenKey) start, (TokenKey) end); + } + + /* + * This behaves quite incorrectly with MinTokenKey because it loses the inclusivity of MinTokenKey in the conversion. + * It's not a problem for cluster metadata and topology, but it's quite wrong for queries that convert from Bounds to + * Range. + */ + public org.apache.cassandra.dht.Range toKeyspaceRange() + { + IPartitioner partitioner = DatabaseDescriptor.getPartitioner(); + TokenKey start = start(); + TokenKey end = end(); + Token left = start.isMin() ? partitioner.getMinimumToken() : start.token(); + Token right = end.isMax() ? partitioner.getMinimumToken() : end.token(); + return new org.apache.cassandra.dht.Range<>(left, right); + } + + public static final Serializer serializer = new Serializer(); + + public static final class Serializer implements UnversionedSerializer + { + @Override + public void serialize(TokenRange range, DataOutputPlus out) throws IOException + { + TokenKey.serializer.serialize(range.start(), out); + TokenKey.serializer.serialize(range.end(), out); + } + + public void skip(DataInputPlus in) throws IOException + { + TokenKey.serializer.skip(in); + TokenKey.serializer.skip(in); + } + + @Override + public TokenRange deserialize(DataInputPlus in) throws IOException + { + return TokenRange.create(TokenKey.serializer.deserialize(in), + TokenKey.serializer.deserialize(in)); + } + + @Override + public long serializedSize(TokenRange range) + { + return TokenKey.serializer.serializedSize(range.start()) + + TokenKey.serializer.serializedSize(range.end()); + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/WatermarkCollector.java b/src/java/org/apache/cassandra/service/accord/WatermarkCollector.java new file mode 100644 index 000000000000..c4c3cb22cfef --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/WatermarkCollector.java @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Objects; +import java.util.Set; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Iterators; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.ConfigurationService; +import accord.local.Node; +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.topology.Topology; +import accord.utils.Invariants; +import accord.utils.async.AsyncResult; +import org.agrona.collections.Int2ObjectHashMap; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.IVerbHandler; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessageDelivery; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.net.NoPayload; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.repair.SharedContext; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.utils.FBUtilities; + +import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.retryFetchWatermarks; + +/** + * Collects watermarks of closed and retired epochs per range, and synced epochs per node. + */ +public class WatermarkCollector implements ConfigurationService.Listener +{ + private static final Logger logger = LoggerFactory.getLogger(WatermarkCollector.class); + + final Map closed; + final Map retired; + final Int2ObjectHashMap synced; + + WatermarkCollector() + { + closed = new HashMap<>(); + retired = new HashMap<>(); + synced = new Int2ObjectHashMap<>(); + } + + @Override public AsyncResult onTopologyUpdate(Topology topology, boolean isLoad, boolean startSync) + { + return null; + } + + @Override + public void onRemoteSyncComplete(Node.Id node, long epoch) + { + synced.compute(node.id, (k, prev) -> prev == null ? epoch : Long.max(prev, epoch)); + } + + @Override + public void onEpochClosed(Ranges ranges, long epoch) + { + synchronized (this) + { + for (Range range : ranges) + this.closed.compute(range, (k, prev) -> prev == null ? epoch : Long.max(prev, epoch)); + } + } + + @Override + public void onEpochRetired(Ranges ranges, long epoch) + { + synchronized (this) + { + for (Range range : ranges) + this.retired.compute(range, (k, prev) -> prev == null ? epoch : Long.max(prev, epoch)); + } + } + + public final IVerbHandler handler = new IVerbHandler() + { + public void doVerb(Message message) throws IOException + { + Invariants.require(AccordService.started()); + Snapshot snapshot; + synchronized (WatermarkCollector.this) + { + snapshot = new Snapshot(new HashMap<>(closed), new HashMap<>(retired), new Int2ObjectHashMap<>(synced)); + } + MessagingService.instance().respond(snapshot, message); + } + }; + + @VisibleForTesting + static void fetchAndReportWatermarksAsync(AccordConfigurationService configService) + { + SharedContext context = SharedContext.Global.instance; + Set peers = new HashSet<>(); + peers.addAll(ClusterMetadata.current().directory.allAddresses()); + peers.remove(FBUtilities.getBroadcastAddressAndPort()); + + context.messaging().sendWithRetries(retryFetchWatermarks(), + context.optionalTasks()::schedule, + Verb.ACCORD_FETCH_WATERMARKS_REQ, + NoPayload.noPayload, + Iterators.cycle(peers), + MessageDelivery.RetryPredicate.ALWAYS_RETRY, + MessageDelivery.RetryErrorMessage.EMPTY) + .addCallback((m, fail) -> { + if (fail != null) + { + return; + } + Snapshot snapshot = m.payload; + long minEpoch = configService.minEpoch(); + for (Map.Entry e : snapshot.closed.entrySet()) + { + Ranges r = Ranges.of(e.getKey()); + for (long epoch = minEpoch; epoch <= e.getValue(); epoch++) + configService.receiveClosed(r, e.getValue()); + } + for (Map.Entry e : snapshot.retired.entrySet()) + { + Ranges r = Ranges.of(e.getKey()); + for (long epoch = minEpoch; epoch <= e.getValue(); epoch++) + configService.receiveRetired(r, e.getValue()); + } + for (Map.Entry e : snapshot.synced.entrySet()) + { + Node.Id node = new Node.Id(e.getKey()); + for (long epoch = minEpoch; epoch <= e.getValue(); epoch++) + configService.receiveRemoteSyncComplete(node, epoch); + } + }); + } + + public static class Snapshot + { + public final Map closed; + public final Map retired; + public final Int2ObjectHashMap synced; + + public Snapshot(Map closed, Map retired, Int2ObjectHashMap synced) + { + this.closed = closed; + this.retired = retired; + this.synced = synced; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Snapshot snapshot = (Snapshot) o; + return closed.equals(snapshot.closed) && retired.equals(snapshot.retired) && synced.equals(snapshot.synced); + } + + @Override + public int hashCode() + { + return Objects.hash(closed, retired, synced); + } + } + + public static final UnversionedSerializer serializer = new UnversionedSerializer<>() + { + @Override + public void serialize(Snapshot t, DataOutputPlus out) throws IOException + { + out.writeUnsignedVInt32(t.closed.size()); + for (Map.Entry e : t.closed.entrySet()) + { + TokenRange.serializer.serialize((TokenRange) e.getKey(), out); + out.writeUnsignedVInt(e.getValue()); + } + out.writeUnsignedVInt32(t.retired.size()); + for (Map.Entry e : t.retired.entrySet()) + { + TokenRange.serializer.serialize((TokenRange) e.getKey(), out); + out.writeUnsignedVInt(e.getValue()); + } + out.writeUnsignedVInt32(t.synced.size()); + for (Map.Entry e : t.synced.entrySet()) + { + out.writeUnsignedVInt32(e.getKey()); + out.writeUnsignedVInt(e.getValue()); + } + } + + // TODO (desired): we do not have to deserialize to report these values + @Override + public Snapshot deserialize(DataInputPlus in) throws IOException + { + int closedSize = in.readUnsignedVInt32(); + Map closed = new HashMap<>(); + for (int i = 0; i < closedSize; i++) + { + closed.put(TokenRange.serializer.deserialize(in), + in.readUnsignedVInt()); + } + int retiredSize = in.readUnsignedVInt32(); + Map retired = new HashMap<>(); + for (int i = 0; i < retiredSize; i++) + { + retired.put(TokenRange.serializer.deserialize(in), + in.readUnsignedVInt()); + } + int syncedSize = in.readUnsignedVInt32(); + Int2ObjectHashMap synced = new Int2ObjectHashMap<>(); + for (int i = 0; i < syncedSize; i++) + { + synced.put(in.readUnsignedVInt32(), + (Long) in.readUnsignedVInt()); + } + return new Snapshot(closed, retired, synced); + } + + @Override + public long serializedSize(Snapshot t) + { + int size = 0; + size += TypeSizes.sizeofUnsignedVInt(t.closed.size()); + for (Map.Entry e : t.closed.entrySet()) + { + size += TokenRange.serializer.serializedSize((TokenRange) e.getKey()); + size += TypeSizes.sizeofUnsignedVInt(e.getValue()); + } + size += TypeSizes.sizeofUnsignedVInt(t.retired.size()); + for (Map.Entry e : t.retired.entrySet()) + { + size += TokenRange.serializer.serializedSize((TokenRange) e.getKey()); + size += TypeSizes.sizeofUnsignedVInt(e.getValue()); + } + size += TypeSizes.sizeofUnsignedVInt(t.synced.size()); + for (Map.Entry e : t.synced.entrySet()) + { + size += TypeSizes.sizeofUnsignedVInt(e.getKey()); + size += TypeSizes.sizeofUnsignedVInt(e.getValue()); + } + return size; + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/api/AccordAgent.java b/src/java/org/apache/cassandra/service/accord/api/AccordAgent.java new file mode 100644 index 000000000000..949f7bbf8602 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/api/AccordAgent.java @@ -0,0 +1,356 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.api; + +import java.util.concurrent.CancellationException; +import java.util.concurrent.TimeUnit; +import java.util.function.BiConsumer; + +import javax.annotation.Nullable; + +import com.google.common.annotations.VisibleForTesting; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.Agent; +import accord.api.EventListener; +import accord.api.ProgressLog.BlockedUntil; +import accord.api.Result; +import accord.api.RoutingKey; +import accord.local.Command; +import accord.local.Node; +import accord.local.SafeCommand; +import accord.local.SafeCommandStore; +import accord.local.TimeService; +import accord.messages.ReplyContext; +import accord.primitives.Keys; +import accord.primitives.Participants; +import accord.primitives.Ranges; +import accord.primitives.Routable; +import accord.primitives.Status; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.Txn.Kind; +import accord.primitives.TxnId; +import accord.topology.Shard; +import accord.topology.Topologies; +import accord.utils.DefaultRandom; +import accord.utils.Invariants; +import accord.utils.RandomSource; +import accord.utils.SortedList; +import accord.utils.UnhandledEnum; +import accord.utils.async.AsyncChain; +import accord.utils.async.AsyncChains; +import accord.utils.async.AsyncResult; +import accord.utils.async.AsyncResults; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.exceptions.RequestTimeoutException; +import org.apache.cassandra.metrics.AccordMetrics; +import org.apache.cassandra.net.ResponseContext; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.txn.TxnQuery; +import org.apache.cassandra.service.accord.txn.TxnRead; +import org.apache.cassandra.utils.Clock; +import org.apache.cassandra.utils.JVMStabilityInspector; + +import static accord.primitives.Routable.Domain.Key; +import static accord.utils.SortedArrays.SortedArrayList.ofSorted; +import static java.util.concurrent.TimeUnit.MICROSECONDS; +import static java.util.concurrent.TimeUnit.MILLISECONDS; +import static java.util.concurrent.TimeUnit.NANOSECONDS; +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.apache.cassandra.config.DatabaseDescriptor.getAccordScheduleDurabilityTxnIdLag; +import static org.apache.cassandra.config.DatabaseDescriptor.getReadRpcTimeout; +import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.expireEpochWait; +import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.fetch; +import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.recover; +import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.retryBootstrap; +import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.retryDurability; +import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.retrySyncPoint; +import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.slowTxnPreaccept; +import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.slowRead; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; + +// TODO (expected): merge with AccordService +public class AccordAgent implements Agent +{ + private static final Logger logger = LoggerFactory.getLogger(AccordAgent.class); + + private static BiConsumer onFailedBarrier; + public static void setOnFailedBarrier(BiConsumer newOnFailedBarrier) { onFailedBarrier = newOnFailedBarrier; } + public static void onFailedBarrier(TxnId txnId, Throwable cause) + { + BiConsumer invoke = onFailedBarrier; + if (invoke != null) invoke.accept(txnId, cause); + } + + + private final RandomSource random = new DefaultRandom(); + protected Node.Id self; + + public AccordAgent() + { + } + + public void setNodeId(Node.Id id) + { + self = id; + } + + @Override + public void onRecover(Node node, Result success, Throwable fail) + { + } + + @Override + public void onInconsistentTimestamp(Command command, Timestamp prev, Timestamp next) + { + // TODO (expected): better reporting + AssertionError error = new AssertionError("Inconsistent execution timestamp detected for txnId " + command.txnId() + ": " + prev + " != " + next); + onUncaughtException(error); + throw error; + } + + @Override + public void onFailedBootstrap(int attempts, String phase, Ranges ranges, Runnable retry, Throwable failure) + { + logger.error("Failed bootstrap at {} for {}", phase, ranges, failure); + AccordService.instance().scheduler().once(retry, retryBootstrap.computeWait(attempts, MICROSECONDS), MICROSECONDS); + } + + @Override + public void onStale(Timestamp staleSince, Ranges ranges) + { + logger.error("This replica has become stale for {} as of {}", ranges, staleSince); + } + + @Override + public void onUncaughtException(Throwable t) + { + if (t instanceof RequestTimeoutException || t instanceof CancellationException) + return; + logger.error("Uncaught accord exception", t); + JVMStabilityInspector.uncaughtException(Thread.currentThread(), t); + } + + @Override + public void onCaughtException(Throwable t, String context) + { + logger.warn(context, t); + JVMStabilityInspector.uncaughtException(Thread.currentThread(), t); + } + + @Override + public Topologies selectPreferred(Node.Id from, Topologies to) + { + SortedList nodes = to.nodes(); + int i = nodes.indexOf(from); + Node.Id node = i <= 0 ? nodes.get(nodes.size() - 1) : to.nodes().get(i - 1); + return to.select(ofSorted(node)); + } + + @Override + public boolean rejectPreAccept(TimeService time, TxnId txnId) + { + return time.now() - getReadRpcTimeout(MICROSECONDS) > txnId.hlc(); + } + + // TODO (expected): we probably want additional configuration here so we can prune on shorter time horizons when we have a lot of transactions on a single key + @Override + public long cfkHlcPruneDelta() + { + return SECONDS.toMicros(10L); + } + + @Override + public int cfkPruneInterval() + { + return 32; + } + + // TODO (expected): we probably want additional configuration here + @Override + public long maxConflictsHlcPruneDelta() + { + return SECONDS.toMicros(1); + } + + @Override + public long maxConflictsPruneInterval() + { + return 100; + } + + /** + * Create an empty transaction that Accord can use for its internal transactions. This is not suitable + * for tests since it skips validation done by regular transactions. + */ + @Override + public Txn emptySystemTxn(Kind kind, Routable.Domain domain) + { + return new Txn.InMemory(kind, (domain == Key ? Keys.EMPTY : Ranges.EMPTY), TxnRead.empty(domain), TxnQuery.UNSAFE_EMPTY, null, TableMetadatasAndKeys.none(domain)); + } + + @Override + public EventListener eventListener() + { + return AccordMetrics.Listener.instance; + } + + @Override + public long slowCoordinatorDelay(Node node, SafeCommandStore safeStore, TxnId txnId, TimeUnit units, int retryCount) + { + SafeCommand safeCommand = safeStore.unsafeGetNoCleanup(txnId); + Invariants.nonNull(safeCommand); + + Command command = safeCommand.current(); + Invariants.nonNull(command); + + Timestamp mostRecentAttempt = Timestamp.max(command.txnId(), command.promised()); + RoutingKey homeKey = command.route().homeKey(); + Shard shard = node.topology().forEpochIfKnown(homeKey, command.txnId().epoch()); + + // TODO (expected): make this a configurable calculation on normal request latencies (like ContentionStrategy) + long oneSecond = SECONDS.toMicros(1L); + long startTime = mostRecentAttempt.hlc() + recover(txnId).computeWait(retryCount, MICROSECONDS); + + startTime = nonClashingStartTime(startTime, shard == null ? null : shard.nodes, node.id(), oneSecond, random); + long nowMicros = MILLISECONDS.toMicros(Clock.Global.currentTimeMillis()); + long delayMicros = Math.max(1, startTime - nowMicros); + Invariants.require(delayMicros < TimeUnit.HOURS.toMicros(1L)); + return units.convert(delayMicros, MICROSECONDS); + } + + @VisibleForTesting + public static long nonClashingStartTime(long startTime, SortedList nodes, Node.Id id, long granularity, RandomSource random) + { + long perSecondStartTime; + if (nodes != null) + { + int position = nodes.indexOf(id); + perSecondStartTime = position * (SECONDS.toMicros(1) / nodes.size()); + } + else + { + // we've raced with topology update, this should be rare so just pick a random start time + perSecondStartTime = random.nextLong(granularity); + } + + // TODO (expected): make this a configurable calculation on normal request latencies (like ContentionStrategy) + long subSecondRemainder = startTime % granularity; + long newStartTime = startTime - subSecondRemainder + perSecondStartTime; + if (newStartTime < startTime) + newStartTime += granularity; + return newStartTime; + } + + @Override + public long slowReplicaDelay(Node node, SafeCommandStore safeStore, TxnId txnId, int attempt, BlockedUntil blockedUntil, TimeUnit units) + { + return fetch(txnId).computeWait(attempt, units); + } + + @Override + public long slowAwaitDelay(Node node, SafeCommandStore safeStore, TxnId txnId, int attempt, BlockedUntil retrying, TimeUnit units) + { + // TODO (desired): separate config? + return fetch(txnId).computeWait(attempt, units); + } + + @Override + public long retrySyncPointDelay(Node node, int attempt, TimeUnit units) + { + return retrySyncPoint.computeWait(attempt, units); + } + + @Override + public long retryDurabilityDelay(Node node, int attempt, TimeUnit units) + { + return retryDurability.computeWait(attempt, units); + } + + @Override + public long expireEpochWait(TimeUnit units) + { + return expireEpochWait.computeWait(1, units); + } + + @Override + public long expiresAt(ReplyContext replyContext, TimeUnit unit) + { + return unit.convert(((ResponseContext)replyContext).expiresAtNanos(), NANOSECONDS); + } + + @Override + public long selfSlowAt(TxnId txnId, Status.Phase phase, TimeUnit unit) + { + switch (phase) + { + default: throw new UnhandledEnum(phase); + case PreAccept: return unit.convert(slowTxnPreaccept.computeWaitUntil(1), NANOSECONDS); + case Execute: return unit.convert(slowRead.computeWaitUntil(1), NANOSECONDS); + } + } + + @Override + public long selfExpiresAt(TxnId txnId, Status.Phase phase, TimeUnit unit) + { + long delayNanos; + switch (txnId.kind()) + { + default: throw new UnhandledEnum(txnId.kind()); + case Write: + delayNanos = DatabaseDescriptor.getWriteRpcTimeout(NANOSECONDS); + break; + case EphemeralRead: + case Read: + delayNanos = DatabaseDescriptor.getReadRpcTimeout(NANOSECONDS); + break; + case ExclusiveSyncPoint: + delayNanos = DatabaseDescriptor.getAccordRangeSyncPointTimeoutNanos(); + } + return unit.convert(nanoTime() + delayNanos, NANOSECONDS); + } + + @Override + public AsyncChain awaitStaleId(Node node, TxnId staleId, boolean isRequested) + { + long waitMicros = (staleId.hlc() + getAccordScheduleDurabilityTxnIdLag(MICROSECONDS)) - node.now(); + if (waitMicros <= 0) + return AsyncChains.success(staleId); + + logger.debug("Waiting {} micros for {} to be stale", waitMicros, staleId); + AsyncResult.Settable result = AsyncResults.settable(); + node.scheduler().selfRecurring(() -> result.setSuccess(staleId), waitMicros, MICROSECONDS); + return result; + } + + @Override + public long minStaleHlc(Node node, boolean requested) + { + return node.now() - (100 + getAccordScheduleDurabilityTxnIdLag(MICROSECONDS)); + } + + @Override + public void onViolation(String message, Participants participants, @Nullable TxnId notWitnessed, @Nullable Timestamp notWitnessedExecuteAt, @Nullable TxnId by, @Nullable Timestamp byEexecuteAt) + { + logger.error(message); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/api/AccordRoutableKey.java b/src/java/org/apache/cassandra/service/accord/api/AccordRoutableKey.java new file mode 100644 index 000000000000..d70cf2beea5d --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/api/AccordRoutableKey.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.api; + +import java.io.IOException; + +import javax.annotation.Nonnull; + +import accord.primitives.RoutableKey; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.TableId; + +public abstract class AccordRoutableKey implements RoutableKey +{ + public interface AccordKeySerializer extends UnversionedSerializer + { + void skip(DataInputPlus in) throws IOException; + } + + public interface AccordSearchableKeySerializer extends AccordKeySerializer + { + // -1 means dynamic + int fixedKeyLengthForPrefix(Object prefix); + int serializedSizeOfPrefix(Object prefix); + int serializedSizeWithoutPrefix(K key); + void serializePrefix(Object prefix, DataOutputPlus out) throws IOException; + void serializeWithoutPrefixOrLength(K key, DataOutputPlus out) throws IOException; + Object deserializePrefix(DataInputPlus in) throws IOException; + K deserializeWithPrefix(Object prefix, int length, DataInputPlus in) throws IOException; + } + + static final byte MAX_TABLE_SENTINEL = 0x48; + static final byte NORMAL_SENTINEL = 0x28; + static final byte BEFORE_TOKEN_SENTINEL = 0x24; + static final byte AFTER_TOKEN_SENTINEL = 0x2c; + static final byte MIN_TABLE_SENTINEL = 0x18; + static final int PREFIX_MASK = 0xF0; + static final int SUFFIX_MASK = 0x0F; + + final TableId table; // TODO (desired): use a long id (TrM) + + protected AccordRoutableKey(TableId table) + { + this.table = table; + } + + public TableId table() + { + return table; + } + + public abstract Token token(); + abstract byte sentinel(); + + @Override + public Object prefix() + { + return table; + } + + @Override + public String toString() + { + return prefix() + ":" + suffix(); + } + + @Override + public int hashCode() + { + return table.hashCode() * 31 + token().tokenHash(); + } + + @Override + public final int compareTo(RoutableKey that) + { + return compareTo((AccordRoutableKey) that); + } + + @Override + public int compareAsRoutingKey(@Nonnull RoutableKey that) + { + return compareAsRoutingKey((AccordRoutableKey) that); + } + + public final int compareAsRoutingKey(@Nonnull AccordRoutableKey that) + { + int c = this.table.compareTo(that.table); + if (c != 0) return c; + int thisSentinel = this.sentinel(), thatSentinel = that.sentinel(); + c = (thisSentinel & PREFIX_MASK) - (thatSentinel & PREFIX_MASK); + if (c == 0) c = this.token().compareTo(that.token()); + if (c == 0) c = (thisSentinel & SUFFIX_MASK) - (thatSentinel & SUFFIX_MASK); + return c; + } + + public final int compareTo(AccordRoutableKey that) + { + if (this == that) return 0; + int c = compareAsRoutingKey(that); + if (c != 0) + return c; + + boolean thisIsRoutingKey = this.getClass() == TokenKey.class; + boolean thatIsRoutingKey = that.getClass() == TokenKey.class; + if (thisIsRoutingKey | thatIsRoutingKey) + { + if (thisIsRoutingKey & thatIsRoutingKey) + return 0; + + return thisIsRoutingKey ? 1 : -1; + } + + return ((PartitionKey)this).key.compareBytesOnly(((PartitionKey)that).key); + } + + @Override + public final boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + AccordRoutableKey that = (AccordRoutableKey) o; + return compareTo(that) == 0; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/api/AccordScheduler.java b/src/java/org/apache/cassandra/service/accord/api/AccordScheduler.java new file mode 100644 index 000000000000..b8e0b8755f36 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/api/AccordScheduler.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.api; + +import java.util.List; +import java.util.concurrent.RejectedExecutionException; +import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.TimeUnit; + +import accord.api.Scheduler; +import org.apache.cassandra.concurrent.ExecutorFactory; +import org.apache.cassandra.concurrent.ScheduledExecutorPlus; +import org.apache.cassandra.concurrent.Shutdownable; + +public class AccordScheduler implements Scheduler, Shutdownable +{ + private final ScheduledExecutorPlus scheduledExecutor = ExecutorFactory.Global.executorFactory().scheduled(false, "AccordScheduled"); + + private static class ScheduledFutureWrapper implements Scheduled + { + private final ScheduledFuture future; + + public ScheduledFutureWrapper(ScheduledFuture future) + { + this.future = future; + } + + @Override + public void cancel() + { + future.cancel(false); + } + + @Override + public boolean isDone() + { + return future.isDone(); + } + } + + @Override + public Scheduled recurring(Runnable run, long delay, TimeUnit units) + { + ScheduledFuture future = scheduledExecutor.scheduleAtFixedRate(run, delay, delay, units); + return new ScheduledFutureWrapper(future); + } + + @Override + public Scheduled once(Runnable run, long delay, TimeUnit units) + { + ScheduledFuture future = scheduledExecutor.schedule(run, delay, units); + return new ScheduledFutureWrapper(future); + } + + @Override + public Scheduled selfRecurring(Runnable run, long delay, TimeUnit units) + { + ScheduledFuture future = scheduledExecutor.scheduleSelfRecurring(run, delay, units); + return new ScheduledFutureWrapper(future); + } + + @Override + public void now(Runnable task) + { + // called from the mutation stage configured by the verb + if (scheduledExecutor.isShutdown()) + throw new RejectedExecutionException("Scheduler has shut down."); + scheduledExecutor.submit(task); + } + + @Override + public boolean isTerminated() + { + return scheduledExecutor.isTerminated(); + } + + @Override + public void shutdown() + { + for (Runnable c : shutdownNow()) + { + if (c instanceof java.util.concurrent.Future) + ((java.util.concurrent.Future) c).cancel(false); + } + } + + @Override + public List shutdownNow() + { + return scheduledExecutor.shutdownNow(); + } + + @Override + public boolean awaitTermination(long timeout, TimeUnit units) throws InterruptedException + { + return scheduledExecutor.awaitTermination(timeout, units); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/api/AccordTimeService.java b/src/java/org/apache/cassandra/service/accord/api/AccordTimeService.java new file mode 100644 index 000000000000..2f13983ea8df --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/api/AccordTimeService.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.api; + +import java.util.concurrent.TimeUnit; + +import accord.local.TimeService; +import org.apache.cassandra.utils.Clock; + +import static java.util.concurrent.TimeUnit.NANOSECONDS; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; + +public class AccordTimeService implements TimeService +{ + @Override + public long now() + { + return nowMicros(); + } + + public static long nowMicros() + { + return TimeUnit.MILLISECONDS.toMicros(Clock.Global.currentTimeMillis()); + } + + @Override + public long elapsed(TimeUnit unit) + { + return unit.convert(nanoTime(), NANOSECONDS); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/api/AccordTopologySorter.java b/src/java/org/apache/cassandra/service/accord/api/AccordTopologySorter.java new file mode 100644 index 000000000000..a9f004b3503c --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/api/AccordTopologySorter.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.api; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Set; + +import accord.api.TopologySorter; +import accord.local.Node; +import accord.topology.ShardSelection; +import accord.topology.Topologies; +import accord.topology.Topology; +import accord.utils.SortedList; +import org.apache.cassandra.gms.ApplicationState; +import org.apache.cassandra.gms.EndpointState; +import org.apache.cassandra.gms.Gossiper; +import org.apache.cassandra.gms.VersionedValue; +import org.apache.cassandra.locator.DynamicEndpointSnitch; +import org.apache.cassandra.locator.Endpoint; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.NodeProximity; +import org.apache.cassandra.service.accord.AccordEndpointMapper; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.Sortable; + +public class AccordTopologySorter implements TopologySorter +{ + public static class Supplier implements TopologySorter.Supplier + { + private final AccordEndpointMapper mapper; + private final NodeProximity proximity; + + public Supplier(AccordEndpointMapper mapper, NodeProximity proximity) + { + checkSnitchSupported(proximity); + this.mapper = mapper; + this.proximity = proximity; + } + + @Override + public TopologySorter get(Topology topologies) + { + return create(topologies.nodes()); + } + + @Override + public TopologySorter get(Topologies topologies) + { + return create(topologies.nodes()); + } + + private AccordTopologySorter create(SortedList nodes) + { + SortableEndpoints endpoints = SortableEndpoints.from(nodes, mapper); + Comparator comparator = proximity.endpointComparator(FBUtilities.getBroadcastAddressAndPort(), endpoints); + return new AccordTopologySorter(mapper, comparator); + } + } + + private final AccordEndpointMapper mapper; + private final Comparator comparator; + + private AccordTopologySorter(AccordEndpointMapper mapper, Comparator comparator) + { + this.mapper = mapper; + this.comparator = comparator; + } + + public static void checkSnitchSupported(NodeProximity proximity) + { + if (!proximity.supportCompareByEndpoint()) + { + if (proximity instanceof DynamicEndpointSnitch) + proximity = ((DynamicEndpointSnitch) proximity).delegate; + throw new IllegalArgumentException("Unsupported snitch " + proximity.getClass() + "; supportCompareByEndpoint returned false"); + } + } + + @Override + public int compare(Node.Id node1, Node.Id node2, ShardSelection shards) + { + return comparator.compare(() -> mapper.mappedEndpoint(node1), () -> mapper.mappedEndpoint(node2)); + } + + @Override + public boolean isFaulty(Node.Id node) + { + InetAddressAndPort ep = mapper.mappedEndpointOrNull(node); + if (ep == null) + return true; + + EndpointState epState = Gossiper.instance.getEndpointStateForEndpoint(ep); + if (epState == null) + return true; + + if (!epState.isAlive()) + return true; + + VersionedValue event = epState.getApplicationState(ApplicationState.SEVERITY); + if (event == null) + return false; + + return Double.parseDouble(event.value) == 0.0; + } + + private static class EndpointTuple implements Endpoint + { + final InetAddressAndPort endpoint; + + private EndpointTuple(InetAddressAndPort endpoint) + { + this.endpoint = endpoint; + } + + @Override + public InetAddressAndPort endpoint() + { + return endpoint; + } + } + + private static class SortableEndpoints extends ArrayList implements Sortable + { + public SortableEndpoints(int initialCapacity) + { + super(initialCapacity); + } + + public SortableEndpoints sorted(Comparator comparator) + { + sort(comparator); + return this; + } + + static SortableEndpoints from(Set nodes, AccordEndpointMapper mapper) + { + SortableEndpoints result = new SortableEndpoints(nodes.size()); + nodes.forEach(id -> result.add(new EndpointTuple(mapper.mappedEndpoint(id)))); + return result; + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/api/AccordWaitStrategies.java b/src/java/org/apache/cassandra/service/accord/api/AccordWaitStrategies.java new file mode 100644 index 000000000000..6fcacee42ac9 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/api/AccordWaitStrategies.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.api; + +import javax.annotation.Nullable; + +import accord.primitives.TxnId; +import org.apache.cassandra.config.AccordSpec; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.config.StringRetryStrategy; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.service.RetryStrategy; +import org.apache.cassandra.service.TimeoutStrategy; + +import static accord.primitives.Txn.Kind.ExclusiveSyncPoint; +import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.accordReadMetrics; +import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.accordWriteMetrics; +import static org.apache.cassandra.service.TimeoutStrategy.LatencySourceFactory.none; +import static org.apache.cassandra.service.TimeoutStrategy.LatencySourceFactory.of; +import static org.apache.cassandra.service.TimeoutStrategy.LatencySourceFactory.rw; + +public class AccordWaitStrategies +{ + static TimeoutStrategy slowTxnPreaccept, slowSyncPointPreaccept, slowRead; + static TimeoutStrategy expireTxn, expireSyncPoint, expireDurability, expireEpochWait; + static TimeoutStrategy fetchTxn, fetchSyncPoint; + static RetryStrategy recoverTxn, recoverSyncPoint, retrySyncPoint, retryDurability, retryBootstrap; + static RetryStrategy retryFetchMinEpoch, retryFetchTopology; + + public static @Nullable TimeoutStrategy slowRead(@Nullable TxnId txnId) + { + if (txnId != null && txnId.isSyncPoint()) + return null; + return slowRead; + } + + public static TimeoutStrategy expire(@Nullable TxnId txnId, Verb verb) + { + if (txnId == null || !txnId.isSyncPoint()) + return expireTxn; + return verb == Verb.ACCORD_WAIT_UNTIL_APPLIED_REQ ? expireDurability : expireSyncPoint; + } + + public static TimeoutStrategy fetch(@Nullable TxnId txnId) + { + if (txnId == null || !txnId.isSyncPoint()) + return fetchTxn; + return fetchSyncPoint; + } + + public static TimeoutStrategy slowPreaccept(TxnId txnId) + { + if (txnId.isSyncPoint()) + return expireSyncPoint; + return slowTxnPreaccept; + } + + public static RetryStrategy retryFetchWatermarks() + { + return retryFetchMinEpoch; + } + + public static RetryStrategy retryFetchTopology() + { + return retryFetchTopology; + } + + static + { + AccordSpec config = DatabaseDescriptor.getAccord(); + setSlowRead(config.slow_read); + setSlowTxnPreaccept(config.slow_txn_preaccept); + setSlowSyncPointPreaccept(config.slow_syncpoint_preaccept); + setExpireTxn(config.expire_txn); + setExpireSyncPoint(config.expire_syncpoint); + setExpireDurability(config.expire_durability); + setExpireEpochWait(config.expire_epoch_wait); + setFetchTxn(config.fetch_txn); + setFetchSyncPoint(config.fetch_syncpoint); + setRecoverTxn(config.recover_txn); + setRecoverSyncPoint(config.recover_syncpoint); + setRetrySyncPoint(config.retry_syncpoint); + setRetryDurability(config.retry_durability); + setRetryBootstrap(config.retry_bootstrap); + setRetryFetchMinEpoch(config.retry_fetch_min_epoch); + setRetryFetchTopology(config.retry_fetch_topology); + } + + public static void setSlowRead(String spec) + { + slowRead = TimeoutStrategy.parse(spec, of(accordReadMetrics)); + } + + public static void setSlowTxnPreaccept(String spec) + { + slowTxnPreaccept = TimeoutStrategy.parse(spec, rw(accordReadMetrics, accordWriteMetrics)); + } + + public static void setSlowSyncPointPreaccept(String spec) + { + slowSyncPointPreaccept = TimeoutStrategy.parse(spec, none()); + } + + public static void setExpireTxn(String spec) + { + expireTxn = TimeoutStrategy.parse(spec, rw(accordReadMetrics, accordWriteMetrics)); + } + + public static void setExpireSyncPoint(String spec) + { + expireSyncPoint = TimeoutStrategy.parse(spec, none()); + } + + public static void setExpireDurability(String spec) + { + expireDurability = TimeoutStrategy.parse(spec, none()); + } + + public static void setExpireEpochWait(String spec) + { + expireEpochWait = TimeoutStrategy.parse(spec, none()); + } + + public static void setFetchTxn(String spec) + { + fetchTxn = TimeoutStrategy.parse(spec, rw(accordReadMetrics, accordWriteMetrics)); + } + + public static void setFetchSyncPoint(String spec) + { + fetchSyncPoint = TimeoutStrategy.parse(spec, none()); + } + + public static void setRecoverTxn(String spec) + { + recoverTxn = RetryStrategy.parse(spec, rw(accordReadMetrics, accordWriteMetrics)); + } + + public static void setRecoverSyncPoint(StringRetryStrategy spec) + { + recoverSyncPoint = spec.retry(); + } + + public static void setRetrySyncPoint(StringRetryStrategy spec) + { + retrySyncPoint = spec.retry(); + } + + public static void setRetryDurability(StringRetryStrategy spec) + { + retryDurability = spec.retry(); + } + + public static void setRetryBootstrap(StringRetryStrategy spec) + { + retryBootstrap = spec.retry(); + } + + public static void setRetryFetchMinEpoch(StringRetryStrategy spec) + { + retryFetchMinEpoch = spec.retry(); + } + + public static void setRetryFetchTopology(StringRetryStrategy spec) + { + retryFetchTopology = spec.retry(); + } + + static RetryStrategy recover(TxnId txnId) + { + if (txnId.is(ExclusiveSyncPoint)) + return recoverSyncPoint; + return recoverTxn; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/api/CompositeTopologySorter.java b/src/java/org/apache/cassandra/service/accord/api/CompositeTopologySorter.java new file mode 100644 index 000000000000..597e4aad8667 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/api/CompositeTopologySorter.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.api; + +import accord.api.TopologySorter; +import accord.local.Node; +import accord.topology.ShardSelection; +import accord.topology.Topologies; +import accord.topology.Topology; + +public class CompositeTopologySorter implements TopologySorter +{ + public static class Supplier implements TopologySorter.Supplier + { + private final TopologySorter.Supplier[] delegates; + + private Supplier(TopologySorter.Supplier[] delegates) + { + this.delegates = delegates; + } + + @Override + public TopologySorter get(Topology topologies) + { + TopologySorter[] sorters = new TopologySorter[delegates.length]; + for (int i = 0; i < sorters.length; i++) + sorters[i] = delegates[i].get(topologies); + return new CompositeTopologySorter(sorters); + } + + @Override + public TopologySorter get(Topologies topologies) + { + TopologySorter[] sorters = new TopologySorter[delegates.length]; + for (int i = 0; i < sorters.length; i++) + sorters[i] = delegates[i].get(topologies); + return new CompositeTopologySorter(sorters); + } + } + + private final TopologySorter[] delegates; + + private CompositeTopologySorter(TopologySorter[] delegates) + { + this.delegates = delegates; + } + + public static TopologySorter.Supplier create(TopologySorter.Supplier... delegates) + { + switch (delegates.length) + { + case 0: throw new IllegalArgumentException("Can not create an empty sorter"); + case 1: return delegates[0]; + default: return new CompositeTopologySorter.Supplier(delegates); + } + } + + @Override + public int compare(Node.Id node1, Node.Id node2, ShardSelection shards) + { + for (int i = 0; i < delegates.length; i++) + { + int rc = delegates[i].compare(node1, node2, shards); + if (rc != 0) return rc; + } + return 0; + } + + @Override + public boolean isFaulty(Node.Id node) + { + for (int i = 0; i < delegates.length; i++) + { + if (delegates[i].isFaulty(node)) + return true; + } + return false; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/api/PartitionKey.java b/src/java/org/apache/cassandra/service/accord/api/PartitionKey.java new file mode 100644 index 000000000000..c98ff3e29ab5 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/api/PartitionKey.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.api; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import accord.api.Key; +import accord.utils.Invariants; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.SinglePartitionReadCommand; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; +import org.apache.cassandra.db.marshal.ValueAccessor; +import org.apache.cassandra.db.partitions.Partition; +import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ObjectSizes; +import org.apache.cassandra.utils.vint.VIntCoding; + +import static org.apache.cassandra.config.DatabaseDescriptor.getPartitioner; + +// final in part because we refer to its class directly in AccordRoutableKey.compareTo +public final class PartitionKey extends AccordRoutableKey implements Key +{ + private static final long EMPTY_SIZE; + + static + { + DecoratedKey key = getPartitioner().decorateKey(ByteBufferUtil.EMPTY_BYTE_BUFFER); + EMPTY_SIZE = ObjectSizes.measureDeep(new PartitionKey(null, key)); + } + + final DecoratedKey key; + + public PartitionKey(TableId tableId, DecoratedKey key) + { + super(tableId); + this.key = key; + } + + public static PartitionKey of(Key key) + { + return (PartitionKey) key; + } + + public static PartitionKey of(PartitionUpdate update) + { + return new PartitionKey(update.metadata().id, update.partitionKey()); + } + + public static PartitionKey of(Partition partition) + { + return new PartitionKey(partition.metadata().id, partition.partitionKey()); + } + + public static PartitionKey of(SinglePartitionReadCommand command) + { + return new PartitionKey(command.metadata().id, command.partitionKey()); + } + + @Override + public Token token() + { + return partitionKey().getToken(); + } + + public DecoratedKey partitionKey() + { + return key; + } + + @Override + public org.apache.cassandra.service.accord.api.TokenKey toUnseekable() + { + return new org.apache.cassandra.service.accord.api.TokenKey(table, token()); + } + + @Override + byte sentinel() + { + return org.apache.cassandra.service.accord.api.TokenKey.NORMAL_SENTINEL; + } + + public long estimatedSizeOnHeap() + { + return EMPTY_SIZE + ByteBufferAccessor.instance.size(partitionKey().getKey()); + } + + @Override + public String suffix() + { + return partitionKey().toString(); + } + + public static final Serializer serializer = new Serializer(); + public static class Serializer implements AccordKeySerializer + { + private Serializer() {} + + @Override + public void serialize(PartitionKey key, DataOutputPlus out) throws IOException + { + key.table().serializeCompact(out); + ByteBufferUtil.writeWithVIntLength(key.partitionKey().getKey(), out); + } + + public int serialize(PartitionKey key, V dst, ValueAccessor accessor, int offset) + { + int position = offset; + position += key.table().serializeCompact(dst, accessor, position); + ByteBuffer bytes = key.partitionKey().getKey(); + Invariants.require(key.partitionKey().getPartitioner() == getPartitioner()); + int numBytes = bytes.remaining(); + position += accessor.putUnsignedVInt32(dst, position, numBytes); + position += accessor.copyByteBufferTo(bytes, 0, dst, position, numBytes); + return position - offset; + + } + + @Override + public void skip(DataInputPlus in) throws IOException + { + TableId.skipCompact(in); + ByteBufferUtil.skipWithVIntLength(in); + } + + @Override + public PartitionKey deserialize(DataInputPlus in) throws IOException + { + TableId tableId = TableId.deserializeCompact(in); + DecoratedKey key = getPartitioner().decorateKey(ByteBufferUtil.readWithVIntLength(in)); + return new PartitionKey(tableId, key); + } + + public PartitionKey deserialize(V src, ValueAccessor accessor, int offset) throws IOException + { + TableId tableId = TableId.deserializeCompact(src, accessor, offset); + offset += tableId.serializedCompactSize(); + int numBytes = accessor.getUnsignedVInt32(src, offset); + offset += VIntCoding.readLengthOfVInt(src, accessor, offset); + ByteBuffer bytes = ByteBuffer.allocate(numBytes); + accessor.copyTo(src, offset, bytes, ByteBufferAccessor.instance, 0, numBytes); + DecoratedKey key = getPartitioner().decorateKey(bytes); + return new PartitionKey(tableId, key); + } + + @Override + public long serializedSize(PartitionKey key) + { + return key.table().serializedCompactSize() + ByteBufferUtil.serializedSizeWithVIntLength(key.partitionKey().getKey()); + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/api/TokenKey.java b/src/java/org/apache/cassandra/service/accord/api/TokenKey.java new file mode 100644 index 000000000000..7511cb70f14b --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/api/TokenKey.java @@ -0,0 +1,616 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.api; + +import java.io.IOException; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +import com.google.common.annotations.VisibleForTesting; + +import accord.api.RoutingKey; +import accord.local.ShardDistributor; +import accord.primitives.Range; +import accord.primitives.RangeFactory; +import accord.primitives.Ranges; +import accord.utils.Invariants; +import accord.utils.VIntCoding; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; +import org.apache.cassandra.db.marshal.ValueAccessor; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.utils.ObjectSizes; + +import static org.apache.cassandra.config.DatabaseDescriptor.getPartitioner; + +public final class TokenKey extends AccordRoutableKey implements RoutingKey, RangeFactory +{ + private static final long EMPTY_SIZE = ObjectSizes.measure(new TokenKey(null, null)); + + @Override + public Range asRange() + { + return TokenRange.create(before(), this); + } + + // we use the first 2 bits as a prefix, and the last 6 bits as a postfix comparison + final byte sentinel; + final Token token; + private TokenKey(TableId tableId, byte sentinel, Token token) + { + super(tableId); + this.sentinel = sentinel; + this.token = token; + } + + public TokenKey(TableId tableId, Token token) + { + this(tableId, NORMAL_SENTINEL, token); + } + + public TokenKey withToken(Token token) + { + return new TokenKey(table, sentinel, token); + } + + @Override + public Token token() + { + return token; + } + + @Override + public byte sentinel() + { + return sentinel; + } + + public byte prefixSentinel() + { + return (byte) (sentinel & PREFIX_MASK); + } + + public byte suffixSentinel() + { + return (byte) (sentinel & SUFFIX_MASK); + } + + // this can be invoked to a depth of 3 from a real token + @VisibleForTesting + public TokenKey before() + { + Invariants.require(!isTokenSentinel(), "Unable to call .before() when already a token sentinel: %s", this); + int lowestBit = Integer.lowestOneBit(sentinel); + Invariants.require(lowestBit != 1); + byte newSentinel = (byte)((sentinel ^ lowestBit) | (lowestBit >>> 1)); + return new TokenKey(table, newSentinel, token); + } + + // this can be invoked to a depth of 2 from a real token + @VisibleForTesting + public TokenKey after() + { + Invariants.require(!isTokenSentinel(), "Unable to call .after() when already a token sentinel: %s", this); + int lowestBit = Integer.lowestOneBit(sentinel); + // we can't use 0xf as we would not be able to disambiguate with variable length byte encoding escape + Invariants.require((lowestBit != 1) && (sentinel & 0xf) != 0xe); + byte newSentinel = (byte)(sentinel | (lowestBit >>> 1)); + return new TokenKey(table, newSentinel, token); + } + + @Override + public Object suffix() + { + return token; + } + + public Object printableSuffix() + { + Object suffix = suffix(); + if (isSentinel()) + { + if (isTableSentinel()) suffix = isMin() ? "-Inf" : "+Inf"; + if (isTokenSentinel()) suffix = (isBefore() ? "before(" : "after(") + suffix + ')'; + } + return suffix; + } + + @Override + public String toString() + { + return prefix() + ":" + printableSuffix(); + } + + public long estimatedSizeOnHeap() + { + return EMPTY_SIZE + token().getHeapSize(); + } + + public TokenKey withTable(TableId table) + { + return new TokenKey(table, sentinel, token); + } + + @Override + public RangeFactory rangeFactory() + { + return this; + } + + @Override + public Range newRange(RoutingKey start, RoutingKey end) + { + return TokenRange.create((TokenKey) start, (TokenKey) end); + } + + @Override + public Range newAntiRange(RoutingKey start, RoutingKey end) + { + return TokenRange.createUnsafe((TokenKey) start, (TokenKey) end); + } + + @Override + public RoutingKey toUnseekable() + { + return this; + } + + public boolean isMin() + { + return (sentinel & PREFIX_MASK) == (MIN_TABLE_SENTINEL & PREFIX_MASK); + } + + public boolean isMax() + { + return (sentinel & PREFIX_MASK) == (MAX_TABLE_SENTINEL & PREFIX_MASK); + } + + public boolean isSentinel() + { + return sentinel != NORMAL_SENTINEL; + } + + public boolean isTableSentinel() + { + return (sentinel & PREFIX_MASK) != (NORMAL_SENTINEL & PREFIX_MASK); + } + + public boolean isTokenSentinel() + { + return (sentinel & SUFFIX_MASK) != (NORMAL_SENTINEL & SUFFIX_MASK); + } + + public boolean isBefore() + { + return (sentinel & SUFFIX_MASK) == (BEFORE_TOKEN_SENTINEL & SUFFIX_MASK); + } + + public boolean isAfter() + { + return (sentinel & SUFFIX_MASK) == (AFTER_TOKEN_SENTINEL & SUFFIX_MASK); + } + + public static TokenKey min(TableId table, IPartitioner partitioner) + { + return new TokenKey(table, MIN_TABLE_SENTINEL, partitioner.getMinimumToken()); + } + + public static TokenKey max(TableId table, IPartitioner partitioner) + { + return new TokenKey(table, MAX_TABLE_SENTINEL, partitioner.getMinimumToken()); + } + + public static TokenKey before(TableId table, Token token) + { + return new TokenKey(table, BEFORE_TOKEN_SENTINEL, token); + } + + public static final class Serializer implements AccordSearchableKeySerializer + { + private Serializer() {} + + // stream serialization methods - including a dynamic length for variable size tokens + // types are byte comparable only after any length component + + @Override + public long serializedSize(TokenKey key) + { + IPartitioner partitioner = key.token.getPartitioner(); + int size = 2 + key.table.serializedCompactComparableSize(); + int tokenSize = partitioner.accordFixedLength(); + if (tokenSize >= 0) + return size + tokenSize; + tokenSize = partitioner.accordSerializedSize(key.token); + return size + tokenSize + VIntCoding.sizeOfUnsignedVInt(tokenSize); + } + + @Override + public void serialize(TokenKey key, DataOutputPlus out) throws IOException + { + IPartitioner partitioner = key.token.getPartitioner(); + int fixedLength = partitioner.accordFixedLength(); + if (fixedLength < 0) + { + int len = partitioner.accordSerializedSize(key.token); + out.writeUnsignedVInt32(len); + } + key.table.serializeCompactComparable(out); + serializeWithoutPrefixOrLength(key, out); + } + + @Override + public TokenKey deserialize(DataInputPlus in) throws IOException + { + return deserialize(in, getPartitioner()); + } + + public TokenKey deserialize(DataInputPlus in, IPartitioner partitioner) throws IOException + { + int len = partitioner.accordFixedLength(); + if (len < 0) len = in.readUnsignedVInt32(); + TableId tableId = deserializePrefix(in); + return deserializeWithPrefix(tableId, len + 2, in, partitioner); + } + + @Override + public void skip(DataInputPlus in) throws IOException + { + skip(in, getPartitioner()); + } + + public void skip(DataInputPlus in, IPartitioner partitioner) throws IOException + { + int len = partitioner.accordFixedLength(); + if (len < 0) len = in.readUnsignedVInt32(); + TableId.skipCompact(in); + in.skipBytesFully(len + 2); + } + + // methods for encoding/decoding a single ByteBuffer value + + public ByteBuffer serialize(TokenKey key) + { + int size = key.table.serializedCompactComparableSize() + serializedSizeWithoutPrefix(key); + ByteBuffer result = ByteBuffer.allocate(size); + result.position(key.table.serializeCompactComparable(result, ByteBufferAccessor.instance, 0)); + serializeWithoutPrefixOrLength(key, result); + result.flip(); + return result; + } + + // WARNING: consumes buffer! + public TokenKey deserialize(ByteBuffer buffer) + { + return deserialize(buffer, getPartitioner()); + } + + public TokenKey deserialize(ByteBuffer buffer, IPartitioner partitioner) + { + TableId tableId = TableId.deserializeCompactComparable(buffer, ByteBufferAccessor.instance, 0); + int offset = tableId.serializedCompactComparableSize(); + return deserializeWithPrefix(tableId, buffer.remaining() - offset, buffer, ByteBufferAccessor.instance, offset, partitioner); + } + + // WARNING: consumes buffer! + public TokenKey deserializeAndConsume(ByteBuffer buffer, IPartitioner partitioner) + { + TableId tableId = TableId.deserializeCompactComparable(buffer, ByteBufferAccessor.instance, 0); + int offset = buffer.position(); + buffer.position(offset + tableId.serializedCompactComparableSize()); + return deserializeWithPrefix(tableId, buffer.remaining(), buffer, partitioner); + } + + // methods for encoding searchable tokens separately from tableIds + + @Override + public int fixedKeyLengthForPrefix(Object prefix) + { + int size = getPartitioner().accordFixedLength(); + if (size < 0) + return size; + return 2 + size; + } + + @Override + public int serializedSizeWithoutPrefix(TokenKey key) + { + return 2 + key.token.getPartitioner().accordSerializedSize(key.token); + } + + @Override + public int serializedSizeOfPrefix(Object prefix) + { + return ((TableId) prefix).serializedCompactComparableSize(); + } + + @Override + public void serializePrefix(Object prefix, DataOutputPlus out) throws IOException + { + ((TableId)prefix).serializeCompactComparable(out); + } + + @Override + public void serializeWithoutPrefixOrLength(TokenKey key, DataOutputPlus out) throws IOException + { + out.write(key.prefixSentinel()); + key.token.getPartitioner().accordSerialize(key.token, out); + out.write(key.suffixSentinel()); + } + + public ByteBuffer serializeWithoutPrefixOrLength(TokenKey key) + { + IPartitioner partitioner = key.token.getPartitioner(); + ByteBuffer result = ByteBuffer.allocate(serializedSizeWithoutPrefix(key)); + serializeWithoutPrefixOrLength(key, result, partitioner); + result.flip(); + return result; + } + + public void serializeWithoutPrefixOrLength(TokenKey key, ByteBuffer out) + { + serializeWithoutPrefixOrLength(key, out, key.token.getPartitioner()); + } + + private static void serializeWithoutPrefixOrLength(TokenKey key, ByteBuffer out, IPartitioner partitioner) + { + out.put(key.prefixSentinel()); + partitioner.accordSerialize(key.token, out); + out.put(key.suffixSentinel()); + } + + @Override + public TableId deserializePrefix(DataInputPlus in) throws IOException + { + return TableId.deserializeCompactComparable(in); + } + + @Override + public TokenKey deserializeWithPrefix(Object tableId, int length, DataInputPlus in) throws IOException + { + return deserializeWithPrefix(tableId, length, in, getPartitioner()); + } + + public TokenKey deserializeWithPrefix(Object tableId, int length, DataInputPlus in, IPartitioner partitioner) throws IOException + { + byte sentinel = in.readByte(); + Token token = partitioner.accordDeserialize(in, length - 2); + sentinel |= in.readByte(); + return new TokenKey((TableId) tableId, sentinel, token); + } + + public TokenKey deserializeWithPrefixAndImpliedLength(Object tableId, V src, ValueAccessor accessor, int offset) + { + return deserializeWithPrefixAndImpliedLength(tableId, src, accessor, offset, getPartitioner()); + } + + public TokenKey deserializeWithPrefixAndImpliedLength(Object tableId, V src, ValueAccessor accessor, int offset, IPartitioner partitioner) + { + return deserializeWithPrefix(tableId, accessor.remaining(src, offset), src, accessor, offset, partitioner); + } + + public TokenKey deserializeWithPrefix(Object tableId, int length, V src, ValueAccessor accessor, int offset, IPartitioner partitioner) + { + byte sentinel = accessor.getByte(src, offset++); + Token token = partitioner.accordDeserialize(src, accessor, offset, length - 2); + offset += partitioner.accordSerializedSize(token); + sentinel |= accessor.getByte(src, offset); + return new TokenKey((TableId) tableId, sentinel, token); + } + + // WARNING: consumes buffer! + public TokenKey deserializeWithPrefixAndImpliedLength(Object tableId, ByteBuffer buffer, IPartitioner partitioner) + { + return deserializeWithPrefix(tableId, buffer.remaining(), buffer, partitioner); + } + + // WARNING: consumes buffer! + public TokenKey deserializeWithPrefix(Object tableId, int length, ByteBuffer buffer, IPartitioner partitioner) + { + byte sentinel = buffer.get(); + Token token = partitioner.accordDeserialize(buffer, length - 2); + sentinel |= buffer.get(); + return new TokenKey((TableId) tableId, sentinel, token); + } + + public static final byte ESCAPE_BYTE = 0x0f; + private static final byte[] ESCAPE_BYTES = new byte[] { ESCAPE_BYTE }; + private static final int UNESCAPE = ESCAPE_BYTE; + private static final int UNESCAPE_MASK = 0xffff; + + public static int countEscapes(byte[] bytes) + { + int escapeLimit = escapeLimit(bytes); + int i = 0; + int count = 0; + while ((i = nextEscape(bytes, i, escapeLimit)) >= 0) + { + ++count; + ++i; + } + return count; + } + + public static int serializedSize(byte[] bytes) + { + return 1 + bytes.length + countEscapes(bytes); + } + + private static int escapeLimit(byte[] bytes) + { + return bytes.length - 1; + } + + private static int nextEscape(byte[] bytes, int index, int escapeLimit) + { + while (index <= escapeLimit) + { + if (bytes[index] == 0 && (index == escapeLimit || (bytes[index + 1] & 0xff) <= ESCAPE_BYTE)) + return index; + ++index; + } + return -1; + } + + public static void serializeWithEscapes(byte[] bytes, ByteBuffer out) + { + serializeWithEscapesInternal(bytes, out, ByteBuffer::put); + out.put(trailingByte(bytes)); + } + + public static void serializeWithEscapes(byte[] bytes, DataOutputPlus out) throws IOException + { + serializeWithEscapesInternal(bytes, out, DataOutputPlus::write); + out.writeByte(trailingByte(bytes)); + } + + interface WriteBytes + { + void write(V out, byte[] bytes, int offset, int length) throws T; + } + + private static byte trailingByte(byte[] bytes) + { + return 0; + } + + private static void serializeWithEscapesInternal(byte[] bytes, V out, WriteBytes write) throws T + { + int i = 0, escapeLimit = escapeLimit(bytes); + while (true) + { + int nexti = nextEscape(bytes, i, escapeLimit); + if (nexti < 0) + break; + write.write(out, bytes, i, 1 + nexti - i); + write.write(out, ESCAPE_BYTES, 0, 1); + i = nexti + 1; + } + write.write(out, bytes, i, bytes.length - i); + } + + public static byte[] deserializeWithEscapes(ByteBuffer in, int escapedLength) + { + Invariants.require(escapedLength >= 1); + --escapedLength; + byte[] bytes = new byte[escapedLength]; + in.get(bytes, 0, Math.min(escapedLength, in.remaining())); + byte[] result = removeEscapes(bytes); + byte trailingEscape = in.get(); + Invariants.require(trailingEscape == trailingByte(result)); + return result; + } + + public static byte[] deserializeWithEscapes(V src, ValueAccessor accessor, int offset, int escapedLength) + { + Invariants.require(--escapedLength >= 0); + byte[] result = removeEscapes(accessor.toArray(src, offset, Math.min(escapedLength, accessor.remaining(src, offset)))); + Invariants.require(trailingByte(result) == accessor.getByte(src, offset + escapedLength)); + return result; + } + + public static byte[] deserializeWithEscapes(DataInputPlus in, int escapedLength) throws IOException + { + byte[] result = new byte[escapedLength - 1]; + in.readFully(result); + result = removeEscapes(result); + byte trailingEscape = in.readByte(); + Invariants.require(trailingEscape == trailingByte(result)); + return result; + } + + private static byte[] removeEscapes(byte[] bytes) + { + if (bytes.length == 0) + return bytes; + + int count = 1; + int escapeMatcher = bytes[0]; + for (int i = 1; i < bytes.length ; ++i) + { + byte next = bytes[i]; + escapeMatcher = (escapeMatcher << 8) | next; + if ((escapeMatcher & UNESCAPE_MASK) != UNESCAPE) + { + if (count != i) + bytes[count] = next; + count++; + } + } + + if (bytes.length != count) + bytes = Arrays.copyOf(bytes, count); + return bytes; + } + + } + + public static final Serializer serializer = new Serializer(); + + public static class KeyspaceSplitter implements ShardDistributor + { + final EvenSplit subSplitter; + public KeyspaceSplitter(EvenSplit subSplitter) + { + this.subSplitter = subSplitter; + } + + @Override + public List split(Ranges ranges) + { + Map> byTable = new TreeMap<>(); + for (Range range : ranges) + { + byTable.computeIfAbsent(((AccordRoutableKey)range.start()).table, ignore -> new ArrayList<>()) + .add(range); + } + + List results = new ArrayList<>(); + for (List keyspaceRanges : byTable.values()) + results.addAll(subSplitter.split(Ranges.ofSortedAndDeoverlapped(keyspaceRanges.toArray(new Range[0])))); + return results; + } + + @Override + public Range splitRange(Range range, int from, int to, int numSplits) + { + return subSplitter.splitRange(range, from, to, numSplits); + } + + @Override + public Ranges selectFirstSubRanges(Range range, Ranges subRanges, int totalSplits) + { + return subSplitter.selectFirstSubRanges(range, subRanges, totalSplits); + } + + @Override + public int numberOfSplitsPossible(Range range) + { + return subSplitter.numberOfSplitsPossible(range); + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/events/CacheEvents.java b/src/java/org/apache/cassandra/service/accord/events/CacheEvents.java new file mode 100644 index 000000000000..b37e4185098a --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/events/CacheEvents.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.events; + +import jdk.jfr.Category; +import jdk.jfr.DataAmount; +import jdk.jfr.Event; +import jdk.jfr.Label; +import jdk.jfr.Name; +import jdk.jfr.Percentage; +import jdk.jfr.StackTrace; + +@Category({"Accord", "Accord Cache"}) +@StackTrace(false) +public abstract class CacheEvents extends Event +{ + public int shard; + public String instance; + public String key; + public String status; + @DataAmount(DataAmount.BYTES) + public int lastQueriedEstimatedSizeOnHeap; + + // instance + @DataAmount(DataAmount.BYTES) + public long instanceAllocated; + public long instanceStatsQueries, instanceStatsHits, instanceStatsMisses; + + @Percentage + public double instanceStatsHitRate; + + // cache + @DataAmount(DataAmount.BYTES) + public long globalCapacity, globalAllocated; + public int globalSize, globalReferenced, globalUnreferenced; + + public long globalStatsQueries, globalStatsHits, globalStatsMisses; + + @Percentage + public double globalStatsHitRate; + + @Percentage + public double globalFree; + public void update() + { + instanceStatsHitRate = 1D - (instanceStatsHits / (double) instanceStatsQueries); + globalStatsHitRate = 1D - (globalStatsHits / (double) globalStatsQueries); + globalFree = 1.0D - (globalAllocated / (double) globalCapacity); + } + + @Name("cassandra.accord.cache.Add") + @Label("Accord Cache Add") + public static class Add extends CacheEvents { } + + @Name("cassandra.accord.cache.Release") + @Label("Accord Cache Release") + public static class Release extends CacheEvents { } + + @Name("cassandra.accord.cache.Evict") + @Label("Accord Cache Evict") + public static class Evict extends CacheEvents { } +} diff --git a/src/java/org/apache/cassandra/service/accord/exceptions/AccordReadExhaustedException.java b/src/java/org/apache/cassandra/service/accord/exceptions/AccordReadExhaustedException.java new file mode 100644 index 000000000000..697b31d6e339 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/exceptions/AccordReadExhaustedException.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.exceptions; + +import org.apache.cassandra.exceptions.ReadTimeoutException; + +import static org.apache.cassandra.db.ConsistencyLevel.SERIAL; + +public class AccordReadExhaustedException extends ReadTimeoutException +{ + public AccordReadExhaustedException(int received, int blockFor, boolean dataPresent) + { + super(SERIAL, received, blockFor, dataPresent); + } + + public AccordReadExhaustedException(int received, int blockFor, boolean dataPresent, String msg) + { + super(SERIAL, received, blockFor, dataPresent, msg); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/exceptions/AccordReadPreemptedException.java b/src/java/org/apache/cassandra/service/accord/exceptions/AccordReadPreemptedException.java new file mode 100644 index 000000000000..53b37b7c89e9 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/exceptions/AccordReadPreemptedException.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.exceptions; + +import org.apache.cassandra.exceptions.ReadTimeoutException; + +import static org.apache.cassandra.db.ConsistencyLevel.SERIAL; + +// shim to allow tests to tell the difference between preemption and other protocol timeouts +public class AccordReadPreemptedException extends ReadTimeoutException +{ + public AccordReadPreemptedException(int received, int blockFor, boolean dataPresent) + { + super(SERIAL, received, blockFor, dataPresent); + } + + public AccordReadPreemptedException(int received, int blockFor, boolean dataPresent, String msg) + { + super(SERIAL, received, blockFor, dataPresent, msg); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/exceptions/AccordWriteExhaustedException.java b/src/java/org/apache/cassandra/service/accord/exceptions/AccordWriteExhaustedException.java new file mode 100644 index 000000000000..6aca227240e8 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/exceptions/AccordWriteExhaustedException.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.exceptions; + +import org.apache.cassandra.db.WriteType; +import org.apache.cassandra.exceptions.WriteTimeoutException; + +import static org.apache.cassandra.db.ConsistencyLevel.SERIAL; + +public class AccordWriteExhaustedException extends WriteTimeoutException +{ + public AccordWriteExhaustedException(int received, int blockFor) + { + super(WriteType.CAS, SERIAL, received, blockFor); + } + + public AccordWriteExhaustedException(int received, int blockFor, String msg) + { + super(WriteType.CAS, SERIAL, received, blockFor, msg); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/exceptions/AccordWritePreemptedException.java b/src/java/org/apache/cassandra/service/accord/exceptions/AccordWritePreemptedException.java new file mode 100644 index 000000000000..ee02e6dfad19 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/exceptions/AccordWritePreemptedException.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.exceptions; + +import org.apache.cassandra.db.WriteType; +import org.apache.cassandra.exceptions.WriteTimeoutException; + +import static org.apache.cassandra.db.ConsistencyLevel.SERIAL; + +// quick hack to allow tests to tell the difference between preemption and other protocol timeouts +public class AccordWritePreemptedException extends WriteTimeoutException +{ + public AccordWritePreemptedException(int received, int blockFor) + { + super(WriteType.CAS, SERIAL, received, blockFor); + } + + public AccordWritePreemptedException(int received, int blockFor, String msg) + { + super(WriteType.CAS, SERIAL, received, blockFor, msg); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/fastpath/FastPathStrategy.java b/src/java/org/apache/cassandra/service/accord/fastpath/FastPathStrategy.java new file mode 100644 index 000000000000..fac1d4a6a130 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/fastpath/FastPathStrategy.java @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.fastpath; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +import javax.annotation.Nullable; + +import com.google.common.collect.ImmutableMap; + +import accord.local.Node; +import accord.utils.SortedArrays.SortedArrayList; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; + +import static org.apache.cassandra.utils.LocalizeString.toLowerCaseLocalized; +import static org.apache.cassandra.utils.LocalizeString.toUpperCaseLocalized; + +public interface FastPathStrategy +{ + enum Kind + { + SIMPLE, PARAMETERIZED, INHERIT_KEYSPACE; + + static final String KEY = "kind"; + private static final Map LOOKUP; + static + { + ImmutableMap.Builder builder = ImmutableMap.builder(); + builder.put(SIMPLE.name(), SIMPLE); + builder.put(PARAMETERIZED.name(), PARAMETERIZED); + builder.put(INHERIT_KEYSPACE.name(), INHERIT_KEYSPACE); + LOOKUP = builder.build(); + } + + public byte asByte() + { + return (byte) ordinal(); + } + + public static Kind fromByte(byte i) + { + return values()[i]; + } + + @Nullable + public static Kind fromString(String s) + { + return LOOKUP.get(toUpperCaseLocalized(s)); + } + + @Nullable + private static Kind fromMap(Map map) + { + String name = map.remove(KEY); + return name != null ? fromString(name) : null; + } + } + + /** + * @param nodes expected to be sorted deterministically + * @param unavailable + * @param dcMap + * @return + */ + SortedArrayList calculateFastPath(SortedArrayList nodes, Set unavailable, Map dcMap); + + Kind kind(); + + Map asMap(); + + String asCQL(); + + static FastPathStrategy fromMap(Map map) + { + if (map == null || map.isEmpty()) + return SimpleFastPathStrategy.instance; + + map = new HashMap<>(map); + Kind kind = Kind.fromMap(map); + if (kind == null) + return map.isEmpty() + ? simple() + : ParameterizedFastPathStrategy.fromMap(map); + + switch (kind) + { + case SIMPLE: + return simple(); + case PARAMETERIZED: + return ParameterizedFastPathStrategy.fromMap(map); + case INHERIT_KEYSPACE: + return inheritKeyspace(); + default: + throw new IllegalArgumentException("Unhandled strategy kind: " + kind); + } + } + + static FastPathStrategy tableStrategyFromString(String s) + { + s = toLowerCaseLocalized(s).trim(); + if (s.equals("keyspace")) + return InheritKeyspaceFastPathStrategy.instance; + if (s.equals("simple")) + return SimpleFastPathStrategy.instance; + + throw new ConfigurationException("Fast path strategy must either be 'keyspace', `default` or a map size and optional dcs {'size':n, 'dcs': dc0,dc1..."); + } + + static FastPathStrategy keyspaceStrategyFromString(String s) + { + s = toLowerCaseLocalized(s).trim(); + if (s.equals("simple")) + return SimpleFastPathStrategy.instance; + + throw new ConfigurationException("Fast path strategy must either be `default` or a map size and optional dcs {'size':n, 'dcs': dc0,dc1..."); + } + + static FastPathStrategy simple() + { + return SimpleFastPathStrategy.instance; + } + + static FastPathStrategy inheritKeyspace() + { + return InheritKeyspaceFastPathStrategy.instance; + } + + MetadataSerializer serializer = new MetadataSerializer() + { + public void serialize(FastPathStrategy strategy, DataOutputPlus out, Version version) throws IOException + { + Kind type = strategy.kind(); + out.write(type.asByte()); + if (type == Kind.PARAMETERIZED) + ParameterizedFastPathStrategy.serializer.serialize((ParameterizedFastPathStrategy) strategy, out, version); + } + + public FastPathStrategy deserialize(DataInputPlus in, Version version) throws IOException + { + Kind type = Kind.fromByte(in.readByte()); + switch (type) + { + case SIMPLE: + return simple(); + case PARAMETERIZED: + return ParameterizedFastPathStrategy.serializer.deserialize(in, version); + case INHERIT_KEYSPACE: + return inheritKeyspace(); + default: + throw new IllegalArgumentException("Unhandled type: " + type); + } + } + + public long serializedSize(FastPathStrategy strategy, Version version) + { + long size = TypeSizes.BYTE_SIZE; + if (strategy.kind() == Kind.PARAMETERIZED) + size += ParameterizedFastPathStrategy.serializer.serializedSize((ParameterizedFastPathStrategy) strategy, version); + return size; + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/fastpath/InheritKeyspaceFastPathStrategy.java b/src/java/org/apache/cassandra/service/accord/fastpath/InheritKeyspaceFastPathStrategy.java new file mode 100644 index 000000000000..39fae55d929b --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/fastpath/InheritKeyspaceFastPathStrategy.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.fastpath; + +import java.util.Map; +import java.util.Set; + +import com.google.common.collect.ImmutableMap; + +import accord.local.Node; +import accord.utils.SortedArrays.SortedArrayList; + +public class InheritKeyspaceFastPathStrategy implements FastPathStrategy +{ + public static final FastPathStrategy instance = new InheritKeyspaceFastPathStrategy(); + + private static final Map SCHEMA_PARAMS = ImmutableMap.of(Kind.KEY, Kind.INHERIT_KEYSPACE.name()); + + private InheritKeyspaceFastPathStrategy() {} + + @Override + public SortedArrayList calculateFastPath(SortedArrayList nodes, Set unavailable, Map dcMap) + { + throw new IllegalStateException("InheritKeyspaceFastPathStrategy should be replaced before calculateFastPath is called"); + } + + @Override + public Kind kind() + { + return Kind.INHERIT_KEYSPACE; + } + + @Override + public String toString() + { + return "keyspace"; + } + + public Map asMap() + { + return SCHEMA_PARAMS; + } + + @Override + public String asCQL() + { + return "'keyspace'"; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/fastpath/ParameterizedFastPathStrategy.java b/src/java/org/apache/cassandra/service/accord/fastpath/ParameterizedFastPathStrategy.java new file mode 100644 index 000000000000..858c15b64ddc --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/fastpath/ParameterizedFastPathStrategy.java @@ -0,0 +1,376 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.fastpath; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableMap; + +import accord.api.VisibleForImplementation; +import accord.local.Node; +import accord.topology.Shard; +import accord.utils.Invariants; +import accord.utils.SortedArrays.SortedArrayList; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; + +import javax.annotation.Nonnull; + +public class ParameterizedFastPathStrategy implements FastPathStrategy +{ + public static final String SIZE = "size"; + public static final String DCS = "dcs"; + private static final Joiner DC_JOINER = Joiner.on(','); + private static final Pattern COMMA_SEPARATOR = Pattern.compile(","); + private static final Pattern COLON_SEPARATOR = Pattern.compile(":"); + + static class WeightedDc implements Comparable + { + private static final WeightedDc UNSPECIFIED = new WeightedDc("", Integer.MAX_VALUE, true); + private static final MetadataSerializer serializer = new MetadataSerializer() + { + public void serialize(WeightedDc dc, DataOutputPlus out, Version version) throws IOException + { + out.writeUTF(dc.name); + out.writeUnsignedVInt32(dc.weight); + out.writeBoolean(dc.autoWeight); + } + + public WeightedDc deserialize(DataInputPlus in, Version version) throws IOException + { + return new WeightedDc(in.readUTF(), + in.readUnsignedVInt32(), + in.readBoolean()); + } + + public long serializedSize(WeightedDc dc, Version version) + { + return TypeSizes.sizeof(dc.name) + TypeSizes.sizeofUnsignedVInt(dc.weight) + TypeSizes.BOOL_SIZE; + } + }; + + final String name; + final int weight; + final boolean autoWeight; + + public WeightedDc(String name, int weight, boolean autoWeight) + { + this.name = name; + this.weight = weight; + this.autoWeight = autoWeight; + } + + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + WeightedDc that = (WeightedDc) o; + return weight == that.weight && autoWeight == that.autoWeight && Objects.equals(name, that.name); + } + + public int hashCode() + { + return Objects.hash(name, weight, autoWeight); + } + + @Override + public int compareTo(@Nonnull WeightedDc that) + { + int cmp = Integer.compare(this.weight, that.weight); + if (cmp != 0) return cmp; + return this.name.compareTo(that.name); + } + + public String toString() + { + return autoWeight ? name : name + ':' + weight; + } + + static String validateDC(String dc) + { + dc = dc.trim(); + if (dc.isEmpty()) + throw cfe("dc name must not be empty", DCS); + return dc; + } + + static int validateWeight(String w) + { + int weight = Integer.parseInt(w); + if (weight < 0) + throw cfe("DC weights must be zero or positive"); + return weight; + } + + static WeightedDc fromString(String s, int idx) + { + s = s.trim(); + if (s.isEmpty()) + throw cfe("%s entries must not be empty", DCS); + + String[] parts = COLON_SEPARATOR.split(s); + if (parts.length == 1) + return new WeightedDc(validateDC(parts[0]), idx, true); + else if (parts.length == 2) + return new WeightedDc(validateDC(parts[0]), validateWeight(parts[1]), false); + else + throw cfe("Invalid dc weighting syntax %s, use :", s); + } + } + + public final int size; + private final ImmutableMap dcs; + + ParameterizedFastPathStrategy(int size, ImmutableMap dcs) + { + this.size = size; + this.dcs = dcs; + } + + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + ParameterizedFastPathStrategy that = (ParameterizedFastPathStrategy) o; + return size == that.size && Objects.equals(dcs, that.dcs); + } + + public int hashCode() + { + return Objects.hash(size, dcs); + } + + private static class NodeSorter implements Comparable + { + private final Node.Id id; + private final int sortPos; + private final int dcIndex; + private final int health; + + public NodeSorter(Node.Id id, int sortPos, int dcIndex, int health) + { + this.id = id; + this.sortPos = sortPos; + this.dcIndex = dcIndex; + this.health = health; + } + + @Override + public int compareTo(@Nonnull NodeSorter that) + { + int cmp = this.health - that.health; + if (cmp != 0) return cmp; + + cmp = this.dcIndex - that.dcIndex; + if (cmp != 0) return cmp; + + cmp = this.sortPos - that.sortPos; + if (cmp != 0) return cmp; + + Invariants.require(this.id.equals(that.id)); + return 0; + } + } + + @Override + public SortedArrayList calculateFastPath(SortedArrayList nodes, Set unavailable, Map dcMap) + { + List sorters = new ArrayList<>(nodes.size()); + + for (int i = 0, mi = nodes.size(); i < mi; i++) + { + Node.Id node = nodes.get(i); + String dc = dcMap.get(node); + int dcScore = dcs.getOrDefault(dc, WeightedDc.UNSPECIFIED).weight; + NodeSorter sorter = new NodeSorter(node, i, dcScore, unavailable.contains(node) ? 1 : 0); + sorters.add(sorter); + } + + sorters.sort(Comparator.naturalOrder()); + + int slowQuorum = Shard.slowQuorumSize(nodes.size()); + int fpSize = Math.max(size, slowQuorum); + Node.Id[] array = new Node.Id[fpSize]; + for (int i=0; i electorate = new SortedArrayList<>(array); + Invariants.require(electorate.size() >= slowQuorum); + return electorate; + } + + private static ConfigurationException cfe(String fmt, Object... args) + { + return new ConfigurationException(String.format(fmt, args)); + } + + public static ParameterizedFastPathStrategy fromMap(Map map) + { + if (!map.containsKey(SIZE)) + throw cfe("fast_path must be set to 'keyspace' or 'default' or a map defining '%s' and optionally '%s'", SIZE, DCS); + + int size; + try + { + size = Integer.parseInt(map.get(SIZE)); + } + catch (NumberFormatException e) + { + throw cfe("%s must be a positive number, got %s", SIZE, map.get(SIZE)); + } + + if (size < 1) + throw cfe("%s must be greater than zero", SIZE); + + ImmutableMap dcMap; + if (map.containsKey(DCS)) + { + + Map mutableDcs = new HashMap<>(); + String dcsString = map.get(DCS); + if (dcsString.trim().isEmpty()) + throw cfe("%s must specify at least one DC", DCS); + + int autoIdx = 0; + boolean hasAuto = false; + boolean hasManual = false; + for (String dcString : COMMA_SEPARATOR.split(dcsString)) + { + WeightedDc dc = WeightedDc.fromString(dcString, autoIdx++); + if (mutableDcs.containsKey(dc.name)) + throw cfe("Multiple entries for DC %s", dc.name); + + if (dc.autoWeight) + { + if (hasManual) throw cfe("Cannot mix auto and manual DC weights"); + hasAuto = true; + } + else + { + if (hasAuto) throw cfe("Cannot mix auto and manual DC weights"); + hasManual = true; + } + + mutableDcs.put(dc.name, dc); + } + dcMap = ImmutableMap.copyOf(mutableDcs); + } + else + { + dcMap = ImmutableMap.of(); + } + + Set keys = new HashSet<>(map.keySet()); + keys.remove(SIZE); + keys.remove(DCS); + if (!keys.isEmpty()) + throw cfe("Unrecognized fast path options provided: ", keys); + + return new ParameterizedFastPathStrategy(size, dcMap); + } + + @Override + public Kind kind() + { + return Kind.PARAMETERIZED; + } + + @VisibleForImplementation + public Iterable dcStrings() + { + return dcs.values().stream().sorted().map(Object::toString).collect(Collectors.toList()); + } + + @Override + public String toString() + { + StringBuilder sb = new StringBuilder("{"); + sb.append('\'').append(SIZE).append("':").append(size); + if (!dcs.isEmpty()) + sb.append(", ").append(DCS).append(':').append('\'').append(DC_JOINER.join(dcStrings())).append('\''); + + return sb.append('}').toString(); + } + + public Map asMap() + { + Map params = new HashMap<>(); + params.put(Kind.KEY, kind().name()); + params.put(SIZE, Integer.toString(size)); + params.put(DCS, DC_JOINER.join(dcStrings())); + return params; + } + + @Override + public String asCQL() + { + return toString(); + } + + public static final MetadataSerializer serializer = new MetadataSerializer() + { + public void serialize(ParameterizedFastPathStrategy strategy, DataOutputPlus out, Version version) throws IOException + { + out.writeUnsignedVInt32(strategy.size); + out.writeUnsignedVInt32(strategy.dcs.size()); + for (WeightedDc dc : strategy.dcs.values()) + WeightedDc.serializer.serialize(dc, out, version); + } + + public ParameterizedFastPathStrategy deserialize(DataInputPlus in, Version version) throws IOException + { + int size = in.readUnsignedVInt32(); + int numDCs = in.readUnsignedVInt32(); + ImmutableMap.Builder builder = ImmutableMap.builder(); + for (int i=0; i SCHEMA_PARAMS = ImmutableMap.of(Kind.KEY, Kind.SIMPLE.name()); + + private SimpleFastPathStrategy() {} + + @Override + public SortedArrayList calculateFastPath(SortedArrayList nodes, Set unavailable, Map dcMap) + { + int maxFailures = Shard.maxToleratedFailures(nodes.size()); + int discarded = 0; + + if (unavailable.isEmpty()) + return nodes; + + Object[] tmp = ArrayBuffers.cachedAny().get(nodes.size()); + for (int i=0,mi=nodes.size(); i fastPath = new SortedArrayList<>(array); + Invariants.require(fastPath.size() >= Shard.slowQuorumSize(nodes.size())); + return fastPath; + } + + @Override + public Kind kind() + { + return Kind.SIMPLE; + } + + @Override + public String toString() + { + return "simple"; + } + + public Map asMap() + { + return SCHEMA_PARAMS; + } + + @Override + public String asCQL() + { + return "'simple'"; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/interop/AccordInteropAdapter.java b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropAdapter.java new file mode 100644 index 000000000000..ca466105eb8b --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropAdapter.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.interop; + +import java.util.function.BiConsumer; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.Result; +import accord.api.Update; +import accord.coordinate.CoordinationAdapter; +import accord.coordinate.CoordinationAdapter.Adapters.TxnAdapter; +import accord.coordinate.ExecuteFlag.ExecuteFlags; +import accord.coordinate.ExecutePath; +import accord.local.Node; +import accord.messages.Apply; +import accord.primitives.Ballot; +import accord.primitives.Deps; +import accord.primitives.FullRoute; +import accord.primitives.Route; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.primitives.Writes; +import accord.topology.Topologies; +import accord.topology.Topologies.SelectNodeOwnership; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.service.accord.AccordEndpointMapper; +import org.apache.cassandra.service.accord.api.AccordAgent; +import org.apache.cassandra.service.accord.interop.AccordInteropExecution.InteropExecutor; +import org.apache.cassandra.service.accord.txn.AccordUpdate; +import org.apache.cassandra.service.accord.txn.TxnRead; + +import static accord.messages.Apply.Kind.Maximal; +import static accord.messages.Apply.Kind.Minimal; + +public class AccordInteropAdapter extends TxnAdapter +{ + private static final Logger logger = LoggerFactory.getLogger(AccordInteropAdapter.class); + public static final class AccordInteropFactory extends DefaultFactory + { + final AccordInteropAdapter standard, recovery; + + public AccordInteropFactory(AccordAgent agent, AccordEndpointMapper endpointMapper) + { + final InteropExecutor executor = new InteropExecutor(agent); + standard = new AccordInteropAdapter(executor, endpointMapper, Minimal); + recovery = new AccordInteropAdapter(executor, endpointMapper, Maximal); + } + + @Override + public CoordinationAdapter get(TxnId txnId, Kind step) + { + if (txnId.isSyncPoint()) + return super.get(txnId, step); + return (CoordinationAdapter) (step == Kind.Recovery ? recovery : standard); + } + }; + + private final InteropExecutor executor; + private final AccordEndpointMapper endpointMapper; + private final Apply.Kind applyKind; + + private AccordInteropAdapter(InteropExecutor executor, AccordEndpointMapper endpointMapper, Apply.Kind applyKind) + { + super(Minimal); + this.executor = executor; + this.endpointMapper = endpointMapper; + this.applyKind = applyKind; + } + + @Override + public void execute(Node node, Topologies any, FullRoute route, Ballot ballot, ExecutePath path, ExecuteFlags executeFlags, TxnId txnId, Txn txn, Timestamp executeAt, Deps stableDeps, Deps sendDeps, BiConsumer callback) + { + if (!doInteropExecute(node, route, ballot, txnId, txn, executeAt, stableDeps, callback)) + super.execute(node, any, route, ballot, path, executeFlags, txnId, txn, executeAt, stableDeps, sendDeps, callback); + } + + @Override + public void persist(Node node, Topologies any, Route require, Route sendTo, SelectNodeOwnership selectSendTo, FullRoute route, Ballot ballot, TxnId txnId, Txn txn, Timestamp executeAt, Deps deps, Writes writes, Result result, BiConsumer callback) + { + if (applyKind == Minimal && doInteropPersist(node, any, require, sendTo, selectSendTo, ballot, txnId, txn, executeAt, deps, writes, result, route, callback)) + return; + + super.persist(node, any, require, sendTo, selectSendTo, route, ballot, txnId, txn, executeAt, deps, writes, result, callback); + } + + + private boolean doInteropExecute(Node node, FullRoute route, Ballot ballot, TxnId txnId, Txn txn, Timestamp executeAt, Deps deps, BiConsumer callback) + { + // Unrecoverable repair always needs to be run by AccordInteropExecution + AccordUpdate.Kind updateKind = AccordUpdate.kind(txn.update()); + ConsistencyLevel consistencyLevel = txn.read() instanceof TxnRead ? ((TxnRead) txn.read()).cassandraConsistencyLevel() : null; + if (updateKind != AccordUpdate.Kind.UNRECOVERABLE_REPAIR && (consistencyLevel == null || consistencyLevel == ConsistencyLevel.ONE || txn.read().keys().isEmpty())) + return false; + + new AccordInteropExecution(node, txnId, txn, updateKind, route, ballot, executeAt, deps, callback, executor, consistencyLevel, endpointMapper) + .start(); + return true; + } + + private boolean doInteropPersist(Node node, Topologies any, Route require, Route sendTo, SelectNodeOwnership selectSendTo, Ballot ballot, TxnId txnId, Txn txn, Timestamp executeAt, Deps deps, Writes writes, Result result, FullRoute fullRoute, BiConsumer callback) + { + Update update = txn.update(); + ConsistencyLevel consistencyLevel = update instanceof AccordUpdate ? ((AccordUpdate) update).cassandraCommitCL() : null; + if (consistencyLevel == null || consistencyLevel == ConsistencyLevel.ANY || writes.isEmpty()) + return false; + + Topologies all = execution(node, any, sendTo, selectSendTo, fullRoute, txnId, executeAt); + new AccordInteropPersist(node, all, txnId, require, ballot, txn, executeAt, deps, writes, result, fullRoute, consistencyLevel, callback) + .start(Minimal, any, writes, result); + return true; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/interop/AccordInteropApply.java b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropApply.java new file mode 100644 index 000000000000..3a5671c97066 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropApply.java @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.interop; + +import javax.annotation.Nullable; + +import accord.api.LocalListeners; +import accord.api.Result; +import accord.local.Command; +import accord.local.Node.Id; +import accord.local.SafeCommand; +import accord.local.SafeCommandStore; +import accord.local.StoreParticipants; +import accord.messages.Apply; +import accord.messages.MessageType; +import accord.primitives.Ballot; +import accord.primitives.Deps; +import accord.primitives.FullRoute; +import accord.primitives.PartialDeps; +import accord.primitives.PartialTxn; +import accord.primitives.Route; +import accord.primitives.Status; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.primitives.Unseekables; +import accord.primitives.Writes; +import accord.topology.Topologies; +import accord.utils.UnhandledEnum; +import org.agrona.collections.Int2ObjectHashMap; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.service.accord.AccordMessageSink.AccordMessageType; +import org.apache.cassandra.service.accord.serializers.ApplySerializers.ApplySerializer; +import org.apache.cassandra.service.accord.serializers.IVersionedSerializer; +import org.apache.cassandra.service.accord.txn.AccordUpdate; +import org.apache.cassandra.tcm.ClusterMetadata; + +import static accord.utils.Invariants.requireArgument; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; + +/** + * Apply that waits until the transaction is actually applied before sending a response + * // TODO (desired): At this point there are a plethora of do X to Command, then wait until state Y before maybe doing Z and returning a response, potentially returning insufficient along the way + * // and these all are a bit copy pasta in terms of managing things like waiting on, obsoletion, cancellation/listeners, insufficient etc. and it would be less fragile + * // in the long run to not duplicate these kind of difficult to get right mechanism and have a single pluggable framework to request each specific behavior + */ +public class AccordInteropApply extends Apply implements LocalListeners.ComplexListener +{ + public static final Apply.Factory FACTORY = new Apply.Factory() + { + @Override + public Apply create(Kind kind, Id to, Topologies participates, TxnId txnId, Ballot ballot, Route route, Txn txn, Timestamp executeAt, Deps deps, Writes writes, Result result, FullRoute fullRoute) + { + checkArgument(kind != Kind.Maximal, "Shouldn't need to send a maximal commit with interop support"); + ConsistencyLevel commitCL = txn.update() instanceof AccordUpdate ? ((AccordUpdate) txn.update()).cassandraCommitCL() : null; + // Any asynchronous apply option should use the regular Apply that doesn't wait for writes to complete + if (commitCL == null || commitCL == ConsistencyLevel.ANY) + return Apply.FACTORY.create(kind, to, participates, txnId, ballot, route, txn, executeAt, deps, writes, result, fullRoute); + return new AccordInteropApply(kind, to, participates, txnId, ballot, route, txn, executeAt, deps, writes, result, fullRoute); + } + }; + + public static final IVersionedSerializer serializer = new ApplySerializer() + { + @Override + protected AccordInteropApply deserializeApply(TxnId txnId, Ballot ballot, Route scope, long minEpoch, long waitForEpoch, long maxEpoch, Apply.Kind kind, Timestamp executeAt, PartialDeps deps, PartialTxn txn, @Nullable FullRoute fullRoute, Writes writes, Result result) + { + return new AccordInteropApply(kind, txnId, ballot, scope, minEpoch, waitForEpoch, maxEpoch, executeAt, deps, txn, fullRoute, writes, result); + } + }; + + transient int waitingOnCount; + transient Int2ObjectHashMap listeners; + boolean failed; + + private AccordInteropApply(Kind kind, TxnId txnId, Ballot ballot, Route route, long minEpoch, long waitForEpoch, long maxEpoch, Timestamp executeAt, PartialDeps deps, @Nullable PartialTxn txn, @Nullable FullRoute fullRoute, Writes writes, Result result) + { + super(kind, txnId, ballot, route, minEpoch, waitForEpoch, maxEpoch, executeAt, deps, txn, fullRoute, writes, result); + } + + private AccordInteropApply(Kind kind, Id to, Topologies participates, TxnId txnId, Ballot ballot, Route route, Txn txn, Timestamp executeAt, Deps deps, Writes writes, Result result, FullRoute fullRoute) + { + super(kind, to, participates, txnId, ballot, route, txn, executeAt, deps, writes, result, fullRoute); + } + + @Override + public ApplyReply apply(SafeCommandStore safeStore, StoreParticipants participants) + { + ClusterMetadata cm = ClusterMetadata.current(); + checkState(cm.epoch.getEpoch() >= minEpoch, "TCM epoch %d is < minEpoch %d", cm.epoch.getEpoch(), minEpoch); + ApplyReply reply = super.apply(safeStore, participants); + requireArgument(reply == ApplyReply.Redundant || reply == ApplyReply.Applied || reply == ApplyReply.Insufficient, "Unexpected ApplyReply"); + + // Hasn't necessarily finished applying yet so need to check and maybe add a listener + // Redundant means we are competing with a recovery coordinator which is fine + // we don't need to return an error we can wait for the Apply + // Insufficient means it is safe to install the listener and wait for Apply to happen + // once the coordinator sends a maximal commit + // Applied doesn't actually mean the command is in the Applied state so we still need to check and maybe install + // the listener + SafeCommand safeCommand = safeStore.get(txnId, participants); + Command current = safeCommand.current(); + // Don't actually think it is possible for this to reach applied while we are stll running, but just to be safe + // check anyways + Status status = current.status(); + switch (status) + { + default: throw new AssertionError(); + case NotDefined: + case PreAccepted: + case AcceptedInvalidate: + case AcceptedMedium: + case AcceptedSlow: + case PreCommitted: + case Committed: + case Stable: + case PreApplied: + LocalListeners.Registered listener = safeStore.register(txnId, this); + synchronized (this) + { + if (!failed) + { + if (listeners == null) + listeners = new Int2ObjectHashMap<>(); + listeners.put(safeStore.commandStore().id(), listener); + ++waitingOnCount; + listener = null; + } + } + if (listener != null) + listener.cancel(); + break; + + case Applied: + case Invalidated: + case Truncated: + } + + return reply; + } + + private synchronized void ack() + { + // wait for -1 to ensure the setup phase has also completed. Setup calls ack in its callback + // and prevents races where we respond before dispatching all the required reads (if the reads are + // completing faster than the reads can be setup on all required shards) + if (-1 == --waitingOnCount) + node.reply(replyTo, replyContext, ApplyReply.Applied, null); + } + + @Override + public ApplyReply reduce(ApplyReply r1, ApplyReply r2) + { + return r1 == null || r2 == null + ? r1 == null ? r2 : r1 + : r1.compareTo(r2) >= 0 ? r1 : r2; + } + + @Override + protected void acceptInternal(ApplyReply reply, Throwable failure) + { + if (reply == ApplyReply.Insufficient) + { + // Respond with insufficient which should make the coordinator send us the commit + // we need to respond + node.reply(replyTo, replyContext, reply, failure); + } + else if (failure != null) + { + node.reply(replyTo, replyContext, null, failure); + node.agent().onUncaughtException(failure); + fail(); + } + + // Unless failed always ack to indicate setup has completed otherwise the counter never gets to -1 + if (failure == null) + ack(); + } + + private void fail() + { + Int2ObjectHashMap listeners; + synchronized (this) + { + failed = true; + listeners = this.listeners; + this.listeners = null; + } + if (listeners != null) + listeners.forEach((i, v) -> v.cancel()); + } + + @Override + public TxnId primaryTxnId() + { + return txnId; + } + + @Override + public Unseekables keys() + { + return scope; + } + + @Override + public MessageType type() + { + return AccordMessageType.INTEROP_APPLY_REQ; + } + + @Override + public String toString() + { + return "AccordInteropApply{" + + "txnId:" + txnId + + ", deps:" + deps + + ", executeAt:" + executeAt + + ", writes:" + writes + + ", result:" + result + + '}'; + } + + @Override + public boolean notify(SafeCommandStore safeStore, SafeCommand safeCommand) + { + Command command = safeCommand.current(); + switch (command.status()) + { + default: throw new UnhandledEnum(command.status()); + case NotDefined: + case PreAccepted: + case AcceptedInvalidate: + case AcceptedMedium: + case AcceptedSlow: + case PreCommitted: + case Committed: + case PreApplied: + case Stable: + return true; + + case Applied: + case Invalidated: + case Truncated: + } + + synchronized (this) + { + if (failed) + return false; + + listeners.remove(safeStore.commandStore().id()); + ack(); + } + return false; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/interop/AccordInteropExecution.java b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropExecution.java new file mode 100644 index 000000000000..ffd42f0c4890 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropExecution.java @@ -0,0 +1,456 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.interop; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Set; +import java.util.concurrent.Callable; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.BiConsumer; + +import accord.api.Agent; +import accord.api.Data; +import accord.api.Result; +import accord.coordinate.CoordinationAdapter; +import accord.local.AgentExecutor; +import accord.local.CommandStore; +import accord.local.Node; +import accord.local.Node.Id; +import accord.messages.Commit; +import accord.messages.Commit.Kind; +import accord.primitives.AbstractRanges; +import accord.primitives.Ballot; +import accord.primitives.Deps; +import accord.primitives.FullRoute; +import accord.primitives.Keys; +import accord.primitives.Participants; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.topology.Shard; +import accord.topology.Topologies; +import accord.topology.Topology; +import accord.utils.UnhandledEnum; +import accord.utils.async.AsyncChain; +import accord.utils.async.AsyncChains; +import org.apache.cassandra.concurrent.Stage; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.db.PartitionRangeReadCommand; +import org.apache.cassandra.db.ReadCommand; +import org.apache.cassandra.db.ReadResponse; +import org.apache.cassandra.db.SinglePartitionReadCommand; +import org.apache.cassandra.db.SinglePartitionReadCommand.Group; +import org.apache.cassandra.db.partitions.FilteredPartition; +import org.apache.cassandra.db.partitions.PartitionIterator; +import org.apache.cassandra.db.rows.RowIterator; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.locator.EndpointsForToken; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.Replica; +import org.apache.cassandra.metrics.AccordClientRequestMetrics; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.RequestCallback; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.StorageProxy; +import org.apache.cassandra.service.accord.AccordEndpointMapper; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.service.accord.api.AccordAgent; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.accord.interop.AccordInteropReadCallback.MaximalCommitSender; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.txn.AccordUpdate; +import org.apache.cassandra.service.accord.txn.TxnData; +import org.apache.cassandra.service.accord.txn.TxnDataKeyValue; +import org.apache.cassandra.service.accord.txn.TxnDataRangeValue; +import org.apache.cassandra.service.accord.txn.TxnRead; +import org.apache.cassandra.service.accord.txn.UnrecoverableRepairUpdate; +import org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter; +import org.apache.cassandra.service.reads.ReadCoordinator; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.transport.Dispatcher; + +import static accord.coordinate.CoordinationAdapter.Factory.Kind.Standard; +import static accord.primitives.Txn.Kind.Write; +import static accord.topology.Topologies.SelectNodeOwnership.SHARE; +import static accord.utils.Invariants.requireArgument; +import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.accordReadMetrics; +import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.accordWriteMetrics; + +/* + * The core interoperability problem between Accord and C* writes (regular, and read repair) + * is that when the writes don't go through Accord then Accord can read data that is not yet committed + * because Accord replicas can lag behind and multiple coordinators can be attempting to compute the result of a + * transaction and they can compute different results depending on what they consider to be the inputs to the Accord + * transaction. + * + * We generally solve this by forcing non-Accord writes through Accord as well as by having Accord perform read repair + * on its inputs. + * + */ +public class AccordInteropExecution implements ReadCoordinator, MaximalCommitSender +{ + static class InteropExecutor implements AgentExecutor + { + private final AccordAgent agent; + + public InteropExecutor(AccordAgent agent) + { + this.agent = agent; + } + + @Override + public Agent agent() + { + return agent; + } + + @Override + public AsyncChain build(Callable task) + { + try + { + return AsyncChains.success(task.call()); + } + catch (Throwable e) + { + return AsyncChains.failure(e); + } + } + } + + private final Node node; + private final TxnId txnId; + private final Txn txn; + private final FullRoute route; + private final Ballot ballot; + private final Timestamp executeAt; + private final Deps deps; + private final BiConsumer callback; + private final AgentExecutor executor; + private final ConsistencyLevel consistencyLevel; + private final AccordEndpointMapper endpointMapper; + + private final Topologies executes; + private final Topologies allTopologies; + private final Topology executeTopology; + private final Topology coordinateTopology; + + private final AtomicInteger readsCurrentlyUnderConstruction; + + private final Set contacted; + private final AccordUpdate.Kind updateKind; + + public AccordInteropExecution(Node node, TxnId txnId, Txn txn, AccordUpdate.Kind updateKind, FullRoute route, Ballot ballot, Timestamp executeAt, Deps deps, BiConsumer callback, + AgentExecutor executor, ConsistencyLevel consistencyLevel, AccordEndpointMapper endpointMapper) + { + requireArgument(!txn.read().keys().isEmpty() || updateKind == AccordUpdate.Kind.UNRECOVERABLE_REPAIR); + this.node = node; + this.txnId = txnId; + this.txn = txn; + this.route = route; + this.ballot = ballot; + this.executeAt = executeAt; + this.deps = deps; + this.callback = callback; + this.executor = executor; + + requireArgument(updateKind == AccordUpdate.Kind.UNRECOVERABLE_REPAIR || consistencyLevel == ConsistencyLevel.QUORUM || consistencyLevel == ConsistencyLevel.ALL || consistencyLevel == ConsistencyLevel.SERIAL); + this.consistencyLevel = consistencyLevel; + this.endpointMapper = endpointMapper; + + // TODO (required): compare this to latest logic in Accord, make sure it makes sense + this.executes = node.topology().forEpoch(route, executeAt.epoch(), SHARE); + this.allTopologies = txnId.epoch() != executeAt.epoch() + ? node.topology().preciseEpochs(route, txnId.epoch(), executeAt.epoch(), SHARE) + : executes; + this.executeTopology = executes.getEpoch(executeAt.epoch()); + this.coordinateTopology = allTopologies.getEpoch(txnId.epoch()); + if (consistencyLevel != ConsistencyLevel.ALL) + { + readsCurrentlyUnderConstruction = new AtomicInteger(txn.read().keys().size()); + contacted = Collections.newSetFromMap(new ConcurrentHashMap<>()); + } + else + { + readsCurrentlyUnderConstruction = null; + contacted = null; + } + this.updateKind = updateKind; + } + + @Override + public boolean localReadSupported() + { + return false; + } + + @Override + public EndpointsForToken forNonLocalStrategyTokenRead(ClusterMetadata doNotUse, KeyspaceMetadata keyspace, TableId tableId, Token token) + { + TokenKey key = new TokenKey(tableId, token); + Shard shard = executeTopology.forKey(key); + Range range = ((TokenRange) shard.range).toKeyspaceRange(); + + Replica[] replicas = new Replica[shard.nodes.size()]; + for (int i=0; i message, InetAddressAndPort to, RequestCallback callback) + { + Node.Id id = endpointMapper.mappedId(to); + // TODO (desired): It would be better to use the re-use the command from the transaction but it's fragile + // to try and figure out exactly what changed for things like read repair and short read protection + // Also this read scope doesn't reflect the contents of this particular read and is larger than it needs to be + // TODO (required): understand interop and whether StableFastPath is appropriate + AccordInteropStableThenRead commit = new AccordInteropStableThenRead(id, allTopologies, txnId, Kind.StableFastPath, executeAt, txn, deps, route, message.payload); + node.send(id, commit, executor, new AccordInteropRead.ReadCallback(id, to, message, callback, this)); + } + + @Override + public void sendReadRepairMutation(Message message, InetAddressAndPort to, RequestCallback callback) + { + requireArgument(message.payload.potentialTxnConflicts().allowed); + requireArgument(message.payload.getTableIds().size() == 1); + Node.Id id = endpointMapper.mappedId(to); + Participants readScope = Participants.singleton(txn.read().keys().domain(), new TokenKey(message.payload.getTableIds().iterator().next(), message.payload.key().getToken())); + AccordInteropReadRepair readRepair = new AccordInteropReadRepair(id, executes, txnId, readScope, executeAt.epoch(), message.payload); + node.send(id, readRepair, executor, new AccordInteropReadRepair.ReadRepairCallback(id, to, message, callback, this)); + } + + private List> readChains(Dispatcher.RequestTime requestTime) + { + switch (txnId.domain()) + { + case Key: + return keyReadChains((Txn.InMemory)txn, requestTime); + case Range: + return rangeReadChains((Txn.InMemory)txn, requestTime); + default: + throw UnhandledEnum.unknown(txnId.domain()); + } + } + + private List> keyReadChains(Txn.InMemory txn, Dispatcher.RequestTime requestTime) + { + TxnRead read = (TxnRead) txn.read(); + Keys keys = (Keys) read.keys(); + TableMetadatasAndKeys tablesAndKeys = (TableMetadatasAndKeys) txn.implementationDefined; + ClusterMetadata cm = ClusterMetadata.current(); + List> results = new ArrayList<>(); + keys.forEach(key -> { + read.forEachWithKey(key, fragment -> { + SinglePartitionReadCommand command = (SinglePartitionReadCommand) fragment.command(tablesAndKeys.tables); + + // This should only rarely occur when coordinators start a transaction in a migrating range + // because they haven't yet updated their cluster metadata. + // It would be harmless to do the read, because it will be rejected in `TxnQuery` anyways, + // but it's faster to skip the read + AccordClientRequestMetrics metrics = txnId.isWrite() ? accordWriteMetrics : accordReadMetrics; + // TODO (required): This doesn't use the metadata from the correct epoch + if (!ConsensusRequestRouter.instance.isKeyManagedByAccordForReadAndWrite(cm, command.metadata().id, command.partitionKey())) + { + metrics.migrationSkippedReads.mark(); + results.add(AsyncChains.success(TxnData.emptyPartition(fragment.txnDataName(), command))); + return; + } + + Group group = Group.one(command); + results.add(AsyncChains.ofCallable(Stage.ACCORD_MIGRATION.executor(), () -> { + TxnData result = new TxnData(); + // Enforcing limits is redundant since we only have a group of size 1, but checking anyways + // documents the requirement here + try (PartitionIterator iterator = StorageProxy.maybeEnforceLimits(StorageProxy.fetchRows(group.queries, consistencyLevel, this, requestTime), group)) + { + if (iterator.hasNext()) + { + try (RowIterator partition = iterator.next()) + { + TxnDataKeyValue value = new TxnDataKeyValue(partition); + if (value.hasRows() || command.selectsFullPartition()) + result.put(fragment.txnDataName(), value); + } + } + } + return result; + })); + }); + + }); + return results; + } + + private List> rangeReadChains(Txn.InMemory txn, Dispatcher.RequestTime requestTime) + { + TxnRead read = (TxnRead) txn.read(); + AbstractRanges ranges = (AbstractRanges) read.keys(); + TableMetadatasAndKeys tablesAndKeys = (TableMetadatasAndKeys) txn.implementationDefined; + List> results = new ArrayList<>(); + ranges.forEach(range -> { + read.forEachWithKey(range, fragment -> { + PartitionRangeReadCommand command = ((PartitionRangeReadCommand) fragment.command(tablesAndKeys.tables)).withTxnReadName(fragment.txnDataName()); + + // TODO (required): To make migration work we need to validate that the range is all on Accord + + results.add(AsyncChains.ofCallable(Stage.ACCORD_MIGRATION.executor(), () -> { + TxnData result = new TxnData(); + try (PartitionIterator iterator = StorageProxy.getRangeSlice(command, consistencyLevel, this, requestTime)) + { + TxnDataRangeValue value = new TxnDataRangeValue(); + while (iterator.hasNext()) + { + try (RowIterator partition = iterator.next()) + { + FilteredPartition filtered = FilteredPartition.create(partition); + if (filtered.hasRows() || command.selectsFullPartition()) + value.add(filtered); + } + } + result.put(fragment.txnDataName(), value); + } + return result; + })); + }); + + }); + return results; + } + + private AsyncChain readChains() + { + // TODO (expected): use normal query nano time + Dispatcher.RequestTime requestTime = Dispatcher.RequestTime.forImmediateExecution(); + + List> results = readChains(requestTime); + if (results.isEmpty()) + return AsyncChains.success(new TxnData()); + + if (results.size() == 1) + return results.get(0); + + return AsyncChains.reduce(results, Data::merge); + } + + /* + * Any nodes not contacted for read need to be sent commits + */ + @Override + public void notifyOfInitialContacts(EndpointsForToken fullDataRequests, EndpointsForToken transientRequests, EndpointsForToken digestRequests) + { + if (readsCurrentlyUnderConstruction == null) + return; + + for (int i = 0; i < fullDataRequests.size(); i++) + contacted.add(fullDataRequests.endpoint(i)); + for (int i = 0; i < transientRequests.size(); i++) + contacted.add(transientRequests.endpoint(i)); + for (int i = 0; i < digestRequests.size(); i++) + contacted.add(digestRequests.endpoint(i)); + if (readsCurrentlyUnderConstruction.decrementAndGet() == 0) + sendStableToUncontacted(); + } + + private void sendStableToUncontacted() + { + for (Node.Id to : executeTopology.nodes()) + if (!contacted.contains(endpointMapper.mappedEndpoint(to))) + node.send(to, new Commit(Kind.StableFastPath, to, allTopologies, txnId, txn, route, Ballot.ZERO, executeAt, deps)); + } + + public void start() + { + if (coordinateTopology != executeTopology) + { + for (Node.Id to : allTopologies.nodes()) + { + if (!executeTopology.contains(to)) + node.send(to, new Commit(Kind.StableFastPath, to, allTopologies, txnId, txn, route, Ballot.ZERO, executeAt, deps)); + } + } + AsyncChain result; + if (updateKind == AccordUpdate.Kind.UNRECOVERABLE_REPAIR) + result = executeUnrecoverableRepairUpdate(); + else + result = readChains(); + + CommandStore cs = node.commandStores().select(route.homeKey()); + result.beginAsResult().withExecutor(cs).begin((data, failure) -> { + if (failure == null) + ((CoordinationAdapter)node.coordinationAdapter(txnId, Standard)).persist(node, executes, route, ballot, txnId, txn, executeAt, deps, txnId.is(Write) ? txn.execute(txnId, executeAt, data) : null, txn.result(txnId, executeAt, data), callback); + else + callback.accept(null, failure); + }); + } + + private AsyncChain executeUnrecoverableRepairUpdate() + { + return AsyncChains.ofCallable(Stage.ACCORD_MIGRATION.executor(), () -> { + UnrecoverableRepairUpdate repairUpdate = (UnrecoverableRepairUpdate)txn.update(); + // TODO (expected): We should send the read in the same message as the commit. This requires refactor ReadData.Kind so that it doesn't specify the ordinal encoding + // and can be extended similar to MessageType which allows additional types not from Accord to be added + // This commit won't necessarily execute before the interop read repair message so there could be an insufficient which is fine + for (Node.Id to : executeTopology.nodes()) + node.send(to, new Commit(Kind.StableFastPath, to, allTopologies, txnId, txn, route, Ballot.ZERO, executeAt, deps)); + repairUpdate.runBRR(AccordInteropExecution.this); + return new TxnData(); + }); + } + + @Override + public boolean isEventuallyConsistent() + { + return false; + } + + @Override + public ReadCommand maybeAllowOutOfRangeReads(ReadCommand readCommand, ConsistencyLevel cl) + { + // Reading from a single coordinator so there is no reconciliation at the coordinator and filtering/limits + // need to be pushed down to query execution + boolean withoutReconciliation = cl == null || cl == ConsistencyLevel.ONE; + // Really just want to enable allowPotentialTxnConflicts without changing anything else + // but didn't want to add another method for constructing a modified read command + if (readCommand instanceof SinglePartitionReadCommand) + return ((SinglePartitionReadCommand)readCommand).withTransactionalSettings(withoutReconciliation, readCommand.nowInSec()); + else + { + PartitionRangeReadCommand rangeCommand = ((PartitionRangeReadCommand)readCommand); + return rangeCommand.withTransactionalSettings(readCommand.nowInSec(), rangeCommand.dataRange().keyRange(), true, withoutReconciliation); + } + } + + // Provide request callbacks with a way to send maximal commits on Insufficient responses + @Override + public void sendMaximalCommit(Id to) + { + Commit.stableMaximal(node, to, txn, txnId, executeAt, route, deps); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/interop/AccordInteropPersist.java b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropPersist.java new file mode 100644 index 000000000000..967438ea56f0 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropPersist.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.interop; + +import java.util.function.BiConsumer; + +import accord.api.Result; +import accord.coordinate.Persist; +import accord.coordinate.tracking.AllTracker; +import accord.coordinate.tracking.QuorumTracker; +import accord.coordinate.tracking.RequestStatus; +import accord.coordinate.tracking.ResponseTracker; +import accord.local.Node; +import accord.messages.Apply; +import accord.primitives.Ballot; +import accord.primitives.Deps; +import accord.primitives.FullRoute; +import accord.primitives.Route; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.primitives.Writes; +import accord.topology.Topologies; +import accord.utils.Invariants; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.utils.Throwables; + +/** + * Similar to Accord persist, but can wait on a configurable number of responses and sends AccordInteropApply messages + * that only return a response when the Apply has actually occurred. Regular Apply messages only get the transaction + * to PreApplied. + */ +public class AccordInteropPersist extends Persist +{ + private static class CallbackHolder + { + boolean isDone = false; + private final ResponseTracker tracker; + private final Result result; + private final BiConsumer clientCallback; + private Throwable failure = null; + + public CallbackHolder(ResponseTracker tracker, Result result, BiConsumer clientCallback) + { + this.tracker = tracker; + this.result = result; + this.clientCallback = clientCallback; + } + + private void handleStatus(RequestStatus status) + { + if (isDone) + return; + + switch (status) + { + default: throw new IllegalStateException("Unhandled request status " + status); + case Success: + isDone = true; + clientCallback.accept(result, null); + return; + case Failed: + isDone = true; + clientCallback.accept(null, failure); + return; + case NoChange: + // noop + } + } + + public void recordSuccess(Node.Id node) + { + handleStatus(tracker.recordSuccess(node)); + } + + public void recordFailure(Node.Id node, Throwable throwable) + { + failure = Throwables.merge(failure, throwable); + handleStatus(tracker.recordFailure(node)); + } + + boolean recordCallbackFailure(Throwable throwable) + { + if (isDone) + return false; + isDone = true; + failure = Throwables.merge(failure, throwable); + clientCallback.accept(null, failure); + return true; + } + } + + private final ConsistencyLevel consistencyLevel; + private CallbackHolder callback; + + public AccordInteropPersist(Node node, Topologies topologies, TxnId txnId, Route sendTo, Ballot ballot, Txn txn, Timestamp executeAt, Deps deps, Writes writes, Result result, FullRoute fullRoute, ConsistencyLevel consistencyLevel, BiConsumer clientCallback) + { + super(node, topologies, txnId, ballot, sendTo, txn, executeAt, deps, writes, result, fullRoute, AccordInteropApply.FACTORY); + Invariants.requireArgument(consistencyLevel == ConsistencyLevel.QUORUM || consistencyLevel == ConsistencyLevel.ALL || consistencyLevel == ConsistencyLevel.SERIAL || consistencyLevel == ConsistencyLevel.ONE); + this.consistencyLevel = consistencyLevel; + registerClientCallback(result, clientCallback); + } + + public void registerClientCallback(Result result, BiConsumer clientCallback) + { + Invariants.require(callback == null); + switch (consistencyLevel) + { + case ONE: // Can safely upgrade ONE to QUORUM/SERIAL to get a synchronous commit + case SERIAL: + case QUORUM: + callback = new CallbackHolder(new QuorumTracker(topologies), result, clientCallback); + break; + case ALL: + callback = new CallbackHolder(new AllTracker(topologies), result, clientCallback); + break; + default: + throw new IllegalArgumentException("Unhandled consistency level: " + consistencyLevel); + } + } + + @Override + public void onSuccess(Node.Id from, Apply.ApplyReply reply) + { + super.onSuccess(from, reply); + switch (reply) + { + case Redundant: + case Applied: + callback.recordSuccess(from); + return; + case Insufficient: + // On insufficient Persist will send a commit with the missing information + // which will allow a final response to be returned later that could be successful + return; + default: throw new IllegalArgumentException("Unhandled apply response " + reply); + } + } + + @Override + public void onFailure(Node.Id from, Throwable failure) + { + callback.recordFailure(from, failure); + } + + @Override + public boolean onCallbackFailure(Node.Id from, Throwable failure) + { + return callback.recordCallbackFailure(failure); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/interop/AccordInteropRead.java b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropRead.java new file mode 100644 index 000000000000..c7e5d36af897 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropRead.java @@ -0,0 +1,321 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.interop; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Lists; + +import accord.api.Data; +import accord.local.Node; +import accord.local.SafeCommandStore; +import accord.messages.MessageType; +import accord.messages.ReadData; +import accord.primitives.AbstractRanges; +import accord.primitives.PartialTxn; +import accord.primitives.Participants; +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import accord.topology.Topologies; +import accord.utils.async.AsyncChain; +import accord.utils.async.AsyncChains; +import org.apache.cassandra.concurrent.Stage; +import org.apache.cassandra.db.PartitionRangeReadCommand; +import org.apache.cassandra.db.ReadCommand; +import org.apache.cassandra.db.ReadCommandVerbHandler; +import org.apache.cassandra.db.ReadResponse; +import org.apache.cassandra.db.SinglePartitionReadCommand; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.RequestCallback; +import org.apache.cassandra.service.accord.AccordMessageSink.AccordMessageType; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.accord.serializers.CommandSerializers; +import org.apache.cassandra.service.accord.serializers.IVersionedSerializer; +import org.apache.cassandra.service.accord.serializers.KeySerializers; +import org.apache.cassandra.service.accord.serializers.ReadDataSerializers; +import org.apache.cassandra.service.accord.serializers.ReadDataSerializers.ReadDataSerializer; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.service.accord.txn.TxnNamedRead; +import org.apache.cassandra.service.accord.txn.TxnRead; +import org.apache.cassandra.utils.Pair; + +import static accord.primitives.SaveStatus.PreApplied; +import static accord.primitives.SaveStatus.ReadyToExecute; +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.base.Preconditions.checkState; + +public class AccordInteropRead extends ReadData +{ + + public static final IVersionedSerializer requestSerializer = new ReadDataSerializer() + { + @Override + public void serialize(AccordInteropRead read, DataOutputPlus out, Version version) throws IOException + { + CommandSerializers.txnId.serialize(read.txnId, out); + KeySerializers.participants.serialize(read.scope, out); + out.writeUnsignedVInt(read.executeAtEpoch); + ReadCommand.serializer.serialize(read.command, out, version.messageVersion()); + } + + @Override + public AccordInteropRead deserialize(DataInputPlus in, Version version) throws IOException + { + TxnId txnId = CommandSerializers.txnId.deserialize(in); + Participants scope = KeySerializers.participants.deserialize(in); + long executeAtEpoch = in.readUnsignedVInt(); + ReadCommand command = ReadCommand.serializer.deserialize(in, version.messageVersion()); + return new AccordInteropRead(txnId, scope, executeAtEpoch, command); + } + + @Override + public long serializedSize(AccordInteropRead read, Version version) + { + return CommandSerializers.txnId.serializedSize(read.txnId) + + KeySerializers.participants.serializedSize(read.scope) + + TypeSizes.sizeofUnsignedVInt(read.executeAtEpoch) + + ReadCommand.serializer.serializedSize(read.command, version.messageVersion()); + } + }; + + public static final IVersionedSerializer replySerializer = new ReadDataSerializers.ReplySerializer<>(LocalReadData.serializer); + + private static class LocalReadData implements Data + { + private static final Comparator> RESPONSE_COMPARATOR = Comparator.comparing(Pair::left); + + static final IVersionedSerializer serializer = new IVersionedSerializer<>() + { + @Override + public void serialize(LocalReadData data, DataOutputPlus out, Version version) throws IOException + { + data.ensureRemoteResponse(); + ReadResponse.serializer.serialize(data.remoteResponse, out, version.messageVersion()); + } + + @Override + public LocalReadData deserialize(DataInputPlus in, Version version) throws IOException + { + return new LocalReadData(ReadResponse.serializer.deserialize(in, version.messageVersion())); + } + + @Override + public long serializedSize(LocalReadData data, Version version) + { + data.ensureRemoteResponse(); + return ReadResponse.serializer.serializedSize(data.remoteResponse, version.messageVersion()); + } + }; + + // Will be null at coordinator + List> localResponses; + // Will be null at coordinator + final ReadCommand readCommand; + // Will be not null at coordinator, but null at the node creating the response until it serialized + ReadResponse remoteResponse; + + public LocalReadData(@Nullable TokenKey start, @Nonnull ReadResponse response, @Nonnull ReadCommand readCommand) + { + checkNotNull(response, "response is null"); + checkNotNull(readCommand, "readCommand is null"); + localResponses = ImmutableList.of(Pair.create(start, response)); + this.readCommand = readCommand; + this.remoteResponse = null; + } + + public LocalReadData(@Nonnull ReadResponse remoteResponse) + { + checkNotNull(remoteResponse); + this.remoteResponse = remoteResponse; + readCommand = null; + } + + @Override + public String toString() + { + if (localResponses != null) + return "LocalReadData{" + localResponses + '}'; + else + return "LocalReadData{" + remoteResponse + '}'; + } + + @Override + public Data merge(Data data) + { + checkState(remoteResponse == null, "Already serialized"); + checkState(readCommand.isRangeRequest(), "Should only ever be a single partition"); + LocalReadData other = (LocalReadData)data; + checkState(readCommand == other.readCommand, "Should share the same ReadCommand"); + if (localResponses.size() == 1) + { + List> merged = new ArrayList<>(); + merged.add(localResponses.get(0)); + localResponses = merged; + } + localResponses.addAll(other.localResponses); + return this; + } + + private void ensureRemoteResponse() + { + if (remoteResponse != null) + return; + // Range reads will be spread across command stores and need to be merged in token order + List> responses = localResponses; + if (responses.size() == 1) + { + remoteResponse = responses.get(0).right; + } + else + { + responses = new ArrayList(responses); + Collections.sort(responses, RESPONSE_COMPARATOR); + remoteResponse = ReadResponse.merge(Lists.transform(responses, Pair::right), readCommand); + } + } + } + + static class ReadCallback extends AccordInteropReadCallback + { + public ReadCallback(Node.Id id, InetAddressAndPort endpoint, Message message, RequestCallback wrapped, MaximalCommitSender maximalCommitSender) + { + super(id, endpoint, message, wrapped, maximalCommitSender); + } + + @Override + ReadResponse convertResponse(ReadOk ok) + { + return ((LocalReadData)ok.data).remoteResponse; + } + } + + private static final ExecuteOn EXECUTE_ON = new ExecuteOn(ReadyToExecute, PreApplied); + + protected final ReadCommand command; + + public AccordInteropRead(Node.Id to, Topologies topologies, TxnId txnId, Participants scope, long executeAtEpoch, ReadCommand command) + { + super(to, topologies, txnId, scope, executeAtEpoch); + this.command = command; + } + + public AccordInteropRead(TxnId txnId, Participants scope, long executeAtEpoch, ReadCommand command) + { + super(txnId, scope, executeAtEpoch); + this.command = command; + } + + @Override + public ReadType kind() + { + return ReadType.readTxnData; + } + + @Override + protected AsyncChain beginRead(SafeCommandStore safeStore, Timestamp executeAt, PartialTxn txn, Participants execute) + { + TxnRead txnRead = (TxnRead)txn.read(); + long nowInSeconds = TxnNamedRead.nowInSeconds(executeAt); + if (!command.isRangeRequest()) + { + SinglePartitionReadCommand readCommand = ((SinglePartitionReadCommand)command); + TokenKey key = new TokenKey(readCommand.metadata().id, readCommand.partitionKey().getToken()); + if (!execute.contains(key)) + return AsyncChains.success(null); + + ReadCommand submit = readCommand.withTransactionalSettings(TxnNamedRead.readsWithoutReconciliation(txnRead.cassandraConsistencyLevel()), nowInSeconds); + return AsyncChains.ofCallable(Stage.READ.executor(), () -> new LocalReadData(key, ReadCommandVerbHandler.instance.doRead(submit, false), command)); + } + + // This path can have a subrange we have never seen before provided by short read protection or read repair so we need to + // calculate the intersection with this instance of the command store and the actual command if it is not empty we + // will need to execute it + TokenRange commandRange = TxnNamedRead.boundsAsAccordRange(command.dataRange().keyRange(), command.metadata().id); + List> chains = new ArrayList<>(execute.size()); + for (Range r : (AbstractRanges)execute) + { + Range intersection = commandRange.intersection(r); + if (intersection == null) + continue; + ReadCommand submit = TxnNamedRead.commandForSubrange((PartitionRangeReadCommand) command, intersection, txnRead.cassandraConsistencyLevel(), nowInSeconds); + TokenKey routingKey = ((TokenRange)r).start(); + chains.add(AsyncChains.ofCallable(Stage.READ.executor(), () -> new LocalReadData(routingKey, ReadCommandVerbHandler.instance.doRead(submit, false), command))); + } + + if (chains.isEmpty()) + return AsyncChains.success(null); + + return AsyncChains.reduce(chains, Data::merge); + } + + @Override + protected ExecuteOn executeOn() + { + return EXECUTE_ON; + } + + @Override + protected ReadOk constructReadOk(Ranges unavailable, Data data, long uniqueHlc) + { + return new InteropReadOk(unavailable, data, uniqueHlc); + } + + @Override + public MessageType type() + { + return AccordMessageType.INTEROP_READ_REQ; + } + + @Override + public String toString() + { + return "AccordInteropRead{" + + "txnId=" + txnId + + "command=" + command + + '}'; + } + + private static class InteropReadOk extends ReadOk + { + public InteropReadOk(@Nullable Ranges unavailable, @Nullable Data data, long uniqueHlc) + { + super(unavailable, data, uniqueHlc); + } + + @Override + public MessageType type() + { + return AccordMessageType.INTEROP_READ_RSP; + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/interop/AccordInteropReadCallback.java b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropReadCallback.java new file mode 100644 index 000000000000..ea9ffabe11d4 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropReadCallback.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.interop; + +import javax.annotation.Nonnull; + +import accord.coordinate.Timeout; +import accord.local.Node; +import accord.messages.Callback; +import accord.messages.ReadData.ReadOk; +import accord.messages.ReadData.ReadReply; +import accord.utils.Invariants; +import org.apache.cassandra.exceptions.RequestFailure; +import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.RequestCallback; + +import static accord.messages.ReadData.CommitOrReadNack.Insufficient; + +public abstract class AccordInteropReadCallback implements Callback +{ + interface MaximalCommitSender + { + void sendMaximalCommit(@Nonnull Node.Id to); + } + + private final Node.Id id; + private final InetAddressAndPort endpoint; + private final Message message; + private final RequestCallback wrapped; + private final MaximalCommitSender maximalCommitSender; + + public AccordInteropReadCallback(Node.Id id, InetAddressAndPort endpoint, Message message, RequestCallback wrapped, MaximalCommitSender maximalCommitSender) + { + this.id = id; + this.message = message; + this.endpoint = endpoint; + this.wrapped = wrapped; + this.maximalCommitSender = maximalCommitSender; + } + + abstract T convertResponse(ReadOk ok); + + @Override + public void onSuccess(Node.Id from, ReadReply reply) + { + Invariants.requireArgument(from.equals(id)); + if (reply.isOk()) + { + wrapped.onResponse(message.responseWith(convertResponse((ReadOk) reply)).withFrom(endpoint)); + } + else if (reply == Insufficient) + { + // Might still send a response if we send a maximal commit. Accord would tryAlternative and send + // both the commit and an additional repair, but Cassandra doesn't have tryAlternative unless we add + // it and instead opts to trigger additional repair messages based on time. + maximalCommitSender.sendMaximalCommit(id); + } + else + { + wrapped.onFailure(endpoint, RequestFailure.UNKNOWN); + } + } + + @Override + public void onFailure(Node.Id from, Throwable failure) + { + RequestFailure requestFailure; + // Convert from Accord's timeout exception to our failure reason because timeout is something + // that is useful for metrics and can be handled differently + if (failure instanceof Timeout) + requestFailure = new RequestFailure(RequestFailureReason.TIMEOUT, failure); + else + requestFailure = new RequestFailure(RequestFailureReason.UNKNOWN, failure); + wrapped.onFailure(endpoint, requestFailure); + } + + @Override + public boolean onCallbackFailure(Node.Id from, Throwable failure) + { + wrapped.onFailure(endpoint, RequestFailure.UNKNOWN); + return true; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/interop/AccordInteropReadRepair.java b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropReadRepair.java new file mode 100644 index 000000000000..995d93b6b064 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropReadRepair.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.interop; + +import java.io.IOException; +import javax.annotation.Nullable; + +import accord.api.Data; +import accord.local.Node; +import accord.local.SafeCommandStore; +import accord.messages.ReadData; +import accord.messages.MessageType; +import accord.primitives.PartialTxn; +import accord.primitives.Participants; +import accord.primitives.Ranges; +import accord.primitives.SaveStatus; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import accord.topology.Topologies; +import accord.utils.async.AsyncChain; +import accord.utils.async.AsyncChains; +import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.db.ReadRepairVerbHandler; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.NoPayload; +import org.apache.cassandra.net.RequestCallback; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.service.accord.AccordMessageSink.AccordMessageType; +import org.apache.cassandra.service.accord.serializers.CommandSerializers; +import org.apache.cassandra.service.accord.serializers.IVersionedSerializer; +import org.apache.cassandra.service.accord.serializers.KeySerializers; +import org.apache.cassandra.service.accord.serializers.ReadDataSerializers; +import org.apache.cassandra.service.accord.serializers.ReadDataSerializers.ReadDataSerializer; +import org.apache.cassandra.service.accord.serializers.Version; + +/** + * Applies a read repair mutation from inside the context of a CommandStore via AbstractExecute + * ensuring that the contents of the read repair consist of data that isn't from transactions that + * haven't been committed yet at this command store. + */ +public class AccordInteropReadRepair extends ReadData +{ + public static final IVersionedSerializer requestSerializer = new ReadDataSerializer() + { + @Override + public void serialize(AccordInteropReadRepair repair, DataOutputPlus out, Version version) throws IOException + { + CommandSerializers.txnId.serialize(repair.txnId, out); + KeySerializers.participants.serialize(repair.scope, out); + out.writeUnsignedVInt(repair.executeAtEpoch); + Mutation.serializer.serialize(repair.mutation, out, version.messageVersion()); + } + + @Override + public AccordInteropReadRepair deserialize(DataInputPlus in, Version version) throws IOException + { + TxnId txnId = CommandSerializers.txnId.deserialize(in); + Participants scope = KeySerializers.participants.deserialize(in); + long executeAtEpoch = in.readUnsignedVInt(); + Mutation mutation = Mutation.serializer.deserialize(in, version.messageVersion()); + return new AccordInteropReadRepair(txnId, scope, executeAtEpoch, mutation); + } + + @Override + public long serializedSize(AccordInteropReadRepair repair, Version version) + { + return CommandSerializers.txnId.serializedSize(repair.txnId) + + KeySerializers.participants.serializedSize(repair.scope) + + TypeSizes.sizeofUnsignedVInt(repair.executeAtEpoch) + + Mutation.serializer.serializedSize(repair.mutation, version.messageVersion()); + } + }; + + static class ReadRepairCallback extends AccordInteropReadCallback + { + public ReadRepairCallback(Node.Id id, InetAddressAndPort endpoint, Message message, RequestCallback wrapped, MaximalCommitSender maximalCommitSender) + { + super(id, endpoint, message, wrapped, maximalCommitSender); + } + + @Override + Object convertResponse(ReadOk ok) + { + return NoPayload.noPayload; + } + } + + private static final ExecuteOn EXECUTE_ON = new ExecuteOn(SaveStatus.ReadyToExecute, SaveStatus.Applied); + + private final Mutation mutation; + + private static final IVersionedSerializer noop_data_serializer = new IVersionedSerializer<>() + { + @Override + public void serialize(Data t, DataOutputPlus out, Version version) throws IOException {} + @Override + public Data deserialize(DataInputPlus in, Version version) throws IOException { return Data.NOOP_DATA; } + + public long serializedSize(Data t, Version version) { return 0; } + }; + + public static final IVersionedSerializer replySerializer = new ReadDataSerializers.ReplySerializer<>(noop_data_serializer); + + public AccordInteropReadRepair(Node.Id to, Topologies topologies, TxnId txnId, Participants scope, long executeAtEpoch, Mutation mutation) + { + super(to, topologies, txnId, scope, executeAtEpoch); + this.mutation = mutation; + } + + public AccordInteropReadRepair(TxnId txnId, Participants scope, long executeAtEpoch, Mutation mutation) + { + super(txnId, scope, executeAtEpoch); + this.mutation = mutation; + } + + @Override + protected ExecuteOn executeOn() + { + return EXECUTE_ON; + } + + @Override + public ReadType kind() + { + return ReadType.readTxnData; + } + + @Override + protected AsyncChain beginRead(SafeCommandStore safeStore, Timestamp executeAt, PartialTxn txn, Participants execute) + { + // TODO (required): subtract unavailable ranges, either from read or from response (or on coordinator) + return AsyncChains.ofCallable(Verb.READ_REPAIR_REQ.stage.executor(), () -> { + ReadRepairVerbHandler.instance.applyMutation(mutation); + return Data.NOOP_DATA; + }); + } + + @Override + protected ReadOk constructReadOk(Ranges unavailable, Data data, long uniqueHlc) + { + return new InteropReadRepairOk(unavailable, data, uniqueHlc); + } + + @Override + public MessageType type() + { + return AccordMessageType.INTEROP_READ_REPAIR_REQ; + } + + private static class InteropReadRepairOk extends ReadOk + { + public InteropReadRepairOk(@Nullable Ranges unavailable, @Nullable Data data, long uniqueHlc) + { + super(unavailable, data, uniqueHlc); + } + + @Override + public MessageType type() + { + return AccordMessageType.INTEROP_READ_REPAIR_RSP; + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/interop/AccordInteropStableThenRead.java b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropStableThenRead.java new file mode 100644 index 000000000000..4f282467b79d --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropStableThenRead.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.interop; + +import java.io.IOException; +import javax.annotation.Nullable; + +import accord.local.Commands; +import accord.local.Node; +import accord.local.SafeCommand; +import accord.local.SafeCommandStore; +import accord.local.StoreParticipants; +import accord.messages.Commit; +import accord.messages.MessageType; +import accord.primitives.Ballot; +import accord.primitives.Deps; +import accord.primitives.FullRoute; +import accord.primitives.PartialDeps; +import accord.primitives.PartialTxn; +import accord.primitives.Participants; +import accord.primitives.Route; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.topology.Topologies; +import org.apache.cassandra.db.ReadCommand; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.AccordMessageSink.AccordMessageType; +import org.apache.cassandra.service.accord.serializers.CommandSerializers; +import org.apache.cassandra.service.accord.serializers.CommandSerializers.ExecuteAtSerializer; +import org.apache.cassandra.service.accord.serializers.CommitSerializers; +import org.apache.cassandra.service.accord.serializers.DepsSerializers; +import org.apache.cassandra.service.accord.serializers.IVersionedSerializer; +import org.apache.cassandra.service.accord.serializers.KeySerializers; +import org.apache.cassandra.service.accord.serializers.ReadDataSerializers.ReadDataSerializer; +import org.apache.cassandra.service.accord.serializers.Version; + +import static accord.messages.Commit.WithDeps.HasDeps; +import static accord.messages.Commit.WithDeps.NoDeps; +import static accord.messages.Commit.WithTxn.HasTxn; +import static accord.messages.Commit.WithTxn.NoTxn; +import static accord.primitives.SaveStatus.PreApplied; +import static accord.primitives.SaveStatus.ReadyToExecute; + +public class AccordInteropStableThenRead extends AccordInteropRead +{ + // TODO (desired): duplicates a lot of StableThenReadSerializer + public static final IVersionedSerializer requestSerializer = new ReadDataSerializer<>() + { + @Override + public void serialize(AccordInteropStableThenRead read, DataOutputPlus out, Version version) throws IOException + { + CommandSerializers.txnId.serialize(read.txnId, out); + KeySerializers.participants.serialize(read.scope, out); + CommitSerializers.kind.serialize(read.kind, out); + out.writeUnsignedVInt(read.minEpoch); + ExecuteAtSerializer.serialize(read.txnId, read.executeAt, out); + if (read.kind.withTxn != NoTxn) + CommandSerializers.nullablePartialTxn.serialize(read.partialTxn, out, version); + if (read.kind.withDeps == HasDeps) + DepsSerializers.partialDeps.serialize(read.partialDeps, out); + if (read.kind.withTxn == HasTxn) + KeySerializers.fullRoute.serialize(read.route, out); + ReadCommand.serializer.serialize(read.command, out, version.messageVersion()); + } + + @Override + public AccordInteropStableThenRead deserialize(DataInputPlus in, Version version) throws IOException + { + TxnId txnId = CommandSerializers.txnId.deserialize(in); + Participants scope = KeySerializers.participants.deserialize(in); + Commit.Kind kind = CommitSerializers.kind.deserialize(in); + long minEpoch = in.readUnsignedVInt(); + Timestamp executeAt = ExecuteAtSerializer.deserialize(txnId, in); + PartialTxn partialTxn = kind.withTxn == NoTxn ? null : CommandSerializers.nullablePartialTxn.deserialize(in, version); + PartialDeps partialDeps = kind.withDeps == NoDeps ? null : DepsSerializers.partialDeps.deserialize(in); + FullRoute < ?> route = kind.withTxn == HasTxn ? KeySerializers.fullRoute.deserialize(in) : null; + ReadCommand command = ReadCommand.serializer.deserialize(in, version.messageVersion()); + return new AccordInteropStableThenRead(txnId, scope, kind, minEpoch, executeAt, partialTxn, partialDeps, route, command); + } + + @Override + public long serializedSize(AccordInteropStableThenRead read, Version version) + { + return CommandSerializers.txnId.serializedSize(read.txnId) + + KeySerializers.participants.serializedSize(read.scope) + + CommitSerializers.kind.serializedSize(read.kind) + + TypeSizes.sizeofUnsignedVInt(read.minEpoch) + + ExecuteAtSerializer.serializedSize(read.txnId, read.executeAt) + + (read.kind.withTxn == NoTxn ? 0 : CommandSerializers.nullablePartialTxn.serializedSize(read.partialTxn, version)) + + (read.kind.withDeps != HasDeps ? 0 : DepsSerializers.partialDeps.serializedSize(read.partialDeps)) + + (read.kind.withTxn != HasTxn ? 0 : KeySerializers.fullRoute.serializedSize(read.route)) + + ReadCommand.serializer.serializedSize(read.command, version.messageVersion()); + } + }; + + // TODO (required): why is this safe to execute at PreApplied? Document. + private static final ExecuteOn EXECUTE_ON = new ExecuteOn(ReadyToExecute, PreApplied); + + public final long minEpoch; + public final Commit.Kind kind; + public final Timestamp executeAt; + public final @Nullable PartialTxn partialTxn; + public final @Nullable PartialDeps partialDeps; + public final @Nullable FullRoute route; + + public AccordInteropStableThenRead(Node.Id to, Topologies topologies, TxnId txnId, Commit.Kind kind, Timestamp executeAt, Txn txn, Deps deps, FullRoute route, ReadCommand command) + { + super(to, topologies, txnId, route, executeAt.epoch(), command); + this.kind = kind; + this.minEpoch = topologies.oldestEpoch(); + this.executeAt = executeAt; + this.partialTxn = kind.withTxn.select(txn, route, topologies, txnId, to); + this.partialDeps = kind.withDeps.select(deps, route); + this.route = kind.withTxn.select(route); + } + + public AccordInteropStableThenRead(TxnId txnId, Participants scope, Commit.Kind kind, long minEpoch, Timestamp executeAt, @Nullable PartialTxn partialTxn, @Nullable PartialDeps partialDeps, @Nullable FullRoute route, ReadCommand command) + { + super(txnId, scope, executeAt.epoch(), command); + this.minEpoch = minEpoch; + this.kind = kind; + this.executeAt = executeAt; + this.partialTxn = partialTxn; + this.partialDeps = partialDeps; + this.route = route; + } + + @Override + public CommitOrReadNack apply(SafeCommandStore safeStore) + { + Route route = this.route == null ? (Route)scope : this.route; + StoreParticipants participants = StoreParticipants.execute(safeStore, route, txnId, minEpoch(), executeAtEpoch); + SafeCommand safeCommand = safeStore.get(txnId, participants); + Commands.commit(safeStore, safeCommand, participants, kind.saveStatus, Ballot.ZERO, txnId, route, partialTxn, executeAt, partialDeps, kind); + return super.apply(safeStore, safeCommand, participants); + } + + @Override + public ReadType kind() + { + return ReadType.stableThenRead; + } + + @Override + protected ExecuteOn executeOn() + { + return EXECUTE_ON; + } + + @Override + public MessageType type() + { + return AccordMessageType.INTEROP_STABLE_THEN_READ_REQ; + } + + @Override + public String toString() + { + return "AccordInteropStableThenRead{" + + "txnId=" + txnId + + "command=" + command + + '}'; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/journal/AccordTopologyUpdate.java b/src/java/org/apache/cassandra/service/accord/journal/AccordTopologyUpdate.java new file mode 100644 index 000000000000..a01580363f79 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/journal/AccordTopologyUpdate.java @@ -0,0 +1,456 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.journal; + +import java.io.IOException; +import java.util.Iterator; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Objects; +import java.util.TreeMap; +import java.util.function.Function; + +import accord.api.Journal; +import accord.local.CommandStores; +import accord.primitives.EpochSupplier; +import accord.primitives.Ranges; +import accord.topology.Topology; +import accord.utils.Invariants; +import accord.utils.UnhandledEnum; +import org.agrona.collections.Int2ObjectHashMap; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.AccordConfigurationService; +import org.apache.cassandra.service.accord.AccordJournalValueSerializers; +import org.apache.cassandra.service.accord.JournalKey; +import org.apache.cassandra.service.accord.serializers.KeySerializers; +import org.apache.cassandra.service.accord.serializers.TopologySerializers; +import org.apache.cassandra.service.accord.serializers.Version; + +public interface AccordTopologyUpdate +{ + Kind kind(); + void applyTo(TopologyImage accumulator); + long epoch(); + + static AccordTopologyUpdate newTopology(Journal.TopologyUpdate update) + { + return new NewTopology(update); + } + class RangesForEpochSerializer implements UnversionedSerializer + { + public static final RangesForEpochSerializer instance = new RangesForEpochSerializer(); + + @Override + public void serialize(CommandStores.RangesForEpoch from, DataOutputPlus out) throws IOException + { + out.writeUnsignedVInt32(from.size()); + for (int i = 0; i < from.size(); i++) + { + out.writeLong(from.epochAtIndex(i)); + KeySerializers.ranges.serialize(from.rangesAtIndex(i), out); + } + } + + @Override + public CommandStores.RangesForEpoch deserialize(DataInputPlus in) throws IOException + { + int size = in.readUnsignedVInt32(); + Ranges[] ranges = new Ranges[size]; + long[] epochs = new long[size]; + for (int i = 0; i < ranges.length; i++) + { + epochs[i] = in.readLong(); + ranges[i] = KeySerializers.ranges.deserialize(in); + } + Invariants.require(ranges.length == epochs.length); + return new CommandStores.RangesForEpoch(epochs, ranges); + } + + @Override + public long serializedSize(CommandStores.RangesForEpoch from) + { + long size = TypeSizes.sizeofUnsignedVInt(from.size()); + for (int i = 0; i < from.size(); i++) + { + size += TypeSizes.sizeof(from.epochAtIndex(i)); + size += KeySerializers.ranges.serializedSize(from.rangesAtIndex(i)); + } + return size; + } + } + + class TopologyUpdateSerializer implements UnversionedSerializer + { + public static final TopologyUpdateSerializer instance = new TopologyUpdateSerializer(); + + @Override + public void serialize(Journal.TopologyUpdate from, DataOutputPlus out) throws IOException + { + out.writeUnsignedVInt32(from.commandStores.size()); + for (Map.Entry e : from.commandStores.entrySet()) + { + out.writeUnsignedVInt32(e.getKey()); + RangesForEpochSerializer.instance.serialize(e.getValue(), out); + } + //TODO (desired): local to what? Rather than serializing local we can serialize the node its relative too? that why when we deserialize we do globa.forNode(node) + // this also decreases the size as we don't have redundent shards + TopologySerializers.topology.serialize(from.local, out); + TopologySerializers.topology.serialize(from.global, out); + } + + @Override + public Journal.TopologyUpdate deserialize(DataInputPlus in) throws IOException + { + int commandStoresSize = in.readUnsignedVInt32(); + Int2ObjectHashMap commandStores = new Int2ObjectHashMap<>(); + for (int j = 0; j < commandStoresSize; j++) + { + int commandStoreId = in.readUnsignedVInt32(); + CommandStores.RangesForEpoch rangesForEpoch = RangesForEpochSerializer.instance.deserialize(in); + commandStores.put(commandStoreId, rangesForEpoch); + } + Topology local = TopologySerializers.topology.deserialize(in); + Topology global = TopologySerializers.topology.deserialize(in); + return new Journal.TopologyUpdate(commandStores, local, global); + } + + @Override + public long serializedSize(Journal.TopologyUpdate from) + { + long size = TypeSizes.sizeofUnsignedVInt(from.commandStores.size()); + for (Map.Entry e : from.commandStores.entrySet()) + { + size += TypeSizes.sizeofUnsignedVInt(e.getKey()); + size += RangesForEpochSerializer.instance.serializedSize(e.getValue()); + } + + size += TopologySerializers.topology.serializedSize(from.local); + size += TopologySerializers.topology.serializedSize(from.global); + return size; + } + } + + class Serializer implements UnversionedSerializer + { + public static Serializer instance = new Serializer(); + + @Override + public void serialize(AccordTopologyUpdate t, DataOutputPlus out) throws IOException + { + out.writeUnsignedVInt(t.epoch()); + out.writeUnsignedVInt32(t.kind().ordinal()); + switch (t.kind()) + { + case NewTopology: + TopologyUpdateSerializer.instance.serialize(((NewTopology) t).update, out); + break; + case Topologies: + TopologyImage image = (TopologyImage) t; + + out.writeBoolean(image.update != null); + if (image.update != null) + TopologyUpdateSerializer.instance.serialize(image.update, out); + if (image.syncStatus == null) + out.writeByte(Byte.MAX_VALUE); + else + out.writeByte(image.syncStatus.ordinal()); + + KeySerializers.ranges.serialize(image.closed, out); + KeySerializers.ranges.serialize(image.retired, out); + break; + default: + throw new UnhandledEnum(t.kind()); + } + } + + @Override + public AccordTopologyUpdate deserialize(DataInputPlus in) throws IOException + { + long epoch = in.readUnsignedVInt(); + Kind kind = Kind.values()[in.readUnsignedVInt32()]; + switch (kind) + { + case NewTopology: + return new NewTopology(TopologyUpdateSerializer.instance.deserialize(in)); + case Topologies: + { + TopologyImage image = new TopologyImage(epoch); + if (in.readBoolean()) + image.update = TopologyUpdateSerializer.instance.deserialize(in); + + byte syncStateByte = in.readByte(); + if (syncStateByte != Byte.MAX_VALUE) + image.syncStatus = AccordConfigurationService.SyncStatus.values()[syncStateByte]; + + image.closed = KeySerializers.ranges.deserialize(in); + image.retired = KeySerializers.ranges.deserialize(in); + return image; + } + default: + throw new UnhandledEnum(kind); + } + } + + @Override + public long serializedSize(AccordTopologyUpdate t) + { + long size = TypeSizes.sizeofUnsignedVInt(t.epoch()); + size += TypeSizes.sizeofUnsignedVInt(t.kind().ordinal()); + + switch (t.kind()) + { + case NewTopology: + size += TopologyUpdateSerializer.instance.serializedSize(((NewTopology) t).update); + break; + case Topologies: + TopologyImage image = (TopologyImage) t; + + size += TypeSizes.sizeof(image.update != null); + if (image.update != null) + size += TopologyUpdateSerializer.instance.serializedSize(image.update); + + size += Byte.BYTES; + + size += KeySerializers.ranges.serializedSize(image.closed); + size += KeySerializers.ranges.serializedSize(image.retired); + break; + default: + throw new UnhandledEnum(t.kind()); + } + return size; + } + } + + enum Kind + { + NewTopology, + Topologies + } + + class ImmutableTopoloyImage extends Journal.TopologyUpdate + { + public ImmutableTopoloyImage(TopologyImage image) + { + super(image.update.commandStores, image.update.local, image.update.global); + } + } + + class TopologyImage implements AccordTopologyUpdate + { + private Journal.TopologyUpdate update; + private AccordConfigurationService.SyncStatus syncStatus = null; + + private Ranges closed = Ranges.EMPTY; + private Ranges retired = Ranges.EMPTY; + + private final long epoch; + + public TopologyImage(long epoch) + { + this.epoch = epoch; + } + + @Override + public long epoch() + { + return this.epoch; + } + + @Override + public Kind kind() + { + return Kind.Topologies; + } + + @Override + public void applyTo(TopologyImage accumulator) + { + Invariants.require(accumulator.epoch == epoch); + Invariants.require(accumulator.update == null || accumulator.update.equals(update)); + accumulator.update = update; + // We're iterating in _reverse_ order + if (accumulator.syncStatus == null) + accumulator.syncStatus = syncStatus; + accumulator.closed = accumulator.closed.with(closed); + accumulator.retired = accumulator.retired.with(retired); + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + TopologyImage that = (TopologyImage) o; + return epoch == that.epoch && Objects.equals(update, that.update) && syncStatus == that.syncStatus && closed.equals(that.closed) && retired.equals(that.retired); + } + + @Override + public int hashCode() + { + return Objects.hash(update, syncStatus, closed, retired, epoch); + } + } + + class NewTopology implements AccordTopologyUpdate + { + public final Journal.TopologyUpdate update; + private final long epoch; + + public NewTopology(Journal.TopologyUpdate update) + { + this.epoch = update.global.epoch(); + this.update = update; + } + + @Override + public long epoch() + { + return this.epoch; + } + + @Override + public Kind kind() + { + return Kind.NewTopology; + } + + @Override + public void applyTo(TopologyImage accumulator) + { + Invariants.require(accumulator.epoch == epoch); + Invariants.require(accumulator.update == null); + accumulator.update = update; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + NewTopology that = (NewTopology) o; + return epoch == that.epoch && update.equals(that.update); + } + + @Override + public int hashCode() + { + return Objects.hash(update, epoch); + } + } + + class Accumulator + extends AccordJournalValueSerializers.Accumulator, AccordTopologyUpdate> + { + public Accumulator() + { + super(new TreeMap<>()); + } + + @Override + public void reset(JournalKey key) + { + accumulated = new TreeMap<>(); + } + + @Override + public void update(AccordTopologyUpdate newValue) + { + super.update(newValue); + } + + public Iterator images() + { + return map(get().values().iterator(), ImmutableTopoloyImage::new); + } + + @Override + protected NavigableMap accumulate(NavigableMap allEpochs, AccordTopologyUpdate update) + { + update.applyTo(allEpochs.computeIfAbsent(update.epoch(), v -> new TopologyImage(update.epoch()))); + return allEpochs; + } + } + + static Iterator map(Iterator iter, Function fn) + { + return new Iterator() + { + public boolean hasNext() + { + return iter.hasNext(); + } + + public TO next() + { + return fn.apply(iter.next()); + } + }; + } + + class AccumulatingSerializer + implements AccordJournalValueSerializers.FlyweightSerializer + { + public static final AccumulatingSerializer defaultInstance = new AccumulatingSerializer(() -> 0); + + private final EpochSupplier minEpoch; + public AccumulatingSerializer(EpochSupplier minEpoch) + { + this.minEpoch = minEpoch; + } + + @Override + public Accumulator mergerFor() + { + return new Accumulator(); + } + + @Override + public void serialize(JournalKey key, AccordTopologyUpdate from, DataOutputPlus out, Version version) throws IOException + { + out.writeUnsignedVInt32(1); + Serializer.instance.serialize(from, out); + } + + @Override + public void reserialize(JournalKey key, Accumulator from, DataOutputPlus out, Version version) throws IOException + { + out.writeUnsignedVInt32(from.get().size()); + for (TopologyImage value : from.get().values()) + Serializer.instance.serialize(value, out); + } + + @Override + public void deserialize(JournalKey key, Accumulator into, DataInputPlus in, Version version) throws IOException + { + long minEpoch = this.minEpoch.epoch(); + int count = in.readUnsignedVInt32(); + while (--count >= 0) + { + AccordTopologyUpdate update = Serializer.instance.deserialize(in); + if (update.epoch() >= minEpoch) + into.update(update); + else + return; + } + } + } +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/service/accord/repair/AccordRepair.java b/src/java/org/apache/cassandra/service/accord/repair/AccordRepair.java new file mode 100644 index 000000000000..d3a68eb20ab8 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/repair/AccordRepair.java @@ -0,0 +1,182 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.repair; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.Executor; +import java.util.stream.Collectors; +import javax.annotation.Nullable; + +import accord.local.durability.DurabilityService; +import accord.local.Node; +import accord.primitives.Ranges; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.metrics.LatencyMetrics; +import org.apache.cassandra.repair.SharedContext; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.AccordTopology; +import org.apache.cassandra.service.accord.IAccordService; +import org.apache.cassandra.service.accord.RequestBookkeeping; +import org.apache.cassandra.service.accord.TimeOnlyRequestBookkeeping.LatencyRequestBookkeeping; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.utils.TimeUUID; +import org.apache.cassandra.utils.concurrent.AsyncPromise; +import org.apache.cassandra.utils.concurrent.Future; + +import static accord.local.durability.DurabilityService.SyncLocal.NoLocal; +import static accord.local.durability.DurabilityService.SyncRemote.All; +import static accord.local.durability.DurabilityService.SyncRemote.Quorum; +import static accord.primitives.Timestamp.mergeMax; +import static accord.primitives.Timestamp.minForEpoch; +import static org.apache.cassandra.config.DatabaseDescriptor.getAccordRepairTimeoutNanos; + +/* + * Accord repair consists of creating a barrier transaction for all the ranges which ensure that all Accord transactions + * before the Epoch and point in time at which the repair started have their side effects visible to Paxos and regular quorum reads. + */ +public class AccordRepair +{ + private final SharedContext ctx; + private final ColumnFamilyStore cfs; + private final TimeUUID repairId; + + private final Ranges ranges; + + private final boolean requireAllEndpoints; + private final List endpoints; + + private final Epoch minEpoch = ClusterMetadata.current().epoch; + + private volatile Throwable shouldAbort = null; + private volatile Thread waiting; + + public AccordRepair(SharedContext ctx, ColumnFamilyStore cfs, TimeUUID repairId, String keyspace, Collection> ranges, boolean requireAllEndpoints, List endpoints) + { + this.ctx = ctx; + this.cfs = cfs; + this.repairId = repairId; + this.requireAllEndpoints = requireAllEndpoints; + this.endpoints = endpoints; + this.ranges = AccordTopology.toAccordRanges(keyspace, ranges); + } + + public Epoch minEpoch() + { + return minEpoch; + } + + public Ranges repair() throws Throwable + { + List repairedRanges = new ArrayList<>(); + for (accord.primitives.Range range : ranges) + repairedRanges.addAll(repairRange((TokenRange)range)); + return Ranges.of(repairedRanges.toArray(new accord.primitives.Range[0])); + } + + public Future repair(Executor executor) + { + AsyncPromise future = new AsyncPromise<>(); + executor.execute(() -> { + try + { + future.trySuccess(repair()); + } + catch (Throwable e) + { + future.tryFailure(e); + } + }); + return future; + } + + protected void abort(@Nullable Throwable reason) + { + shouldAbort = reason == null ? new RuntimeException("Abort") : reason; + Thread thread = waiting; + if (thread != null) + thread.interrupt(); + } + + private List repairRange(TokenRange range) throws Throwable + { + List repairedRanges = new ArrayList<>(); + List ids = endpoints == null ? null : endpoints.stream().map(AccordService.instance().configService()::mappedId).collect(Collectors.toList()); + DurabilityService.SyncRemote syncRemote = requireAllEndpoints ? All : Quorum; + + if (shouldAbort != null) + throw shouldAbort; + + LatencyMetrics latency = null; + { + TableMetadata metadata = Schema.instance.getTableMetadata(range.table()); + if (metadata != null) + { + ColumnFamilyStore cfs = Keyspace.openAndGetStore(metadata); + if (cfs != null) + latency = cfs.metric.accordRepair; + } + } + long start = ctx.clock().nanoTime(); + try + { + IAccordService service = AccordService.instance(); + Ranges ranges = AccordService.intersecting(Ranges.of(range)); + waiting = Thread.currentThread(); + RequestBookkeeping bookkeeping = new LatencyRequestBookkeeping(latency); + AccordService.getBlocking(service.maxConflict(ranges).flatMap(conflict -> { + conflict = mergeMax(conflict, minForEpoch(this.minEpoch.getEpoch())); + return service.sync("[repairId #" + repairId + ']', conflict, Ranges.of(range), ids, NoLocal, syncRemote); + }), ranges, bookkeeping, start, start + getAccordRepairTimeoutNanos()); + waiting = null; + + if (shouldAbort != null) + throw shouldAbort; + + for (accord.primitives.Range r : ranges) + repairedRanges.add(r); + } + catch (Throwable t) + { + cfs.metric.accordRepairUnexpectedFailures.mark(); + if (shouldAbort != null) + { + shouldAbort.addSuppressed(t); + throw shouldAbort; + } + throw t; + } + finally + { + long end = ctx.clock().nanoTime(); + cfs.metric.accordRepair.addNano(end - start); + } + + return repairedRanges; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/repair/RequiredResponseTracker.java b/src/java/org/apache/cassandra/service/accord/repair/RequiredResponseTracker.java new file mode 100644 index 000000000000..ac2651dcd33c --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/repair/RequiredResponseTracker.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.repair; + +import java.util.HashSet; +import java.util.Set; + +import accord.coordinate.tracking.RequestStatus; +import accord.coordinate.tracking.ShardTracker; +import accord.coordinate.tracking.SimpleTracker; +import accord.local.Node; +import accord.topology.Shard; +import accord.topology.Topologies; + +import static accord.coordinate.tracking.AbstractTracker.ShardOutcomes.Fail; +import static accord.coordinate.tracking.AbstractTracker.ShardOutcomes.NoChange; +import static accord.coordinate.tracking.AbstractTracker.ShardOutcomes.Success; + +public class RequiredResponseTracker extends SimpleTracker +{ + public static class RequiredResponseShardTracker extends ShardTracker + { + private final Set outstandingResponses; + + public RequiredResponseShardTracker(Set requiredResponses, Shard shard) + { + super(shard); + this.outstandingResponses = new HashSet<>(); + for (Node.Id id : shard.nodes) + { + if (requiredResponses.contains(id)) + outstandingResponses.add(id); + } + } + + public ShardOutcomes onSuccess(Node.Id node) + { + return outstandingResponses.remove(node) && outstandingResponses.isEmpty() ? Success : NoChange; + } + + public ShardOutcomes onFailure(Object ignore) + { + return !outstandingResponses.isEmpty() ? Fail : NoChange; + } + } + + public RequiredResponseTracker(Set requiredResponses, Topologies topologies) + { + super(topologies, RequiredResponseShardTracker[]::new, shard -> new RequiredResponseShardTracker(requiredResponses, shard)); + } + + @Override + public RequestStatus recordSuccess(Node.Id node) + { + return recordResponse(this, node, RequiredResponseShardTracker::onSuccess, node); + } + + @Override + public RequestStatus recordFailure(Node.Id node) + { + return recordResponse(this, node, RequiredResponseShardTracker::onFailure, null); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/AbstractSortedCollector.java b/src/java/org/apache/cassandra/service/accord/serializers/AbstractSortedCollector.java new file mode 100644 index 000000000000..d979dab3fa97 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/AbstractSortedCollector.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.util.AbstractList; +import java.util.Arrays; +import java.util.Comparator; + +import org.apache.cassandra.utils.BulkIterator; +import org.apache.cassandra.utils.btree.BTree; +import org.apache.cassandra.utils.btree.UpdateFunction; + +import static accord.utils.ArrayBuffers.cachedAny; + +public abstract class AbstractSortedCollector extends AbstractList +{ + private static final int BTREE_THRESHOLD = 16; + + Object buffer; + int count = 0; + + abstract Comparator comparator(); + abstract C empty(); + abstract C of(T one); + abstract C copy(Object[] array, int count); + abstract C copyBtree(Object[] btree, int count); + + public AbstractSortedCollector() + { + } + + public boolean add(T add) + { + return add == collect(add); + } + + protected T collect(T add) + { + if (count == 0) + { + buffer = add; + count = 1; + return add; + } + if (count == 1) + { + if (add.equals(buffer)) + return (T)buffer; + Object[] newBuffer = cachedAny().get(8); + boolean addIsLower = comparator().compare(add, buffer) < 0; + newBuffer[0] = addIsLower ? add : buffer; + newBuffer[1] = addIsLower ? buffer : add; + buffer = newBuffer; + count = 2; + return add; + } + Object[] buffer = (Object[]) this.buffer; + if (count < BTREE_THRESHOLD) + { + int i = Arrays.binarySearch(buffer, 0, count, add, comparator()); + if (i >= 0) + return (T) buffer[i]; + i = -1 - i; + if (count == buffer.length) + this.buffer = buffer = cachedAny().resize(buffer, count, count + 1); + System.arraycopy(buffer, i, buffer, i + 1, count - i); + buffer[i] = add; + if (++count == BTREE_THRESHOLD) + { + Object[] btree = BTree.build(BulkIterator.of(buffer), count, UpdateFunction.noOp()); + cachedAny().forceDiscard(buffer, count); + this.buffer = btree; + } + return add; + } + Object existing = BTree.find(buffer, comparator(), add); + if (existing != null) + return (T)existing; + this.buffer = BTree.update(buffer, BTree.singleton(add), comparator()); + ++count; + return add; + } + + public C build() + { + if (count == 0) + { + return empty(); + } + else if (count == 1) + { + return of((T)buffer); + } + else if (count < BTREE_THRESHOLD) + { + C result = copy((Object[])buffer, count); + cachedAny().forceDiscard((Object[])buffer, count); + return result; + } + else + { + return copyBtree((Object[])buffer, count); + } + } + + @Override + public T get(int index) + { + if (index < 0 || index >= count) throw new IndexOutOfBoundsException(); + if (count == 1) return (T) buffer; + if (count < BTREE_THRESHOLD) + return (T) ((Object[])buffer)[index]; + return BTree.findByIndex((Object[])buffer, index); + } + + @Override + public int size() + { + return count; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/AcceptSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/AcceptSerializers.java new file mode 100644 index 000000000000..d7603b69818e --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/AcceptSerializers.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import accord.coordinate.ExecuteFlag.ExecuteFlags; +import accord.local.Commands.AcceptOutcome; +import accord.messages.Accept; +import accord.messages.Accept.AcceptReply; +import accord.primitives.Ballot; +import accord.primitives.Deps; +import accord.primitives.Participants; +import accord.primitives.Route; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.serializers.CommandSerializers.ExecuteAtSerializer; + +import static accord.messages.Accept.SerializerSupport.create; + +public class AcceptSerializers +{ + private AcceptSerializers() {} + + public static final IVersionedSerializer request = new RequestSerializer(); + public static class RequestSerializer extends TxnRequestSerializer.WithUnsyncedSerializer + { + private static final Accept.Kind[] kinds = Accept.Kind.values(); + private static final int IS_PARTIAL = 1; + + @Override + public void serializeBody(Accept accept, DataOutputPlus out, Version version) throws IOException + { + out.writeByte((accept.kind.ordinal() << 1) | (accept.isPartialAccept ? IS_PARTIAL : 0)); + CommandSerializers.ballot.serialize(accept.ballot, out); + ExecuteAtSerializer.serialize(accept.txnId, accept.executeAt, out); + DepsSerializers.partialDeps.serialize(accept.partialDeps, out); + } + + @Override + public Accept deserializeBody(DataInputPlus in, Version version, TxnId txnId, Route scope, long waitForEpoch, long minEpoch) throws IOException + { + int flags = in.readByte(); + Accept.Kind kind = kinds[(flags >>> 1) & 1]; + return create(txnId, scope, waitForEpoch, minEpoch, + kind, + CommandSerializers.ballot.deserialize(in), + ExecuteAtSerializer.deserialize(txnId, in), + DepsSerializers.partialDeps.deserialize(in), + (flags & IS_PARTIAL) != 0); + } + + @Override + public long serializedBodySize(Accept accept, Version version) + { + return 1 + + CommandSerializers.ballot.serializedSize(accept.ballot) + + ExecuteAtSerializer.serializedSize(accept.txnId, accept.executeAt) + + DepsSerializers.partialDeps.serializedSize(accept.partialDeps); + } + } + + public static final UnversionedSerializer notAccept = new UnversionedSerializer<>() + { + @Override + public void serialize(Accept.NotAccept invalidate, DataOutputPlus out) throws IOException + { + CommandSerializers.status.serialize(invalidate.status, out); + CommandSerializers.ballot.serialize(invalidate.ballot, out); + CommandSerializers.txnId.serialize(invalidate.txnId, out); + KeySerializers.participants.serialize(invalidate.participants, out); + } + + @Override + public Accept.NotAccept deserialize(DataInputPlus in) throws IOException + { + return new Accept.NotAccept(CommandSerializers.status.deserialize(in), + CommandSerializers.ballot.deserialize(in), + CommandSerializers.txnId.deserialize(in), + KeySerializers.participants.deserialize(in)); + } + + @Override + public long serializedSize(Accept.NotAccept invalidate) + { + return CommandSerializers.status.serializedSize(invalidate.status) + + CommandSerializers.ballot.serializedSize(invalidate.ballot) + + CommandSerializers.txnId.serializedSize(invalidate.txnId) + + KeySerializers.participants.serializedSize(invalidate.participants); + } + }; + + public static final UnversionedSerializer reply = new ReplySerializer(); + public static class ReplySerializer implements UnversionedSerializer + { + // we have one spare bit at 0x04 for either another flag or more AcceptOutcome variants + private static final int SUPERSEDED_BY = 0x08; + private static final int COMMITTED_EXECUTE_AT = 0x10; + private static final int SUCCESSFUL = 0x20; + private static final int DEPS = 0x40; + private static final int FLAGS = 0x80; + @Override + public void serialize(AcceptReply reply, DataOutputPlus out) throws IOException + { + int flags = reply.outcome.ordinal() + | (reply.supersededBy != null ? SUPERSEDED_BY : 0) + | (reply.committedExecuteAt != null ? COMMITTED_EXECUTE_AT : 0) + | (reply.successful != null ? SUCCESSFUL : 0) + | (reply.deps != null ? DEPS : 0) + | (!reply.flags.isEmpty() ? FLAGS : 0); + + out.writeByte(flags); + if (reply.supersededBy != null) + CommandSerializers.ballot.serialize(reply.supersededBy, out); + if (reply.committedExecuteAt != null) + ExecuteAtSerializer.serialize(reply.committedExecuteAt, out); + if (reply.successful != null) + KeySerializers.participants.serialize(reply.successful, out); + if (reply.deps != null) + DepsSerializers.deps.serialize(reply.deps, out); + if (!reply.flags.isEmpty()) + out.writeUnsignedVInt32(reply.flags.bits()); + } + + private final AcceptOutcome[] outcomes = AcceptOutcome.values(); + @Override + public AcceptReply deserialize(DataInputPlus in) throws IOException + { + int flags = in.readByte(); + AcceptOutcome outcome = outcomes[flags & 3]; + Ballot supersededBy = (flags & SUPERSEDED_BY) == 0 ? null : CommandSerializers.ballot.deserialize(in); + Timestamp committedExecuteAt = (flags & COMMITTED_EXECUTE_AT) == 0 ? null : ExecuteAtSerializer.deserialize(in); + Participants successful = (flags & SUCCESSFUL) == 0 ? null : KeySerializers.participants.deserialize(in); + Deps deps = (flags & DEPS) == 0 ? null : DepsSerializers.deps.deserialize(in); + ExecuteFlags executeFlags = (flags & FLAGS) == 0 ? ExecuteFlags.none() : ExecuteFlags.get(in.readUnsignedVInt32()); + return new AcceptReply(outcome, supersededBy, successful, deps, committedExecuteAt, executeFlags); + } + + @Override + public long serializedSize(AcceptReply reply) + { + long size = TypeSizes.BYTE_SIZE; + if (reply.supersededBy != null) + size += CommandSerializers.ballot.serializedSize(reply.supersededBy); + if (reply.committedExecuteAt != null) + size += ExecuteAtSerializer.serializedSize(reply.committedExecuteAt); + if (reply.successful != null) + size += KeySerializers.participants.serializedSize(reply.successful); + if (reply.deps != null) + size += DepsSerializers.deps.serializedSize(reply.deps); + if (!reply.flags.isEmpty()) + size += TypeSizes.sizeofUnsignedVInt(reply.flags.bits()); + return size; + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/ApplySerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/ApplySerializers.java new file mode 100644 index 000000000000..181634cc508c --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/ApplySerializers.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import accord.api.Result; +import accord.messages.Apply; +import accord.primitives.Ballot; +import accord.primitives.FullRoute; +import accord.primitives.PartialDeps; +import accord.primitives.PartialTxn; +import accord.primitives.Route; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import accord.primitives.Writes; +import accord.utils.Invariants; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.serializers.CommandSerializers.ExecuteAtSerializer; + +import static accord.primitives.Txn.Kind.Write; + + +public class ApplySerializers +{ + private static final UnversionedSerializer kind = new UnversionedSerializer<>() + { + public void serialize(Apply.Kind kind, DataOutputPlus out) throws IOException + { + Invariants.requireArgument(kind == Apply.Kind.Maximal || kind == Apply.Kind.Minimal); + out.writeBoolean(kind == Apply.Kind.Maximal); + } + + public Apply.Kind deserialize(DataInputPlus in) throws IOException + { + return in.readBoolean() ? Apply.Kind.Maximal : Apply.Kind.Minimal; + } + + public long serializedSize(Apply.Kind t) + { + return TypeSizes.BOOL_SIZE; + } + }; + + public abstract static class ApplySerializer extends TxnRequestSerializer + { + @Override + public void serializeBody(A apply, DataOutputPlus out, Version version) throws IOException + { + CommandSerializers.ballot.serialize(apply.ballot, out); + out.writeVInt(apply.minEpoch - apply.waitForEpoch); + out.writeUnsignedVInt(apply.maxEpoch - apply.minEpoch); + kind.serialize(apply.kind, out); + ExecuteAtSerializer.serialize(apply.txnId, apply.executeAt, out); + DepsSerializers.partialDeps.serialize(apply.deps, out); + CommandSerializers.nullablePartialTxn.serialize(apply.txn, out, version); + KeySerializers.nullableFullRoute.serialize(apply.fullRoute, out); + if (apply.txnId.is(Write)) + CommandSerializers.writes.serialize(apply.writes, out, version); + } + + protected abstract A deserializeApply(TxnId txnId, Ballot ballot, Route scope, long minEpoch, long waitForEpoch, long maxEpoch, Apply.Kind kind, + Timestamp executeAt, PartialDeps deps, PartialTxn txn, FullRoute fullRoute, Writes writes, Result result); + + @Override + public A deserializeBody(DataInputPlus in, Version version, TxnId txnId, Route scope, long waitForEpoch) throws IOException + { + Ballot ballot = CommandSerializers.ballot.deserialize(in); + long minEpoch = waitForEpoch + in.readVInt(); + long maxEpoch = minEpoch + in.readUnsignedVInt(); + return deserializeApply(txnId, ballot, scope, minEpoch, waitForEpoch, maxEpoch, + kind.deserialize(in), + ExecuteAtSerializer.deserialize(txnId, in), + DepsSerializers.partialDeps.deserialize(in), + CommandSerializers.nullablePartialTxn.deserialize(in, version), + KeySerializers.nullableFullRoute.deserialize(in), + (txnId.is(Write) ? CommandSerializers.writes.deserialize(in, version) : null), + ResultSerializers.APPLIED); + } + + @Override + public long serializedBodySize(A apply, Version version) + { + return CommandSerializers.ballot.serializedSize(apply.ballot) + + TypeSizes.sizeofVInt(apply.minEpoch - apply.waitForEpoch) + + TypeSizes.sizeofUnsignedVInt(apply.maxEpoch - apply.minEpoch) + + kind.serializedSize(apply.kind) + + ExecuteAtSerializer.serializedSize(apply.txnId, apply.executeAt) + + DepsSerializers.partialDeps.serializedSize(apply.deps) + + CommandSerializers.nullablePartialTxn.serializedSize(apply.txn, version) + + KeySerializers.nullableFullRoute.serializedSize(apply.fullRoute) + + (apply.txnId.is(Write) ? CommandSerializers.writes.serializedSize(apply.writes, version) : 0); + } + } + + public static final IVersionedSerializer request = new ApplySerializer<>() + { + @Override + protected Apply deserializeApply(TxnId txnId, Ballot ballot, Route scope, long minEpoch, long waitForEpoch, long maxEpoch, Apply.Kind kind, + Timestamp executeAt, PartialDeps deps, PartialTxn txn, FullRoute fullRoute, Writes writes, Result result) + { + return Apply.SerializationSupport.create(txnId, ballot, scope, minEpoch, waitForEpoch, maxEpoch, kind, executeAt, deps, txn, fullRoute, writes, result); + } + }; + + public static final UnversionedSerializer reply = new UnversionedSerializer<>() + { + private final Apply.ApplyReply[] replies = Apply.ApplyReply.values(); + + @Override + public void serialize(Apply.ApplyReply t, DataOutputPlus out) throws IOException + { + out.writeByte(t.ordinal()); + } + + @Override + public Apply.ApplyReply deserialize(DataInputPlus in) throws IOException + { + return replies[in.readByte()]; + } + + @Override + public long serializedSize(Apply.ApplyReply t) + { + return 1; + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/AsymmetricVersionedSerializer.java b/src/java/org/apache/cassandra/service/accord/serializers/AsymmetricVersionedSerializer.java new file mode 100644 index 000000000000..987762bd0341 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/AsymmetricVersionedSerializer.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +public interface AsymmetricVersionedSerializer extends org.apache.cassandra.io.AsymmetricVersionedSerializer +{ +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/AwaitSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/AwaitSerializers.java new file mode 100644 index 000000000000..d08d1bb822fc --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/AwaitSerializers.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import accord.api.ProgressLog.BlockedUntil; +import accord.messages.Await; +import accord.messages.Await.AsyncAwaitComplete; +import accord.messages.Await.AwaitOk; +import accord.messages.RecoverAwait; +import accord.messages.RecoverAwait.RecoverAwaitOk; +import accord.primitives.Participants; +import accord.primitives.Route; +import accord.primitives.SaveStatus; +import accord.primitives.TxnId; +import accord.utils.Invariants; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.utils.vint.VIntCoding; + +public class AwaitSerializers +{ + public static final UnversionedSerializer request = new RequestSerializer<>() + { + @Override + public Await deserialize(TxnId txnId, Participants scope, BlockedUntil blockedUntil, boolean notifyProgressLog, long minAwaitEpoch, long maxAwaitEpoch, int callbackId, DataInputPlus in) + { + return Await.SerializerSupport.create(txnId, scope, blockedUntil, notifyProgressLog, minAwaitEpoch, maxAwaitEpoch, callbackId); + } + }; + + public static final UnversionedSerializer recoverRequest = new RequestSerializer<>() + { + @Override + public RecoverAwait deserialize(TxnId txnId, Participants scope, BlockedUntil blockedUntil, boolean notifyProgressLog, long minAwaitEpoch, long maxAwaitEpoch, int callbackId, DataInputPlus in) throws IOException + { + TxnId recoverId = CommandSerializers.txnId.deserialize(in); + return RecoverAwait.SerializerSupport.create(txnId, scope, blockedUntil, notifyProgressLog, minAwaitEpoch, maxAwaitEpoch, callbackId, recoverId); + } + + @Override + public void serialize(RecoverAwait await, DataOutputPlus out) throws IOException + { + super.serialize(await, out); + CommandSerializers.txnId.serialize(await.recoverId, out); + } + + @Override + public long serializedSize(RecoverAwait await) + { + return super.serializedSize(await) + CommandSerializers.txnId.serializedSize(await.recoverId); + } + }; + + static abstract class RequestSerializer implements UnversionedSerializer + { + abstract A deserialize(TxnId txnId, Participants scope, BlockedUntil blockedUntil, boolean notifyProgressLog, long minAwaitEpoch, long maxAwaitEpoch, int callbackId, DataInputPlus in) throws IOException; + + @Override + public void serialize(A await, DataOutputPlus out) throws IOException + { + CommandSerializers.txnId.serialize(await.txnId, out); + KeySerializers.participants.serialize(await.scope, out); + out.writeByte((await.blockedUntil.ordinal() << 1) | (await.notifyProgressLog ? 1 : 0)); + out.writeUnsignedVInt(await.maxAwaitEpoch - await.txnId.epoch()); + out.writeUnsignedVInt(await.maxAwaitEpoch - await.minAwaitEpoch); + out.writeUnsignedVInt32(await.callbackId + 1); + Invariants.require(await.callbackId >= -1); + } + + @Override + public A deserialize(DataInputPlus in) throws IOException + { + TxnId txnId = CommandSerializers.txnId.deserialize(in); + Participants scope = KeySerializers.participants.deserialize(in); + int blockedAndNotify = in.readByte(); + BlockedUntil blockedUntil = BlockedUntil.forOrdinal(blockedAndNotify >>> 1); + boolean notifyProgressLog = (blockedAndNotify & 1) == 1; + long maxAwaitEpoch = in.readUnsignedVInt() + txnId.epoch(); + long minAwaitEpoch = maxAwaitEpoch - in.readUnsignedVInt(); + int callbackId = in.readUnsignedVInt32() - 1; + Invariants.require(callbackId >= -1); + return deserialize(txnId, scope, blockedUntil, notifyProgressLog, minAwaitEpoch, maxAwaitEpoch, callbackId, in); + } + + @Override + public long serializedSize(A await) + { + return CommandSerializers.txnId.serializedSize(await.txnId) + + KeySerializers.participants.serializedSize(await.scope) + + TypeSizes.BYTE_SIZE + + VIntCoding.computeUnsignedVIntSize(await.maxAwaitEpoch - await.txnId.epoch()) + + VIntCoding.computeUnsignedVIntSize(await.maxAwaitEpoch - await.minAwaitEpoch) + + VIntCoding.computeUnsignedVIntSize(await.callbackId + 1); + } + } + + public static final UnversionedSerializer syncReply = EncodeAsVInt32.of(AwaitOk.class); + public static final UnversionedSerializer recoverReply = EncodeAsVInt32.of(RecoverAwaitOk.class); + + public static final UnversionedSerializer asyncReply = new UnversionedSerializer<>() + { + @Override + public void serialize(AsyncAwaitComplete ok, DataOutputPlus out) throws IOException + { + CommandSerializers.txnId.serialize(ok.txnId, out); + KeySerializers.route.serialize(ok.route, out); + out.writeByte(ok.newStatus.ordinal()); + out.writeUnsignedVInt32(ok.callbackId); + } + + @Override + public AsyncAwaitComplete deserialize(DataInputPlus in) throws IOException + { + TxnId txnId = CommandSerializers.txnId.deserialize(in); + Route scope = KeySerializers.route.deserialize(in); + SaveStatus newStatus = SaveStatus.forOrdinal(in.readByte()); + int callbackId = in.readUnsignedVInt32(); + return new AsyncAwaitComplete(txnId, scope, newStatus, callbackId); + } + + @Override + public long serializedSize(AsyncAwaitComplete ok) + { + return CommandSerializers.txnId.serializedSize(ok.txnId) + + KeySerializers.route.serializedSize(ok.route) + + TypeSizes.BYTE_SIZE + + VIntCoding.computeVIntSize(ok.callbackId); + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/BeginInvalidationSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/BeginInvalidationSerializers.java new file mode 100644 index 000000000000..25a40a6a418c --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/BeginInvalidationSerializers.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import accord.api.RoutingKey; +import accord.messages.BeginInvalidation; +import accord.messages.BeginInvalidation.InvalidateReply; +import accord.primitives.Ballot; +import accord.primitives.Participants; +import accord.primitives.Route; +import accord.primitives.SaveStatus; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + +public class BeginInvalidationSerializers +{ + public static final UnversionedSerializer request = new UnversionedSerializer<>() + { + @Override + public void serialize(BeginInvalidation begin, DataOutputPlus out) throws IOException + { + CommandSerializers.txnId.serialize(begin.txnId, out); + KeySerializers.participants.serialize(begin.participants, out); + CommandSerializers.ballot.serialize(begin.ballot, out); + } + + @Override + public BeginInvalidation deserialize(DataInputPlus in) throws IOException + { + return new BeginInvalidation(CommandSerializers.txnId.deserialize(in), + KeySerializers.participants.deserialize(in), + CommandSerializers.ballot.deserialize(in)); + } + + @Override + public long serializedSize(BeginInvalidation begin) + { + return CommandSerializers.txnId.serializedSize(begin.txnId) + + KeySerializers.participants.serializedSize(begin.participants) + + CommandSerializers.ballot.serializedSize(begin.ballot); + } + }; + + public static final UnversionedSerializer reply = new UnversionedSerializer<>() + { + private static final int ACCEPTED_FAST_PATH = 0x1; + private static final int HAS_TRUNCATED = 0x2; + private static final int HAS_ROUTE = 0x4; + private static final int HAS_HOME_KEY = 0x8; + + @Override + public void serialize(InvalidateReply reply, DataOutputPlus out) throws IOException + { + CommandSerializers.ballot.serialize(reply.supersededBy, out); + CommandSerializers.ballot.serialize(reply.accepted, out); + CommandSerializers.saveStatus.serialize(reply.maxStatus, out); + CommandSerializers.saveStatus.serialize(reply.maxKnowledgeStatus, out); + int flags = (reply.acceptedFastPath ? ACCEPTED_FAST_PATH : 0) + | (reply.truncated != null ? HAS_TRUNCATED : 0) + | (reply.route != null ? HAS_ROUTE : 0) + | (reply.homeKey != null && reply.route == null ? HAS_HOME_KEY : 0); + out.writeByte(flags); + if (reply.truncated != null) KeySerializers.participants.serialize(reply.truncated, out); + if (reply.route != null) KeySerializers.route.serialize(reply.route, out); + else if (reply.homeKey != null) KeySerializers.routingKey.serialize(reply.homeKey, out); + } + + @Override + public InvalidateReply deserialize(DataInputPlus in) throws IOException + { + Ballot supersededBy = CommandSerializers.ballot.deserialize(in); + Ballot accepted = CommandSerializers.ballot.deserialize(in); + SaveStatus maxStatus = CommandSerializers.saveStatus.deserialize(in); + SaveStatus maxKnowledgeStatus = CommandSerializers.saveStatus.deserialize(in); + byte flags = in.readByte(); + boolean acceptedFastPath = (flags & ACCEPTED_FAST_PATH) != 0; + Participants truncated = (flags & HAS_TRUNCATED) != 0 ? KeySerializers.participants.deserialize(in) : null; + Route route = (flags & HAS_ROUTE) != 0 ? KeySerializers.route.deserialize(in) : null; + RoutingKey homeKey = (flags & HAS_HOME_KEY) != 0 ? KeySerializers.routingKey.deserialize(in) : route != null ? route.homeKey() : null; + return new InvalidateReply(supersededBy, accepted, maxStatus, maxKnowledgeStatus, acceptedFastPath, truncated, route, homeKey); + } + + @Override + public long serializedSize(InvalidateReply reply) + { + return CommandSerializers.ballot.serializedSize(reply.supersededBy) + + CommandSerializers.ballot.serializedSize(reply.accepted) + + CommandSerializers.saveStatus.serializedSize(reply.maxStatus) + + CommandSerializers.saveStatus.serializedSize(reply.maxKnowledgeStatus) + + 1 + + (reply.truncated != null ? KeySerializers.participants.serializedSize(reply.truncated) : 0) + + (reply.route != null ? KeySerializers.route.serializedSize(reply.route) : 0) + + (reply.homeKey != null && reply.route == null ? KeySerializers.routingKey.serializedSize(reply.homeKey) : 0); + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/CheckStatusSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/CheckStatusSerializers.java new file mode 100644 index 000000000000..6981761612fb --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/CheckStatusSerializers.java @@ -0,0 +1,270 @@ +/* + * Licensed to the Apache Software ation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import accord.api.Result; +import accord.api.RoutingKey; +import accord.coordinate.Infer; +import accord.messages.CheckStatus; +import accord.messages.CheckStatus.CheckStatusNack; +import accord.messages.CheckStatus.CheckStatusOk; +import accord.messages.CheckStatus.CheckStatusOkFull; +import accord.messages.CheckStatus.CheckStatusReply; +import accord.primitives.Ballot; +import accord.primitives.Known; +import accord.primitives.KnownMap; +import accord.primitives.KnownMap.MinMax; +import accord.primitives.PartialDeps; +import accord.primitives.PartialTxn; +import accord.primitives.Participants; +import accord.primitives.Route; +import accord.primitives.SaveStatus; +import accord.primitives.Status.Durability; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import accord.primitives.Writes; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.serializers.CommandSerializers.ExecuteAtSerializer; + +import static accord.messages.CheckStatus.SerializationSupport.createOk; +import static org.apache.cassandra.service.accord.serializers.CommandSerializers.known; + +public class CheckStatusSerializers +{ + public static final UnversionedSerializer knownMap = new UnversionedSerializer<>() + { + @Override + public void serialize(KnownMap knownMap, DataOutputPlus out) throws IOException + { + int size = knownMap.size(); + out.writeUnsignedVInt32(size); + for (int i = 0 ; i <= size ; ++i) + KeySerializers.routingKey.serialize(knownMap.startAt(i), out); + for (int i = 0 ; i < size ; ++i) + { + KnownMap.MinMax minMax = knownMap.valueAt(i); + if (minMax == null) + { + out.writeByte(0); + continue; + } + boolean equal = minMax.min.equals(minMax); + out.writeByte(equal ? 1 : 2); + known.serialize(minMax.min, out); + if (!equal) + known.serialize(minMax, out); + } + } + + @Override + public KnownMap deserialize(DataInputPlus in) throws IOException + { + int size = in.readUnsignedVInt32(); + RoutingKey[] starts = new RoutingKey[size + 1]; + for (int i = 0 ; i <= size ; ++i) + starts[i] = KeySerializers.routingKey.deserialize(in); + MinMax[] values = new MinMax[size]; + for (int i = 0 ; i < size ; ++i) + { + int kind = in.readByte(); + if (kind == 0) + continue; + Known min = known.deserialize(in); + Known max = kind == 1 ? min : known.deserialize(in); + values[i] = new KnownMap.MinMax(min, max); + } + return KnownMap.SerializerSupport.create(true, starts, values); + } + + @Override + public long serializedSize(KnownMap knownMap) + { + int size = knownMap.size(); + long result = TypeSizes.sizeofUnsignedVInt(size); + for (int i = 0 ; i <= size ; ++i) + result += KeySerializers.routingKey.serializedSize(knownMap.startAt(i)); + for (int i = 0 ; i < size ; ++i) + { + KnownMap.MinMax minMax = knownMap.valueAt(i); + result += TypeSizes.BYTE_SIZE; + if (minMax == null) + continue; + boolean equal = minMax.min.equals(minMax); + result += known.serializedSize(minMax.min); + if (!equal) + result += known.serializedSize(minMax); + } + return result; + } + }; + + public static final UnversionedSerializer request = new UnversionedSerializer<>() + { + final CheckStatus.IncludeInfo[] infos = CheckStatus.IncludeInfo.values(); + + @Override + public void serialize(CheckStatus check, DataOutputPlus out) throws IOException + { + CommandSerializers.txnId.serialize(check.txnId, out); + KeySerializers.participants.serialize(check.query, out); + out.writeUnsignedVInt(check.sourceEpoch); + out.writeByte(check.includeInfo.ordinal()); + CommandSerializers.ballot.serialize(check.bumpBallot, out); + } + + @Override + public CheckStatus deserialize(DataInputPlus in) throws IOException + { + TxnId txnId = CommandSerializers.txnId.deserialize(in); + Participants query = KeySerializers.participants.deserialize(in); + long sourceEpoch = in.readUnsignedVInt(); + CheckStatus.IncludeInfo info = infos[in.readByte()]; + Ballot ballot = CommandSerializers.ballot.deserialize(in); + return new CheckStatus(txnId, query, sourceEpoch, info, ballot); + } + + @Override + public long serializedSize(CheckStatus check) + { + return CommandSerializers.txnId.serializedSize(check.txnId) + + KeySerializers.participants.serializedSize(check.query) + + TypeSizes.sizeofUnsignedVInt(check.sourceEpoch) + + TypeSizes.BYTE_SIZE + + CommandSerializers.ballot.serializedSize(check.bumpBallot); + } + }; + + public static final IVersionedSerializer reply = new IVersionedSerializer<>() + { + private static final byte OK = 0x00; + private static final byte FULL = 0x01; + private static final byte NACK = 0x02; + + @Override + public void serialize(CheckStatusReply reply, DataOutputPlus out, Version version) throws IOException + { + if (!reply.isOk()) + { + out.write(NACK); + return; + } + + CheckStatusOk ok = (CheckStatusOk) reply; + out.write(reply instanceof CheckStatusOkFull ? FULL : OK); + knownMap.serialize(ok.map, out); + CommandSerializers.saveStatus.serialize(ok.maxKnowledgeSaveStatus, out); + CommandSerializers.saveStatus.serialize(ok.maxSaveStatus, out); + CommandSerializers.ballot.serialize(ok.maxPromised, out); + CommandSerializers.ballot.serialize(ok.maxAcceptedOrCommitted, out); + CommandSerializers.ballot.serialize(ok.acceptedOrCommitted, out); + ExecuteAtSerializer.serializeNullable(ok.executeAt, out); + out.writeBoolean(ok.isCoordinating); + CommandSerializers.durability.serialize(ok.durability, out); + KeySerializers.nullableRoute.serialize(ok.route, out); + KeySerializers.nullableRoutingKey.serialize(ok.homeKey, out); + CommandSerializers.invalidIf.serialize(ok.invalidIf, out); + + if (!(reply instanceof CheckStatusOkFull)) + return; + + CheckStatusOkFull okFull = (CheckStatusOkFull) ok; + CommandSerializers.nullablePartialTxn.serialize(okFull.partialTxn, out, version); + DepsSerializers.nullablePartialDeps.serialize(okFull.stableDeps, out); + CommandSerializers.nullableWrites.serialize(okFull.writes, out, version); + } + + @Override + public CheckStatusReply deserialize(DataInputPlus in, Version version) throws IOException + { + byte kind = in.readByte(); + switch (kind) + { + default: throw new IOException("Unhandled CheckStatusReply kind: " + Integer.toHexString(Byte.toUnsignedInt(kind))); + case NACK: + return CheckStatusNack.NotOwned; + case OK: + case FULL: + KnownMap map = knownMap.deserialize(in); + SaveStatus maxKnowledgeStatus = CommandSerializers.saveStatus.deserialize(in); + SaveStatus maxStatus = CommandSerializers.saveStatus.deserialize(in); + Ballot maxPromised = CommandSerializers.ballot.deserialize(in); + Ballot maxAcceptedOrCommitted = CommandSerializers.ballot.deserialize(in); + Ballot acceptedOrCommitted = CommandSerializers.ballot.deserialize(in); + Timestamp executeAt = ExecuteAtSerializer.deserializeNullable(in); + boolean isCoordinating = in.readBoolean(); + Durability durability = CommandSerializers.durability.deserialize(in); + Route route = KeySerializers.nullableRoute.deserialize(in); + RoutingKey homeKey = KeySerializers.nullableRoutingKey.deserialize(in); + Infer.InvalidIf invalidIf = CommandSerializers.invalidIf.deserialize(in); + + if (kind == OK) + return createOk(map, maxKnowledgeStatus, maxStatus, maxPromised, maxAcceptedOrCommitted, acceptedOrCommitted, executeAt, + isCoordinating, durability, route, homeKey, invalidIf); + + PartialTxn partialTxn = CommandSerializers.nullablePartialTxn.deserialize(in, version); + PartialDeps committedDeps = DepsSerializers.nullablePartialDeps.deserialize(in); + Writes writes = CommandSerializers.nullableWrites.deserialize(in, version); + + Result result = null; + if (maxKnowledgeStatus.known.outcome().isOrWasApply()) + result = ResultSerializers.APPLIED; + + return createOk(map, maxKnowledgeStatus, maxStatus, maxPromised, maxAcceptedOrCommitted, acceptedOrCommitted, executeAt, + isCoordinating, durability, route, homeKey, invalidIf, partialTxn, committedDeps, writes, result); + + } + } + + @Override + public long serializedSize(CheckStatusReply reply, Version version) + { + long size = TypeSizes.BYTE_SIZE; + if (!reply.isOk()) + return size; + + CheckStatusOk ok = (CheckStatusOk) reply; + size += knownMap.serializedSize(ok.map); + size += CommandSerializers.saveStatus.serializedSize(ok.maxKnowledgeSaveStatus); + size += CommandSerializers.saveStatus.serializedSize(ok.maxSaveStatus); + size += CommandSerializers.ballot.serializedSize(ok.maxPromised); + size += CommandSerializers.ballot.serializedSize(ok.maxAcceptedOrCommitted); + size += CommandSerializers.ballot.serializedSize(ok.acceptedOrCommitted); + size += ExecuteAtSerializer.serializedNullableSize(ok.executeAt); + size += TypeSizes.BOOL_SIZE; + size += CommandSerializers.durability.serializedSize(ok.durability); + size += KeySerializers.nullableRoute.serializedSize(ok.route); + size += KeySerializers.nullableRoutingKey.serializedSize(ok.homeKey); + size += CommandSerializers.invalidIf.serializedSize(ok.invalidIf); + + if (!(reply instanceof CheckStatusOkFull)) + return size; + + CheckStatusOkFull okFull = (CheckStatusOkFull) ok; + size += CommandSerializers.nullablePartialTxn.serializedSize(okFull.partialTxn, version); + size += DepsSerializers.nullablePartialDeps.serializedSize(okFull.stableDeps); + size += CommandSerializers.nullableWrites.serializedSize(okFull.writes, version); + return size; + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/CommandSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/CommandSerializers.java new file mode 100644 index 000000000000..1057afe13e09 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/CommandSerializers.java @@ -0,0 +1,962 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import com.google.common.annotations.VisibleForTesting; + +import accord.api.Query; +import accord.api.Read; +import accord.api.Update; +import accord.api.Write; +import accord.coordinate.Infer; +import accord.local.Node; +import accord.local.StoreParticipants; +import accord.primitives.Ballot; +import accord.primitives.Known; +import accord.primitives.Known.KnownDeps; +import accord.primitives.PartialTxn; +import accord.primitives.Participants; +import accord.primitives.Routable; +import accord.primitives.Route; +import accord.primitives.SaveStatus; +import accord.primitives.Seekables; +import accord.primitives.Status; +import accord.primitives.Status.Durability; +import accord.primitives.Timestamp; +import accord.primitives.TimestampWithUniqueHlc; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.primitives.Unseekables; +import accord.primitives.Writes; +import accord.utils.Invariants; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; +import org.apache.cassandra.db.marshal.ValueAccessor; +import org.apache.cassandra.io.ParameterisedVersionedSerializer; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.VersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.txn.AccordUpdate; +import org.apache.cassandra.service.accord.txn.TxnQuery; +import org.apache.cassandra.service.accord.txn.TxnRead; +import org.apache.cassandra.service.accord.txn.TxnWrite; +import org.apache.cassandra.utils.NullableSerializer; + +public class CommandSerializers +{ + private CommandSerializers() + { + } + + public static final VariableWidthTimestampSerializer txnId = new VariableWidthTimestampSerializer<>(TxnId::fromValues); + public static final VariableWidthTimestampSerializer timestamp = new VariableWidthTimestampSerializer<>(Timestamp::fromValues); + public static final BallotSerializer ballot = new BallotSerializer(); // permits null + public static final UnversionedSerializer kind = EncodeAsVInt32.of(Txn.Kind.class); + public static final StoreParticipantsSerializer participants = new StoreParticipantsSerializer(); + + public static class ExecuteAtSerializer + { + private static final int IS_TIMESTAMP = 1; + private static final int HAS_UNIQUE_HLC = 2; + private static final int HAS_EPOCH = 4; + + public static Timestamp deserialize(TxnId txnId, DataInputPlus in) throws IOException + { + int flags = in.readUnsignedVInt32(); + if ((flags & 1) == 0) + return txnId.addFlags(flags >>> 1); + + long epoch = txnId.epoch(); + if((flags & HAS_EPOCH) != 0) + { + long delta = in.readUnsignedVInt(); + if (delta == 0) + return Timestamp.NONE; + epoch += delta - 1; + } + + long hlc = txnId.hlc() + in.readUnsignedVInt(); + Node.Id node = new Node.Id(in.readUnsignedVInt32()); + if ((flags & HAS_UNIQUE_HLC) == 0) + return Timestamp.fromValues(epoch, hlc, flags >>> 3, node); + return new TimestampWithUniqueHlc(epoch, hlc, hlc + in.readUnsignedVInt(), flags >>> 3, node); + } + + public static void skip(TxnId txnId, DataInputPlus in) throws IOException + { + int flags = in.readUnsignedVInt32(); + if ((flags & 1) != 0) + { + if ((flags & HAS_EPOCH) != 0 && in.readUnsignedVInt() == 0) + return; + in.readUnsignedVInt(); + in.readUnsignedVInt32(); + if ((flags & HAS_UNIQUE_HLC) != 0) + in.readUnsignedVInt(); + } + } + + public static void serialize(TxnId txnId, Timestamp executeAt, DataOutputPlus out) throws IOException + { + int flags = flags(txnId, executeAt); + out.writeUnsignedVInt32(flags); + if ((flags & 1) != 0) + { + if ((flags & HAS_EPOCH) != 0) + { + if (executeAt.equals(Timestamp.NONE)) + { + out.writeUnsignedVInt(0L); + return; + } + out.writeUnsignedVInt(1 + executeAt.epoch() - txnId.epoch()); + } + out.writeUnsignedVInt(executeAt.hlc() - txnId.hlc()); + out.writeUnsignedVInt32(executeAt.node.id); + if ((flags & HAS_UNIQUE_HLC) != 0) + out.writeUnsignedVInt(executeAt.uniqueHlc() - executeAt.hlc()); + } + } + + private static int flags(TxnId txnId, Timestamp executeAt) + { + if (executeAt.getClass() == TxnId.class) + return (executeAt.flags() ^ txnId.flags()) << 1; + + int flags = executeAt.flags() << 3; + if (executeAt.epoch() != txnId.epoch()) + flags |= HAS_EPOCH; + if (executeAt.hasDistinctHlcAndUniqueHlc()) + flags |= HAS_UNIQUE_HLC; + return flags | 1; + } + + public static long serializedSize(TxnId txnId, Timestamp executeAt) + { + int flags = flags(txnId, executeAt); + long size = TypeSizes.sizeofUnsignedVInt(flags); + if ((flags & 1) != 0) + { + if ((flags & HAS_EPOCH) != 0) + { + if (executeAt.equals(Timestamp.NONE)) + return size + TypeSizes.sizeofUnsignedVInt(0L); + + size += TypeSizes.sizeofUnsignedVInt(executeAt.epoch() - txnId.epoch()); + } + size += TypeSizes.sizeofUnsignedVInt(executeAt.hlc() - txnId.hlc()); + size += TypeSizes.sizeofUnsignedVInt(executeAt.node.id); + if ((flags & HAS_UNIQUE_HLC) != 0) + size += TypeSizes.sizeofUnsignedVInt(executeAt.uniqueHlc() - executeAt.hlc()); + } + return size; + } + + public static Timestamp deserialize(DataInputPlus in) throws IOException + { + return deserialize(in, false); + } + + public static Timestamp deserializeNullable(DataInputPlus in) throws IOException + { + return deserialize(in, true); + } + + private static Timestamp deserialize(DataInputPlus in, boolean nullable) throws IOException + { + int flags = in.readUnsignedVInt32(); + if (nullable) + { + if (flags == 0) return null; + flags--; + } + long epoch = in.readUnsignedVInt(); + if (epoch-- == 0) + return Timestamp.NONE; + + long hlc = in.readUnsignedVInt(); + Node.Id node = new Node.Id(in.readUnsignedVInt32()); + if ((flags & HAS_UNIQUE_HLC) == 0) + { + if ((flags & IS_TIMESTAMP) == 0) + return TxnId.fromValues(epoch, hlc, flags >>> 2, node); + return Timestamp.fromValues(epoch, hlc, flags >>> 2, node); + } + return new TimestampWithUniqueHlc(epoch, hlc, hlc + in.readUnsignedVInt(), flags >>> 2, node); + } + + public static void skip(DataInputPlus in) throws IOException + { + skip(in, false); + } + + public static void skipNullable(DataInputPlus in) throws IOException + { + skip(in, true); + } + + private static void skip(DataInputPlus in, boolean nullable) throws IOException + { + int flags = in.readUnsignedVInt32(); + if (nullable) + { + if (flags == 0) + return; + flags--; + } + if (0 == in.readUnsignedVInt()) + return; + in.readUnsignedVInt(); + in.readUnsignedVInt32(); + if ((flags & HAS_UNIQUE_HLC) != 0) + in.readUnsignedVInt(); + } + + public static void serialize(Timestamp executeAt, DataOutputPlus out) throws IOException + { + serialize(executeAt, out, false); + } + + public static void serializeNullable(Timestamp executeAt, DataOutputPlus out) throws IOException + { + serialize(executeAt, out, true); + } + + private static void serialize(Timestamp executeAt, DataOutputPlus out, boolean nullable) throws IOException + { + int flags = flags(executeAt, nullable); + out.writeUnsignedVInt32(flags); + if (executeAt == null) + { + Invariants.require(nullable); + } + else if (executeAt.equals(Timestamp.NONE)) + { + out.writeUnsignedVInt(0L); + } + else + { + out.writeUnsignedVInt(1 + executeAt.epoch()); + out.writeUnsignedVInt(executeAt.hlc()); + out.writeUnsignedVInt32(executeAt.node.id); + if (executeAt.hasDistinctHlcAndUniqueHlc()) + out.writeUnsignedVInt(executeAt.uniqueHlc() - executeAt.hlc()); + } + } + + public static long serializedSize(Timestamp executeAt) + { + return serializedSize(executeAt, false); + } + + public static long serializedNullableSize(Timestamp executeAt) + { + return serializedSize(executeAt, true); + } + + private static long serializedSize(Timestamp executeAt, boolean nullable) + { + int flags = flags(executeAt, nullable); + long size = TypeSizes.sizeofUnsignedVInt(flags); + if (executeAt == null) + { + Invariants.require(nullable); + return size; + } + if (executeAt.equals(Timestamp.NONE)) size += TypeSizes.sizeofUnsignedVInt(0); + else + { + size += TypeSizes.sizeofUnsignedVInt(1 + executeAt.epoch()); + size += TypeSizes.sizeofUnsignedVInt(executeAt.hlc()); + size += TypeSizes.sizeofUnsignedVInt(executeAt.node.id); + if (executeAt.hasDistinctHlcAndUniqueHlc()) + size += TypeSizes.sizeofUnsignedVInt(executeAt.uniqueHlc() - executeAt.hlc()); + } + return size; + } + + private static int flags(Timestamp executeAt, boolean nullable) + { + if (executeAt == null) + { + Invariants.require(nullable); + return 0; + } + + int flags = executeAt.flags() << 2; + // for compatibility with other serialized form + flags |= (executeAt.getClass() == TxnId.class) ? 0 : 1; + if (executeAt.hasDistinctHlcAndUniqueHlc()) + flags |= HAS_UNIQUE_HLC; + if (nullable) + flags++; + return flags; + } + } + + public static class StoreParticipantsSerializer implements UnversionedSerializer + { + static final int HAS_ROUTE = 0x1; + static final int ROUTE_EQUALS_SUPERSET = 0x2; + static final int HAS_TOUCHED_EQUALS_SUPERSET = 0x4; + static final int TOUCHES_EQUALS_HAS_TOUCHED = 0x8; + static final int OWNS_EQUALS_TOUCHES = 0x10; + static final int EXECUTES_IS_NULL = 0x20; + static final int EXECUTES_IS_OWNS = 0x40; + static final int WAITSON_IS_OWNS = 0x80; + + @Override + public void serialize(StoreParticipants t, DataOutputPlus out) throws IOException + { + Participants hasTouched = t.hasTouched(); + Route route = t.route(); + Participants owns = t.owns(); + Participants executes = t.executes(); + Participants touches = t.touches(); + boolean hasRoute = route != null; + boolean touchesEqualsHasTouched = touches == hasTouched; + boolean ownsEqualsTouches = owns == touches; + boolean executesIsNull = executes == null; + boolean executesIsOwns = !executesIsNull && executes == owns; + boolean waitsOnIsOwns = !executesIsNull && t.waitsOn() == owns; + boolean encodeSubsets = hasTouched.domain() == Routable.Domain.Key; + Participants superset = !hasRoute ? hasTouched : encodeSubsets ? route.with((Participants)hasTouched) : route; + boolean routeEqualsSuperset = route == superset; + boolean hasTouchedEqualsSuperset = hasTouched == superset; + out.writeByte((hasRoute ? HAS_ROUTE : 0) + | (routeEqualsSuperset ? ROUTE_EQUALS_SUPERSET : 0) + | (hasTouchedEqualsSuperset ? HAS_TOUCHED_EQUALS_SUPERSET : 0) + | (touchesEqualsHasTouched ? TOUCHES_EQUALS_HAS_TOUCHED : 0) + | (ownsEqualsTouches ? OWNS_EQUALS_TOUCHES : 0) + | (executesIsNull ? EXECUTES_IS_NULL : 0) + | (executesIsOwns ? EXECUTES_IS_OWNS : 0) + | (waitsOnIsOwns ? WAITSON_IS_OWNS : 0) + ); + + KeySerializers.participants.serialize(superset, out); + if (encodeSubsets) + { + if (hasRoute && !routeEqualsSuperset) KeySerializers.route.serializeSubset(route, superset, out); + if (!hasTouchedEqualsSuperset) KeySerializers.participants.serializeSubset(hasTouched, superset, out); + if (!touchesEqualsHasTouched) KeySerializers.participants.serializeSubset(touches, superset, out); + if (!ownsEqualsTouches) KeySerializers.participants.serializeSubset(owns, superset, out); + if (!executesIsNull && !executesIsOwns) KeySerializers.participants.serializeSubset(executes, superset, out); + if (!executesIsNull && !waitsOnIsOwns) KeySerializers.participants.serializeSubset(t.waitsOn(), superset, out); + } + else + { + if (hasRoute && !routeEqualsSuperset) KeySerializers.route.serialize(route, out); + if (!hasTouchedEqualsSuperset) KeySerializers.participants.serialize(hasTouched, out); + if (!touchesEqualsHasTouched) KeySerializers.participants.serialize(touches, out); + if (!ownsEqualsTouches) KeySerializers.participants.serialize(owns, out); + if (!executesIsNull && !executesIsOwns) KeySerializers.participants.serialize(executes, out); + if (!executesIsNull && !waitsOnIsOwns) KeySerializers.participants.serialize(t.waitsOn(), out); + } + } + + public void skip(DataInputPlus in) throws IOException + { + int flags = in.readByte(); + Unseekables.UnseekablesKind kind = KeySerializers.participants.readKind(in); + int supersetCount = KeySerializers.participants.countAndSkip(kind, in); + boolean skipSubset = kind.domain() == Routable.Domain.Key; + if (skipSubset) + { + if (0 != (flags & HAS_ROUTE) && 0 == (flags & ROUTE_EQUALS_SUPERSET)) KeySerializers.route.skipSubset(supersetCount, in); + if (0 == (flags & HAS_TOUCHED_EQUALS_SUPERSET)) KeySerializers.participants.skipSubset(supersetCount, in); + if (0 == (flags & TOUCHES_EQUALS_HAS_TOUCHED)) KeySerializers.participants.skipSubset(supersetCount, in); + if (0 == (flags & OWNS_EQUALS_TOUCHES)) KeySerializers.participants.skipSubset(supersetCount, in); + if (0 == (flags & (EXECUTES_IS_OWNS | EXECUTES_IS_NULL))) KeySerializers.participants.skipSubset(supersetCount, in); + if (0 == (flags & (WAITSON_IS_OWNS | EXECUTES_IS_NULL))) KeySerializers.participants.skipSubset(supersetCount, in); + } + else + { + if (0 != (flags & HAS_ROUTE) && 0 == (flags & ROUTE_EQUALS_SUPERSET)) KeySerializers.route.skip(in); + if (0 == (flags & HAS_TOUCHED_EQUALS_SUPERSET)) KeySerializers.participants.skip(in); + if (0 == (flags & TOUCHES_EQUALS_HAS_TOUCHED)) KeySerializers.participants.skip(in); + if (0 == (flags & OWNS_EQUALS_TOUCHES)) KeySerializers.participants.skip(in); + if (0 == (flags & (EXECUTES_IS_OWNS | EXECUTES_IS_NULL))) KeySerializers.participants.skip(in); + if (0 == (flags & (WAITSON_IS_OWNS | EXECUTES_IS_NULL))) KeySerializers.participants.skip(in); + } + } + + @Override + public StoreParticipants deserialize(DataInputPlus in) throws IOException + { + int flags = in.readByte(); + Participants superset = KeySerializers.participants.deserialize(in); + boolean decodeSubset = superset.domain() == Routable.Domain.Key; + if (decodeSubset) + { + Route route = 0 == (flags & HAS_ROUTE) ? null : 0 != (flags & ROUTE_EQUALS_SUPERSET) ? (Route)superset : KeySerializers.route.deserializeSubset(superset, in); + Participants hasTouched = 0 != (flags & HAS_TOUCHED_EQUALS_SUPERSET) ? superset : KeySerializers.participants.deserializeSubset(superset, in); + Participants touches = 0 != (flags & TOUCHES_EQUALS_HAS_TOUCHED) ? hasTouched : KeySerializers.participants.deserializeSubset(superset, in); + Participants owns = 0 != (flags & OWNS_EQUALS_TOUCHES) ? touches : KeySerializers.participants.deserializeSubset(superset, in); + Participants executes = 0 != (flags & EXECUTES_IS_NULL) ? null : 0 != (flags & EXECUTES_IS_OWNS) ? owns : KeySerializers.participants.deserializeSubset(superset, in); + Participants waitsOn = 0 != (flags & EXECUTES_IS_NULL) ? null : 0 != (flags & WAITSON_IS_OWNS) ? owns : KeySerializers.participants.deserializeSubset(superset, in); + return StoreParticipants.create(route, owns, executes, waitsOn, touches, hasTouched); + } + else + { + Route route = 0 == (flags & HAS_ROUTE) ? null : 0 != (flags & ROUTE_EQUALS_SUPERSET) ? (Route)superset : KeySerializers.route.deserialize(in); + Participants hasTouched = 0 != (flags & HAS_TOUCHED_EQUALS_SUPERSET) ? superset : KeySerializers.participants.deserialize(in); + Participants touches = 0 != (flags & TOUCHES_EQUALS_HAS_TOUCHED) ? hasTouched : KeySerializers.participants.deserialize(in); + Participants owns = 0 != (flags & OWNS_EQUALS_TOUCHES) ? touches : KeySerializers.participants.deserialize(in); + Participants executes = 0 != (flags & EXECUTES_IS_NULL) ? null : 0 != (flags & EXECUTES_IS_OWNS) ? owns : KeySerializers.participants.deserialize(in); + Participants waitsOn = 0 != (flags & EXECUTES_IS_NULL) ? null : 0 != (flags & WAITSON_IS_OWNS) ? owns : KeySerializers.participants.deserialize(in); + return StoreParticipants.create(route, owns, executes, waitsOn, touches, hasTouched); + } + } + + @Override + public long serializedSize(StoreParticipants t) + { + Participants hasTouched = t.hasTouched(); + Route route = t.route(); + Participants owns = t.owns(); + Participants executes = t.executes(); + Participants touches = t.touches(); + boolean hasRoute = route != null; + boolean touchesEqualsHasTouched = touches == hasTouched; + boolean ownsEqualsTouches = owns == touches; + boolean executesIsNull = executes == null; + boolean executesIsOwns = !executesIsNull && executes == owns; + boolean waitsOnIsOwns = !executesIsNull && t.waitsOn() == owns; + boolean encodeSubsets = hasTouched.domain() == Routable.Domain.Key; + Participants superset = !hasRoute ? hasTouched : encodeSubsets ? route.with((Participants)hasTouched) : route; + boolean routeEqualsSuperset = route == superset; + boolean hasTouchedEqualsSuperset = hasTouched == superset; + long size = 1 + KeySerializers.participants.serializedSize(superset); + if (encodeSubsets) + { + if (hasRoute && !routeEqualsSuperset) size += KeySerializers.route.serializedSubsetSize(route, superset); + if (!hasTouchedEqualsSuperset) size += KeySerializers.participants.serializedSubsetSize(hasTouched, superset); + if (!touchesEqualsHasTouched) size += KeySerializers.participants.serializedSubsetSize(touches, superset); + if (!ownsEqualsTouches) size += KeySerializers.participants.serializedSubsetSize(owns, superset); + if (!executesIsNull && !executesIsOwns) size += KeySerializers.participants.serializedSubsetSize(executes, superset); + if (!executesIsNull && !waitsOnIsOwns) size += KeySerializers.participants.serializedSubsetSize(t.waitsOn(), superset); + } + else + { + if (hasRoute && !routeEqualsSuperset) size += KeySerializers.route.serializedSize(route); + if (!hasTouchedEqualsSuperset) size += KeySerializers.participants.serializedSize(hasTouched); + if (!touchesEqualsHasTouched) size += KeySerializers.participants.serializedSize(touches); + if (!ownsEqualsTouches) size += KeySerializers.participants.serializedSize(owns); + if (!executesIsNull && !executesIsOwns) size += KeySerializers.participants.serializedSize(executes); + if (!executesIsNull && !waitsOnIsOwns) size += KeySerializers.participants.serializedSize(t.waitsOn()); + } + return size; + } + } + + public static class VariableWidthTimestampSerializer implements UnversionedSerializer + { + private static final int NODE_SHIFT = 0; + private static final int NODE_MASK = 0x3; + private static final int NODE_MIN_LENGTH = 1; + private static final int FLAGS_SHIFT = NODE_SHIFT + Integer.bitCount(NODE_MASK); + private static final int FLAGS_MASK = 0x1; + private static final int FLAGS_MIN_LENGTH = 1; + private static final int HLC_SHIFT = FLAGS_SHIFT + Integer.bitCount(FLAGS_MASK); + private static final int HLC_MASK = 0x3; + private static final int HLC_MIN_LENGTH = 5; + private static final int EPOCH_SHIFT = HLC_SHIFT + Integer.bitCount(HLC_MASK); + private static final int EPOCH_MASK = 0x3; + private static final int EPOCH_MIN_LENGTH = 3; + static final byte NULL_BYTE = (byte) 0x80; + static + { + Invariants.require(EPOCH_MASK << EPOCH_SHIFT >= 0); + } + + interface Factory + { + T create(long epoch, long hlc, int flags, Node.Id node); + } + + private final VariableWidthTimestampSerializer.Factory factory; + + T decodeSpecial(int encodingFlags) + { + Invariants.require(encodingFlags == NULL_BYTE); + return null; + } + + byte encodeSpecial(T value) + { + if (value != null) + return 0; + return NULL_BYTE; + } + + private VariableWidthTimestampSerializer(VariableWidthTimestampSerializer.Factory factory) + { + this.factory = factory; + } + + @Override + public void serialize(T ts, DataOutputPlus out) throws IOException + { + { + byte specialByte = encodeSpecial(ts); + if (specialByte != 0) + { + Invariants.require(specialByte < 0); + out.writeByte(specialByte); + return; + } + } + long epoch = ts.epoch(); + long hlc = ts.hlc(); + int flags = ts.flags(); + int epochLength = length(epoch, EPOCH_MIN_LENGTH); + int hlcLength = length(hlc, HLC_MIN_LENGTH); + int flagsLength = length(flags, FLAGS_MIN_LENGTH); + int nodeLength = length(ts.node.id, NODE_MIN_LENGTH); + int encodingFlags = encodeLength(epochLength, EPOCH_SHIFT, EPOCH_MIN_LENGTH, EPOCH_MASK) + | encodeLength(hlcLength, HLC_SHIFT, HLC_MIN_LENGTH, HLC_MASK) + | encodeLength(flagsLength, FLAGS_SHIFT, FLAGS_MIN_LENGTH, FLAGS_MASK) + | encodeLength(nodeLength, NODE_SHIFT, NODE_MIN_LENGTH, NODE_MASK); + out.writeByte(encodingFlags); + out.writeLeastSignificantBytes(epoch, epochLength); + out.writeLeastSignificantBytes(hlc, hlcLength); + out.writeLeastSignificantBytes(flags, flagsLength); + out.writeLeastSignificantBytes(ts.node.id, nodeLength); + } + + // exactly the same fundamental format as serialize(), only we interleave the length bits with the values, maintaining ordering + public int serializeComparable(T ts, V dst, ValueAccessor accessor, int offset) + { + int position = offset; + Invariants.require(encodeSpecial(ts) == 0); + long epoch = ts.epoch(); + long hlc = ts.hlc(); + int flags = ts.flags(); + int epochLength = length(epoch, EPOCH_MIN_LENGTH); + int hlcLength = length(hlc, HLC_MIN_LENGTH); + int flagsLength = length(flags, FLAGS_MIN_LENGTH); + int nodeLength = length(ts.node.id, NODE_MIN_LENGTH); + + long pack = packLength(epochLength, epochLength * 8, EPOCH_MIN_LENGTH, EPOCH_MASK); + pack |= epoch; + pack <<= 5; + pack |= packLength(hlcLength, 3, HLC_MIN_LENGTH, HLC_MASK); + pack |= hlc >>> ((hlcLength*8)-3); + accessor.putLeastSignificantBytes(dst, position, pack, epochLength + 1); + position += epochLength + 1; + + hlc <<= 3; + hlc |= packLength(flagsLength, 2, FLAGS_MIN_LENGTH, FLAGS_MASK); + hlc |= flags >>> ((flagsLength * 8) - 2); + accessor.putLeastSignificantBytes(dst, position, hlc, hlcLength); + position += hlcLength; + + pack = (long)flags << (2 + nodeLength * 8); + pack |= packLength(nodeLength, nodeLength * 8, NODE_MIN_LENGTH, NODE_MASK); + pack |= ts.node.id & 0xffffffffL; + accessor.putLeastSignificantBytes(dst, position, pack, flagsLength + nodeLength); + position += flagsLength + nodeLength; + return position - offset; + } + + public int serialize(T ts, V dst, ValueAccessor accessor, int offset) + { + { + byte specialByte = encodeSpecial(ts); + if (specialByte != 0) + { + Invariants.require(specialByte < 0); + accessor.putByte(dst, offset, specialByte); + return 1; + } + } + + long epoch = ts.epoch(); + long hlc = ts.hlc(); + int flags = ts.flags(); + int epochLength = length(epoch, EPOCH_MIN_LENGTH); + int hlcLength = length(hlc, HLC_MIN_LENGTH); + int flagsLength = length(flags, FLAGS_MIN_LENGTH); + int nodeLength = length(ts.node.id, NODE_MIN_LENGTH); + int encodingFlags = encodeLength(epochLength, EPOCH_SHIFT, EPOCH_MIN_LENGTH, EPOCH_MASK) + | encodeLength(hlcLength, HLC_SHIFT, HLC_MIN_LENGTH, HLC_MASK) + | encodeLength(flagsLength, FLAGS_SHIFT, FLAGS_MIN_LENGTH, FLAGS_MASK) + | encodeLength(nodeLength, NODE_SHIFT, NODE_MIN_LENGTH, NODE_MASK); + + int position = offset; + position += accessor.putByte(dst, position, (byte)encodingFlags); + position += accessor.putLeastSignificantBytes(dst, position, epoch, epochLength); + position += accessor.putLeastSignificantBytes(dst, position, hlc, hlcLength); + position += accessor.putLeastSignificantBytes(dst, position, flags, flagsLength); + position += accessor.putLeastSignificantBytes(dst, position, ts.node.id, nodeLength); + return position - offset; + } + + public ByteBuffer serialize(T ts) + { + int size = Math.toIntExact(serializedSize(ts)); + ByteBuffer result = ByteBuffer.allocate(size); + serialize(ts, result, ByteBufferAccessor.instance, 0); + return result; + } + + public void serialize(T ts, ByteBuffer out) + { + int position = out.position(); + position += serialize(ts, out, ByteBufferAccessor.instance, 0); + out.position(position); + } + + public void skip(DataInputPlus in) throws IOException + { + int encodingFlags = in.readByte(); + if (encodingFlags < 0) + return; + int epochLength = decodeLength(encodingFlags, EPOCH_SHIFT, EPOCH_MIN_LENGTH, EPOCH_MASK); + int hlcLength = decodeLength(encodingFlags, HLC_SHIFT, HLC_MIN_LENGTH, HLC_MASK); + int flagsLength = decodeLength(encodingFlags, FLAGS_SHIFT, FLAGS_MIN_LENGTH, FLAGS_MASK); + int nodeLength = decodeLength(encodingFlags, NODE_SHIFT, NODE_MIN_LENGTH, NODE_MASK); + in.skipBytesFully(epochLength + hlcLength + flagsLength + nodeLength); + } + + @Override + public T deserialize(DataInputPlus in) throws IOException + { + int encodingFlags = in.readByte(); + if (encodingFlags < 0) + return decodeSpecial(encodingFlags); + int epochLength = decodeLength(encodingFlags, EPOCH_SHIFT, EPOCH_MIN_LENGTH, EPOCH_MASK); + int hlcLength = decodeLength(encodingFlags, HLC_SHIFT, HLC_MIN_LENGTH, HLC_MASK); + int flagsLength = decodeLength(encodingFlags, FLAGS_SHIFT, FLAGS_MIN_LENGTH, FLAGS_MASK); + int nodeLength = decodeLength(encodingFlags, NODE_SHIFT, NODE_MIN_LENGTH, NODE_MASK); + long epoch = in.readLeastSignificantBytes(epochLength); + long hlc = in.readLeastSignificantBytes(hlcLength); + int flags = Math.toIntExact(in.readLeastSignificantBytes(flagsLength)); + int nodeId = (int)in.readLeastSignificantBytes(nodeLength); + return factory.create(epoch, hlc, flags, new Node.Id(nodeId)); + } + + public T deserialize(V src, ValueAccessor accessor, int offset) + { + int encodingFlags = accessor.getByte(src, offset); + if (encodingFlags < 0) + return decodeSpecial(encodingFlags); + ++offset; + int epochLength = decodeLength(encodingFlags, EPOCH_SHIFT, EPOCH_MIN_LENGTH, EPOCH_MASK); + int hlcLength = decodeLength(encodingFlags, HLC_SHIFT, HLC_MIN_LENGTH, HLC_MASK); + int flagsLength = decodeLength(encodingFlags, FLAGS_SHIFT, FLAGS_MIN_LENGTH, FLAGS_MASK); + int nodeLength = decodeLength(encodingFlags, NODE_SHIFT, NODE_MIN_LENGTH, NODE_MASK); + long epoch = accessor.getLeastSignificantBytes(src, offset, epochLength); + offset += epochLength; + long hlc = accessor.getLeastSignificantBytes(src, offset, hlcLength); + offset += hlcLength; + int flags = Math.toIntExact(accessor.getLeastSignificantBytes(src, offset, flagsLength)); + offset += flagsLength; + int nodeId = (int)accessor.getLeastSignificantBytes(src, offset, nodeLength); + return factory.create(epoch, hlc, flags, new Node.Id(nodeId)); + } + + public T deserialize(ByteBuffer buffer, int position) + { + return deserialize(buffer, ByteBufferAccessor.instance, position); + } + + public T deserialize(ByteBuffer buffer) + { + return deserialize(buffer, ByteBufferAccessor.instance, 0); + } + + // exactly the same fundamental format as deserialize(), only we interleave the length bits with the values, maintaining ordering + public T deserializeComparable(V src, ValueAccessor accessor, int offset) + { + int b = accessor.getByte(src, offset++); + int epochLength = decodeLength(b, 5, EPOCH_MIN_LENGTH, EPOCH_MASK); + long bits64 = accessor.getLeastSignificantBytes(src, offset, epochLength); + offset += epochLength; + long epoch = (b&0x1fL) << (epochLength*8 - 5); + epoch |= bits64 >>> 5; + + int hlcLength = decodeLength((int)bits64, 3, HLC_MIN_LENGTH, HLC_MASK); + long hlc = (bits64 & 0x7L) << (hlcLength*8 - 3); + bits64 = accessor.getLeastSignificantBytes(src, offset, hlcLength); + offset += hlcLength; + hlc |= bits64 >>> 3; + + int flagsLength = decodeLength((int)bits64, 2, FLAGS_MIN_LENGTH, FLAGS_MASK); + int flags = ((int)bits64 & 0x3) << (flagsLength*8-2); + int bits32 = (int) accessor.getLeastSignificantBytes(src, offset, flagsLength); + offset += flagsLength; + flags |= bits32 >>> 2; + + int nodeLength = decodeLength(bits32, 0, NODE_MIN_LENGTH, NODE_MASK); + int node = (int) accessor.getLeastSignificantBytes(src, offset, nodeLength); + return factory.create(epoch, hlc, flags, new Node.Id(node)); + } + + @Override + public long serializedSize(T ts) + { + if (encodeSpecial(ts) != 0) + return 1; + int epochLength = length(ts.epoch(), EPOCH_MIN_LENGTH); + int hlcLength = length(ts.hlc(), HLC_MIN_LENGTH); + int flagsLength = length(ts.flags(), FLAGS_MIN_LENGTH); + int nodeLength = length(ts.node.id, NODE_MIN_LENGTH); + return 1 + epochLength + hlcLength + flagsLength + nodeLength; + } + + private static int length(long value, int minLength) + { + int length = ((64 + 7) - Long.numberOfLeadingZeros(value))/8; + return Math.max(length, minLength); + } + + private static int length(int value, int minLength) + { + int length = ((32 + 7) - Integer.numberOfLeadingZeros(value))/8; + return Math.max(length, minLength); + } + + private static int encodeLength(int length, int shift, int minLength, int mask) + { + int encoded = length - minLength; + Invariants.require(encoded <= mask); + return encoded << shift; + } + + private static long packLength(int length, int shift, int minLength, int mask) + { + int encoded = length - minLength; + Invariants.require(encoded <= mask); + return (long)encoded << shift; + } + + private static int decodeLength(int encodingFlags, int shift, int minLength, int mask) + { + return minLength + ((encodingFlags >>> shift) & mask); + } + } + + public static class BallotSerializer extends VariableWidthTimestampSerializer + { + private static final byte ZERO_BYTE = (byte) 0x81; + private static final byte MAX_BYTE = (byte) 0x82; + private BallotSerializer() + { + super(Ballot::fromValues); + } + + @Override + byte encodeSpecial(Ballot value) + { + if (value == null) return NULL_BYTE; + if (value == Ballot.ZERO) return ZERO_BYTE; + if (value == Ballot.MAX) return MAX_BYTE; + return 0; + } + + @Override + Ballot decodeSpecial(int specialByte) + { + if (specialByte == NULL_BYTE) return null; + if (specialByte == ZERO_BYTE) return Ballot.ZERO; + if (specialByte == MAX_BYTE) return Ballot.MAX; + throw new IllegalArgumentException("Unexpected specialByte: " + specialByte); + } + } + + public static class PartialTxnSerializer + implements IVersionedSerializer + { + private final ParameterisedVersionedSerializer readSerializer; + private final UnversionedSerializer querySerializer; + private final ParameterisedVersionedSerializer updateSerializer; + private final UnversionedSerializer tablesAndKeysSerializer; + + public PartialTxnSerializer(ParameterisedVersionedSerializer readSerializer, + UnversionedSerializer querySerializer, + ParameterisedVersionedSerializer updateSerializer, + UnversionedSerializer tablesAndKeysSerializer) + { + this.readSerializer = readSerializer; + this.querySerializer = querySerializer; + this.updateSerializer = updateSerializer; + this.tablesAndKeysSerializer = tablesAndKeysSerializer; + } + + @Override + public void serialize(PartialTxn txn, DataOutputPlus out, Version version) throws IOException + { + PartialTxn.InMemory cast = (PartialTxn.InMemory)txn; + CommandSerializers.kind.serialize(txn.kind(), out); + TableMetadatasAndKeys tablesAndKeys = (TableMetadatasAndKeys) cast.implementationDefined; + if (tablesAndKeys != null) tablesAndKeysSerializer.serialize(tablesAndKeys, out); + else KeySerializers.seekables.serialize(txn.keys(), out); + readSerializer.serialize(txn.read(), tablesAndKeys, out, version); + querySerializer.serialize(txn.query(), out); + out.writeBoolean(txn.update() != null); + if (txn.update() != null) + updateSerializer.serialize(txn.update(), tablesAndKeys, out, version); + } + + @Override + public PartialTxn deserialize(DataInputPlus in, Version version) throws IOException + { + Txn.Kind kind = CommandSerializers.kind.deserialize(in); + TableMetadatasAndKeys tablesAndKeys = tablesAndKeysSerializer.deserialize(in); + Seekables keys = tablesAndKeys != null ? tablesAndKeys.keys : KeySerializers.seekables.deserialize(in); + Read read = readSerializer.deserialize(tablesAndKeys, in, version); + Query query = querySerializer.deserialize(in); + Update update = in.readBoolean() ? updateSerializer.deserialize(tablesAndKeys, in, version) : null; + return new PartialTxn.InMemory(kind, keys, read, query, update, tablesAndKeys); + } + + @Override + public long serializedSize(PartialTxn txn, Version version) + { + long size = CommandSerializers.kind.serializedSize(txn.kind()); + TableMetadatasAndKeys tablesAndKeys = (TableMetadatasAndKeys) ((PartialTxn.InMemory)txn).implementationDefined; + if (tablesAndKeys != null) size += tablesAndKeysSerializer.serializedSize(tablesAndKeys); + else size += KeySerializers.seekables.serializedSize(txn.keys()); + size += readSerializer.serializedSize(txn.read(), tablesAndKeys, version); + size += querySerializer.serializedSize(txn.query()); + size += TypeSizes.sizeof(txn.update() != null); + if (txn.update() != null) + size += updateSerializer.serializedSize(txn.update(), tablesAndKeys, version); + return size; + } + } + + public static final ParameterisedVersionedSerializer read; + public static final UnversionedSerializer query; + public static final ParameterisedVersionedSerializer update; + public static final ParameterisedVersionedSerializer write; + public static final UnversionedSerializer tablesAndKeys; + + public static final VersionedSerializer partialTxn; + public static final VersionedSerializer nullablePartialTxn; + + static + { + // We use a separate class for initialization to make it easier for BurnTest to plug its own serializers. + QuerySerializers querySerializers = new QuerySerializers(); + read = querySerializers.read; + query = querySerializers.query; + update = querySerializers.update; + write = querySerializers.write; + tablesAndKeys = querySerializers.tablesAndKeys; + + partialTxn = querySerializers.partialTxn; + nullablePartialTxn = querySerializers.nullablePartialTxn; + } + + @VisibleForTesting + public static class QuerySerializers + { + public final ParameterisedVersionedSerializer read; + public final UnversionedSerializer query; + public final ParameterisedVersionedSerializer update; + public final ParameterisedVersionedSerializer write; + public final UnversionedSerializer tablesAndKeys; + + public final VersionedSerializer partialTxn; + public final VersionedSerializer nullablePartialTxn; + + private QuerySerializers() + { + this((ParameterisedVersionedSerializer) TxnRead.serializer, + (UnversionedSerializer) TxnQuery.serializer, + (ParameterisedVersionedSerializer) AccordUpdate.serializer, + (ParameterisedVersionedSerializer) TxnWrite.serializer, + TableMetadatasAndKeys.serializer); + } + + public QuerySerializers(ParameterisedVersionedSerializer read, + UnversionedSerializer query, + ParameterisedVersionedSerializer update, + ParameterisedVersionedSerializer write, + UnversionedSerializer tablesAndKeys) + { + this.read = read; + this.query = query; + this.update = update; + this.write = write; + this.tablesAndKeys = tablesAndKeys; + + this.partialTxn = new PartialTxnSerializer(read, query, update, tablesAndKeys); + this.nullablePartialTxn = NullableSerializer.wrap(partialTxn); + } + } + + public static final UnversionedSerializer saveStatus = EncodeAsVInt32.of(SaveStatus.class); + public static final UnversionedSerializer status = EncodeAsVInt32.of(Status.class); + public static final UnversionedSerializer durability = EncodeAsVInt32.of(Durability.class); + + public static final IVersionedSerializer writes = new IVersionedSerializer<>() + { + @Override + public void serialize(Writes writes, DataOutputPlus out, Version version) throws IOException + { + txnId.serialize(writes.txnId, out); + ExecuteAtSerializer.serialize(writes.txnId, writes.executeAt, out); + KeySerializers.seekables.serialize(writes.keys, out); + boolean hasWrite = writes.write != null; + out.writeBoolean(hasWrite); + if (hasWrite) + CommandSerializers.write.serialize(writes.write, writes.keys, out, version); + } + + @Override + public Writes deserialize(DataInputPlus in, Version version) throws IOException + { + TxnId id = txnId.deserialize(in); + Timestamp executeAt = ExecuteAtSerializer.deserialize(id, in); + Seekables seekables = KeySerializers.seekables.deserialize(in); + boolean hasWrite = in.readBoolean(); + Write write = null; + if (hasWrite) + write = CommandSerializers.write.deserialize(seekables, in, version); + return new Writes(id, executeAt, seekables, write); + } + + @Override + public long serializedSize(Writes writes, Version version) + { + long size = txnId.serializedSize(writes.txnId); + size += ExecuteAtSerializer.serializedSize(writes.txnId, writes.executeAt); + boolean hasWrites = writes.write != null; + size += KeySerializers.seekables.serializedSize(writes.keys); + size += TypeSizes.sizeof(hasWrites); + if (hasWrites) + size += CommandSerializers.write.serializedSize(writes.write, writes.keys, version); + return size; + } + }; + + public static final VersionedSerializer nullableWrites = NullableSerializer.wrap(writes); + public static final UnversionedSerializer knownDeps = EncodeAsVInt32.of(KnownDeps.class); + public static final UnversionedSerializer invalidIf = EncodeAsVInt32.of(Infer.InvalidIf.class); + + public static final UnversionedSerializer known = EncodeAsVInt32.withNulls(known -> known.encoded, Known::new); + +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/service/accord/serializers/CommandStoreSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/CommandStoreSerializers.java new file mode 100644 index 000000000000..cf28b5a4dc20 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/CommandStoreSerializers.java @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; +import java.util.NavigableMap; +import java.util.TreeMap; +import java.util.function.IntFunction; + +import accord.api.RoutingKey; +import accord.local.DurableBefore; +import accord.local.RedundantBefore; +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import accord.utils.Invariants; +import accord.utils.ReducingRangeMap; +import accord.utils.TriFunction; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.utils.CollectionSerializers; +import org.apache.cassandra.utils.NullableSerializer; + +import static org.apache.cassandra.service.accord.serializers.CommandSerializers.ExecuteAtSerializer.deserializeNullable; +import static org.apache.cassandra.service.accord.serializers.CommandSerializers.ExecuteAtSerializer.serializeNullable; +import static org.apache.cassandra.service.accord.serializers.CommandSerializers.ExecuteAtSerializer.serializedNullableSize; + +public class CommandStoreSerializers +{ + private CommandStoreSerializers() {} + + public static class ReducingRangeMapSerializer> implements UnversionedSerializer + { + final UnversionedSerializer valueSerializer; + final IntFunction newValueArray; + final TriFunction constructor; + + public ReducingRangeMapSerializer(UnversionedSerializer valueSerializer, IntFunction newValueArray, TriFunction constructor) + { + this.valueSerializer = valueSerializer; + this.newValueArray = newValueArray; + this.constructor = constructor; + } + + @Override + public void serialize(R map, DataOutputPlus out) throws IOException + { + out.writeBoolean(map.inclusiveEnds()); + int mapSize = map.size(); + out.writeUnsignedVInt32(mapSize); + + for (int i=0; i 0) + KeySerializers.routingKey.serialize(map.startAt(mapSize), out); + } + + @Override + public R deserialize(DataInputPlus in) throws IOException + { + boolean inclusiveEnds = in.readBoolean(); + int mapSize = in.readUnsignedVInt32(); + RoutingKey[] keys = new RoutingKey[mapSize + 1]; + T[] values = newValueArray.apply(mapSize); + for (int i=0; i 0) + keys[mapSize] = KeySerializers.routingKey.deserialize(in); + return constructor.apply(inclusiveEnds, keys, values); + } + + @Override + public long serializedSize(R map) + { + long size = TypeSizes.BOOL_SIZE; + int mapSize = map.size(); + size += TypeSizes.sizeofUnsignedVInt(mapSize); + for (int i=0; i 0) + size += KeySerializers.routingKey.serializedSize(map.startAt(mapSize)); + + return size; + } + } + + public static UnversionedSerializer durableBefore = new ReducingRangeMapSerializer<>(NullableSerializer.wrap(new UnversionedSerializer<>() + { + @Override + public void serialize(DurableBefore.Entry t, DataOutputPlus out) throws IOException + { + CommandSerializers.txnId.serialize(t.majorityBefore, out); + CommandSerializers.txnId.serialize(t.universalBefore, out); + } + + @Override + public DurableBefore.Entry deserialize(DataInputPlus in) throws IOException + { + TxnId majorityBefore = CommandSerializers.txnId.deserialize(in); + TxnId universalBefore = CommandSerializers.txnId.deserialize(in); + return new DurableBefore.Entry(majorityBefore, universalBefore); + } + + @Override + public long serializedSize(DurableBefore.Entry t) + { + return CommandSerializers.txnId.serializedSize(t.majorityBefore) + + CommandSerializers.txnId.serializedSize(t.universalBefore); + } + }), DurableBefore.Entry[]::new, DurableBefore.SerializerSupport::create); + + public static final UnversionedSerializer redundantBeforeEntry = new UnversionedSerializer<>() + { + @Override + public void serialize(RedundantBefore.Bounds b, DataOutputPlus out) throws IOException + { + KeySerializers.range.serialize(b.range, out); + Invariants.require(b.startEpoch <= b.endEpoch); + out.writeUnsignedVInt(b.startEpoch); + if (b.endEpoch == Long.MAX_VALUE) out.writeUnsignedVInt(0L); + else out.writeUnsignedVInt(1 + b.endEpoch - b.startEpoch); + serializeNullable(b.staleUntilAtLeast, out); + out.writeUnsignedVInt32(b.bounds.length); + for (TxnId bound : b.bounds) + { + CommandSerializers.txnId.serialize(bound, out); + } + for (int status : b.statuses) + out.writeShort(status); + } + + @Override + public RedundantBefore.Bounds deserialize(DataInputPlus in) throws IOException + { + Range range = KeySerializers.range.deserialize(in); + long startEpoch = in.readUnsignedVInt(); + long endEpoch = in.readUnsignedVInt(); + if (endEpoch == 0) endEpoch = Long.MAX_VALUE; + else endEpoch = endEpoch - 1 + startEpoch; + Timestamp staleUntilAtLeast = deserializeNullable(in); + int count = in.readUnsignedVInt32(); + + TxnId[] bounds = new TxnId[count]; + for (int i = 0 ; i < bounds.length ; ++i) + bounds[i] = CommandSerializers.txnId.deserialize(in); + short[] statuses = new short[count * 2]; + for (int i = 0 ; i < statuses.length ; ++i) + statuses[i] = in.readShort(); + + return new RedundantBefore.Bounds(range, startEpoch, endEpoch, bounds, statuses, staleUntilAtLeast); + } + + @Override + public long serializedSize(RedundantBefore.Bounds b) + { + long size = KeySerializers.range.serializedSize(b.range); + size += TypeSizes.sizeofUnsignedVInt(b.startEpoch); + size += TypeSizes.sizeofUnsignedVInt(b.endEpoch == Long.MAX_VALUE ? 0 : 1 + b.endEpoch - b.startEpoch); + size += serializedNullableSize(b.staleUntilAtLeast); + size += TypeSizes.sizeofUnsignedVInt(b.bounds.length); + for (TxnId bound : b.bounds) + { + size += CommandSerializers.txnId.serializedSize(bound); + } + size += 2L * b.statuses.length; + return size; + } + }; + public static UnversionedSerializer redundantBefore = new ReducingRangeMapSerializer<>(NullableSerializer.wrap(redundantBeforeEntry), RedundantBefore.Bounds[]::new, RedundantBefore.SerializerSupport::create); + + private static class TimestampToRangesSerializer implements UnversionedSerializer> + { + private final UnversionedSerializer timestampSerializer; + + public TimestampToRangesSerializer(UnversionedSerializer timestampSerializer) + { + this.timestampSerializer = timestampSerializer; + } + + @Override + public void serialize(NavigableMap map, DataOutputPlus out) throws IOException + { + CollectionSerializers.serializeMap(map, out, timestampSerializer, KeySerializers.ranges); + } + + @Override + public NavigableMap deserialize(DataInputPlus in) throws IOException + { + return CollectionSerializers.deserializeMap(in, timestampSerializer, KeySerializers.ranges, i -> new TreeMap<>()); + + } + + @Override + public long serializedSize(NavigableMap map) + { + return CollectionSerializers.serializedMapSize(map, timestampSerializer, KeySerializers.ranges); + } + } + + public static final UnversionedSerializer> bootstrapBeganAt = new TimestampToRangesSerializer<>(CommandSerializers.txnId); + public static final UnversionedSerializer> safeToRead = new TimestampToRangesSerializer<>(CommandSerializers.timestamp); +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/CommitSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/CommitSerializers.java new file mode 100644 index 000000000000..31d879ce7c87 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/CommitSerializers.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import accord.messages.Commit; +import accord.primitives.Ballot; +import accord.primitives.FullRoute; +import accord.primitives.PartialDeps; +import accord.primitives.PartialTxn; +import accord.primitives.Participants; +import accord.primitives.Route; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.serializers.CommandSerializers.ExecuteAtSerializer; + +import static org.apache.cassandra.utils.NullableSerializer.deserializeNullable; +import static org.apache.cassandra.utils.NullableSerializer.serializeNullable; +import static org.apache.cassandra.utils.NullableSerializer.serializedNullableSize; + +public class CommitSerializers +{ + public static final UnversionedSerializer kind = EncodeAsVInt32.of(Commit.Kind.class); + + public static final CommitSerializer request = new CommitSerializer(); + public static class CommitSerializer extends TxnRequestSerializer.WithUnsyncedSerializer + { + @Override + public void serializeBody(Commit msg, DataOutputPlus out, Version version) throws IOException + { + kind.serialize(msg.kind, out); + CommandSerializers.ballot.serialize(msg.ballot, out); + ExecuteAtSerializer.serialize(msg.txnId, msg.executeAt, out); + CommandSerializers.nullablePartialTxn.serialize(msg.partialTxn, out, version); + if (msg.kind.withDeps == Commit.WithDeps.HasDeps) + DepsSerializers.partialDeps.serialize(msg.partialDeps, out); + serializeNullable(msg.route, out, KeySerializers.fullRoute); + } + + @Override + public Commit deserializeBody(DataInputPlus in, Version version, TxnId txnId, Route scope, long waitForEpoch, long minEpoch) throws IOException + { + Commit.Kind kind = CommitSerializers.kind.deserialize(in); + Ballot ballot = CommandSerializers.ballot.deserialize(in); + Timestamp executeAt = ExecuteAtSerializer.deserialize(txnId, in); + PartialTxn partialTxn = CommandSerializers.nullablePartialTxn.deserialize(in, version); + PartialDeps partialDeps = null; + if (kind.withDeps == Commit.WithDeps.HasDeps) + partialDeps = DepsSerializers.partialDeps.deserialize(in); + FullRoute route = deserializeNullable(in, KeySerializers.fullRoute); + return Commit.SerializerSupport.create(txnId, scope, waitForEpoch, minEpoch, kind, ballot, executeAt, partialTxn, partialDeps, route); + } + + @Override + public long serializedBodySize(Commit msg, Version version) + { + long size = kind.serializedSize(msg.kind) + + CommandSerializers.ballot.serializedSize(msg.ballot) + + ExecuteAtSerializer.serializedSize(msg.txnId, msg.executeAt) + + CommandSerializers.nullablePartialTxn.serializedSize(msg.partialTxn, version); + + if (msg.kind.withDeps == Commit.WithDeps.HasDeps) + size += DepsSerializers.partialDeps.serializedSize(msg.partialDeps); + + size += serializedNullableSize(msg.route, KeySerializers.fullRoute); + return size; + } + } + + public static final UnversionedSerializer invalidate = new UnversionedSerializer<>() + { + @Override + public void serialize(Commit.Invalidate invalidate, DataOutputPlus out) throws IOException + { + CommandSerializers.txnId.serialize(invalidate.txnId, out); + KeySerializers.participants.serialize(invalidate.scope, out); + out.writeUnsignedVInt(invalidate.waitForEpoch); + out.writeUnsignedVInt(invalidate.invalidateUntilEpoch - invalidate.waitForEpoch); + } + + @Override + public Commit.Invalidate deserialize(DataInputPlus in) throws IOException + { + TxnId txnId = CommandSerializers.txnId.deserialize(in); + Participants scope = KeySerializers.participants.deserialize(in); + long waitForEpoch = in.readUnsignedVInt(); + long invalidateUntilEpoch = in.readUnsignedVInt() + waitForEpoch; + return Commit.Invalidate.SerializerSupport.create(txnId, scope, waitForEpoch, invalidateUntilEpoch); + } + + @Override + public long serializedSize(Commit.Invalidate invalidate) + { + return CommandSerializers.txnId.serializedSize(invalidate.txnId) + + KeySerializers.participants.serializedSize(invalidate.scope) + + TypeSizes.sizeofUnsignedVInt(invalidate.waitForEpoch) + + TypeSizes.sizeofUnsignedVInt(invalidate.invalidateUntilEpoch - invalidate.waitForEpoch); + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/DepsSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/DepsSerializers.java new file mode 100644 index 000000000000..daf6f923d22d --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/DepsSerializers.java @@ -0,0 +1,237 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.primitives.Ints; + +import accord.primitives.Deps; +import accord.primitives.KeyDeps; +import accord.primitives.PartialDeps; +import accord.primitives.Participants; +import accord.primitives.Range; +import accord.primitives.RangeDeps; +import accord.primitives.RoutingKeys; +import accord.primitives.TxnId; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.utils.NullableSerializer; + +import static accord.primitives.KeyDeps.SerializerSupport.keysToTxnIds; +import static accord.primitives.KeyDeps.SerializerSupport.keysToTxnIdsCount; +import static accord.primitives.RangeDeps.SerializerSupport.rangesToTxnIds; +import static accord.primitives.RangeDeps.SerializerSupport.rangesToTxnIdsCount; +import static org.apache.cassandra.db.TypeSizes.sizeofUnsignedVInt; + +public class DepsSerializers +{ + public static final UnversionedSerializer tokenRange; + public static final DepsSerializer deps; + public static final UnversionedSerializer nullableDeps; + public static final DepsSerializer partialDeps; + public static final UnversionedSerializer nullablePartialDeps; + + static + { + // We use a separate class for initialization to make it easier for BurnTest to plug its own serializers. + Impl serializers = new Impl((UnversionedSerializer) (UnversionedSerializer) TokenRange.serializer); + tokenRange = serializers.tokenRange; + deps = serializers.deps; + nullableDeps = serializers.nullableDeps; + partialDeps = serializers.partialDeps; + nullablePartialDeps = serializers.nullablePartialDeps; + } + + public static abstract class DepsSerializer implements UnversionedSerializer + { + protected UnversionedSerializer tokenRange; + public DepsSerializer(UnversionedSerializer tokenRange) + { + this.tokenRange = tokenRange; + } + + abstract D deserialize(KeyDeps keyDeps, RangeDeps rangeDeps, DataInputPlus in) throws IOException; + + @Override + public void serialize(D deps, DataOutputPlus out) throws IOException + { + { + KeyDeps keyDeps = deps.keyDeps; + KeySerializers.routingKeys.serialize(keyDeps.keys(), out); + int txnIdCount = keyDeps.txnIdCount(); + out.writeUnsignedVInt32(txnIdCount); + for (int i = 0; i < txnIdCount; i++) + CommandSerializers.txnId.serialize(keyDeps.txnId(i), out); + + int keysToTxnIdsCount = keysToTxnIdsCount(keyDeps); + out.writeUnsignedVInt32(keysToTxnIdsCount); + for (int i = 0; i < keysToTxnIdsCount; i++) + out.writeUnsignedVInt32(keysToTxnIds(keyDeps, i)); + } + { + RangeDeps rangeDeps = deps.rangeDeps; + int rangeCount = rangeDeps.rangeCount(); + out.writeUnsignedVInt32(rangeCount); + for (int i = 0; i < rangeCount; i++) + tokenRange.serialize(rangeDeps.range(i), out); + + int txnIdCount = rangeDeps.txnIdCount(); + out.writeUnsignedVInt32(txnIdCount); + for (int i = 0; i < txnIdCount; i++) + CommandSerializers.txnId.serialize(rangeDeps.txnId(i), out); + + int rangesToTxnIdsCount = rangesToTxnIdsCount(rangeDeps); + out.writeUnsignedVInt32(rangesToTxnIdsCount); + for (int i = 0; i < rangesToTxnIdsCount; i++) + out.writeUnsignedVInt32(rangesToTxnIds(rangeDeps, i)); + } + } + + @Override + public D deserialize(DataInputPlus in) throws IOException + { + KeyDeps keyDeps; + { + RoutingKeys keys = KeySerializers.routingKeys.deserialize(in); + int txnIdCount = in.readUnsignedVInt32(); + TxnId[] txnIds = new TxnId[txnIdCount]; + for (int i = 0; i < txnIdCount; i++) + txnIds[i] = CommandSerializers.txnId.deserialize(in); + + int keysToTxnIdsCount = in.readUnsignedVInt32(); + int[] keysToTxnIds = new int[keysToTxnIdsCount]; + for (int i = 0; i < keysToTxnIdsCount; i++) + keysToTxnIds[i] = in.readUnsignedVInt32(); + + keyDeps = KeyDeps.SerializerSupport.create(keys, txnIds, keysToTxnIds); + } + + RangeDeps rangeDeps; + { + int rangeCount = Ints.checkedCast(in.readUnsignedVInt32()); + Range[] ranges = new Range[rangeCount]; + for (int i = 0; i < rangeCount; i++) + ranges[i] = tokenRange.deserialize(in); + + int txnIdCount = in.readUnsignedVInt32(); + TxnId[] txnIds = new TxnId[txnIdCount]; + for (int i = 0; i < txnIdCount; i++) + txnIds[i] = CommandSerializers.txnId.deserialize(in); + + int rangesToTxnIdsCount = in.readUnsignedVInt32(); + int[] rangesToTxnIds = new int[rangesToTxnIdsCount]; + for (int i = 0; i < rangesToTxnIdsCount; i++) + rangesToTxnIds[i] = in.readUnsignedVInt32(); + + rangeDeps = RangeDeps.SerializerSupport.create(ranges, txnIds, rangesToTxnIds); + } + return deserialize(keyDeps, rangeDeps, in); + } + + @Override + public long serializedSize(D deps) + { + long size; + { + KeyDeps keyDeps = deps.keyDeps; + size = KeySerializers.routingKeys.serializedSize(deps.keyDeps.keys()); + int txnIdCount = keyDeps.txnIdCount(); + size += sizeofUnsignedVInt(txnIdCount); + for (int i = 0; i < txnIdCount; i++) + size += CommandSerializers.txnId.serializedSize(keyDeps.txnId(i)); + + int keysToTxnIdsCount = keysToTxnIdsCount(keyDeps); + size += sizeofUnsignedVInt(keysToTxnIdsCount); + for (int i = 0; i < keysToTxnIdsCount; i++) + size += sizeofUnsignedVInt(keysToTxnIds(keyDeps, i)); + } + + { + RangeDeps rangeDeps = deps.rangeDeps; + int rangeCount = rangeDeps.rangeCount(); + size += sizeofUnsignedVInt(rangeCount); + for (int i = 0; i < rangeCount; ++i) + size += tokenRange.serializedSize(rangeDeps.range(i)); + + int txnIdCount = rangeDeps.txnIdCount(); + size += sizeofUnsignedVInt(txnIdCount); + for (int i = 0; i < txnIdCount; i++) + size += CommandSerializers.txnId.serializedSize(rangeDeps.txnId(i)); + + int rangesToTxnIdsCount = rangesToTxnIdsCount(rangeDeps); + size += sizeofUnsignedVInt(rangesToTxnIdsCount); + for (int i = 0; i < rangesToTxnIdsCount; i++) + size += sizeofUnsignedVInt(rangesToTxnIds(rangeDeps, i)); + } + return size; + } + } + + @VisibleForTesting + public static class Impl + { + final UnversionedSerializer tokenRange; + final DepsSerializer deps; + final UnversionedSerializer nullableDeps; + final DepsSerializer partialDeps; + final UnversionedSerializer nullablePartialDeps; + + public Impl(UnversionedSerializer tokenRange) + { + this.tokenRange = tokenRange; + this.deps = new DepsSerializer<>(tokenRange) + { + @Override + Deps deserialize(KeyDeps keyDeps, RangeDeps rangeDeps, DataInputPlus in) + { + return new Deps(keyDeps, rangeDeps); + } + }; + this.nullableDeps = NullableSerializer.wrap(deps); + this.partialDeps = new DepsSerializer<>(tokenRange) + { + @Override + PartialDeps deserialize(KeyDeps keyDeps, RangeDeps rangeDeps, DataInputPlus in) throws IOException + { + Participants covering = KeySerializers.participants.deserialize(in); + return new PartialDeps(covering, keyDeps, rangeDeps); + } + + @Override + public void serialize(PartialDeps partialDeps, DataOutputPlus out) throws IOException + { + super.serialize(partialDeps, out); + KeySerializers.participants.serialize(partialDeps.covering, out); + } + + @Override + public long serializedSize(PartialDeps partialDeps) + { + return super.serializedSize(partialDeps) + + KeySerializers.participants.serializedSize(partialDeps.covering); + } + }; + + this.nullablePartialDeps = NullableSerializer.wrap(partialDeps); + } + } +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/service/accord/serializers/EncodeAsVInt32.java b/src/java/org/apache/cassandra/service/accord/serializers/EncodeAsVInt32.java new file mode 100644 index 000000000000..81cae8885f2f --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/EncodeAsVInt32.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; +import java.util.function.IntFunction; +import java.util.function.ToIntFunction; + +import javax.annotation.Nullable; + +import com.google.common.primitives.Ints; + +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.utils.vint.VIntCoding; + +public abstract class EncodeAsVInt32 implements UnversionedSerializer +{ + /** + * Creates a serializer that uses vint to store the encoded value. + * + * Negative ints cause undefined behavior and are unsafe to use; this logic is only safe for 0 and posotive values + */ + public static EncodeAsVInt32 withNulls(ToIntFunction encode, IntFunction decode) + { + return new WithNulls<>(encode, decode); + } + + public static EncodeAsVInt32 withoutNulls(ToIntFunction encode, IntFunction decode) + { + return new WithoutNulls<>(encode, decode); + } + + public static > EncodeAsVInt32 of(Class clazz) + { + E[] values = clazz.getEnumConstants(); + return withNulls(Enum::ordinal, i -> values[i]); + } + + static class WithNulls extends EncodeAsVInt32 + { + private WithNulls(ToIntFunction encode, IntFunction decode) + { + super(encode, decode); + } + + @Override + int encode(@Nullable T t) + { + return t == null ? 0 : (1 + encode.applyAsInt(t)); + } + + @Override + T decode(long i) + { + return i == 0 ? null : decode.apply(Ints.checkedCast(i - 1)); + } + } + + static class WithoutNulls extends EncodeAsVInt32 + { + private WithoutNulls(ToIntFunction encode, IntFunction decode) + { + super(encode, decode); + } + + @Override + int encode(@Nullable T t) + { + return encode.applyAsInt(t); + } + + @Override + T decode(long i) + { + return decode.apply(Ints.checkedCast(i)); + } + } + + final ToIntFunction encode; + final IntFunction decode; + + abstract int encode(T t); + abstract T decode(long i); + + private EncodeAsVInt32(ToIntFunction encode, IntFunction decode) + { + this.encode = encode; + this.decode = decode; + } + + @Override + public void serialize(T t, DataOutputPlus out) throws IOException + { + out.writeUnsignedVInt32(encode(t)); + } + + @Override + public T deserialize(DataInputPlus in) throws IOException + { + // we read a long to ensure we are correct even if the underlying conversion may return -1 + return decode(in.readUnsignedVInt()); + } + + @Override + public long serializedSize(T t) + { + return VIntCoding.computeUnsignedVIntSize(encode(t)); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/EnumSerializer.java b/src/java/org/apache/cassandra/service/accord/serializers/EnumSerializer.java new file mode 100644 index 000000000000..b8c77d3ecf3e --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/EnumSerializer.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import accord.messages.SimpleReply; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + +public class EnumSerializer> implements UnversionedSerializer +{ + public static final EnumSerializer simpleReply = new EnumSerializer<>(SimpleReply.class); + + final E[] values; + + public EnumSerializer(Class clazz) + { + this.values = clazz.getEnumConstants(); + } + + public E forOrdinal(int ordinal) + { + return values[ordinal]; + } + + @Override + public void serialize(E t, DataOutputPlus out) throws IOException + { + out.writeUnsignedVInt32(t.ordinal()); + } + + @Override + public E deserialize(DataInputPlus in) throws IOException + { + return values[in.readUnsignedVInt32()]; + } + + @Override + public long serializedSize(E t) + { + return TypeSizes.sizeofUnsignedVInt(t.ordinal()); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/FetchSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/FetchSerializers.java new file mode 100644 index 000000000000..7d45a6006de3 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/FetchSerializers.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import accord.api.Data; +import accord.impl.AbstractFetchCoordinator.FetchRequest; +import accord.impl.AbstractFetchCoordinator.FetchResponse; +import accord.messages.ReadData.CommitOrReadNack; +import accord.messages.ReadData.ReadReply; +import accord.primitives.Ranges; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.AccordFetchCoordinator.AccordFetchRequest; +import org.apache.cassandra.service.accord.AccordFetchCoordinator.StreamData; +import org.apache.cassandra.service.accord.AccordFetchCoordinator.StreamingTxn; +import org.apache.cassandra.utils.CastingSerializer; + +import static org.apache.cassandra.utils.NullableSerializer.deserializeNullable; +import static org.apache.cassandra.utils.NullableSerializer.serializeNullable; +import static org.apache.cassandra.utils.NullableSerializer.serializedNullableSize; + +public class FetchSerializers +{ + public static final IVersionedSerializer request = new IVersionedSerializer<>() + { + @Override + public void serialize(FetchRequest request, DataOutputPlus out, Version version) throws IOException + { + out.writeUnsignedVInt(request.executeAtEpoch); + CommandSerializers.txnId.serialize(request.txnId, out); + KeySerializers.ranges.serialize((Ranges) request.scope, out); + DepsSerializers.partialDeps.serialize(request.partialDeps, out); + StreamingTxn.serializer.serialize(request.read, out, version); + } + + @Override + public FetchRequest deserialize(DataInputPlus in, Version version) throws IOException + { + return new AccordFetchRequest(in.readUnsignedVInt(), + CommandSerializers.txnId.deserialize(in), + KeySerializers.ranges.deserialize(in), + DepsSerializers.partialDeps.deserialize(in), + StreamingTxn.serializer.deserialize(in, version)); + } + + @Override + public long serializedSize(FetchRequest request, Version version) + { + return TypeSizes.sizeofUnsignedVInt(request.executeAtEpoch) + + CommandSerializers.txnId.serializedSize(request.txnId) + + KeySerializers.ranges.serializedSize((Ranges) request.scope) + + DepsSerializers.partialDeps.serializedSize(request.partialDeps) + + StreamingTxn.serializer.serializedSize(request.read, version); + } + }; + + public static final UnversionedSerializer reply = new UnversionedSerializer<>() + { + final CommitOrReadNack[] nacks = CommitOrReadNack.values(); + final UnversionedSerializer streamDataSerializer = CastingSerializer.create(StreamData.class, StreamData.serializer); + + @Override + public void serialize(ReadReply reply, DataOutputPlus out) throws IOException + { + if (!reply.isOk()) + { + out.writeByte(1 + ((CommitOrReadNack) reply).ordinal()); + return; + } + + out.writeByte(0); + FetchResponse response = (FetchResponse) reply; + serializeNullable(response.unavailable, out, KeySerializers.ranges); + serializeNullable(response.data, out, streamDataSerializer); + CommandSerializers.timestamp.serialize(response.safeToReadAfter, out); + } + + @Override + public ReadReply deserialize(DataInputPlus in) throws IOException + { + int id = in.readByte(); + if (id != 0) + return nacks[id - 1]; + + return new FetchResponse(deserializeNullable(in, KeySerializers.ranges), + deserializeNullable(in, streamDataSerializer), + CommandSerializers.timestamp.deserialize(in)); + } + + @Override + public long serializedSize(ReadReply reply) + { + if (!reply.isOk()) + return TypeSizes.BYTE_SIZE; + + FetchResponse response = (FetchResponse) reply; + return TypeSizes.BYTE_SIZE + + serializedNullableSize(response.unavailable, KeySerializers.ranges) + + serializedNullableSize(response.data, streamDataSerializer) + + CommandSerializers.timestamp.serializedSize(response.safeToReadAfter); + } + }; + +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/GetDurableBeforeSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/GetDurableBeforeSerializers.java new file mode 100644 index 000000000000..c314e51a792b --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/GetDurableBeforeSerializers.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import accord.messages.GetDurableBefore; +import accord.messages.GetDurableBefore.DurableBeforeReply; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + +public class GetDurableBeforeSerializers +{ + public static final UnversionedSerializer request = new UnversionedSerializer() + { + @Override + public void serialize(GetDurableBefore msg, DataOutputPlus out) throws IOException + { + } + + @Override + public GetDurableBefore deserialize(DataInputPlus in) throws IOException + { + return new GetDurableBefore(); + } + + @Override + public long serializedSize(GetDurableBefore msg) + { + return 0; + } + }; + + public static final UnversionedSerializer reply = new UnversionedSerializer() + { + @Override + public void serialize(DurableBeforeReply msg, DataOutputPlus out) throws IOException + { + CommandStoreSerializers.durableBefore.serialize(msg.durableBeforeMap, out); + } + + @Override + public DurableBeforeReply deserialize(DataInputPlus in) throws IOException + { + return new DurableBeforeReply(CommandStoreSerializers.durableBefore.deserialize(in)); + } + + @Override + public long serializedSize(DurableBeforeReply msg) + { + return CommandStoreSerializers.durableBefore.serializedSize(msg.durableBeforeMap); + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/GetEphmrlReadDepsSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/GetEphmrlReadDepsSerializers.java new file mode 100644 index 000000000000..9f90274a94f6 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/GetEphmrlReadDepsSerializers.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import accord.coordinate.ExecuteFlag.ExecuteFlags; +import accord.messages.GetEphemeralReadDeps; +import accord.messages.GetEphemeralReadDeps.GetEphemeralReadDepsOk; +import accord.primitives.Deps; +import accord.primitives.Route; +import accord.primitives.TxnId; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + +public class GetEphmrlReadDepsSerializers +{ + public static final IVersionedSerializer request = new TxnRequestSerializer.WithUnsyncedSerializer() + { + @Override + public void serializeBody(GetEphemeralReadDeps msg, DataOutputPlus out, Version version) throws IOException + { + out.writeUnsignedVInt(msg.executionEpoch); + } + + @Override + public GetEphemeralReadDeps deserializeBody(DataInputPlus in, Version version, TxnId txnId, Route scope, long waitForEpoch, long minEpoch) throws IOException + { + long executionEpoch = in.readUnsignedVInt(); + return GetEphemeralReadDeps.SerializationSupport.create(txnId, scope, waitForEpoch, minEpoch, executionEpoch); + } + + @Override + public long serializedBodySize(GetEphemeralReadDeps msg, Version version) + { + return TypeSizes.sizeofUnsignedVInt(msg.executionEpoch); + } + }; + + public static final UnversionedSerializer reply = new UnversionedSerializer() + { + @Override + public void serialize(GetEphemeralReadDepsOk reply, DataOutputPlus out) throws IOException + { + DepsSerializers.deps.serialize(reply.deps, out); + out.writeUnsignedVInt(reply.latestEpoch); + out.writeUnsignedVInt32(reply.flags.bits()); + } + + @Override + public GetEphemeralReadDepsOk deserialize(DataInputPlus in) throws IOException + { + Deps deps = DepsSerializers.deps.deserialize(in); + long latestEpoch = in.readUnsignedVInt(); + ExecuteFlags flags = ExecuteFlags.get(in.readUnsignedVInt32()); + return new GetEphemeralReadDepsOk(deps, latestEpoch, flags); + } + + @Override + public long serializedSize(GetEphemeralReadDepsOk reply) + { + return DepsSerializers.deps.serializedSize(reply.deps) + + TypeSizes.sizeofUnsignedVInt(reply.latestEpoch) + + TypeSizes.sizeofUnsignedVInt(reply.flags.bits()); + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/GetMaxConflictSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/GetMaxConflictSerializers.java new file mode 100644 index 000000000000..eed742e24778 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/GetMaxConflictSerializers.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import accord.messages.GetMaxConflict; +import accord.messages.GetMaxConflict.GetMaxConflictOk; +import accord.primitives.Route; +import accord.primitives.Timestamp; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + +public class GetMaxConflictSerializers +{ + public static final UnversionedSerializer request = new UnversionedSerializer<>() + { + @Override + public void serialize(GetMaxConflict msg, DataOutputPlus out) throws IOException + { + KeySerializers.route.serialize(msg.scope, out); + out.writeUnsignedVInt(msg.waitForEpoch); + out.writeUnsignedVInt(msg.minEpoch); + out.writeUnsignedVInt(msg.executionEpoch); + } + + @Override + public GetMaxConflict deserialize(DataInputPlus in) throws IOException + { + Route scope = KeySerializers.route.deserialize(in); + long waitForEpoch = in.readUnsignedVInt(); + long minEpoch = in.readUnsignedVInt(); + long executionEpoch = in.readUnsignedVInt(); + return GetMaxConflict.SerializationSupport.create(scope, waitForEpoch, minEpoch, executionEpoch); + } + + @Override + public long serializedSize(GetMaxConflict msg) + { + return KeySerializers.route.serializedSize(msg.scope()) + + TypeSizes.sizeofUnsignedVInt(msg.waitForEpoch) + + TypeSizes.sizeofUnsignedVInt(msg.minEpoch) + + TypeSizes.sizeofUnsignedVInt(msg.executionEpoch); + } + }; + + public static final UnversionedSerializer reply = new UnversionedSerializer<>() + { + @Override + public void serialize(GetMaxConflictOk reply, DataOutputPlus out) throws IOException + { + CommandSerializers.timestamp.serialize(reply.maxConflict, out); + out.writeUnsignedVInt(reply.latestEpoch); + } + + @Override + public GetMaxConflictOk deserialize(DataInputPlus in) throws IOException + { + Timestamp maxConflict = CommandSerializers.timestamp.deserialize(in); + long latestEpoch = in.readUnsignedVInt(); + return new GetMaxConflictOk(maxConflict, latestEpoch); + } + + @Override + public long serializedSize(GetMaxConflictOk reply) + { + return CommandSerializers.timestamp.serializedSize(reply.maxConflict) + + TypeSizes.sizeofUnsignedVInt(reply.latestEpoch); + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/IVersionedSerializer.java b/src/java/org/apache/cassandra/service/accord/serializers/IVersionedSerializer.java new file mode 100644 index 000000000000..58f775e636f1 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/IVersionedSerializer.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.VersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + +public interface IVersionedSerializer extends VersionedSerializer +{ + static IVersionedSerializer fromMessaging(org.apache.cassandra.io.IVersionedSerializer delegate) + { + return new IVersionedSerializer() + { + @Override + public void serialize(T t, DataOutputPlus out, Version version) throws IOException + { + delegate.serialize(t, out, version.messageVersion()); + } + + @Override + public T deserialize(DataInputPlus in, Version version) throws IOException + { + return delegate.deserialize(in, version.messageVersion()); + } + + @Override + public long serializedSize(T t, Version version) + { + return delegate.serializedSize(t, version.messageVersion()); + } + }; + } + + static IVersionedSerializer fromSerializer(UnversionedSerializer delegate) + { + return new IVersionedSerializer() + { + @Override + public void serialize(T t, DataOutputPlus out, Version version) throws IOException + { + delegate.serialize(t, out); + } + + @Override + public T deserialize(DataInputPlus in, Version version) throws IOException + { + return delegate.deserialize(in); + } + + @Override + public long serializedSize(T t, Version version) + { + return delegate.serializedSize(t); + } + }; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/IVersionedWithKeysSerializer.java b/src/java/org/apache/cassandra/service/accord/serializers/IVersionedWithKeysSerializer.java new file mode 100644 index 000000000000..d9b9f18005b7 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/IVersionedWithKeysSerializer.java @@ -0,0 +1,334 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; +import java.util.function.BiFunction; +import java.util.function.IntFunction; + +import accord.api.Key; +import accord.api.RoutingKey; +import accord.primitives.AbstractKeys; +import accord.primitives.AbstractRanges; +import accord.primitives.AbstractUnseekableKeys; +import accord.primitives.Keys; +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.primitives.Routable; +import accord.primitives.RoutableKey; +import accord.primitives.Routables; +import accord.primitives.RoutingKeys; +import accord.utils.UnhandledEnum; +import net.nicoulaj.compilecommand.annotations.DontInline; +import net.nicoulaj.compilecommand.annotations.Inline; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + +import static accord.utils.SortedArrays.Search.FAST; + +/** + * De/serialize a structure that can refer to a known superset of RoutingKeys/Keys/Ranges... + */ +public interface IVersionedWithKeysSerializer, T> extends IVersionedSerializer +{ + /** + * Serialize the specified type into the specified DataOutputStream instance. + * + * @param t type that needs to be serialized + * @param out DataOutput into which serialization needs to happen. + * @param version protocol version + * @throws IOException if serialization fails + */ + void serialize(K keys, T t, DataOutputPlus out, Version version) throws IOException; + + /** + * Deserialize into the specified DataInputStream instance. + * @param in DataInput from which deserialization needs to happen. + * @param version protocol version + * @return the type that was deserialized + * @throws IOException if deserialization fails + */ + T deserialize(K keys, DataInputPlus in, Version version) throws IOException; + + /** + * Calculate serialized size of object without actually serializing. + * @param t object to calculate serialized size + * @param version protocol version + * @return serialized size of object t + */ + long serializedSize(K keys, T t, Version version); + + abstract class AbstractWithKeysSerializer + { + /** + * If both ends have a pre-shared superset of the columns we are serializing, we can send them much + * more efficiently. Both ends must provide the identically same set of columns. + */ + protected void serializeSubsetInternal(Routables serialize, Routables superset, DataOutputPlus out) throws IOException + { + /** + * We weight this towards small sets, and sets where the majority of items are present, since + * we expect this to mostly be used for serializing result sets. + * + * For supersets with fewer than 64 columns, we encode a bitmap of *missing* columns, + * which equates to a zero (single byte) when all columns are present, and otherwise + * a positive integer that can typically be vint encoded efficiently. + * + * If we have 64 or more columns, we cannot neatly perform a bitmap encoding, so we just switch + * to a vint encoded set of deltas, either adding or subtracting (whichever is most efficient). + * We indicate this switch by sending our bitmap with every bit set, i.e. -1L + */ + int serializeCount = serialize.size(); + int supersetCount = superset.size(); + if (serializeCount == supersetCount) + { + out.writeUnsignedVInt(0L); + } + else if (supersetCount < 64) + { + switch (serialize.domainKind()) + { + default: throw UnhandledEnum.unknown(serialize.domainKind()); + case SeekableKey: + out.writeUnsignedVInt(encodeBitmap((Keys)serialize, (Keys)superset, supersetCount)); + break; + case UnseekableKey: + out.writeUnsignedVInt(encodeBitmap((AbstractUnseekableKeys)serialize, (AbstractUnseekableKeys)superset, supersetCount)); + break; + case Range: + out.writeUnsignedVInt(encodeBitmap((AbstractRanges)serialize, (AbstractRanges)superset, supersetCount)); + break; + } + } + else + { + switch (serialize.domainKind()) + { + default: throw UnhandledEnum.unknown(serialize.domainKind()); + case SeekableKey: + serializeLargeSubset((Keys)serialize, serializeCount, (Keys)superset, supersetCount, out); + break; + case UnseekableKey: + serializeLargeSubset((AbstractUnseekableKeys)serialize, serializeCount, (AbstractUnseekableKeys)superset, supersetCount, out); + break; + case Range: + serializeLargeSubset((AbstractRanges)serialize, serializeCount, (AbstractRanges)superset, supersetCount, out); + break; + } + } + } + + protected long serializedSubsetSizeInternal(Routables serialize, Routables superset) + { + int columnCount = serialize.size(); + int supersetCount = superset.size(); + if (columnCount == supersetCount) + { + return TypeSizes.sizeofUnsignedVInt(0); + } + else if (supersetCount < 64) + { + switch (serialize.domainKind()) + { + default: throw UnhandledEnum.unknown(serialize.domainKind()); + case SeekableKey: + return TypeSizes.sizeofUnsignedVInt(encodeBitmap((Keys)serialize, (Keys)superset, supersetCount)); + case UnseekableKey: + return TypeSizes.sizeofUnsignedVInt(encodeBitmap((AbstractUnseekableKeys)serialize, (AbstractUnseekableKeys)superset, supersetCount)); + case Range: + return TypeSizes.sizeofUnsignedVInt(encodeBitmap((AbstractRanges)serialize, (AbstractRanges)superset, supersetCount)); + } + } + else + { + switch (serialize.domainKind()) + { + default: throw UnhandledEnum.unknown(serialize.domainKind()); + case SeekableKey: + return serializeLargeSubsetSize((Keys)serialize, columnCount, (Keys)superset, supersetCount); + case UnseekableKey: + return serializeLargeSubsetSize((AbstractUnseekableKeys)serialize, columnCount, (AbstractUnseekableKeys)superset, supersetCount); + case Range: + return serializeLargeSubsetSize((AbstractRanges)serialize, columnCount, (AbstractRanges)superset, supersetCount); + } + } + } + + @DontInline + private > long serializeLargeSubsetSize(R serialize, int serializeCount, R superset, int supersetCount) + { + long size = TypeSizes.sizeofUnsignedVInt(supersetCount - serializeCount); + if (serializeCount == 0) return size; + int prevSupersetIndex = 0; + int supersetIndex = 0; + int take = 0; + for (int i = 0; i < serializeCount; i++) + { + int offset = supersetIndex + take; + int nextIndex = superset.findNext(offset, serialize.get(i), FAST); + if (nextIndex == offset) + { + take++; + continue; + } + if (take != 0) // since this is dealing with subsets, the only time take=0 is when i=0 and the first superset offset isn't included + { + size += TypeSizes.sizeofUnsignedVInt(take); + size += TypeSizes.sizeofUnsignedVInt(supersetIndex - prevSupersetIndex); + prevSupersetIndex = supersetIndex; + } + + supersetIndex = nextIndex; + take = 1; + } + size += TypeSizes.sizeofUnsignedVInt(take); + size += TypeSizes.sizeofUnsignedVInt(supersetIndex - prevSupersetIndex); + return size; + } + + // encodes a 1 bit for every *missing* column, on the assumption presence is more common, + // and because this is consistent with encoding 0 to represent all present + private static long encodeBitmap(AbstractKeys serialize, AbstractKeys superset, int supersetCount) + { + // the index we would encounter next if all columns are present + long bitmap = superset.foldl(serialize, (k, p1, v, i) -> { + return v | (1L << i); + }, 0L, 0L, -1L); + bitmap ^= -1L >>> (64 - supersetCount); + return bitmap; + } + + private static long encodeBitmap(AbstractRanges serialize, AbstractRanges superset, int supersetCount) + { + // the index we would encounter next if all columns are present + long bitmap = superset.foldl(serialize, (k, p1, v, i) -> { + return v | (1L << i); + }, 0L, 0L, -1L); + bitmap ^= -1L >>> (64 - supersetCount); + return bitmap; + } + + @DontInline + private > void serializeLargeSubset(R serialize, int serializeCount, + R superset, int supersetCount, + DataOutputPlus out) throws IOException + { + out.writeUnsignedVInt32(supersetCount - serializeCount); + if (serializeCount == 0) return; + int prevSupersetIndex = 0; + int supersetIndex = 0; + int take = 0; + for (int i = 0; i < serializeCount; i++) + { + int offset = supersetIndex + take; + int nextIndex = superset.findNext(offset, serialize.get(i), FAST); + if (nextIndex == offset) + { + take++; + continue; + } + if (take != 0) + { + out.writeUnsignedVInt32(take); + out.writeUnsignedVInt32(supersetIndex - prevSupersetIndex); + prevSupersetIndex = supersetIndex; + } + + supersetIndex = nextIndex; + take = 1; + } + out.writeUnsignedVInt32(take); + out.writeUnsignedVInt32(supersetIndex - prevSupersetIndex); + } + + public Routables deserializeSubsetInternal(Routables superset, DataInputPlus in) throws IOException + { + switch (superset.domainKind()) + { + default: throw UnhandledEnum.unknown(superset.domainKind()); + case SeekableKey: return deserializeSubset((Keys) superset, in, (ks, s) -> ks == null ? s : Keys.of(ks), Key[]::new); + case UnseekableKey: return deserializeSubset((AbstractUnseekableKeys) superset, in, (ks, s) -> ks == null ? s : RoutingKeys.of(ks), RoutingKey[]::new); + case Range: return deserializeSubset((AbstractRanges) superset, in, (rs, s) -> rs == null ? s : Ranges.of(rs), Range[]::new); + } + } + + public , T> T deserializeSubset(R superset, DataInputPlus in, BiFunction result, IntFunction allocator) throws IOException + { + long encoded = in.readUnsignedVInt(); + int supersetCount = superset.size(); + if (encoded == 0L) + return result.apply(null, superset); + else if (supersetCount >= 64) + return result.apply(deserializeLargeSubset(in, superset, supersetCount, (int) encoded, allocator), superset); + else + return result.apply(deserializeSmallSubsetArray(encoded, superset, supersetCount, allocator), superset); + } + + @Inline + private T[] deserializeLargeSubset(DataInputPlus in, Routables superset, int supersetCount, int delta, IntFunction allocator) throws IOException + { + int deserializeCount = supersetCount - delta; + T[] out = allocator.apply(deserializeCount); + int count = 0; + int prevSupersetIndex = 0; + while (count < deserializeCount) + { + int take = in.readUnsignedVInt32(); + int supersetIndex = in.readUnsignedVInt32() + prevSupersetIndex; + prevSupersetIndex = supersetIndex; + for (int i = 0; i < take; i++) + out[count++] = superset.get(supersetIndex + i); + } + return out; + } + + private K[] deserializeSmallSubsetArray(long encoded, Routables superset, int supersetCount, IntFunction allocator) + { + encoded ^= -1L >>> (64 - supersetCount); + int deserializeCount = Long.bitCount(encoded); + K[] out = allocator.apply(deserializeCount); + int count = 0; + while (encoded != 0) + { + long lowestBit = Long.lowestOneBit(encoded); + out[count++] = superset.get(Long.numberOfTrailingZeros(lowestBit)); + encoded ^= lowestBit; + } + return out; + } + + public void skipSubsetInternal(int supersetCount, DataInputPlus in) throws IOException + { + long encoded = in.readUnsignedVInt(); + if (encoded == 0 || supersetCount < 64) return; + // large + int deserializeCount = supersetCount - ((int) encoded); + int count = 0; + while (count < deserializeCount) + { + int take = in.readUnsignedVInt32(); + in.readUnsignedVInt32(); + for (int i = 0; i < take; i++) + count++; + } + } + } + +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/InformDurableSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/InformDurableSerializers.java new file mode 100644 index 000000000000..a7d181e70558 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/InformDurableSerializers.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import accord.messages.InformDurable; +import accord.primitives.Route; +import accord.primitives.Status; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + +public class InformDurableSerializers +{ + public static final IVersionedSerializer request = new TxnRequestSerializer() + { + @Override + public void serializeBody(InformDurable msg, DataOutputPlus out, Version version) throws IOException + { + out.writeVInt(msg.minEpoch - msg.waitForEpoch); + out.writeVInt(msg.maxEpoch - msg.waitForEpoch); + CommandSerializers.timestamp.serialize(msg.executeAt, out); + CommandSerializers.durability.serialize(msg.durability, out); + } + + @Override + public InformDurable deserializeBody(DataInputPlus in, Version version, TxnId txnId, Route scope, long waitForEpoch) throws IOException + { + long minEpoch = waitForEpoch + in.readVInt(); + long maxEpoch = waitForEpoch + in.readVInt(); + Timestamp executeAt = CommandSerializers.timestamp.deserialize(in); + Status.Durability durability = CommandSerializers.durability.deserialize(in); + return InformDurable.SerializationSupport.create(txnId, scope, executeAt, minEpoch, waitForEpoch, maxEpoch, durability); + } + + @Override + public long serializedBodySize(InformDurable msg, Version version) + { + return TypeSizes.sizeofVInt(msg.minEpoch - msg.waitForEpoch) + + TypeSizes.sizeofVInt(msg.maxEpoch - msg.waitForEpoch) + + CommandSerializers.timestamp.serializedSize(msg.executeAt) + + CommandSerializers.durability.serializedSize(msg.durability); + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/KeySerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/KeySerializers.java new file mode 100644 index 000000000000..3ab41a74df57 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/KeySerializers.java @@ -0,0 +1,1137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; +import java.util.Objects; +import java.util.function.Function; +import java.util.function.IntFunction; + +import com.google.common.annotations.VisibleForTesting; + +import accord.api.Key; +import accord.api.RoutingKey; +import accord.primitives.AbstractKeys; +import accord.primitives.AbstractRanges; +import accord.primitives.AbstractUnseekableKeys; +import accord.primitives.FullKeyRoute; +import accord.primitives.FullRangeRoute; +import accord.primitives.FullRoute; +import accord.primitives.KeyRoute; +import accord.primitives.Keys; +import accord.primitives.PartialKeyRoute; +import accord.primitives.PartialRangeRoute; +import accord.primitives.PartialRoute; +import accord.primitives.Participants; +import accord.primitives.Range; +import accord.primitives.RangeRoute; +import accord.primitives.Ranges; +import accord.primitives.Routable; +import accord.primitives.RoutableKey; +import accord.primitives.Routables; +import accord.primitives.Route; +import accord.primitives.RoutingKeys; +import accord.primitives.Seekable; +import accord.primitives.Seekables; +import accord.primitives.Unseekables; +import accord.primitives.Unseekables.UnseekablesKind; +import accord.utils.Invariants; +import accord.utils.TinyEnumSet; +import accord.utils.UnhandledEnum; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.service.accord.api.AccordRoutableKey.AccordKeySerializer; +import org.apache.cassandra.service.accord.api.AccordRoutableKey.AccordSearchableKeySerializer; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.utils.NullableSerializer; + +import static accord.utils.ArrayBuffers.cachedInts; + +public class KeySerializers +{ + public static final AccordKeySerializer key; + public static final AccordSearchableKeySerializer routingKey; + + public static final UnversionedSerializer nullableRoutingKey; + public static final AbstractSearchableRoutingKeysSerializer routingKeys; + public static final UnversionedSerializer keys; + + public static final AbstractSearchableRoutingKeysSerializer partialKeyRoute; + public static final AbstractSearchableRoutingKeysSerializer fullKeyRoute; + + public static final UnversionedSerializer range; + public static final AbstractRangesSerializer ranges; + public static final AbstractRangesSerializer partialRangeRoute; + public static final AbstractRangesSerializer fullRangeRoute; + + public static final AbstractRoutablesSerializer> route; + public static final UnversionedSerializer> nullableRoute; + public static final UnversionedSerializer> partialRoute; + + public static final AbstractRoutablesSerializer> fullRoute; + public static final UnversionedSerializer> seekables; + public static final UnversionedSerializer> nullableFullRoute; + public static final AbstractRoutablesSerializer> unseekables; + public static final AbstractRoutablesSerializer> participants; + public static final UnversionedSerializer> nullableParticipants; + + static + { + Impl impl = new Impl(); + key = impl.key; + routingKey = impl.routingKey; + + nullableRoutingKey = impl.nullableRoutingKey; + routingKeys = impl.routingKeys; + keys = impl.keys; + + partialKeyRoute = impl.partialKeyRoute; + fullKeyRoute = impl.fullKeyRoute; + + range = impl.range; + ranges = impl.ranges; + partialRangeRoute = impl.partialRangeRoute; + fullRangeRoute = impl.fullRangeRoute; + + route = impl.route; + nullableRoute = impl.nullableRoute; + partialRoute = impl.partialRoute; + + fullRoute = impl.fullRoute; + seekables = impl.seekables; + nullableFullRoute = impl.nullableFullRoute; + unseekables = impl.unseekables; + participants = impl.participants; + nullableParticipants = impl.nullableParticipants; + } + + public static class Impl + { + final AccordKeySerializer key; + final AccordSearchableKeySerializer routingKey; + + final UnversionedSerializer nullableRoutingKey; + final AbstractSearchableRoutingKeysSerializer routingKeys; + final UnversionedSerializer keys; + + final AbstractSearchableRoutingKeysSerializer partialKeyRoute; + final AbstractSearchableRoutingKeysSerializer fullKeyRoute; + + final UnversionedSerializer range; + final AbstractRangesSerializer ranges; + final AbstractRangesSerializer partialRangeRoute; + final AbstractRangesSerializer fullRangeRoute; + + final AbstractRoutablesSerializer> route; + final UnversionedSerializer> nullableRoute; + final UnversionedSerializer> partialRoute; + + final AbstractRoutablesSerializer> fullRoute; + final AbstractSeekablesSerializer seekables; + final UnversionedSerializer> nullableFullRoute; + final AbstractRoutablesSerializer> unseekables; + final AbstractRoutablesSerializer> participants; + final UnversionedSerializer> nullableParticipants; + private Impl() + { + this((AccordKeySerializer) (AccordKeySerializer) PartitionKey.serializer, + (AccordSearchableKeySerializer) (AccordSearchableKeySerializer) TokenKey.serializer, + (UnversionedSerializer) (UnversionedSerializer) TokenRange.serializer); + } + + @VisibleForTesting + public Impl(AccordKeySerializer key, + AccordSearchableKeySerializer routingKey, + UnversionedSerializer range) + { + this.key = key; + this.routingKey = routingKey; + this.range = range; + + this.nullableRoutingKey = NullableSerializer.wrap(routingKey); + this.routingKeys = new AbstractSearchableRoutingKeysSerializer<>(routingKey) + { + @Override RoutingKeys deserialize(DataInputPlus in, RoutingKey[] keys) + { + return RoutingKeys.SerializationSupport.create(keys); + } + }; + + this.keys = new AbstractKeysSerializer<>(key, Key[]::new) + { + @Override Keys deserialize(DataInputPlus in, Key[] keys) + { + return Keys.SerializationSupport.create(keys); + } + }; + + this.partialKeyRoute = new AbstractKeyRouteSerializer<>(routingKey) + { + @Override + PartialKeyRoute construct(RoutingKey homeKey, RoutingKey[] keys) + { + return PartialKeyRoute.SerializationSupport.create(homeKey, keys); + } + }; + + this.fullKeyRoute = new AbstractKeyRouteSerializer<>(routingKey) + { + @Override + FullKeyRoute construct(RoutingKey homeKey, RoutingKey[] keys) + { + return FullKeyRoute.SerializationSupport.create(homeKey, keys); + } + }; + + this.ranges = new AbstractRangesSerializer<>() + { + @Override + public Ranges deserialize(DataInputPlus in, Range[] ranges) + { + return Ranges.ofSortedAndDeoverlapped(ranges); + } + }; + + this.partialRangeRoute = new AbstractRangeRouteSerializer<>() + { + @Override + PartialRangeRoute construct(RoutingKey homeKey, Range[] rs) + { + return PartialRangeRoute.SerializationSupport.create(homeKey, rs); + } + }; + + this.fullRangeRoute = new AbstractRangeRouteSerializer<>() + { + @Override + FullRangeRoute construct(RoutingKey homeKey, Range[] Ranges) + { + return FullRangeRoute.SerializationSupport.create(homeKey, Ranges); + } + }; + + Function, AbstractRoutablesSerializer> factory = (a) -> new AbstractRoutablesSerializer<>(a, routingKeys, partialKeyRoute, fullKeyRoute, ranges, partialRangeRoute, fullRangeRoute); + + this.route = (AbstractRoutablesSerializer>) factory.apply(TinyEnumSet.of(UnseekablesKind.PartialKeyRoute, UnseekablesKind.FullKeyRoute, UnseekablesKind.PartialRangeRoute, UnseekablesKind.FullRangeRoute)); + this.nullableRoute = NullableSerializer.wrap(route); + + this.partialRoute = (AbstractRoutablesSerializer>) factory.apply(TinyEnumSet.of(UnseekablesKind.PartialKeyRoute, UnseekablesKind.PartialRangeRoute)); + this.fullRoute = (AbstractRoutablesSerializer>) factory.apply(TinyEnumSet.of(UnseekablesKind.FullKeyRoute, UnseekablesKind.FullRangeRoute)); + this.nullableFullRoute = NullableSerializer.wrap(fullRoute); + + this.unseekables = (AbstractRoutablesSerializer>) factory.apply(TinyEnumSet.allOf(UnseekablesKind.class)); + this.participants = (AbstractRoutablesSerializer>) factory.apply(TinyEnumSet.allOf(UnseekablesKind.class)); + + this.nullableParticipants = NullableSerializer.wrap(participants); + this.seekables = new AbstractSeekablesSerializer(keys, ranges); + } + } + + public static class AbstractRoutablesSerializer> implements UnversionedSerializer + { + final TinyEnumSet permitted; + final AbstractSearchableRoutingKeysSerializer routingKeys; + final AbstractSearchableRoutingKeysSerializer partialKeyRoute; + final AbstractSearchableRoutingKeysSerializer fullKeyRoute; + final AbstractRangesSerializer ranges; + final AbstractRangesSerializer partialRangeRoute; + final AbstractRangesSerializer fullRangeRoute; + + protected AbstractRoutablesSerializer(TinyEnumSet permitted, + AbstractSearchableRoutingKeysSerializer routingKeys, + AbstractSearchableRoutingKeysSerializer partialKeyRoute, + AbstractSearchableRoutingKeysSerializer fullKeyRoute, + AbstractRangesSerializer ranges, + AbstractRangesSerializer partialRangeRoute, + AbstractRangesSerializer fullRangeRoute) + { + this.permitted = permitted; + this.routingKeys = routingKeys; + this.partialKeyRoute = partialKeyRoute; + this.fullKeyRoute = fullKeyRoute; + this.ranges = ranges; + this.partialRangeRoute = partialRangeRoute; + this.fullRangeRoute = fullRangeRoute; + } + + @Override + public void serialize(RS t, DataOutputPlus out) throws IOException + { + UnseekablesKind kind = t.kind(); + Invariants.requireArgument(permitted.contains(kind)); + + switch (kind) + { + default: throw new AssertionError(); + case RoutingKeys: + out.writeByte(1); + routingKeys.serialize((RoutingKeys)t, out); + break; + case PartialKeyRoute: + out.writeByte(2); + partialKeyRoute.serialize((PartialKeyRoute)t, out); + break; + case FullKeyRoute: + out.writeByte(3); + fullKeyRoute.serialize((FullKeyRoute)t, out); + break; + case RoutingRanges: + out.writeByte(4); + ranges.serialize((Ranges)t, out); + break; + case PartialRangeRoute: + out.writeByte(5); + partialRangeRoute.serialize((PartialRangeRoute)t, out); + break; + case FullRangeRoute: + out.writeByte(6); + fullRangeRoute.serialize((FullRangeRoute)t, out); + break; + } + } + + public void serializeSubset(RS t, Unseekables superset, DataOutputPlus out) throws IOException + { + UnseekablesKind kind = t.kind(); + Invariants.requireArgument(permitted.contains(kind)); + + switch (kind) + { + default: throw new AssertionError(); + case RoutingKeys: + out.writeByte(1); + routingKeys.serializeSubset((RoutingKeys)t, superset, out); + break; + case PartialKeyRoute: + out.writeByte(2); + partialKeyRoute.serializeSubset((PartialKeyRoute)t, superset, out); + break; + case FullKeyRoute: + out.writeByte(3); + fullKeyRoute.serializeSubset((FullKeyRoute)t, superset, out); + break; + case RoutingRanges: + out.writeByte(4); + ranges.serializeSubset((Ranges)t, superset, out); + break; + case PartialRangeRoute: + out.writeByte(5); + partialRangeRoute.serializeSubset((PartialRangeRoute)t, superset, out); + break; + case FullRangeRoute: + out.writeByte(6); + fullRangeRoute.serializeSubset((FullRangeRoute)t, superset, out); + break; + } + } + + @Override + public RS deserialize(DataInputPlus in) throws IOException + { + byte b = in.readByte(); + UnseekablesKind kind; + RS result; + switch (b) + { + default: throw new IOException("Corrupted input: expected byte 1, 2, 3, 4, 5 or 6; received " + b); + case 1: kind = UnseekablesKind.RoutingKeys; result = (RS)routingKeys.deserialize(in); break; + case 2: kind = UnseekablesKind.PartialKeyRoute; result = (RS)partialKeyRoute.deserialize(in); break; + case 3: kind = UnseekablesKind.FullKeyRoute; result = (RS)fullKeyRoute.deserialize(in); break; + case 4: kind = UnseekablesKind.RoutingRanges; result = (RS)ranges.deserialize(in); break; + case 5: kind = UnseekablesKind.PartialRangeRoute; result = (RS)partialRangeRoute.deserialize(in); break; + case 6: kind = UnseekablesKind.FullRangeRoute; result = (RS)fullRangeRoute.deserialize(in); break; + } + Invariants.require(permitted.contains(kind)); + return result; + } + + public RS deserializeSubset(Unseekables superset, DataInputPlus in) throws IOException + { + byte b = in.readByte(); + UnseekablesKind kind; + RS result; + switch (b) + { + default: throw new IOException("Corrupted input: expected byte 1, 2, 3, 4 or 5; received " + b); + case 1: kind = UnseekablesKind.RoutingKeys; result = (RS)routingKeys.deserializeSubset((AbstractUnseekableKeys) superset, in); break; + case 2: kind = UnseekablesKind.PartialKeyRoute; result = (RS)partialKeyRoute.deserializeSubset((AbstractUnseekableKeys) superset, in); break; + case 3: kind = UnseekablesKind.FullKeyRoute; result = (RS)fullKeyRoute.deserializeSubset((AbstractUnseekableKeys) superset, in); break; + case 4: kind = UnseekablesKind.RoutingRanges; result = (RS)ranges.deserializeSubset((AbstractRanges) superset, in); break; + case 5: kind = UnseekablesKind.PartialRangeRoute; result = (RS)partialRangeRoute.deserializeSubset((AbstractRanges) superset, in); break; + case 6: kind = UnseekablesKind.FullRangeRoute; result = (RS)fullRangeRoute.deserializeSubset((AbstractRanges) superset, in); break; + } + Invariants.require(permitted.contains(kind)); + return result; + } + + public void skip(DataInputPlus in) throws IOException + { + countAndSkip(in); + } + + public void skip(UnseekablesKind kind, DataInputPlus in) throws IOException + { + countAndSkip(kind, in); + } + + // return number of elements skipped + public int countAndSkip(DataInputPlus in) throws IOException + { + byte b = in.readByte(); + switch (b) + { + default: throw new IOException("Corrupted input: expected byte 1, 2, 3, 4 or 5; received " + b); + case 1: return routingKeys.countAndSkip(in); + case 2: return partialKeyRoute.countAndSkip(in); + case 3: return fullKeyRoute.countAndSkip(in); + case 4: return ranges.countAndSkip(in); + case 5: return partialRangeRoute.countAndSkip(in); + case 6: return fullRangeRoute.countAndSkip(in); + } + } + + public int countAndSkip(UnseekablesKind kind, DataInputPlus in) throws IOException + { + switch (kind) + { + default: throw UnhandledEnum.unknown(kind); + case RoutingKeys: return routingKeys.countAndSkip(in); + case PartialKeyRoute: return partialKeyRoute.countAndSkip(in); + case FullKeyRoute: return fullKeyRoute.countAndSkip(in); + case RoutingRanges: return ranges.countAndSkip(in); + case PartialRangeRoute: return partialRangeRoute.countAndSkip(in); + case FullRangeRoute: return fullRangeRoute.countAndSkip(in); + } + } + + public Unseekables.UnseekablesKind readKind(DataInputPlus in) throws IOException + { + byte b = in.readByte(); + switch (b) + { + default: throw new IOException("Corrupted input: expected byte 1, 2, 3, 4 or 5; received " + b); + case 1: return UnseekablesKind.RoutingKeys; + case 2: return UnseekablesKind.PartialKeyRoute; + case 3: return UnseekablesKind.FullKeyRoute; + case 4: return UnseekablesKind.RoutingRanges; + case 5: return UnseekablesKind.PartialRangeRoute; + case 6: return UnseekablesKind.FullRangeRoute; + } + } + + public void skipSubset(int supersetCount, DataInputPlus in) throws IOException + { + byte b = in.readByte(); + switch (b) + { + default: throw new IOException("Corrupted input: expected byte 1, 2, 3, 4 or 5; received " + b); + case 1: routingKeys.skipSubset(supersetCount, in); break; + case 2: partialKeyRoute.skipSubset(supersetCount, in); break; + case 3: fullKeyRoute.skipSubset(supersetCount, in); break; + case 4: ranges.skipSubset(supersetCount, in); break; + case 5: partialRangeRoute.skipSubset(supersetCount, in); break; + case 6: fullRangeRoute.skipSubset(supersetCount, in); break; + } + } + + @Override + public long serializedSize(RS t) + { + switch (t.kind()) + { + default: throw new AssertionError(); + case RoutingKeys: + return 1 + routingKeys.serializedSize((RoutingKeys)t); + case PartialKeyRoute: + return 1 + partialKeyRoute.serializedSize((PartialKeyRoute)t); + case FullKeyRoute: + return 1 + fullKeyRoute.serializedSize((FullKeyRoute)t); + case RoutingRanges: + return 1 + ranges.serializedSize((Ranges)t); + case PartialRangeRoute: + return 1 + partialRangeRoute.serializedSize((PartialRangeRoute)t); + case FullRangeRoute: + return 1 + fullRangeRoute.serializedSize((FullRangeRoute)t); + } + } + + public long serializedSubsetSize(RS t, Unseekables superset) + { + switch (t.kind()) + { + default: throw new AssertionError(); + case RoutingKeys: + return 1 + routingKeys.serializedSubsetSize((RoutingKeys)t, superset); + case PartialKeyRoute: + return 1 + partialKeyRoute.serializedSubsetSize((PartialKeyRoute)t, superset); + case FullKeyRoute: + return 1 + fullKeyRoute.serializedSubsetSize((FullKeyRoute)t, superset); + case RoutingRanges: + return 1 + ranges.serializedSubsetSize((Ranges)t, superset); + case PartialRangeRoute: + return 1 + partialRangeRoute.serializedSubsetSize((PartialRangeRoute)t, superset); + case FullRangeRoute: + return 1 + fullRangeRoute.serializedSubsetSize((FullRangeRoute)t, superset); + } + } + } + + public static final UnversionedSerializer seekable = new UnversionedSerializer<>() + { + @Override + public void serialize(Seekable seekable, DataOutputPlus out) throws IOException + { + switch (seekable.domain()) + { + default: throw new AssertionError(); + case Key: + out.writeByte(0); + PartitionKey.serializer.serialize((PartitionKey) seekable, out); + break; + case Range: + out.writeByte(1); + TokenRange.serializer.serialize((TokenRange) seekable, out); + break; + } + } + + @Override + public Seekable deserialize(DataInputPlus in) throws IOException + { + byte b = in.readByte(); + switch (b) + { + default: throw new IOException("Corrupted input: expected byte 1 or 2, received " + b); + case 0: return PartitionKey.serializer.deserialize(in); + case 1: return TokenRange.serializer.deserialize(in); + } + } + + @Override + public long serializedSize(Seekable seekable) + { + switch (seekable.domain()) + { + default: throw new AssertionError(); + case Key: + return 1 + PartitionKey.serializer.serializedSize((PartitionKey) seekable); + case Range: + return 1 + TokenRange.serializer.serializedSize((TokenRange) seekable); + } + } + }; + + public static class AbstractSeekablesSerializer implements UnversionedSerializer> + { + final UnversionedSerializer keys; + final AbstractRangesSerializer ranges; + + public AbstractSeekablesSerializer(UnversionedSerializer keys, AbstractRangesSerializer ranges) + { + this.keys = keys; + this.ranges = ranges; + } + + @Override + public void serialize(Seekables t, DataOutputPlus out) throws IOException + { + switch (t.domain()) + { + default: throw new AssertionError(); + case Key: + out.writeByte(1); + keys.serialize((Keys)t, out); + break; + case Range: + out.writeByte(2); + ranges.serialize((Ranges)t, out); + break; + } + } + + @Override + public Seekables deserialize(DataInputPlus in) throws IOException + { + byte b = in.readByte(); + switch (b) + { + default: throw new IOException("Corrupted input: expected byte 1 or 2, received " + b); + case 1: return keys.deserialize(in); + case 2: return ranges.deserialize(in); + } + } + + @Override + public long serializedSize(Seekables t) + { + switch (t.domain()) + { + default: throw new AssertionError(); + case Key: + return 1 + keys.serializedSize((Keys)t); + case Range: + return 1 + ranges.serializedSize((Ranges)t); + } + } + } + + // this serializer is designed to permits using the collection in its serialized form with minimal in-memory state. + // it also saves some memory by avoiding duplicating prefixes (which happens to also assist faster lookups) + public abstract static class AbstractKeysSerializer> implements UnversionedSerializer + { + final AccordKeySerializer keySerializer; + final IntFunction allocate; + + public AbstractKeysSerializer(AccordKeySerializer keySerializer, IntFunction allocate) + { + this.keySerializer = keySerializer; + this.allocate = allocate; + } + + @Override + public void serialize(KS keys, DataOutputPlus out) throws IOException + { + out.writeUnsignedVInt32(keys.size()); + for (int i=0, mi=keys.size(); i> extends IVersionedWithKeysSerializer.AbstractWithKeysSerializer implements UnversionedSerializer + { + final IntFunction allocate; + + public AbstractSearchableSerializer(IntFunction allocate) + { + this.allocate = allocate; + } + + private int serializedSizeOfPrefix(Object prefix) + { + return routingKey.serializedSizeOfPrefix(prefix); + } + + private void serializePrefix(Object prefix, DataOutputPlus out) throws IOException + { + routingKey.serializePrefix(prefix, out); + } + + private Object deserializePrefix(DataInputPlus in) throws IOException + { + return routingKey.deserializePrefix(in); + } + + // if we store Ranges, we have twice as many indexes + abstract int recordCountToLengthCount(int recordCount); + abstract int fixedKeyLengthForPrefix(Object prefix); + abstract int serializedSizeWithoutPrefix(R routable); + abstract void serializeWithoutPrefixOrLength(R routable, DataOutputPlus out) throws IOException; + abstract void serializeOffsets(RS unseekables, int start, int end, DataOutputPlus out) throws IOException; + + abstract R deserializeWithPrefix(Object prefix, int length, DataInputPlus in) throws IOException; + abstract R deserializeWithPrefix(Object prefix, int lengthIndex, int[] lengths, DataInputPlus in) throws IOException; + + abstract RS deserialize(DataInputPlus in, R[] keys) throws IOException; + + @Override + public long serializedSize(RS routables) + { + int count = routables.size(); + long size = TypeSizes.sizeofUnsignedVInt(count); + if (count == 0) + return size; + + Object prefix = routables.get(0).prefix(); + int prefixStart = 0; + for (int i = 1 ; i <= count ; ++i) + { + Object nextPrefix = null; + if (i < count) + { + nextPrefix = routables.get(i).prefix(); + if (Objects.equals(prefix, nextPrefix)) + continue; + } + + size += TypeSizes.sizeofUnsignedVInt(count - i); + size += serializedSizeOfPrefix(prefix); + int fixedLength = fixedKeyLengthForPrefix(prefix); + if (fixedLength < 0) + { + size += 4L * recordCountToLengthCount(i - prefixStart); + size += serializedSizeOfKeysWithoutPrefix(routables, prefixStart, i); + } + else + { + size += fixedLength * (long)(i - prefixStart); + } + prefixStart = i; + prefix = nextPrefix; + } + + return size; + } + + public long serializedSubsetSize(RS keys, Routables superset) + { + return serializedSubsetSizeInternal(keys, superset); + } + + @Override + public void serialize(RS keys, DataOutputPlus out) throws IOException + { + int size = keys.size(); + out.writeUnsignedVInt32(size); + if (size == 0) + return; + + Object prefix = keys.get(0).prefix(); + int prefixStart = 0; + for (int i = 1 ; i <= size ; ++i) + { + Object nextPrefix = null; + if (i < size) + { + nextPrefix = keys.get(i).prefix(); + if (Objects.equals(prefix, nextPrefix)) + continue; + } + + out.writeUnsignedVInt32(size - i); + serializePrefix(prefix, out); + int fixedLength = fixedKeyLengthForPrefix(prefix); + if (fixedLength < 0) + serializeOffsets(keys, prefixStart, i, out); + serializeKeysWithoutPrefix(keys, prefixStart, i, out); + prefixStart = i; + prefix = nextPrefix; + } + } + + private long serializedSizeOfKeysWithoutPrefix(RS keys, int start, int end) + { + long size = 0; + for (int i = start; i < end; ++i) + size += serializedSizeWithoutPrefix(keys.get(i)); + return size; + } + + private void serializeKeysWithoutPrefix(RS keys, int start, int end, DataOutputPlus out) throws IOException + { + for (int i = start; i < end; ++i) + serializeWithoutPrefixOrLength(keys.get(i), out); + } + + public void serializeSubset(RS keys, Routables superset, DataOutputPlus out) throws IOException + { + serializeSubsetInternal(keys, superset, out); + } + + public void skip(DataInputPlus in) throws IOException + { + countAndSkip(in); + } + + // return number of elements skipped + public int countAndSkip(DataInputPlus in) throws IOException + { + int remaining = in.readUnsignedVInt32(); + if (remaining == 0) + return 0; + + int total = 0; + while (remaining > 0) + { + int count = remaining - in.readUnsignedVInt32(); + remaining -= count; + Object prefix = deserializePrefix(in); + int fixedLength = fixedKeyLengthForPrefix(prefix); + if (fixedLength >= 0) + { + in.skipBytesFully(count * fixedLength); + } + else + { + in.skipBytesFully(4 * (recordCountToLengthCount(count) - 1)); + int end = in.readInt(); + in.skipBytesFully(end); + } + total += count; + } + return total; + } + + public void skipSubset(int supersetCount, DataInputPlus in) throws IOException + { + skipSubsetInternal(supersetCount, in); + } + + @Override + public RS deserialize(DataInputPlus in) throws IOException + { + int remaining = in.readUnsignedVInt32(); + R[] out = allocate.apply(remaining); + int outCount = 0; + while (remaining > 0) + { + int count = remaining - in.readUnsignedVInt32(); + remaining -= count; + Object prefix = deserializePrefix(in); + int fixedLength = fixedKeyLengthForPrefix(prefix); + if (fixedLength >= 0) + { + for (int i = 0 ; i < count ; ++i) + out[outCount++] = deserializeWithPrefix(prefix, fixedLength, in); + } + else + { + int lengthCount = recordCountToLengthCount(count); + if (lengthCount == 1) + { + int end = in.readInt(); + out[outCount++] = deserializeWithPrefix(prefix, end, in); + } + else + { + int[] lengths = cachedInts().getInts(lengthCount); + int prev = 0; + for (int i = 0 ; i < lengthCount ; ++i) + { + int end = in.readInt(); + lengths[i] = end - prev; + prev = end; + } + for (int i = 0 ; i < count ; ++i) + out[outCount++] = deserializeWithPrefix(prefix, i, lengths, in); + cachedInts().forceDiscard(lengths); + } + } + } + + return deserialize(in, out); + } + } + + // this serializer is designed to permits using the collection in its serialized form with minimal in-memory state. + // it also saves some memory by avoiding duplicating prefixes (which happens to also assist faster lookups) + public abstract static class AbstractSearchableRoutingKeysSerializer extends AbstractSearchableSerializer implements UnversionedSerializer + { + public AbstractSearchableRoutingKeysSerializer(AccordSearchableKeySerializer serializer) + { + super(RoutingKey[]::new); + } + + @Override + final int fixedKeyLengthForPrefix(Object prefix) + { + return routingKey.fixedKeyLengthForPrefix(prefix); + } + + @Override + final int recordCountToLengthCount(int recordCount) + { + return recordCount; + } + + @Override + final int serializedSizeWithoutPrefix(RoutingKey routable) + { + return routingKey.serializedSizeWithoutPrefix(routable); + } + + @Override + final void serializeWithoutPrefixOrLength(RoutingKey routable, DataOutputPlus out) throws IOException + { + routingKey.serializeWithoutPrefixOrLength(routable, out); + } + + @Override + final void serializeOffsets(KS keys, int startIndex, int endIndex, DataOutputPlus out) throws IOException + { + int endOffset = 0; + for (int i = startIndex; i < endIndex; ++i) + { + endOffset += serializedSizeWithoutPrefix(keys.get(i)); + out.writeInt(endOffset); + } + } + + @Override + final RoutingKey deserializeWithPrefix(Object prefix, int length, DataInputPlus in) throws IOException + { + return routingKey.deserializeWithPrefix(prefix, length, in); + } + + @Override + final RoutingKey deserializeWithPrefix(Object prefix, int lengthIndex, int[] lengths, DataInputPlus in) throws IOException + { + return routingKey.deserializeWithPrefix(prefix, lengths[lengthIndex], in); + } + + public KS deserializeSubset(AbstractUnseekableKeys superset, DataInputPlus in) throws IOException + { + RoutingKey[] keys = deserializeSubset(superset, in, (ks, s) -> ks == null ? s.unsafeKeys() : ks, RoutingKey[]::new); + return deserialize(in, keys); + } + } + + public abstract static class AbstractKeyRouteSerializer extends AbstractSearchableRoutingKeysSerializer + { + public AbstractKeyRouteSerializer(AccordSearchableKeySerializer serializer) + { + super(serializer); + } + + abstract KS construct(RoutingKey homeKey, RoutingKey[] keys); + + @Override + KS deserialize(DataInputPlus in, RoutingKey[] keys) throws IOException + { + int i = in.readUnsignedVInt32(); + RoutingKey homeKey = i == 0 ? routingKey.deserialize(in) : keys[i - 1]; + return construct(homeKey, keys); + } + + @Override + public int countAndSkip(DataInputPlus in) throws IOException + { + int count = super.countAndSkip(in); + completeSkip(in); + return count; + } + + @Override + public void skipSubset(int supersetCount, DataInputPlus in) throws IOException + { + skipSubsetInternal(supersetCount, in); + completeSkip(in); + } + + @Override + public void serialize(KS route, DataOutputPlus out) throws IOException + { + super.serialize(route, out); + completeSerialize(route, out); + } + + @Override + public void serializeSubset(KS route, Routables superset, DataOutputPlus out) throws IOException + { + super.serializeSubset(route, superset, out); + completeSerialize(route, out); + } + + @Override + public long serializedSize(KS route) + { + return super.serializedSize(route) + + completeSerializedSize(route); + } + + @Override + public long serializedSubsetSize(KS route, Routables superset) + { + return super.serializedSubsetSize(route, superset) + + completeSerializedSize(route); + } + + private void completeSerialize(KS route, DataOutputPlus out) throws IOException + { + int i = route.indexOf(route.homeKey()); + out.writeUnsignedVInt32(Math.max(0, 1 + i)); + if (i < 0) routingKey.serialize(route.homeKey, out); + } + + private void completeSkip(DataInputPlus in) throws IOException + { + int i = in.readUnsignedVInt32(); + if (i == 0) routingKey.skip(in); + } + + private long completeSerializedSize(KS route) + { + int i = route.indexOf(route.homeKey()); + long size = TypeSizes.sizeofUnsignedVInt(Math.max(0, 1 + i)); + if (i < 0) size += routingKey.serializedSize(route.homeKey); + return size; + } + } + + public abstract static class AbstractRangesSerializer extends AbstractSearchableSerializer implements UnversionedSerializer + { + public AbstractRangesSerializer() + { + super(Range[]::new); + } + + @Override + int fixedKeyLengthForPrefix(Object prefix) + { + return routingKey.fixedKeyLengthForPrefix(prefix) * 2; + } + + @Override + int recordCountToLengthCount(int recordCount) + { + return recordCount * 2; + } + + @Override + final int serializedSizeWithoutPrefix(Range range) + { + return routingKey.serializedSizeWithoutPrefix(range.start()) + + routingKey.serializedSizeWithoutPrefix(range.end()); + } + + @Override + final void serializeWithoutPrefixOrLength(Range key, DataOutputPlus out) throws IOException + { + routingKey.serializeWithoutPrefixOrLength(key.start(), out); + routingKey.serializeWithoutPrefixOrLength(key.end(), out); + } + + @Override + final void serializeOffsets(RS ranges, int startIndex, int endIndex, DataOutputPlus out) throws IOException + { + int endOffset = 0; + for (int i = startIndex; i < endIndex; ++i) + { + Range r = ranges.get(i); + endOffset += routingKey.serializedSizeWithoutPrefix(r.start()); + out.writeInt(endOffset); + endOffset += routingKey.serializedSizeWithoutPrefix(r.end()); + out.writeInt(endOffset); + } + } + + @Override + final Range deserializeWithPrefix(Object prefix, int length, DataInputPlus in) throws IOException + { + RoutingKey start = routingKey.deserializeWithPrefix(prefix, length/2, in); + RoutingKey end = routingKey.deserializeWithPrefix(prefix, length/2, in); + return start.rangeFactory().newRange(start, end); + } + + @Override + final Range deserializeWithPrefix(Object prefix, int lengthIndex, int[] lengths, DataInputPlus in) throws IOException + { + RoutingKey start = routingKey.deserializeWithPrefix(prefix, lengths[lengthIndex * 2], in); + RoutingKey end = routingKey.deserializeWithPrefix(prefix, lengths[lengthIndex * 2 + 1], in); + return start.rangeFactory().newRange(start, end); + } + + public RS deserializeSubset(AbstractRanges superset, DataInputPlus in) throws IOException + { + Range[] ranges = deserializeSubset(superset, in, (rs, s) -> rs == null ? s.unsafeRanges() : rs, Range[]::new); + return deserialize(in, ranges); + } + } + + public abstract static class AbstractRangeRouteSerializer extends AbstractRangesSerializer + { + public AbstractRangeRouteSerializer() + { + super(); + } + + abstract RS construct(RoutingKey homeKey, Range[] ranges); + + @Override + RS deserialize(DataInputPlus in, Range[] ranges) throws IOException + { + RoutingKey homeKey = routingKey.deserialize(in); + return construct(homeKey, ranges); + } + + @Override + public int countAndSkip(DataInputPlus in) throws IOException + { + int count = super.countAndSkip(in); + routingKey.skip(in); + return count; + } + + @Override + public void skipSubset(int supersetCount, DataInputPlus in) throws IOException + { + super.skipSubset(supersetCount, in); + routingKey.skip(in); + } + + @Override + public void serialize(RS route, DataOutputPlus out) throws IOException + { + super.serialize(route, out); + routingKey.serialize(route.homeKey, out); + } + + @Override + public void serializeSubset(RS route, Routables superset, DataOutputPlus out) throws IOException + { + super.serializeSubset(route, superset, out); + routingKey.serialize(route.homeKey, out); + } + + @Override + public long serializedSize(RS route) + { + return super.serializedSize(route) + + routingKey.serializedSize(route.homeKey); + } + + @Override + public long serializedSubsetSize(RS route, Routables superset) + { + return super.serializedSubsetSize(route, superset) + routingKey.serializedSize(route.homeKey); + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/LatestDepsSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/LatestDepsSerializers.java new file mode 100644 index 000000000000..0dda1592599b --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/LatestDepsSerializers.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import accord.api.RoutingKey; +import accord.messages.GetLatestDeps; +import accord.messages.GetLatestDeps.GetLatestDepsOk; +import accord.primitives.Ballot; +import accord.primitives.Deps; +import accord.primitives.Known; +import accord.primitives.LatestDeps; +import accord.primitives.Route; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.serializers.CommandSerializers.ExecuteAtSerializer; + +public class LatestDepsSerializers +{ + public static final UnversionedSerializer latestDeps = new UnversionedSerializer<>() + { + @Override + public void serialize(LatestDeps t, DataOutputPlus out) throws IOException + { + out.writeUnsignedVInt32(t.size()); + if (t.size() == 0) + return; + + for (int i = 0 ; i < t.size() ; ++i) + { + RoutingKey start = t.startAt(i); + KeySerializers.routingKey.serialize(start, out); + LatestDeps.LatestEntry e = t.valueAt(i); + if (e == null) + { + CommandSerializers.knownDeps.serialize(null, out); + } + else + { + CommandSerializers.knownDeps.serialize(e.known, out); + CommandSerializers.ballot.serialize(e.ballot, out); + DepsSerializers.nullableDeps.serialize(e.coordinatedDeps, out); + DepsSerializers.nullableDeps.serialize(e.localDeps, out); + } + } + KeySerializers.routingKey.serialize(t.startAt(t.size()), out); + } + + @Override + public LatestDeps deserialize(DataInputPlus in) throws IOException + { + int size = in.readUnsignedVInt32(); + if (size == 0) + return LatestDeps.EMPTY; + + RoutingKey[] starts = new RoutingKey[size + 1]; + LatestDeps.LatestEntry[] values = new LatestDeps.LatestEntry[size]; + for (int i = 0 ; i < size ; ++i) + { + starts[i] = KeySerializers.routingKey.deserialize(in); + Known.KnownDeps knownDeps = CommandSerializers.knownDeps.deserialize(in); + if (knownDeps == null) + continue; + + Ballot ballot = CommandSerializers.ballot.deserialize(in); + Deps coordinatedDeps = DepsSerializers.nullableDeps.deserialize(in); + Deps localDeps = DepsSerializers.nullableDeps.deserialize(in); + values[i] = new LatestDeps.LatestEntry(knownDeps, ballot, coordinatedDeps, localDeps); + } + starts[size] = KeySerializers.routingKey.deserialize(in); + + return LatestDeps.SerializerSupport.create(true, starts, values); + } + + @Override + public long serializedSize(LatestDeps t) + { + long size = 0; + size += TypeSizes.sizeofUnsignedVInt(t.size()); + if (t.size() == 0) + return size; + for (int i = 0 ; i < t.size() ; ++i) + { + RoutingKey start = t.startAt(i); + size += KeySerializers.routingKey.serializedSize(start); + LatestDeps.LatestEntry e = t.valueAt(i); + if (e == null) + { + size += CommandSerializers.knownDeps.serializedSize(null); + } + else + { + size += CommandSerializers.knownDeps.serializedSize(e.known); + size += CommandSerializers.ballot.serializedSize(e.ballot); + size += DepsSerializers.nullableDeps.serializedSize(e.coordinatedDeps); + size += DepsSerializers.nullableDeps.serializedSize(e.localDeps); + } + } + size += KeySerializers.routingKey.serializedSize(t.startAt(t.size())); + return size; + } + }; + + public static final IVersionedSerializer request = new TxnRequestSerializer.WithUnsyncedSerializer<>() + { + @Override + public void serializeBody(GetLatestDeps msg, DataOutputPlus out, Version version) throws IOException + { + ExecuteAtSerializer.serialize(msg.executeAt, out); + } + + @Override + public GetLatestDeps deserializeBody(DataInputPlus in, Version version, TxnId txnId, Route scope, long waitForEpoch, long minEpoch) throws IOException + { + Ballot ballot = CommandSerializers.ballot.deserialize(in); + Timestamp executeAt = ExecuteAtSerializer.deserialize(in); + return GetLatestDeps.SerializationSupport.create(txnId, scope, waitForEpoch, minEpoch, ballot, executeAt); + } + + @Override + public long serializedBodySize(GetLatestDeps msg, Version version) + { + return ExecuteAtSerializer.serializedSize(msg.executeAt); + } + }; + + public static final UnversionedSerializer reply = new UnversionedSerializer<>() + { + @Override + public void serialize(GetLatestDepsOk reply, DataOutputPlus out) throws IOException + { + latestDeps.serialize(reply.deps, out); + } + + @Override + public GetLatestDepsOk deserialize(DataInputPlus in) throws IOException + { + return new GetLatestDepsOk(latestDeps.deserialize(in)); + } + + @Override + public long serializedSize(GetLatestDepsOk reply) + { + return latestDeps.serializedSize(reply.deps); + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/PreacceptSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/PreacceptSerializers.java new file mode 100644 index 000000000000..ae07f347c401 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/PreacceptSerializers.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; +import javax.annotation.Nullable; + +import accord.coordinate.ExecuteFlag.ExecuteFlags; +import accord.messages.PreAccept; +import accord.messages.PreAccept.PreAcceptOk; +import accord.messages.PreAccept.PreAcceptReply; +import accord.primitives.FullRoute; +import accord.primitives.PartialDeps; +import accord.primitives.PartialTxn; +import accord.primitives.Route; +import accord.primitives.TxnId; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.serializers.CommandSerializers.ExecuteAtSerializer; +import org.apache.cassandra.service.accord.serializers.TxnRequestSerializer.WithUnsyncedSerializer; + + +public class PreacceptSerializers +{ + private PreacceptSerializers() {} + + public static final IVersionedSerializer request = new WithUnsyncedSerializer<>() + { + @Override + public void serializeBody(PreAccept msg, DataOutputPlus out, Version version) throws IOException + { + int flags = (msg.partialDeps == null ? 0 : 1) + | (msg.route == null ? 0 : 2) + | (msg.hasCoordinatorVote ? 4 : 0) + | (msg.acceptEpoch == msg.minEpoch ? 0 : 8); + out.writeByte(flags); + CommandSerializers.partialTxn.serialize(msg.partialTxn, out, version); + if (msg.partialDeps != null) + DepsSerializers.partialDeps.serialize(msg.partialDeps, out); + if (msg.route != null) + KeySerializers.fullRoute.serialize(msg.route, out); + if (msg.acceptEpoch != msg.minEpoch) + out.writeUnsignedVInt(msg.acceptEpoch - msg.minEpoch); + } + + @Override + public PreAccept deserializeBody(DataInputPlus in, Version version, TxnId txnId, Route scope, long waitForEpoch, long minEpoch) throws IOException + { + byte flags = in.readByte(); + PartialTxn partialTxn = CommandSerializers.partialTxn.deserialize(in, version); + @Nullable PartialDeps partialDeps = (flags & 1) == 0 ? null : DepsSerializers.partialDeps.deserialize(in); + @Nullable FullRoute fullRoute = (flags & 2) == 0 ? null : KeySerializers.fullRoute.deserialize(in); + boolean hasCoordinatorVote = (flags & 4) != 0; + long acceptEpoch = (flags & 8) == 0 ? minEpoch : in.readUnsignedVInt() + minEpoch; + return PreAccept.SerializerSupport.create(txnId, scope, waitForEpoch, minEpoch, acceptEpoch, partialTxn, partialDeps, hasCoordinatorVote, fullRoute); + } + + @Override + public long serializedBodySize(PreAccept msg, Version version) + { + return TypeSizes.BYTE_SIZE + + CommandSerializers.partialTxn.serializedSize(msg.partialTxn, version) + + (msg.partialDeps == null ? 0 : DepsSerializers.partialDeps.serializedSize(msg.partialDeps)) + + (msg.route == null ? 0 : KeySerializers.fullRoute.serializedSize(msg.route)) + + (msg.acceptEpoch == msg.minEpoch ? 0 : TypeSizes.sizeofUnsignedVInt(msg.acceptEpoch - msg.minEpoch)); + } + }; + + public static final UnversionedSerializer reply = new UnversionedSerializer<>() + { + @Override + public void serialize(PreAcceptReply reply, DataOutputPlus out) throws IOException + { + out.writeBoolean(reply.isOk()); + if (!reply.isOk()) + return; + + PreAcceptOk preAcceptOk = (PreAcceptOk) reply; + CommandSerializers.txnId.serialize(preAcceptOk.txnId, out); + ExecuteAtSerializer.serialize(preAcceptOk.txnId, preAcceptOk.witnessedAt, out); + DepsSerializers.deps.serialize(preAcceptOk.deps, out); + out.writeUnsignedVInt32(preAcceptOk.flags.bits()); + } + + @Override + public PreAcceptReply deserialize(DataInputPlus in) throws IOException + { + if (!in.readBoolean()) + return PreAccept.PreAcceptNack.INSTANCE; + + TxnId txnId = CommandSerializers.txnId.deserialize(in); + return new PreAcceptOk(txnId, + ExecuteAtSerializer.deserialize(txnId, in), + DepsSerializers.deps.deserialize(in), + ExecuteFlags.get(in.readUnsignedVInt32())); + } + + @Override + public long serializedSize(PreAcceptReply reply) + { + long size = TypeSizes.sizeof(reply.isOk()); + if (!reply.isOk()) + return size; + + PreAcceptOk preAcceptOk = (PreAcceptOk) reply; + size += CommandSerializers.txnId.serializedSize(preAcceptOk.txnId); + size += ExecuteAtSerializer.serializedSize(preAcceptOk.txnId, preAcceptOk.witnessedAt); + size += DepsSerializers.deps.serializedSize(preAcceptOk.deps); + size += TypeSizes.sizeofUnsignedVInt(preAcceptOk.flags.bits()); + return size; + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/ReadDataSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/ReadDataSerializers.java new file mode 100644 index 000000000000..82e702b5c223 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/ReadDataSerializers.java @@ -0,0 +1,369 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import accord.api.Data; +import accord.messages.ApplyThenWaitUntilApplied; +import accord.messages.Commit; +import accord.messages.ReadData; +import accord.messages.ReadData.CommitOrReadNack; +import accord.messages.ReadData.ReadOk; +import accord.messages.ReadData.ReadOkWithFutureEpoch; +import accord.messages.ReadData.ReadReply; +import accord.messages.ReadData.ReadType; +import accord.messages.ReadEphemeralTxnData; +import accord.messages.ReadTxnData; +import accord.messages.StableThenRead; +import accord.messages.WaitUntilApplied; +import accord.primitives.FullRoute; +import accord.primitives.PartialDeps; +import accord.primitives.PartialTxn; +import accord.primitives.Participants; +import accord.primitives.Ranges; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.VersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.serializers.CommandSerializers.ExecuteAtSerializer; +import org.apache.cassandra.service.accord.txn.TxnData; + +import static accord.messages.Commit.WithDeps.HasDeps; +import static accord.messages.Commit.WithDeps.NoDeps; +import static accord.messages.Commit.WithTxn.HasTxn; +import static accord.messages.Commit.WithTxn.NoTxn; +import static org.apache.cassandra.db.TypeSizes.sizeof; +import static org.apache.cassandra.utils.NullableSerializer.deserializeNullable; +import static org.apache.cassandra.utils.NullableSerializer.serializeNullable; +import static org.apache.cassandra.utils.NullableSerializer.serializedNullableSize; + +public class ReadDataSerializers +{ + public static final IVersionedSerializer readData = new IVersionedSerializer() + { + @Override + public void serialize(ReadData t, DataOutputPlus out, Version version) throws IOException + { + out.writeByte(t.kind().val); + serializerFor(t).serialize(t, out, version); + } + + @Override + public ReadData deserialize(DataInputPlus in, Version version) throws IOException + { + return serializerFor(ReadType.valueOf(in.readByte())).deserialize(in, version); + } + + @Override + public long serializedSize(ReadData t, Version version) + { + return sizeof(t.kind().val) + serializerFor(t).serializedSize(t, version); + } + }; + + public static final ApplyThenWaitUntilAppliedSerializer applyThenWaitUntilApplied = new ApplyThenWaitUntilAppliedSerializer(); + + public static class ApplyThenWaitUntilAppliedSerializer implements ReadDataSerializer + { + @Override + public void serialize(ApplyThenWaitUntilApplied msg, DataOutputPlus out, Version version) throws IOException + { + CommandSerializers.txnId.serialize(msg.txnId, out); + KeySerializers.participants.serialize(msg.scope, out); + out.writeUnsignedVInt(msg.minEpoch()); + ExecuteAtSerializer.serialize(msg.txnId, msg.executeAt, out); + KeySerializers.fullRoute.serialize(msg.route, out); + CommandSerializers.partialTxn.serialize(msg.txn, out, version); + DepsSerializers.partialDeps.serialize(msg.deps, out); + CommandSerializers.nullableWrites.serialize(msg.writes, out, version); + } + + @Override + public ApplyThenWaitUntilApplied deserialize(DataInputPlus in, Version version) throws IOException + { + TxnId txnId = CommandSerializers.txnId.deserialize(in); + return ApplyThenWaitUntilApplied.SerializerSupport.create( + txnId, + KeySerializers.participants.deserialize(in), + in.readUnsignedVInt(), + ExecuteAtSerializer.deserialize(txnId, in), + KeySerializers.fullRoute.deserialize(in), + CommandSerializers.partialTxn.deserialize(in, version), + DepsSerializers.partialDeps.deserialize(in), + CommandSerializers.nullableWrites.deserialize(in, version), + ResultSerializers.APPLIED); + } + + @Override + public long serializedSize(ApplyThenWaitUntilApplied msg, Version version) + { + return CommandSerializers.txnId.serializedSize(msg.txnId) + + KeySerializers.participants.serializedSize(msg.scope) + + TypeSizes.sizeofUnsignedVInt(msg.minEpoch()) + + ExecuteAtSerializer.serializedSize(msg.txnId, msg.executeAt) + + KeySerializers.fullRoute.serializedSize(msg.route) + + CommandSerializers.partialTxn.serializedSize(msg.txn, version) + + DepsSerializers.partialDeps.serializedSize(msg.deps) + + CommandSerializers.nullableWrites.serializedSize(msg.writes, version); + } + } + + private static final ReadDataSerializer readTxnData = new ReadDataSerializer() + { + @Override + public void serialize(ReadTxnData read, DataOutputPlus out, Version version) throws IOException + { + CommandSerializers.txnId.serialize(read.txnId, out); + KeySerializers.participants.serialize(read.scope, out); + out.writeUnsignedVInt(read.executeAtEpoch); + } + + @Override + public ReadTxnData deserialize(DataInputPlus in, Version version) throws IOException + { + TxnId txnId = CommandSerializers.txnId.deserialize(in); + Participants scope = KeySerializers.participants.deserialize(in); + long executeAtEpoch = in.readUnsignedVInt(); + return ReadTxnData.SerializerSupport.create(txnId, scope, executeAtEpoch); + } + + @Override + public long serializedSize(ReadTxnData read, Version version) + { + return CommandSerializers.txnId.serializedSize(read.txnId) + + KeySerializers.participants.serializedSize(read.scope) + + TypeSizes.sizeofUnsignedVInt(read.executeAtEpoch); + } + }; + + public static final ReadDataSerializer readEphemeralTxnData = new ReadDataSerializer<>() + { + @Override + public void serialize(ReadEphemeralTxnData read, DataOutputPlus out, Version version) throws IOException + { + CommandSerializers.txnId.serialize(read.txnId, out); + KeySerializers.participants.serialize(read.scope, out); + out.writeUnsignedVInt(read.executeAtEpoch); + CommandSerializers.partialTxn.serialize(read.partialTxn(), out, version); + DepsSerializers.partialDeps.serialize(read.partialDeps(), out); + KeySerializers.fullRoute.serialize(read.route(), out); + } + + @Override + public ReadEphemeralTxnData deserialize(DataInputPlus in, Version version) throws IOException + { + TxnId txnId = CommandSerializers.txnId.deserialize(in); + Participants scope = KeySerializers.participants.deserialize(in); + long executeAtEpoch = in.readUnsignedVInt(); + PartialTxn partialTxn = CommandSerializers.partialTxn.deserialize(in, version); + PartialDeps partialDeps = DepsSerializers.partialDeps.deserialize(in); + FullRoute route = KeySerializers.fullRoute.deserialize(in); + return ReadEphemeralTxnData.SerializerSupport.create(txnId, scope, executeAtEpoch, partialTxn, partialDeps, route); + } + + @Override + public long serializedSize(ReadEphemeralTxnData read, Version version) + { + return CommandSerializers.txnId.serializedSize(read.txnId) + + KeySerializers.participants.serializedSize(read.scope) + + TypeSizes.sizeofUnsignedVInt(read.executeAtEpoch) + + CommandSerializers.partialTxn.serializedSize(read.partialTxn(), version) + + DepsSerializers.partialDeps.serializedSize(read.partialDeps()) + + KeySerializers.fullRoute.serializedSize(read.route()); + } + }; + + public interface ReadDataSerializer extends IVersionedSerializer + { + void serialize(T bound, DataOutputPlus out, Version version) throws IOException; + T deserialize(DataInputPlus in, Version version) throws IOException; + long serializedSize(T condition, Version version); + } + + private static ReadDataSerializer serializerFor(ReadData toSerialize) + { + return serializerFor(toSerialize.kind()); + } + + private static ReadDataSerializer serializerFor(ReadType type) + { + switch (type) + { + case readTxnData: + return readTxnData; + case readDataWithoutTimestamp: + return readEphemeralTxnData; + case applyThenWaitUntilApplied: + return applyThenWaitUntilApplied; + case waitUntilApplied: + return waitUntilApplied; + default: + throw new IllegalStateException("Unsupported ExecuteType " + type); + } + } + + public static final class ReplySerializer implements IVersionedSerializer + { + final CommitOrReadNack[] nacks = CommitOrReadNack.values(); + private final VersionedSerializer dataSerializer; + + public ReplySerializer(VersionedSerializer dataSerializer) + { + this.dataSerializer = dataSerializer; + } + + @Override + public void serialize(ReadReply reply, DataOutputPlus out, Version version) throws IOException + { + if (!reply.isOk()) + { + out.writeByte(3 + ((CommitOrReadNack) reply).ordinal()); + return; + } + + ReadOk readOk = (ReadOk) reply; + int flags = readOk.getClass() == ReadOkWithFutureEpoch.class ? 2 : readOk.uniqueHlc != 0 ? 1 : 0; + out.writeByte(flags); + serializeNullable(readOk.unavailable, out, KeySerializers.ranges); + dataSerializer.serialize((D) readOk.data, out, version); + switch (flags) + { + case 2: out.writeUnsignedVInt(((ReadOkWithFutureEpoch) reply).futureEpoch); break; + case 1: out.writeUnsignedVInt(readOk.uniqueHlc); + } + } + + @Override + public ReadReply deserialize(DataInputPlus in, Version version) throws IOException + { + int flags = in.readByte(); + if (flags > 2) + return nacks[flags - 3]; + + Ranges unavailable = deserializeNullable(in, KeySerializers.ranges); + D data = dataSerializer.deserialize(in, version); + + long extraLong = flags == 0 ? 0 : in.readUnsignedVInt(); + if (flags <= 1) + return new ReadOk(unavailable, data, extraLong); + return new ReadOkWithFutureEpoch(unavailable, data, extraLong); + } + + @Override + public long serializedSize(ReadReply reply, Version version) + { + if (!reply.isOk()) + return TypeSizes.BYTE_SIZE; + + ReadOk readOk = (ReadOk) reply; + long size = TypeSizes.BYTE_SIZE + + serializedNullableSize(readOk.unavailable, KeySerializers.ranges) + + dataSerializer.serializedSize((D) readOk.data, version); + if (readOk.uniqueHlc != 0) + size += TypeSizes.sizeofUnsignedVInt(readOk.uniqueHlc); + else if (readOk instanceof ReadOkWithFutureEpoch) + size += TypeSizes.sizeofUnsignedVInt(((ReadOkWithFutureEpoch) readOk).futureEpoch); + return size; + } + } + + public static final IVersionedSerializer reply = new ReplySerializer<>(TxnData.nullableSerializer); + + // TODO (desired): duplicates ReadTxnData ser/de logic; conside deduplicating if another instance of this is added + public static final ReadDataSerializer waitUntilApplied = new ReadDataSerializer() + { + @Override + public void serialize(WaitUntilApplied waitUntilApplied, DataOutputPlus out, Version version) throws IOException + { + CommandSerializers.txnId.serialize(waitUntilApplied.txnId, out); + KeySerializers.participants.serialize(waitUntilApplied.scope, out); + out.writeUnsignedVInt(waitUntilApplied.minEpoch()); + out.writeUnsignedVInt(waitUntilApplied.executeAtEpoch - waitUntilApplied.minEpoch()); + } + + @Override + public WaitUntilApplied deserialize(DataInputPlus in, Version version) throws IOException + { + TxnId txnId = CommandSerializers.txnId.deserialize(in); + Participants scope = KeySerializers.participants.deserialize(in); + long minEpoch = in.readUnsignedVInt(); + long executeAtEpoch = minEpoch + in.readUnsignedVInt(); + return WaitUntilApplied.SerializerSupport.create(txnId, scope, minEpoch, executeAtEpoch); + } + + @Override + public long serializedSize(WaitUntilApplied waitUntilApplied, Version version) + { + return CommandSerializers.txnId.serializedSize(waitUntilApplied.txnId) + + KeySerializers.participants.serializedSize(waitUntilApplied.scope) + + TypeSizes.sizeofUnsignedVInt(waitUntilApplied.minEpoch()) + + TypeSizes.sizeofUnsignedVInt(waitUntilApplied.executeAtEpoch - waitUntilApplied.minEpoch()); + } + }; + + // TODO (desired): duplicates a lot of Commit serializer + public static final ReadDataSerializer stableThenRead = new ReadDataSerializer<>() + { + @Override + public void serialize(StableThenRead read, DataOutputPlus out, Version version) throws IOException + { + CommandSerializers.txnId.serialize(read.txnId, out); + KeySerializers.participants.serialize(read.scope, out); + CommitSerializers.kind.serialize(read.kind, out); + out.writeUnsignedVInt(read.minEpoch); + ExecuteAtSerializer.serialize(read.txnId, read.executeAt, out); + if (read.kind.withTxn != NoTxn) + CommandSerializers.nullablePartialTxn.serialize(read.partialTxn, out, version); + if (read.kind.withDeps == HasDeps) + DepsSerializers.partialDeps.serialize(read.partialDeps, out); + if (read.kind.withTxn == HasTxn) + KeySerializers.fullRoute.serialize(read.route, out); + } + + @Override + public StableThenRead deserialize(DataInputPlus in, Version version) throws IOException + { + TxnId txnId = CommandSerializers.txnId.deserialize(in); + Participants scope = KeySerializers.participants.deserialize(in); + Commit.Kind kind = CommitSerializers.kind.deserialize(in); + long minEpoch = in.readUnsignedVInt(); + Timestamp executeAt = ExecuteAtSerializer.deserialize(txnId, in); + PartialTxn partialTxn = kind.withTxn == NoTxn ? null : CommandSerializers.nullablePartialTxn.deserialize(in, version); + PartialDeps partialDeps = kind.withDeps == NoDeps ? null : DepsSerializers.partialDeps.deserialize(in); + FullRoute < ?> route = kind.withTxn == HasTxn ? KeySerializers.fullRoute.deserialize(in) : null; + return StableThenRead.SerializerSupport.create(txnId, scope, kind, minEpoch, executeAt, partialTxn, partialDeps, route); + } + + @Override + public long serializedSize(StableThenRead read, Version version) + { + return CommandSerializers.txnId.serializedSize(read.txnId) + + KeySerializers.participants.serializedSize(read.scope) + + CommitSerializers.kind.serializedSize(read.kind) + + TypeSizes.sizeofUnsignedVInt(read.minEpoch) + + ExecuteAtSerializer.serializedSize(read.txnId, read.executeAt) + + (read.kind.withTxn == NoTxn ? 0 : CommandSerializers.nullablePartialTxn.serializedSize(read.partialTxn, version)) + + (read.kind.withDeps != HasDeps ? 0 : DepsSerializers.partialDeps.serializedSize(read.partialDeps)) + + (read.kind.withTxn != HasTxn ? 0 : KeySerializers.fullRoute.serializedSize(read.route)); + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/RecoverySerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/RecoverySerializers.java new file mode 100644 index 000000000000..fcb680326ddc --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/RecoverySerializers.java @@ -0,0 +1,266 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import accord.api.Result; +import accord.api.RoutingKey; +import accord.messages.BeginRecovery; +import accord.messages.BeginRecovery.RecoverNack; +import accord.messages.BeginRecovery.RecoverOk; +import accord.messages.BeginRecovery.RecoverReply; +import accord.primitives.Ballot; +import accord.primitives.Deps; +import accord.primitives.FullRoute; +import accord.primitives.Known.KnownDeps; +import accord.primitives.LatestDeps; +import accord.primitives.PartialTxn; +import accord.primitives.Participants; +import accord.primitives.Route; +import accord.primitives.Status; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import accord.primitives.Writes; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.serializers.CommandSerializers.ExecuteAtSerializer; +import org.apache.cassandra.service.accord.serializers.TxnRequestSerializer.WithUnsyncedSerializer; + +import static accord.messages.BeginRecovery.RecoverReply.Kind.Ok; +import static org.apache.cassandra.utils.NullableSerializer.deserializeNullable; +import static org.apache.cassandra.utils.NullableSerializer.serializeNullable; +import static org.apache.cassandra.utils.NullableSerializer.serializedNullableSize; + +public class RecoverySerializers +{ + public static final IVersionedSerializer request = new WithUnsyncedSerializer() + { + @Override + public void serializeBody(BeginRecovery recover, DataOutputPlus out, Version version) throws IOException + { + CommandSerializers.partialTxn.serialize(recover.partialTxn, out, version); + CommandSerializers.ballot.serialize(recover.ballot, out); + serializeNullable(recover.route, out, KeySerializers.fullRoute); + out.writeUnsignedVInt(recover.executeAtOrTxnIdEpoch - recover.txnId.epoch()); + } + + @Override + public BeginRecovery deserializeBody(DataInputPlus in, Version version, TxnId txnId, Route scope, long waitForEpoch, long minEpoch) throws IOException + { + PartialTxn partialTxn = CommandSerializers.partialTxn.deserialize(in, version); + Ballot ballot = CommandSerializers.ballot.deserialize(in); + @Nullable FullRoute route = deserializeNullable(in, KeySerializers.fullRoute); + long executeAtOrTxnIdEpoch = in.readUnsignedVInt32() + txnId.epoch(); + return BeginRecovery.SerializationSupport.create(txnId, scope, waitForEpoch, minEpoch, partialTxn, ballot, route, executeAtOrTxnIdEpoch); + } + + @Override + public long serializedBodySize(BeginRecovery recover, Version version) + { + return CommandSerializers.partialTxn.serializedSize(recover.partialTxn, version) + + CommandSerializers.ballot.serializedSize(recover.ballot) + + serializedNullableSize(recover.route, KeySerializers.fullRoute) + + TypeSizes.sizeofUnsignedVInt(recover.executeAtOrTxnIdEpoch - recover.txnId.epoch()); + } + }; + + public static final IVersionedSerializer reply = new IVersionedSerializer() + { + final RecoverReply.Kind[] kinds = RecoverReply.Kind.values(); + void serializeNack(RecoverNack recoverNack, DataOutputPlus out, Version version) throws IOException + { + CommandSerializers.ballot.serialize(recoverNack.supersededBy, out); + } + + void serializeOk(RecoverOk recoverOk, DataOutputPlus out, Version version) throws IOException + { + CommandSerializers.txnId.serialize(recoverOk.txnId, out); + CommandSerializers.status.serialize(recoverOk.status, out); + CommandSerializers.ballot.serialize(recoverOk.accepted, out); + ExecuteAtSerializer.serializeNullable(recoverOk.executeAt, out); + latestDeps.serialize(recoverOk.deps, out); + DepsSerializers.deps.serialize(recoverOk.earlierWait, out); + DepsSerializers.deps.serialize(recoverOk.earlierNoWait, out); + DepsSerializers.deps.serialize(recoverOk.laterCoordRejects, out); + out.writeBoolean(recoverOk.selfAcceptsFastPath); + KeySerializers.nullableParticipants.serialize(recoverOk.coordinatorAcceptsFastPath, out); + out.writeBoolean(recoverOk.supersedingRejects); + CommandSerializers.nullableWrites.serialize(recoverOk.writes, out, version); + } + + @Override + public void serialize(RecoverReply reply, DataOutputPlus out, Version version) throws IOException + { + out.writeByte(reply.kind().ordinal()); + if (reply.kind() == Ok) serializeOk((RecoverOk) reply, out, version); + else serializeNack((RecoverNack) reply, out, version); + } + + RecoverNack deserializeNack(RecoverReply.Kind kind, Ballot supersededBy, DataInputPlus in, Version version) + { + return new RecoverNack(kind, supersededBy); + } + + RecoverOk deserializeOk(TxnId txnId, Status status, Ballot accepted, Timestamp executeAt, @Nonnull LatestDeps deps, Deps earlierWait, Deps earlierNoWait, Deps laterCoordRejects, boolean acceptsFastPath, @Nullable Participants coordinatorAcceptsFastPath, boolean rejectsFastPath, Writes writes, Result result, DataInputPlus in, Version version) + { + return new RecoverOk(txnId, status, accepted, executeAt, deps, earlierWait, earlierNoWait, laterCoordRejects, acceptsFastPath, coordinatorAcceptsFastPath, rejectsFastPath, writes, result); + } + + @Override + public RecoverReply deserialize(DataInputPlus in, Version version) throws IOException + { + RecoverReply.Kind kind = kinds[in.readByte()]; + if (kind != Ok) + return deserializeNack(kind, CommandSerializers.ballot.deserialize(in), in, version); + + TxnId id = CommandSerializers.txnId.deserialize(in); + Status status = CommandSerializers.status.deserialize(in); + + Result result = null; + if (status == Status.PreApplied || status == Status.Applied || status == Status.Truncated) + result = ResultSerializers.APPLIED; + + return deserializeOk(id, + status, + CommandSerializers.ballot.deserialize(in), + ExecuteAtSerializer.deserializeNullable(in), + latestDeps.deserialize(in), + DepsSerializers.deps.deserialize(in), + DepsSerializers.deps.deserialize(in), + DepsSerializers.deps.deserialize(in), + in.readBoolean(), + KeySerializers.nullableParticipants.deserialize(in), + in.readBoolean(), + CommandSerializers.nullableWrites.deserialize(in, version), + result, + in, + version); + } + + long serializedNackSize(RecoverNack recoverNack, Version version) + { + return CommandSerializers.ballot.serializedSize(recoverNack.supersededBy); + } + + long serializedOkSize(RecoverOk recoverOk, Version version) + { + long size = CommandSerializers.txnId.serializedSize(recoverOk.txnId); + size += CommandSerializers.status.serializedSize(recoverOk.status); + size += CommandSerializers.ballot.serializedSize(recoverOk.accepted); + size += ExecuteAtSerializer.serializedNullableSize(recoverOk.executeAt); + size += latestDeps.serializedSize(recoverOk.deps); + size += DepsSerializers.deps.serializedSize(recoverOk.earlierWait); + size += DepsSerializers.deps.serializedSize(recoverOk.earlierNoWait); + size += DepsSerializers.deps.serializedSize(recoverOk.laterCoordRejects); + size += TypeSizes.sizeof(recoverOk.selfAcceptsFastPath); + size += KeySerializers.nullableParticipants.serializedSize(recoverOk.coordinatorAcceptsFastPath); + size += TypeSizes.sizeof(recoverOk.supersedingRejects); + size += CommandSerializers.nullableWrites.serializedSize(recoverOk.writes, version); + return size; + } + + @Override + public long serializedSize(RecoverReply reply, Version version) + { + return TypeSizes.BYTE_SIZE + + (reply.kind() == Ok ? serializedOkSize((RecoverOk) reply, version) : serializedNackSize((RecoverNack) reply, version)); + } + }; + + public static final UnversionedSerializer latestDeps = new UnversionedSerializer<>() + { + @Override + public void serialize(LatestDeps t, DataOutputPlus out) throws IOException + { + out.writeUnsignedVInt32(t.size()); + for (int i = 0 ; i < t.size() ; ++i) + { + RoutingKey start = t.startAt(i); + KeySerializers.routingKey.serialize(start, out); + LatestDeps.LatestEntry e = t.valueAt(i); + if (e == null) + { + CommandSerializers.knownDeps.serialize(null, out); + } + else + { + CommandSerializers.knownDeps.serialize(e.known, out); + CommandSerializers.ballot.serialize(e.ballot, out); + DepsSerializers.nullableDeps.serialize(e.coordinatedDeps, out); + DepsSerializers.nullableDeps.serialize(e.localDeps, out); + } + } + KeySerializers.routingKey.serialize(t.startAt(t.size()), out); + } + + @Override + public LatestDeps deserialize(DataInputPlus in) throws IOException + { + int size = in.readUnsignedVInt32(); + RoutingKey[] starts = new RoutingKey[size + 1]; + LatestDeps.LatestEntry[] values = new LatestDeps.LatestEntry[size]; + for (int i = 0 ; i < size ; ++i) + { + starts[i] = KeySerializers.routingKey.deserialize(in); + KnownDeps knownDeps = CommandSerializers.knownDeps.deserialize(in); + if (knownDeps == null) + continue; + + Ballot ballot = CommandSerializers.ballot.deserialize(in); + Deps coordinatedDeps = DepsSerializers.nullableDeps.deserialize(in); + Deps localDeps = DepsSerializers.nullableDeps.deserialize(in); + values[i] = new LatestDeps.LatestEntry(knownDeps, ballot, coordinatedDeps, localDeps); + } + starts[size] = KeySerializers.routingKey.deserialize(in); + + return LatestDeps.SerializerSupport.create(true, starts, values); + } + + @Override + public long serializedSize(LatestDeps t) + { + long size = 0; + size += TypeSizes.sizeofUnsignedVInt(t.size()); + for (int i = 0 ; i < t.size() ; ++i) + { + RoutingKey start = t.startAt(i); + size += KeySerializers.routingKey.serializedSize(start); + LatestDeps.LatestEntry e = t.valueAt(i); + if (e == null) + { + size += CommandSerializers.knownDeps.serializedSize(null); + } + else + { + size += CommandSerializers.knownDeps.serializedSize(e.known); + size += CommandSerializers.ballot.serializedSize(e.ballot); + size += DepsSerializers.nullableDeps.serializedSize(e.coordinatedDeps); + size += DepsSerializers.nullableDeps.serializedSize(e.localDeps); + } + } + size += KeySerializers.routingKey.serializedSize(t.startAt(t.size())); + return size; + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/ResultSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/ResultSerializers.java new file mode 100644 index 000000000000..5d2d5efa4603 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/ResultSerializers.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import accord.api.Result; +import accord.primitives.ProgressToken; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + +public class ResultSerializers +{ + // TODO (desired): this is meant to encode e.g. whether the transaction's condition met or not for clients to later query + public static final Result APPLIED = new Result() + { + @Override + public ProgressToken asProgressToken() + { + return ProgressToken.APPLIED; + } + }; + + public static final UnversionedSerializer result = new UnversionedSerializer<>() + { + public void serialize(Result t, DataOutputPlus out) { } + public Result deserialize(DataInputPlus in) + { + return APPLIED; + } + + public long serializedSize(Result t) + { + return 0; + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/SetDurableSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/SetDurableSerializers.java new file mode 100644 index 000000000000..60dbbc3c88ff --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/SetDurableSerializers.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import accord.messages.SetGloballyDurable; +import accord.messages.SetShardDurable; +import accord.primitives.Deps; +import accord.primitives.FullRoute; +import accord.primitives.SyncPoint; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.serializers.CommandSerializers.ExecuteAtSerializer; + +public class SetDurableSerializers +{ + public static final UnversionedSerializer shardDurable = new UnversionedSerializer<>() + { + @Override + public void serialize(SetShardDurable msg, DataOutputPlus out) throws IOException + { + syncPoint.serialize(msg.exclusiveSyncPoint, out); + CommandSerializers.durability.serialize(msg.durability, out); + } + + @Override + public SetShardDurable deserialize(DataInputPlus in) throws IOException + { + return new SetShardDurable(syncPoint.deserialize(in), + CommandSerializers.durability.deserialize(in)); + } + + @Override + public long serializedSize(SetShardDurable msg) + { + return syncPoint.serializedSize(msg.exclusiveSyncPoint) + + CommandSerializers.durability.serializedSize(msg.durability); + } + }; + + public static final UnversionedSerializer globallyDurable = new UnversionedSerializer<>() + { + @Override + public void serialize(SetGloballyDurable msg, DataOutputPlus out) throws IOException + { + CommandStoreSerializers.durableBefore.serialize(msg.durableBefore, out); + } + + @Override + public SetGloballyDurable deserialize(DataInputPlus in) throws IOException + { + return new SetGloballyDurable(CommandStoreSerializers.durableBefore.deserialize(in)); + } + + @Override + public long serializedSize(SetGloballyDurable msg) + { + return CommandStoreSerializers.durableBefore.serializedSize(msg.durableBefore); + } + }; + + public static final UnversionedSerializer syncPoint = new UnversionedSerializer<>() + { + @Override + public void serialize(SyncPoint sp, DataOutputPlus out) throws IOException + { + CommandSerializers.txnId.serialize(sp.syncId, out); + ExecuteAtSerializer.serialize(sp.syncId, sp.executeAt, out); + DepsSerializers.deps.serialize(sp.waitFor, out); + KeySerializers.fullRoute.serialize(sp.route, out); + } + + @Override + public SyncPoint deserialize(DataInputPlus in) throws IOException + { + TxnId syncId = CommandSerializers.txnId.deserialize(in); + Timestamp executeAt = ExecuteAtSerializer.deserialize(syncId, in); + Deps waitFor = DepsSerializers.deps.deserialize(in); + FullRoute route = KeySerializers.fullRoute.deserialize(in); + return SyncPoint.SerializationSupport.construct(syncId, executeAt, waitFor, route); + } + + @Override + public long serializedSize(SyncPoint sp) + { + return CommandSerializers.txnId.serializedSize(sp.syncId) + + ExecuteAtSerializer.serializedSize(sp.syncId, sp.executeAt) + + DepsSerializers.deps.serializedSize(sp.waitFor) + + KeySerializers.fullRoute.serializedSize(sp.route); + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/TableMetadatas.java b/src/java/org/apache/cassandra/service/accord/serializers/TableMetadatas.java new file mode 100644 index 000000000000..6e24758961e8 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/TableMetadatas.java @@ -0,0 +1,415 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; +import java.util.AbstractList; +import java.util.Arrays; +import java.util.Comparator; + +import accord.utils.Invariants; +import accord.utils.SortedArrays; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.exceptions.UnknownTableException; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.btree.BTree; + +import static accord.utils.SortedArrays.Search.FAST; + +public abstract class TableMetadatas extends AbstractList +{ + private static final Comparator comparingId = Comparator.comparing(v -> ((TableMetadata) v).id); + public static class Collector extends AbstractSortedCollector + { + @Override + Comparator comparator() + { + return comparingId; + } + + @Override + Complete empty() + { + return TableMetadatas.none(); + } + + @Override + Complete of(TableMetadata one) + { + return TableMetadatas.of(one); + } + + @Override + Complete copy(Object[] array, int count) + { + TableMetadata[] result = new TableMetadata[count]; + System.arraycopy(array, 0, result, 0, count); + return TableMetadatas.ofSortedUnique(result); + } + + @Override + Complete copyBtree(Object[] btree, int count) + { + TableMetadata[] result = new TableMetadata[count]; + int i = 0; + for (TableMetadata v : BTree.iterable(btree)) + result[i++] = v; + return TableMetadatas.ofSortedUnique(result); + } + } + + public abstract int indexOf(TableMetadata find); + public abstract int indexOf(TableId find); + public abstract TableId get(TableId tableId); + + public abstract void serialize(TableMetadata table, DataOutputPlus out) throws IOException; + public abstract TableMetadata deserialize(DataInputPlus in) throws IOException; + public abstract long serializedSize(TableMetadata table); + + public abstract void serializeSelf(DataOutputPlus out) throws IOException; + public abstract long serializedSelfSize(); + + public static Complete none() + { + return Multi.NONE; + } + + public static Complete of(TableMetadata metadata) + { + return new One(metadata); + } + + public static Complete ofSortedUnique(TableMetadata ... metadatas) + { + if (metadatas.length == 0) + return none(); + if (metadatas.length == 1) + return new One(metadatas[0]); + Invariants.requireStrictlyOrdered(comparingId, metadatas); + return new Multi(metadatas); + } + + public static abstract class Complete extends TableMetadatas + { + public abstract TableMetadata getMetadata(TableId tableId); + } + + static class One extends Complete + { + final TableMetadata table; + + One(TableMetadata table) + { + this.table = table; + } + + @Override + public TableId get(int index) + { + Invariants.require(index == 0); + return table.id; + } + + @Override + public int size() + { + return 1; + } + + @Override + public int indexOf(TableMetadata find) + { + int c = find.id == table.id ? 0 : find.id.compareTo(table.id); + if (c == 0) return 0; + else if (c < 0) return -1; + else return -2; + } + + @Override + public void serialize(TableMetadata table, DataOutputPlus out) throws IOException + { + } + + @Override + public void serializeSelf(DataOutputPlus out) throws IOException + { + out.writeUnsignedVInt32(1); + table.id.serializeCompactComparable(out); + } + + @Override + public TableMetadata deserialize(DataInputPlus in) throws IOException + { + return table; + } + + @Override + public long serializedSize(TableMetadata table) + { + return 0; + } + + @Override + public long serializedSelfSize() + { + return TypeSizes.sizeofUnsignedVInt(1) + table.id.serializedCompactComparableSize(); + } + + @Override + public int indexOf(TableId tableId) + { + if (tableId.equals(table.id)) + return 0; + return -1; + } + + @Override + public TableId get(TableId tableId) + { + if (tableId.equals(table.id)) + return table.id; + return null; + } + + @Override + public TableMetadata getMetadata(TableId tableId) + { + if (tableId.equals(table.id)) + return table; + return null; + } + } + + static class Multi extends Complete + { + static final Complete NONE = new Multi(); + + final TableMetadata[] tables; + + Multi(TableMetadata ... tables) + { + this.tables = tables; + } + + @Override + public TableId get(int index) + { + return tables[index].id; + } + + @Override + public int size() + { + return tables.length; + } + + @Override + public int indexOf(TableMetadata find) + { + return Arrays.binarySearch(tables, find, comparingId); + } + + @Override + public int indexOf(TableId find) + { + return SortedArrays.binarySearch(tables, 0, tables.length, find, (id, metadata) -> id.compareTo(metadata.id), FAST); + } + + @Override + public void serialize(TableMetadata table, DataOutputPlus out) throws IOException + { + int i = indexOf(table); + if (i < 0) + throw new IllegalStateException("TableMetadata for " + table + " not found in " + this); + out.writeUnsignedVInt32(i); + } + + @Override + public void serializeSelf(DataOutputPlus out) throws IOException + { + out.writeUnsignedVInt32(tables.length); + for (TableMetadata table : tables) + table.id.serializeCompactComparable(out); + } + + @Override + public TableMetadata deserialize(DataInputPlus in) throws IOException + { + return tables[in.readUnsignedVInt32()]; + } + + @Override + public long serializedSize(TableMetadata table) + { + int i = indexOf(table); + if (i < 0) + throw new IllegalStateException("TableMetadata for " + table + " not found in " + this); + return TypeSizes.sizeofUnsignedVInt(indexOf(table)); + } + + @Override + public long serializedSelfSize() + { + long size = TypeSizes.sizeofUnsignedVInt(tables.length); + for (TableMetadata table : tables) + size += table.id.serializedCompactComparableSize(); + return size; + } + + @Override + public TableId get(TableId tableId) + { + int i = indexOf(tableId); + return i >= 0 ? tables[i].id : null; + } + + @Override + public TableMetadata getMetadata(TableId tableId) + { + int i = indexOf(tableId); + return i >= 0 ? tables[i] : null; + } + } + + static class WithUnknown extends TableMetadatas + { + final TableId[] ids; + final TableMetadata[] metadatas; + + WithUnknown(TableId[] ids, TableMetadata[] metadatas) + { + this.ids = ids; + this.metadatas = metadatas; + } + + @Override + public TableId get(int index) + { + return ids[index]; + } + + @Override + public int size() + { + return ids.length; + } + + @Override + public int indexOf(TableMetadata find) + { + return indexOf(find.id); + } + + @Override + public int indexOf(TableId find) + { + return Arrays.binarySearch(ids, find); + } + + @Override + public void serialize(TableMetadata table, DataOutputPlus out) throws IOException + { + int i = indexOf(table); + if (i < 0) + throw new IllegalStateException("TableMetadata for " + table + " not found in " + this); + out.writeUnsignedVInt32(i); + } + + @Override + public void serializeSelf(DataOutputPlus out) throws IOException + { + out.writeUnsignedVInt32(ids.length); + for (TableId id : ids) + id.serializeCompactComparable(out); + } + + @Override + public TableMetadata deserialize(DataInputPlus in) throws IOException + { + int index = in.readUnsignedVInt32(); + TableMetadata metadata = metadatas[index]; + if (metadata == null) + throw new UnknownTableException("Unknown table", ids[index]); + return metadata; + } + + @Override + public long serializedSize(TableMetadata table) + { + int i = indexOf(table); + if (i < 0) + throw new IllegalStateException("TableMetadata for " + table + " not found in " + this); + return TypeSizes.sizeofUnsignedVInt(indexOf(table)); + } + + @Override + public long serializedSelfSize() + { + long size = TypeSizes.sizeofUnsignedVInt(ids.length); + for (TableId id : ids) + size += id.serializedCompactComparableSize(); + return size; + } + + @Override + public TableId get(TableId tableId) + { + int index = indexOf(tableId); + return get(index); + } + } + + public static TableMetadatas deserializeSelf(DataInputPlus in) throws IOException + { + int count = in.readUnsignedVInt32(); + if (count == 0) + return none(); + if (count == 1) + { + TableId id = TableId.deserializeCompactComparable(in); + TableMetadata metadata = Schema.instance.getTableMetadata(id); + if (metadata == null) + return new WithUnknown(new TableId[] { id}, new TableMetadata[] { null }); + return new One(metadata); + } + TableId[] ids = null; + TableMetadata[] metadatas = new TableMetadata[count]; + int i; + for (i = 0 ; i < count ; ++i) + { + TableId id = TableId.deserializeCompactComparable(in); + TableMetadata metadata = Schema.instance.getTableMetadata(id); + metadatas[i] = metadata; + if (ids != null) ids[i] = id; + else if (metadata == null) + { + ids = new TableId[count]; + for (int j = 0 ; j < i ; ++j) + ids[j] = metadatas[j].id; + } + } + if (ids == null) + return new Multi(metadatas); + return new WithUnknown(ids, metadatas); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/TableMetadatasAndKeys.java b/src/java/org/apache/cassandra/service/accord/serializers/TableMetadatasAndKeys.java new file mode 100644 index 000000000000..293264bfa511 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/TableMetadatasAndKeys.java @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; +import java.util.Comparator; + +import accord.api.Key; +import accord.api.Sliceable; +import accord.primitives.Keys; +import accord.primitives.Participants; +import accord.primitives.Ranges; +import accord.primitives.Routable; +import accord.primitives.Seekable; +import accord.primitives.Seekables; +import accord.utils.Invariants; +import accord.utils.VIntCoding; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.service.accord.serializers.TableMetadatas.Multi; +import org.apache.cassandra.utils.btree.BTreeSet; + +import static accord.primitives.Routable.Domain.Range; +import static accord.primitives.Routables.Slice.Minimal; + +public class TableMetadatasAndKeys extends IVersionedWithKeysSerializer.AbstractWithKeysSerializer implements Sliceable +{ + public static class KeyCollector extends AbstractSortedCollector + { + private static final Comparator comparator = Comparator.comparing(v -> ((PartitionKey) v)); + + public final TableMetadatas tables; + + public KeyCollector(TableMetadatas tables) + { + this.tables = tables; + } + + public TableMetadatasAndKeys buildTablesAndKeys() + { + return new TableMetadatasAndKeys(tables, build()); + } + + @Override + Comparator comparator() + { + return comparator; + } + + public PartitionKey collect(TableMetadata table, DecoratedKey key) + { + TableId tableId = tables.get(table.id); + if (count == 1) + { + PartitionKey one = (PartitionKey) buffer; + if (one.prefix() == table && one.partitionKey().equals(key)) + return one; + } + return collect(new PartitionKey(tableId, key)); + } + + @Override + Keys empty() + { + return Keys.EMPTY; + } + + @Override + Keys of(PartitionKey one) + { + return Keys.of(one); + } + + @Override + Keys copy(Object[] array, int count) + { + Key[] result = new Key[count]; + System.arraycopy(array, 0, result, 0, count); + return Keys.ofSortedUnique(result); + } + + @Override + Keys copyBtree(Object[] btree, int count) + { + return Keys.ofSortedUnique(new BTreeSet<>(btree, comparator())); + } + } + + private static final TableMetadatasAndKeys NO_KEYS = new TableMetadatasAndKeys(Multi.NONE, Keys.EMPTY); + private static final TableMetadatasAndKeys NO_RANGES = new TableMetadatasAndKeys(Multi.NONE, Ranges.EMPTY); + + public static TableMetadatasAndKeys none(Routable.Domain domain) + { + return domain.isKey() ? NO_KEYS : NO_RANGES; + } + + public final TableMetadatas tables; + public final Seekables keys; + + public TableMetadatasAndKeys(TableMetadatas tables, Seekables keys) + { + this.tables = tables; + this.keys = keys; + } + + public void serializeKeys(Keys keys, DataOutputPlus out) throws IOException + { + serializeSubsetInternal(keys, this.keys, out); + } + + public Keys deserializeKeys(DataInputPlus in) throws IOException + { + return (Keys)deserializeSubsetInternal(this.keys, in); + } + + public void serializeSeekable(Seekable seekable, DataOutputPlus out) throws IOException + { + int index = keys.indexOf(seekable); + if (index >= 0) out.writeUnsignedVInt32(1 + index); + else + { + Invariants.require(seekable.domain() == Range); + out.writeUnsignedVInt32(0); + KeySerializers.seekable.serialize(seekable, out); + } + } + + public void serializeKey(PartitionKey key, DataOutputPlus out) throws IOException + { + int index = keys.indexOf(key); + Invariants.require(index >= 0); + out.writeUnsignedVInt32(index); + } + + public Seekable deserializeSeekable(DataInputPlus in) throws IOException + { + int offset = in.readUnsignedVInt32(); + Seekable key; + if (offset > 0) key = (Seekable) keys.get(offset - 1); + else key = KeySerializers.seekable.deserialize(in); + return key; + } + + public PartitionKey deserializeKey(DataInputPlus in) throws IOException + { + int offset = in.readUnsignedVInt32(); + return (PartitionKey) keys.get(offset); + } + + public long serializedKeysSize(Keys keys) + { + return serializedSubsetSizeInternal(keys, this.keys); + } + + public long serializedSeekableSize(Seekable seekable) + { + int i = keys.indexOf(seekable); + Invariants.require(i >= 0 || seekable.domain() == Range); + return VIntCoding.sizeOfUnsignedVInt(1 + i); + } + + public long serializedKeySize(PartitionKey key) + { + int i = keys.indexOf(key); + Invariants.require(i >= 0); + return VIntCoding.sizeOfUnsignedVInt(i); + } + + public TableMetadatasAndKeys slice(Ranges ranges) + { + return new TableMetadatasAndKeys(tables, keys.slice(ranges, Minimal)); + } + + @Override + public TableMetadatasAndKeys intersecting(Participants participants) + { + return new TableMetadatasAndKeys(tables, keys.intersecting(participants, Minimal)); + } + + @Override + public TableMetadatasAndKeys merge(TableMetadatasAndKeys merge) + { + Invariants.require(tables.equals(merge.tables)); + return new TableMetadatasAndKeys(tables, keys.with(merge.keys)); + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + TableMetadatasAndKeys that = (TableMetadatasAndKeys) o; + return tables.equals(that.tables) && keys.equals(that.keys); + } + + @Override + public int hashCode() + { + throw new UnsupportedOperationException(); + } + + @Override + public String toString() + { + return "{tables=" + tables + ",keys=" + keys + '}'; + } + + public static final UnversionedSerializer serializer = new UnversionedSerializer<>() + { + @Override + public void serialize(TableMetadatasAndKeys tablesAndKeys, DataOutputPlus out) throws IOException + { + tablesAndKeys.tables.serializeSelf(out); + KeySerializers.seekables.serialize(tablesAndKeys.keys, out); + } + + @Override + public TableMetadatasAndKeys deserialize(DataInputPlus in) throws IOException + { + TableMetadatas tables = TableMetadatas.deserializeSelf(in); + Seekables keys = KeySerializers.seekables.deserialize(in); + return new TableMetadatasAndKeys(tables, keys); + } + + @Override + public long serializedSize(TableMetadatasAndKeys tablesAndKeys) + { + return tablesAndKeys.tables.serializedSelfSize() + + KeySerializers.seekables.serializedSize(tablesAndKeys.keys); + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/TopologySerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/TopologySerializers.java new file mode 100644 index 000000000000..0fe912335cbd --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/TopologySerializers.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import accord.local.Node; +import accord.primitives.Range; +import accord.topology.Shard; +import accord.topology.Topology; +import accord.utils.SortedArrays.SortedArrayList; +import accord.utils.TinyEnumSet; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.db.marshal.ValueAccessor; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.utils.ArraySerializers; +import org.apache.cassandra.utils.CollectionSerializers; + +public class TopologySerializers +{ + private TopologySerializers() {} + + public static final NodeIdSerializer nodeId = new NodeIdSerializer(); + public static class NodeIdSerializer implements UnversionedSerializer + { + private NodeIdSerializer() {} + + @Override + public void serialize(Node.Id id, DataOutputPlus out) throws IOException + { + out.writeInt(id.id); + } + + public int serialize(Node.Id id, V dst, ValueAccessor accessor, int offset) + { + return accessor.putInt(dst, offset, id.id); + } + + public void serialize(Node.Id id, ByteBuffer out) + { + out.putInt(id.id); + } + + @Override + public Node.Id deserialize(DataInputPlus in) throws IOException + { + return new Node.Id(in.readInt()); + } + + public Node.Id deserialize(V src, ValueAccessor accessor, int offset) + { + return new Node.Id(accessor.getInt(src, offset)); + } + + public Node.Id deserialize(ByteBuffer src, int position) + { + return new Node.Id(src.getInt(position)); + } + + @Override + public long serializedSize(Node.Id id) + { + return TypeSizes.INT_SIZE; // id.id + } + } + + public static final UnversionedSerializer shard = new ShardSerializer((UnversionedSerializer) + (UnversionedSerializer) + TokenRange.serializer); + + public static class ShardSerializer implements UnversionedSerializer + { + protected UnversionedSerializer range; + + public ShardSerializer(UnversionedSerializer range) + { + this.range = range; + } + + @Override + public void serialize(Shard shard, DataOutputPlus out) throws IOException + { + range.serialize(shard.range, out); + CollectionSerializers.serializeList(shard.nodes, out, nodeId); + CollectionSerializers.serializeList(shard.notInFastPath, out, nodeId); + CollectionSerializers.serializeList(shard.joining, out, nodeId); + out.writeUnsignedVInt32(shard.flags().bitset()); + } + + @Override + public Shard deserialize(DataInputPlus in) throws IOException + { + Range range = ShardSerializer.this.range.deserialize(in); + SortedArrayList nodes = CollectionSerializers.deserializeSortedArrayList(in, nodeId, Node.Id[]::new); + SortedArrayList notInFastPath = CollectionSerializers.deserializeSortedArrayList(in, nodeId, Node.Id[]::new); + SortedArrayList joining = CollectionSerializers.deserializeSortedArrayList(in, nodeId, Node.Id[]::new); + int flags = in.readUnsignedVInt32(); + return Shard.SerializerSupport.create(range, nodes, notInFastPath, joining, new TinyEnumSet<>(flags)); + } + + @Override + public long serializedSize(Shard shard) + { + long size = range.serializedSize(shard.range); + size += CollectionSerializers.serializedListSize(shard.nodes, nodeId); + size += CollectionSerializers.serializedListSize(shard.notInFastPath, nodeId); + size += CollectionSerializers.serializedListSize(shard.joining, nodeId); + size += TypeSizes.sizeofUnsignedVInt(shard.flags().bitset()); + return size; + } + } + + public static final UnversionedSerializer topology = new UnversionedSerializer<>() + { + @Override + public void serialize(Topology topology, DataOutputPlus out) throws IOException + { + out.writeLong(topology.epoch()); + CollectionSerializers.serializeList(topology.shards(), out, shard); + CollectionSerializers.serializeCollection(topology.staleIds(), out, TopologySerializers.nodeId); + } + + @Override + public Topology deserialize(DataInputPlus in) throws IOException + { + long epoch = in.readLong(); + Shard[] shards = ArraySerializers.deserializeArray(in, shard, Shard[]::new); + SortedArrayList staleIds = CollectionSerializers.deserializeSortedArrayList(in, TopologySerializers.nodeId, Node.Id[]::new); + return new Topology(epoch, staleIds, shards); + } + + @Override + public long serializedSize(Topology topology) + { + long size = 0; + size += TypeSizes.LONG_SIZE; // epoch + size += CollectionSerializers.serializedListSize(topology.shards(), shard); + size += CollectionSerializers.serializedCollectionSize(topology.staleIds(), TopologySerializers.nodeId); + return size; + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/TxnRequestSerializer.java b/src/java/org/apache/cassandra/service/accord/serializers/TxnRequestSerializer.java new file mode 100644 index 000000000000..fe2cbe26136b --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/TxnRequestSerializer.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import accord.messages.TxnRequest; +import accord.primitives.Route; +import accord.primitives.TxnId; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + +public abstract class TxnRequestSerializer> implements IVersionedSerializer +{ + void serializeHeader(T msg, DataOutputPlus out, Version version) throws IOException + { + CommandSerializers.txnId.serialize(msg.txnId, out); + KeySerializers.route.serialize(msg.scope, out); + out.writeUnsignedVInt(msg.waitForEpoch); + } + + public abstract void serializeBody(T msg, DataOutputPlus out, Version version) throws IOException; + + @Override + public final void serialize(T msg, DataOutputPlus out, Version version) throws IOException + { + serializeHeader(msg, out, version); + serializeBody(msg, out, version); + } + + public abstract T deserializeBody(DataInputPlus in, Version version, TxnId txnId, Route scope, long waitForEpoch) throws IOException; + + @Override + public final T deserialize(DataInputPlus in, Version version) throws IOException + { + TxnId txnId = CommandSerializers.txnId.deserialize(in); + Route scope = KeySerializers.route.deserialize(in); + // TODO (desired): there should be a base epoch + long waitForEpoch = in.readUnsignedVInt(); + return deserializeBody(in, version, txnId, scope, waitForEpoch); + } + + long serializedHeaderSize(T msg, Version version) + { + return CommandSerializers.txnId.serializedSize(msg.txnId) + + KeySerializers.route.serializedSize(msg.scope()) + + TypeSizes.sizeofUnsignedVInt(msg.waitForEpoch); + } + + public abstract long serializedBodySize(T msg, Version version); + + @Override + public final long serializedSize(T msg, Version version) + { + return serializedHeaderSize(msg, version) + serializedBodySize(msg, version); + } + + public static abstract class WithUnsyncedSerializer> extends TxnRequestSerializer + { + @Override + void serializeHeader(T msg, DataOutputPlus out, Version version) throws IOException + { + super.serializeHeader(msg, out, version); + out.writeUnsignedVInt(msg.minEpoch); + } + + public abstract T deserializeBody(DataInputPlus in, Version version, TxnId txnId, Route scope, long waitForEpoch, long minEpoch) throws IOException; + + @Override + public final T deserializeBody(DataInputPlus in, Version version, TxnId txnId, Route scope, long waitForEpoch) throws IOException + { + long minEpoch = in.readUnsignedVInt(); + return deserializeBody(in, version, txnId, scope, waitForEpoch, minEpoch); + } + + @Override + long serializedHeaderSize(T msg, Version version) + { + long size = super.serializedHeaderSize(msg, version); + size += TypeSizes.sizeofUnsignedVInt(msg.minEpoch); + return size; + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/TxnSerializer.java b/src/java/org/apache/cassandra/service/accord/serializers/TxnSerializer.java new file mode 100644 index 000000000000..cd3819350c16 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/TxnSerializer.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import org.apache.cassandra.io.ParameterisedVersionedSerializer; + +public interface TxnSerializer extends ParameterisedVersionedSerializer +{ +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/Version.java b/src/java/org/apache/cassandra/service/accord/serializers/Version.java new file mode 100644 index 000000000000..45dfa09412f5 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/Version.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.MessageVersionProvider; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.net.MessagingService; + +public enum Version implements MessageVersionProvider +{ + // If MessagingService version bumps, this mapping does not need to be updated; only updates needed are those that + // include accord serializer changes. + V1(1, MessagingService.Version.VERSION_51); + + public static final Version LATEST = Version.V1; + /** + * Version that should be used for disk serialization where downgrade may be possible. + * + * As of this writing only 1 version exists, so this is the same as LATEST... Once v2 comes into the picture we need this version to be the oldest version needed for downgrade... If you upgrade from 5.1 to 5.2 (assuming this adds a v2) you need a version that works with 5.1 here. + */ + public static final Version DOWNGRADE_SAFE_VERSION = Version.V1; + + /** + * Version that should be used for messaging serialization where mixed versions may be possible. + * + * As of this writing only 1 version exists, so this is the same as LATEST... Once v2 comes into the picture we need this version to be the oldest version needed for downgrade... If you upgrade from 5.1 to 5.2 (assuming this adds a v2) you need a version that works with 5.1 here. + */ + public static final Version CLUSTER_SAFE_VERSION = Version.V1; + + /** + * Version number used in the serialization protocol. This is not the same as the messaging version, and is localized to this class. + */ + public final int version; + /** + * For the accord versioned serializers they sometimes need to access existing messaging serializers, in these cases an agreed messaging version is required and can not be plumbed directly from the messaging layer. + * + * @see #messageVersion() + */ + private final MessagingService.Version messagingVersion; + + Version(int version, MessagingService.Version messagingVersion) + { + this.version = version; + this.messagingVersion = messagingVersion; + } + + public static Version fromVersion(int version) + { + switch (version) + { + case 1: return V1; + default: + throw new IllegalArgumentException("Unknown version: " + version); + } + } + + public static Version findBestMatchForMessagingVersion(int messagingVersion) + { + Version[] versions = values(); + for (int i = versions.length - 1; i >= 0; i--) + { + Version v = versions[i]; + // If network version bumped (12 to 13), the accord serializers may not have been changed; use the largest + // version smaller than or equal to this version + if (v.messageVersion() <= messagingVersion) + return v; + } + throw new IllegalArgumentException("Attempted to use message version " + messagingVersion + " which is smaller than " + versions[0] + " can handle (" + versions[0].messageVersion() + ")"); + } + + @Override + public int messageVersion() + { + return messagingVersion.value; + } + + public List greaterThanOrEqual() + { + Version[] all = Version.values(); + if (ordinal() == all.length - 1) + return Collections.singletonList(this); + List values = new ArrayList<>(all.length - ordinal()); + for (int i = ordinal(); i < all.length; i++) + values.add(all[i]); + return values; + } + + public enum Serializer implements UnversionedSerializer + { + instance; + + @Override + public void serialize(Version t, DataOutputPlus out) throws IOException + { + out.writeUnsignedVInt32(t.version); + } + + @Override + public Version deserialize(DataInputPlus in) throws IOException + { + return Version.fromVersion(in.readUnsignedVInt32()); + } + + @Override + public long serializedSize(Version t) + { + return TypeSizes.sizeofUnsignedVInt(t.version); + } + } +} diff --git a/src/java/org/apache/cassandra/service/accord/serializers/WaitingOnSerializer.java b/src/java/org/apache/cassandra/service/accord/serializers/WaitingOnSerializer.java new file mode 100644 index 000000000000..af7e5d66bb41 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/serializers/WaitingOnSerializer.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import accord.impl.CommandChange.WaitingOnProvider; +import accord.local.Command; +import accord.local.Command.WaitingOn; +import accord.primitives.PartialDeps; +import accord.primitives.RangeDeps; +import accord.primitives.RoutingKeys; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import accord.utils.ImmutableBitSet; +import accord.utils.Invariants; +import accord.utils.SimpleBitSet; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + +import static accord.primitives.Routable.Domain.Key; +import static accord.primitives.Routable.Domain.Range; + +public class WaitingOnSerializer +{ + public static void serializeBitSetsOnly(TxnId txnId, WaitingOn waitingOn, DataOutputPlus out) throws IOException + { + Invariants.require(txnId.is(Key) == (waitingOn.appliedOrInvalidated == null)); + int keyCount = waitingOn.keys.size(); + int txnIdCount = waitingOn.txnIdCount(); + int waitingOnLength = (txnIdCount + keyCount + 63) / 64; + out.writeUnsignedVInt32(waitingOnLength); + serialize(waitingOnLength, waitingOn.waitingOn, out); + + if (txnId.is(Range)) + { + int appliedOrInvalidatedLength = (txnIdCount + 63) / 64; + out.writeUnsignedVInt32(waitingOnLength - appliedOrInvalidatedLength); + serialize(appliedOrInvalidatedLength, waitingOn.appliedOrInvalidated, out); + } + } + + public static final class Provider implements WaitingOnProvider + { + final ImmutableBitSet waitingOn, appliedOrInvalidated; + final int waitingOnLength, appliedOrInvalidatedLength; + + public Provider(ImmutableBitSet waitingOn, ImmutableBitSet appliedOrInvalidated, int waitingOnLength, int appliedOrInvalidatedLength) + { + this.waitingOn = waitingOn; + this.appliedOrInvalidated = appliedOrInvalidated; + this.waitingOnLength = waitingOnLength; + this.appliedOrInvalidatedLength = appliedOrInvalidatedLength; + } + + @Override + public WaitingOn provide(TxnId txnId, PartialDeps deps, Timestamp executeAtLeast, long uniqueHlc) + { + Invariants.nonNull(deps); + RoutingKeys keys = deps.keyDeps.keys(); + RangeDeps directRangeDeps = deps.rangeDeps; + int txnIdCount = directRangeDeps.txnIdCount(); + Invariants.require(waitingOn.size()/64 == (txnIdCount + keys.size() + 63) / 64); + Invariants.require(appliedOrInvalidated == null || (appliedOrInvalidated.size()/64 == (txnIdCount + 63)/64)); + + WaitingOn result = new WaitingOn(keys, directRangeDeps, waitingOn, appliedOrInvalidated); + if (executeAtLeast != null) return new Command.WaitingOnWithExecuteAt(result, executeAtLeast); + else if (uniqueHlc != 0) return new Command.WaitingOnWithMinUniqueHlc(result, uniqueHlc); + return result; + } + + public void reserialize(DataOutputPlus out) throws IOException + { + out.writeUnsignedVInt32(waitingOnLength); + serialize(waitingOnLength, waitingOn, out); + if (appliedOrInvalidated != null) + { + out.writeUnsignedVInt32(waitingOnLength - appliedOrInvalidatedLength); + serialize(appliedOrInvalidatedLength, appliedOrInvalidated, out); + } + } + } + + public static WaitingOnProvider deserializeProvider(TxnId txnId, DataInputPlus in) throws IOException + { + ImmutableBitSet waitingOn, appliedOrInvalidated = null; + int waitingOnLength, appliedOrInvalidatedLength = 0; + waitingOnLength = in.readUnsignedVInt32(); + waitingOn = deserialize(waitingOnLength, in); + if (txnId.is(Range)) + { + appliedOrInvalidatedLength = waitingOnLength - in.readUnsignedVInt32(); + appliedOrInvalidated = deserialize(appliedOrInvalidatedLength, in); + } + + return new Provider(waitingOn, appliedOrInvalidated, waitingOnLength, appliedOrInvalidatedLength); + } + + public static void skip(TxnId txnId, DataInputPlus in) throws IOException + { + int waitingOnLength = in.readUnsignedVInt32(); + in.skipBytesFully(waitingOnLength * 8); + if (txnId.is(Range)) + { + int delta = in.readUnsignedVInt32(); + in.skipBytesFully((waitingOnLength - delta) * 8); + } + } + + private static void serialize(int length, SimpleBitSet write, DataOutputPlus out) throws IOException + { + long[] bits = SimpleBitSet.SerializationSupport.getArray(write); + Invariants.require(length == bits.length); + for (int i = 0; i < length; i++) + out.writeLong(bits[i]); + } + + private static ImmutableBitSet deserialize(int length, DataInputPlus in) throws IOException + { + long[] bits = new long[length]; + for (int i = 0 ; i < length ; ++i) + bits[i] = in.readLong(); + return ImmutableBitSet.SerializationSupport.construct(bits); + } + + public static long serializedSize(int length, SimpleBitSet write) + { + long[] bits = SimpleBitSet.SerializationSupport.getArray(write); + Invariants.require(length == bits.length, "Expected length %d != %d", length, bits.length); + return (long) TypeSizes.LONG_SIZE * length; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/AbstractKeySorted.java b/src/java/org/apache/cassandra/service/accord/txn/AbstractKeySorted.java new file mode 100644 index 000000000000..ac605b33ab08 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/AbstractKeySorted.java @@ -0,0 +1,251 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; +import java.util.function.Consumer; +import java.util.stream.Collectors; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Iterators; + +import accord.api.Key; +import accord.primitives.Keys; +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.primitives.Routable.Domain; +import accord.primitives.Seekable; +import accord.primitives.Seekables; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.service.accord.api.PartitionKey; + +/** + * Immutable collection of items, sorted first by their partition key + */ +public abstract class AbstractKeySorted implements Iterable +{ + public static final String ITEMS_OUT_OF_ORDER_MESSAGE = "Items are out of order ([%s] %s >= [%s] %s)"; + + protected final Seekables itemKeys; + protected final T[] items; + + public AbstractKeySorted(T[] items, Domain domain) + { + this.items = items; + this.itemKeys = extractItemKeys(domain); + } + + public AbstractKeySorted(List items, Domain domain) + { + T[] arr = newArray(items.size()); + items.toArray(arr); + this.items = arr; + switch (domain) + { + case Key: + Arrays.sort(arr, this::compareKey); + break; + case Range: + Arrays.sort(arr, this::compareRange); + break; + default: + throw new IllegalStateException("Unhandled domain " + domain); + } + this.itemKeys = extractItemKeys(domain); + } + + private Seekables extractItemKeys(Domain domain) + { + switch (domain) + { + case Key: + if (items.length == 0) + return Keys.EMPTY; + PartitionKey[] keys = new PartitionKey[items.length]; + for (int i = 0 ; i < keys.length; i++) + keys[i] = (PartitionKey)getKey(items[i]); + return Keys.ofSorted(keys); + case Range: + if (items.length == 0) + return Ranges.EMPTY; + TokenRange[] ranges = new TokenRange[items.length]; + for (int i = 0 ; i < ranges.length; i++) + ranges[i] = (TokenRange)getKey(items[i]); + return Ranges.ofSortedAndDeoverlapped(ranges); + default: + throw new IllegalStateException("Unhandled domain " + domain); + } + } + + @Override + public Iterator iterator() + { + return Iterators.forArray(items); + } + + @Override + public String toString() + { + return getClass().getSimpleName() + Arrays.stream(items) + .map(Objects::toString) + .collect(Collectors.joining(", ", "{", "}")); + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + AbstractKeySorted that = (AbstractKeySorted) o; + return Arrays.equals(items, that.items); + } + + @Override + public int hashCode() + { + return Arrays.hashCode(items); + } + + @VisibleForTesting + public Seekables keys() + { + return itemKeys; + } + + public T get(int index) + { + return items[index]; + } + + /** + * Compare the non-key component of items (since this class handles sorting by key) + */ + abstract int compareNonKeyFields(T left, T right); + + abstract Seekable getKey(T item); + abstract T[] newArray(int size); + + public int compareKey(T left, T right) + { + int cmp = ((PartitionKey)getKey(left)).compareTo(((PartitionKey)getKey(right))); + return cmp != 0 ? cmp : compareNonKeyFields(left, right); + } + + public int compareRange(T left, T right) + { + int cmp = ((TokenRange)getKey(left)).compareTo(((TokenRange)getKey(right))); + return cmp != 0 ? cmp : compareNonKeyFields(left, right); + } + + @VisibleForTesting + void validateOrder() + { + Domain domain = getKey(items[0]).domain(); + switch (domain) + { + case Key: + for (int i = 1; i < items.length; i++) + { + T prev = items[i-1]; + T next = items[i]; + + if (compareKey(prev, next) >= 0) + throw new IllegalStateException(String.format(ITEMS_OUT_OF_ORDER_MESSAGE, i - 1, prev, i, next)); + } + break; + case Range: + for (int i = 1; i < items.length; i++) + { + T prev = items[i-1]; + T next = items[i]; + + if (compareRange(prev, next) >= 0) + throw new IllegalStateException(String.format(ITEMS_OUT_OF_ORDER_MESSAGE, i - 1, prev, i, next)); + } + break; + default: + throw new IllegalStateException("Unhandled domain " + domain); + } + } + + public int size() + { + return items.length; + } + + public void forEachWithKey(Seekable key, Consumer consumer) + { + switch (key.domain()) + { + case Key: + for (int i = firstPossibleKeyIdx((PartitionKey) key); i < items.length; i++) + { + Key itemKey = (Key)getKey(items[i]); + if (key.equals(itemKey)) + consumer.accept(items[i]); + else + break; + } + break; + case Range: + TokenRange range = (TokenRange) key; + for (int i = firstPossibleRangeIdx(range); i < items.length; i++) + { + Range itemRange = (Range) getKey(items[i]); + if (range.compareIntersecting(itemRange) == 0) + consumer.accept(items[i]); + else + break; + } + break; + default: + throw new IllegalStateException("Unhandled domain " + key.domain()); + } + } + + private int firstPossibleRangeIdx(TokenRange range) + { + int idx = Arrays.binarySearch(items, range, (l, r) -> { + Range itemRange = (Range)getKey((T) l); + if (itemRange.compareIntersecting((TokenRange)r) == 0) + return 1; + if (((TokenRange) r).end().compareTo(itemRange.end()) > 0) + return 1; + else + return -1; + }); + + return -1 - idx; + } + + private int firstPossibleKeyIdx(PartitionKey key) + { + int idx = Arrays.binarySearch(items, key, (l, r) -> { + PartitionKey lk = (PartitionKey) getKey((T) l); + PartitionKey rk = (PartitionKey) r; + int cmp = lk.compareTo(rk); + return cmp != 0 ? cmp : 1; + }); + + return -1 - idx; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/AbstractSerialized.java b/src/java/org/apache/cassandra/service/accord/txn/AbstractSerialized.java new file mode 100644 index 000000000000..ac172a221bbb --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/AbstractSerialized.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.nio.ByteBuffer; +import java.util.Objects; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import javax.annotation.concurrent.NotThreadSafe; + +import accord.utils.Invariants; +import org.apache.cassandra.service.accord.serializers.Version; + +/** + * Item that is serialized by default + */ +@NotThreadSafe +public abstract class AbstractSerialized +{ + private @Nullable final ByteBuffer latestVersionBytes; + private transient @Nullable T memoized = null; + + protected AbstractSerialized(@Nullable ByteBuffer latestVersionBytes) + { + this.latestVersionBytes = latestVersionBytes; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || (o.getClass() != getClass())) return false; + + AbstractSerialized that = (AbstractSerialized) o; + return Objects.equals(latestVersionBytes, that.latestVersionBytes); + } + + @Override + public int hashCode() + { + return latestVersionBytes != null ? latestVersionBytes.hashCode() : 0; + } + + public abstract long estimatedSizeOnHeap(); + protected abstract ByteBuffer serialize(T value, P param, Version version); + protected abstract ByteBuffer reserialize(ByteBuffer bytes, P param, Version srcVersion, Version trgVersion); + protected abstract T deserialize(P param, ByteBuffer bytes, Version version); + + protected boolean isNull() + { + return latestVersionBytes == null; + } + + @Nullable + protected T deserialize(P param) + { + T result = memoized; + if (result == null && latestVersionBytes != null) + memoized = result = deserialize(param, latestVersionBytes, Version.LATEST); + return result; + } + + public void unmemoize() + { + memoized = null; + } + + @Nullable + protected ByteBuffer unsafeBytes() + { + return latestVersionBytes; + } + + @Nonnull + protected ByteBuffer bytes(P param, Version target) + { + Invariants.nonNull(latestVersionBytes); + if (Version.LATEST == target) + return latestVersionBytes; + return reserialize(latestVersionBytes, param, Version.LATEST, target); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/AccordUpdate.java b/src/java/org/apache/cassandra/service/accord/txn/AccordUpdate.java new file mode 100644 index 000000000000..3a9d1bbe7b31 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/AccordUpdate.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.io.IOException; +import javax.annotation.Nullable; + +import accord.api.Data; +import accord.api.Update; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.serializers.TxnSerializer; +import org.apache.cassandra.service.accord.serializers.Version; + +public abstract class AccordUpdate implements Update +{ + public enum Kind + { + TXN(0), + UNRECOVERABLE_REPAIR(1), + NONE(2), + ; + + int val; + + Kind(int val) + { + this.val = val; + } + + public static Kind valueOf(int val) + { + switch(val) + { + case 0: + return TXN; + case 1: + return UNRECOVERABLE_REPAIR; + default: + throw new IllegalArgumentException("Unrecognized AccordUpdate.Kind value " + val); + } + } + } + + public static Kind kind(@Nullable Update update) + { + if (update == null) + return Kind.NONE; + return ((AccordUpdate)update).kind(); + } + + public boolean checkCondition(Data data) + { + throw new UnsupportedOperationException(); + } + + public abstract ConsistencyLevel cassandraCommitCL(); + + public abstract Kind kind(); + + public abstract long estimatedSizeOnHeap(); + + public interface AccordUpdateSerializer extends TxnSerializer + { + } + + private static AccordUpdateSerializer serializerFor(AccordUpdate toSerialize) + { + return serializerFor(toSerialize.kind()); + } + + private static AccordUpdateSerializer serializerFor(Kind kind) + { + switch (kind) + { + case TXN: + return TxnUpdate.serializer; + case UNRECOVERABLE_REPAIR: + return UnrecoverableRepairUpdate.serializer; + default: + throw new IllegalStateException("Unsupported AccordUpdate Kind " + kind); + } + } + + public static final AccordUpdateSerializer serializer = new AccordUpdateSerializer<>() + { + @Override + public void serialize(AccordUpdate update, TableMetadatasAndKeys tablesAndKeys, DataOutputPlus out, Version version) throws IOException + { + out.writeByte(update.kind().val); + serializerFor(update).serialize(update, tablesAndKeys, out, version); + } + + @Override + public AccordUpdate deserialize(TableMetadatasAndKeys tablesAndKeys, DataInputPlus in, Version version) throws IOException + { + Kind kind = Kind.valueOf(in.readByte()); + return (AccordUpdate) serializerFor(kind).deserialize(tablesAndKeys, in, version); + } + + @Override + public long serializedSize(AccordUpdate update, TableMetadatasAndKeys tablesAndKeys, Version version) + { + return 1 + serializerFor(update).serializedSize(update, tablesAndKeys, version); + } + }; +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/service/accord/txn/AccordUpdateParameters.java b/src/java/org/apache/cassandra/service/accord/txn/AccordUpdateParameters.java new file mode 100644 index 000000000000..efb222b6557f --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/AccordUpdateParameters.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.util.Collections; +import java.util.Map; + +import com.google.common.collect.ImmutableMap; + +import org.apache.cassandra.cql3.QueryOptions; +import org.apache.cassandra.cql3.UpdateParameters; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.marshal.TimeUUIDType; +import org.apache.cassandra.db.partitions.Partition; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.paxos.Ballot; +import org.apache.cassandra.utils.TimeUUID; + +import static com.google.common.base.Preconditions.checkState; +import static java.util.concurrent.TimeUnit.MICROSECONDS; + +public class AccordUpdateParameters +{ + private final TxnData data; + private final QueryOptions options; + private final long timestamp; + + public AccordUpdateParameters(TxnData data, QueryOptions options, long timestamp) + { + this.data = data; + this.options = options; + this.timestamp = timestamp; + } + + static class RowUpdateParameters extends UpdateParameters + { + private long timeUuidNanos; + + public RowUpdateParameters(TableMetadata metadata, ClientState clientState, QueryOptions options, long timestamp, long nowInSec, int ttl, Map prefetchedRows) throws InvalidRequestException + { + super(metadata, clientState, options, timestamp, nowInSec, ttl, prefetchedRows); + } + + @Override + public byte[] nextTimeUUIDAsBytes() + { + return TimeUUID.toBytes(Ballot.unixMicrosToMsb(timestamp), TimeUUIDType.signedBytesToNativeLong(timeUuidNanos++)); + } + } + + public TxnData getData() + { + return data; + } + + public UpdateParameters updateParameters(TableMetadata metadata, DecoratedKey dk, int rowIndex) + { + // This is currently only used by Guardrails, but this logically have issues with Accord as drifts in config + // values could cause unexpected issues in Accord. (ex. some nodes reject writes while others accept) + // For the time being, guardrails are disabled for Accord queries. + ClientState disabledGuardrails = null; + + int ttl = metadata.params.defaultTimeToLive; + return new RowUpdateParameters(metadata, + disabledGuardrails, + options, + timestamp, + MICROSECONDS.toSeconds(timestamp), + ttl, + prefetchRow(metadata, dk, rowIndex)); + } + + private Map prefetchRow(TableMetadata metadata, DecoratedKey dk, int index) + { + if (data != null) + { + for (Map.Entry e : data.entrySet()) + { + int name = e.getKey(); + TxnDataKeyValue value = (TxnDataKeyValue)e.getValue(); + switch (TxnData.txnDataNameKind(name)) + { + case CAS_READ: + checkState(data.entrySet().size() == 1, "CAS read should only have one entry"); + return ImmutableMap.of(dk, value); + case AUTO_READ: + if (TxnData.txnDataNameIndex(name) == index) + return ImmutableMap.of(dk, value); + default: + } + } + } + return Collections.emptyMap(); + } +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/RetryWithNewProtocolResult.java b/src/java/org/apache/cassandra/service/accord/txn/RetryWithNewProtocolResult.java new file mode 100644 index 000000000000..5f80e304e403 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/RetryWithNewProtocolResult.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +/** + * Potentially returned by any transaction that tries to execute in an Epoch + * where the range has migrated away from Accord + */ +public class RetryWithNewProtocolResult implements TxnResult +{ + public static final RetryWithNewProtocolResult instance = new RetryWithNewProtocolResult(); + + private RetryWithNewProtocolResult() + { + } + + @Override + public Kind kind() + { + return Kind.retry_new_protocol; + } + + @Override + public long estimatedSizeOnHeap() + { + return 0; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/TxnCondition.java b/src/java/org/apache/cassandra/service/accord/txn/TxnCondition.java new file mode 100644 index 000000000000..881a3321621d --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/TxnCondition.java @@ -0,0 +1,673 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collection; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Iterables; + +import accord.utils.Invariants; +import org.apache.cassandra.cql3.Operator; +import org.apache.cassandra.cql3.conditions.ColumnCondition; +import org.apache.cassandra.cql3.conditions.ColumnCondition.Bound; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.UserType; +import org.apache.cassandra.db.partitions.FilteredPartition; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.ColumnData; +import org.apache.cassandra.db.rows.ComplexColumnData; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.io.ParameterisedVersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ObjectSizes; + +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.cassandra.service.accord.AccordSerializers.clusteringSerializer; +import static org.apache.cassandra.service.accord.txn.TxnData.TxnDataNameKind.CAS_READ; +import static org.apache.cassandra.service.accord.txn.TxnData.txnDataName; +import static org.apache.cassandra.utils.CollectionSerializers.deserializeList; +import static org.apache.cassandra.utils.CollectionSerializers.serializeCollection; +import static org.apache.cassandra.utils.CollectionSerializers.serializeList; +import static org.apache.cassandra.utils.CollectionSerializers.serializedCollectionSize; +import static org.apache.cassandra.utils.CollectionSerializers.serializedListSize; + +public abstract class TxnCondition +{ + public static class SerializedTxnCondition extends AbstractSerialized + { + private static final long EMPTY_SIZE = ObjectSizes.measure(new SerializedTxnCondition(null)); + + protected SerializedTxnCondition(@Nullable ByteBuffer latestVersionBytes) + { + super(latestVersionBytes); + } + + protected SerializedTxnCondition(TxnCondition condition, TableMetadatas param) + { + this(serializer.serializeUnchecked(condition, param, Version.LATEST)); + } + + @Override + public long estimatedSizeOnHeap() + { + return EMPTY_SIZE + ObjectSizes.sizeOnHeapOf(unsafeBytes()); + } + + @Override + protected ByteBuffer serialize(TxnCondition value, TableMetadatas param, Version version) + { + return serializer.serializeUnchecked(value, param, version); + } + + @Override + protected ByteBuffer reserialize(ByteBuffer bytes, TableMetadatas param, Version srcVersion, Version trgVersion) + { + return bytes; + } + + @Override + protected TxnCondition deserialize(TableMetadatas param, ByteBuffer bytes, Version version) + { + return serializer.deserializeUnchecked(param, bytes, version); + } + } + + private interface ConditionSerializer + { + void serialize(T condition, TableMetadatas tables, DataOutputPlus out, Version version) throws IOException; + T deserialize(TableMetadatas tables, DataInputPlus in, Version version, Kind kind) throws IOException; + long serializedSize(T condition, TableMetadatas tables, Version version); + } + + public enum Kind + { + NONE("n/a", null), + AND("AND", null), + OR("OR", null), + IS_NOT_NULL("IS NOT NULL", null), + IS_NULL("IS NULL", null), + EQUAL("=", Operator.EQ), + NOT_EQUAL("!=", Operator.NEQ), + GREATER_THAN(">", Operator.GT), + GREATER_THAN_OR_EQUAL(">=", Operator.GTE), + LESS_THAN("<", Operator.LT), + LESS_THAN_OR_EQUAL("<=", Operator.LTE), + COLUMN_CONDITIONS("COLUMN_CONDITIONS", null); + + @Nonnull + private final String symbol; + @Nullable + private final Operator operator; + + Kind(String symbol, Operator operator) + { + this.symbol = symbol; + this.operator = operator; + } + + @SuppressWarnings("rawtypes") + private ConditionSerializer serializer() + { + switch (this) + { + case IS_NOT_NULL: + case IS_NULL: + return Exists.serializer; + case EQUAL: + case NOT_EQUAL: + case LESS_THAN: + case LESS_THAN_OR_EQUAL: + case GREATER_THAN: + case GREATER_THAN_OR_EQUAL: + return Value.serializer; + case AND: + case OR: + return BooleanGroup.serializer; + case NONE: + return None.serializer; + case COLUMN_CONDITIONS: + return ColumnConditionsAdapter.serializer; + default: + throw new IllegalArgumentException("No serializer exists for kind " + this); + } + } + } + + protected final Kind kind; + + public TxnCondition(Kind kind) + { + this.kind = kind; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + TxnCondition condition = (TxnCondition) o; + return kind == condition.kind; + } + + public abstract void collect(TableMetadatas.Collector collector); + + @Override + public int hashCode() + { + return Objects.hash(kind); + } + + public Kind kind() + { + return kind; + } + + public abstract boolean applies(TxnData data); + + private static class None extends TxnCondition + { + private static final None instance = new None(); + + private None() + { + super(Kind.NONE); + } + + @Override + public String toString() + { + return kind.toString(); + } + + @Override + public void collect(TableMetadatas.Collector collector) + { + } + + @Override + public boolean applies(TxnData data) + { + return true; + } + + private static final ConditionSerializer serializer = new ConditionSerializer<>() + { + @Override + public void serialize(None condition, TableMetadatas tables, DataOutputPlus out, Version version) {} + @Override + public None deserialize(TableMetadatas tables, DataInputPlus in, Version version, Kind kind) { return instance; } + @Override + public long serializedSize(None condition, TableMetadatas tables, Version version) { return 0; } + }; + } + + public static TxnCondition none() + { + return None.instance; + } + + public static class Exists extends TxnCondition + { + private static final Set KINDS = ImmutableSet.of(Kind.IS_NOT_NULL, Kind.IS_NULL); + + public final TxnReference reference; + + public Exists(TxnReference reference, Kind kind) + { + super(kind); + Preconditions.checkArgument(KINDS.contains(kind), "Kind " + kind + " cannot be used with an existence condition"); + this.reference = reference; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + Exists exists = (Exists) o; + return reference.equals(exists.reference); + } + + @Override + public void collect(TableMetadatas.Collector collector) + { + TableMetadata table = reference.table(); + if (table != null) + collector.add(table); + } + + @Override + public int hashCode() + { + return Objects.hash(super.hashCode(), reference); + } + + @Override + public String toString() + { + return reference.toString() + ' ' + kind.toString(); + } + + @Override + public boolean applies(TxnData data) + { + FilteredPartition partition = reference.getPartition(data); + boolean exists = partition != null && !partition.isEmpty(); + + Row row = null; + if (exists) + { + row = reference.getRow(partition); + exists = row != null && !row.isEmpty(); + } + + if (exists && reference.selectsColumn()) + { + ColumnData columnData = reference.getColumnData(row); + + if (columnData == null) + { + exists = false; + } + else if (columnData.column().isComplex()) + { + if (reference.isElementSelection() || reference.isFieldSelection()) + { + Cell cell = (Cell) columnData; + exists = !cell.isTombstone(); + } + else + { + // TODO: Is this even necessary, given the partition is already filtered? + if (!((ComplexColumnData) columnData).complexDeletion().isLive()) + exists = false; + } + } + else if (reference.isElementSelection()) + { + // This is frozen, so check if the Cell is a tombstone and that the element is present. + Cell cell = (Cell) columnData; + ByteBuffer element = reference.getFrozenCollectionElement(cell); + exists = element != null && !cell.isTombstone(); + } + else if (reference.isFieldSelection()) + { + // This is frozen, so check if the Cell is a tombstone and that the field is present. + Cell cell = (Cell) columnData; + ByteBuffer fieldValue = reference.getFrozenFieldValue(cell); + exists = fieldValue != null && !cell.isTombstone(); + } + else + { + Cell cell = (Cell) columnData; + exists = !cell.isTombstone(); + } + } + + switch (kind()) + { + case IS_NOT_NULL: + return exists; + case IS_NULL: + return !exists; + default: + throw new IllegalStateException(); + } + } + + private static final ConditionSerializer serializer = new ConditionSerializer() + { + @Override + public void serialize(Exists condition, TableMetadatas tables, DataOutputPlus out, Version version) throws IOException + { + TxnReference.serializer.serialize(condition.reference, tables, out, version); + } + + @Override + public Exists deserialize(TableMetadatas tables, DataInputPlus in, Version version, Kind kind) throws IOException + { + return new Exists(TxnReference.serializer.deserialize(tables, in, version), kind); + } + + @Override + public long serializedSize(Exists condition, TableMetadatas tables, Version version) + { + return TxnReference.serializer.serializedSize(condition.reference, tables, version); + } + }; + } + + public static class ColumnConditionsAdapter extends TxnCondition + { + @Nonnull + public final Collection bounds; + + @Nonnull + public final Clustering clustering; + + public ColumnConditionsAdapter(Clustering clustering, Collection bounds) + { + super(Kind.COLUMN_CONDITIONS); + checkNotNull(bounds); + checkNotNull(clustering); + this.bounds = bounds; + this.clustering = clustering; + } + + @Override + public void collect(TableMetadatas.Collector collector) + { + for (Bound bound : bounds) + { + TableMetadata table = bound.table; + if (table != null) + collector.add(table); + } + } + + @Override + public boolean applies(@Nonnull TxnData data) + { + checkNotNull(data); + TxnDataKeyValue value = (TxnDataKeyValue)data.get(txnDataName(CAS_READ)); + Row row = value != null ? value.getRow(clustering) : null; + for (Bound bound : bounds) + { + if (!bound.appliesTo(row)) + return false; + } + return true; + } + + private static final ConditionSerializer serializer = new ConditionSerializer() + { + @Override + public void serialize(ColumnConditionsAdapter condition, TableMetadatas tables, DataOutputPlus out, Version version) throws IOException + { + clusteringSerializer.serialize(condition.clustering, out); + serializeCollection(condition.bounds, tables, out, Bound.serializer); + } + + @Override + public ColumnConditionsAdapter deserialize(TableMetadatas tables, DataInputPlus in, Version version, Kind ignored) throws IOException + { + Clustering clustering = clusteringSerializer.deserialize(in); + List bounds = deserializeList(tables, in, Bound.serializer); + return new ColumnConditionsAdapter(clustering, bounds); + } + + @Override + public long serializedSize(ColumnConditionsAdapter condition, TableMetadatas tables, Version version) + { + return clusteringSerializer.serializedSize(condition.clustering) + + serializedCollectionSize(condition.bounds, tables, Bound.serializer); + } + }; + } + + public static class Value extends TxnCondition + { + private static final Set KINDS = ImmutableSet.of(Kind.EQUAL, Kind.NOT_EQUAL, + Kind.GREATER_THAN, Kind.GREATER_THAN_OR_EQUAL, + Kind.LESS_THAN, Kind.LESS_THAN_OR_EQUAL); + + private final TxnReference reference; + private final ByteBuffer value; + private final ProtocolVersion version; + + public Value(TxnReference reference, Kind kind, ByteBuffer value, ProtocolVersion version) + { + super(kind); + Invariants.requireArgument(KINDS.contains(kind), "Kind " + kind + " cannot be used with a value condition"); + Invariants.requireArgument(reference.selectsColumn(), "Reference " + reference + " does not select a column"); + this.reference = reference; + this.value = value; + this.version = version; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + Value value1 = (Value) o; + return reference.equals(value1.reference) && value.equals(value1.value); + } + + @Override + public void collect(TableMetadatas.Collector collector) + { + TableMetadata table = reference.table(); + if (table != null) + collector.add(table); + } + + @Override + public int hashCode() + { + return Objects.hash(super.hashCode(), reference, value); + } + + @Override + public String toString() + { + return reference.toString() + ' ' + kind.symbol + " 0x" + ByteBufferUtil.bytesToHex(value); + } + + private Bound getBounds(TxnData data) + { + ColumnMetadata column = reference.column(); + TableMetadata table = reference.table(); + if (column.isPartitionKey()) + { + ByteBuffer bb = reference.getPartitionKey(data); + return new ColumnCondition.SimpleBound(column, table, kind.operator, value) + { + @Override + protected ByteBuffer rowValue(Row row) + { + return bb; + } + }; + } + else if (column.isClusteringColumn()) + return new ColumnCondition.SimpleClusteringBound(column, table, kind.operator, value); + AbstractType type = column.type; + if (type.isCollection()) + { + if (reference.selectsPath()) + return new ColumnCondition.ElementOrFieldAccessBound(column, table, reference.path().get(0), kind.operator, value); + if (type.isMultiCell()) + return new ColumnCondition.MultiCellBound(column, table, kind.operator, value); + } + else if (type.isUDT()) + { + if (reference.isFieldSelection()) + { + UserType ut = (UserType) type; + return new ColumnCondition.ElementOrFieldAccessBound(column, table, ut.fieldName(reference.path()).bytes, kind.operator, value); + } + if (type.isMultiCell()) + return new ColumnCondition.MultiCellBound(column, table, kind.operator, value); + } + return new ColumnCondition.SimpleBound(column, table, kind.operator, value); + } + + @Override + public boolean applies(TxnData data) + { + return getBounds(data).appliesTo(reference.getRow(data)); + } + + private static final ConditionSerializer serializer = new ConditionSerializer<>() + { + @Override + public void serialize(Value condition, TableMetadatas tables, DataOutputPlus out, Version version) throws IOException + { + TxnReference.serializer.serialize(condition.reference, tables, out, version); + ByteBufferUtil.writeWithVIntLength(condition.value, out); + out.writeUTF(condition.version.name()); + } + + @Override + public Value deserialize(TableMetadatas tables, DataInputPlus in, Version version, Kind kind) throws IOException + { + TxnReference reference = TxnReference.serializer.deserialize(tables, in, version); + ByteBuffer value = ByteBufferUtil.readWithVIntLength(in); + ProtocolVersion protocolVersion = ProtocolVersion.valueOf(in.readUTF()); + return new Value(reference, kind, value, protocolVersion); + } + + @Override + public long serializedSize(Value condition, TableMetadatas tables, Version version) + { + long size = 0; + size += TxnReference.serializer.serializedSize(condition.reference, tables, version); + size += ByteBufferUtil.serializedSizeWithVIntLength(condition.value); + size += TypeSizes.sizeof(condition.version.name()); + return size; + } + }; + } + + public static class BooleanGroup extends TxnCondition + { + private static final Set KINDS = ImmutableSet.of(Kind.AND, Kind.OR); + + public final List conditions; + + public BooleanGroup(Kind kind, List conditions) + { + super(kind); + Preconditions.checkArgument(KINDS.contains(kind), "Kind " + kind + " cannot be used at the root of a boolean condition"); + this.conditions = conditions; + } + + @Override + public String toString() + { + return '(' + conditions.stream().map(Objects::toString).reduce((a, b) -> a + ' ' + kind.symbol + ' ' + b).orElse("") + ')'; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + BooleanGroup that = (BooleanGroup) o; + return Objects.equals(conditions, that.conditions); + } + + @Override + public void collect(TableMetadatas.Collector collector) + { + for (TxnCondition condition : conditions) + condition.collect(collector); + } + + @Override + public int hashCode() + { + return Objects.hash(super.hashCode(), conditions); + } + + @Override + public boolean applies(TxnData data) + { + switch (kind()) + { + case AND: + return Iterables.all(conditions, c -> c.applies(data)); + case OR: + return Iterables.any(conditions, c -> c.applies(data)); + default: + throw new IllegalStateException(); + } + } + + private static final ConditionSerializer serializer = new ConditionSerializer<>() + { + @Override + public void serialize(BooleanGroup condition, TableMetadatas tables, DataOutputPlus out, Version version) throws IOException + { + serializeList(condition.conditions, tables, out, version, TxnCondition.serializer); + } + + @Override + public BooleanGroup deserialize(TableMetadatas tables, DataInputPlus in, Version version, Kind kind) throws IOException + { + return new BooleanGroup(kind, deserializeList(tables, in, version, TxnCondition.serializer)); + } + + @Override + public long serializedSize(BooleanGroup condition, TableMetadatas tables, Version version) + { + return serializedListSize(condition.conditions, tables, version, TxnCondition.serializer); + } + }; + } + + public static final ParameterisedVersionedSerializer serializer = new ParameterisedVersionedSerializer<>() + { + @SuppressWarnings("unchecked") + @Override + public void serialize(TxnCondition condition, TableMetadatas tables, DataOutputPlus out, Version version) throws IOException + { + out.writeUnsignedVInt32(condition.kind.ordinal()); + condition.kind.serializer().serialize(condition, tables, out, version); + } + + @Override + public TxnCondition deserialize(TableMetadatas tables, DataInputPlus in, Version version) throws IOException + { + Kind kind = Kind.values()[in.readUnsignedVInt32()]; + return kind.serializer().deserialize(tables, in, version, kind); + } + + @SuppressWarnings("unchecked") + @Override + public long serializedSize(TxnCondition condition, TableMetadatas tables, Version version) + { + long size = TypeSizes.sizeofUnsignedVInt(condition.kind.ordinal()); + size += condition.kind.serializer().serializedSize(condition, tables, version); + return size; + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/TxnData.java b/src/java/org/apache/cassandra/service/accord/txn/TxnData.java new file mode 100644 index 000000000000..27369dcaef0b --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/TxnData.java @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.io.IOException; +import java.util.Map; + +import accord.api.Data; +import org.agrona.collections.Int2ObjectHashMap; +import org.apache.cassandra.db.EmptyIterators; +import org.apache.cassandra.db.SinglePartitionReadCommand; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.db.partitions.PartitionIterators; +import org.apache.cassandra.io.VersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.serializers.IVersionedSerializer; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.utils.CollectionSerializers; +import org.apache.cassandra.utils.Int32Serializer; +import org.apache.cassandra.utils.NullableSerializer; +import org.apache.cassandra.utils.ObjectSizes; + +import static accord.utils.Invariants.requireArgument; +import static org.apache.cassandra.service.accord.txn.TxnResult.Kind.txn_data; + +/** + * Fairly generic holder for result values for Accord txns as well as data exchange during Accord txn execution + * when read results are returned to the coordinator to compute query results and writes. + */ +public class TxnData extends Int2ObjectHashMap implements TxnResult, Data +{ + private static final long EMPTY_SIZE = ObjectSizes.measure(new TxnData()); + + private static final int TXN_DATA_NAME_INDEX_BITS = 32 - 6; + private static final int TXN_DATA_NAME_INDEX_MASK = ~(~0 << TXN_DATA_NAME_INDEX_BITS); + public static final int TXN_DATA_NAME_INDEX_MAX = ((1 << TXN_DATA_NAME_INDEX_BITS) - 1); + + public enum TxnDataNameKind + { + USER((byte) 0), + RETURNING((byte) 1), + AUTO_READ((byte) 2), + CAS_READ((byte) 3); + + private final byte value; + + TxnDataNameKind(byte value) + { + this.value = value; + } + + public static TxnDataNameKind from(byte b) + { + switch (b) + { + case 0: + return USER; + case 1: + return RETURNING; + case 2: + return AUTO_READ; + case 3: + return CAS_READ; + default: + throw new IllegalArgumentException("Unknown kind: " + b); + } + } + } + + public static int txnDataName(TxnDataNameKind kind, int index) + { + requireArgument(index >= 0 && index <= TXN_DATA_NAME_INDEX_MAX); + int kindInt = (int)(((long)kind.value) << TXN_DATA_NAME_INDEX_BITS); + return kindInt | index; + } + + public static int txnDataName(TxnDataNameKind kind) + { + return txnDataName(kind, 0); + } + + public static TxnDataNameKind txnDataNameKind(int txnDataName) + { + int kind = txnDataName >>> TXN_DATA_NAME_INDEX_BITS; + return TxnDataNameKind.from((byte)kind); + } + + public static int txnDataNameIndex(int txnDataName) + { + return txnDataName & TXN_DATA_NAME_INDEX_MASK; + } + + public TxnData() {} + + private TxnData(int size) + { + super(size, 0.65f); + } + + public static TxnData of(int key, TxnDataValue value) + { + TxnData result = newWithExpectedSize(1); + result.put(key, value); + return result; + } + + public static TxnData newWithExpectedSize(int size) + { + requireArgument(size >= 0, "size can't be negative"); + size = Math.max(4, size); + return new TxnData(size < 1073741824 ? (int)((float)size / 0.75F + 1.0F) : Integer.MAX_VALUE); + } + + @Override + public TxnData merge(Data data) + { + TxnData that = (TxnData) data; + TxnData merged = new TxnData(); + this.forEach(merged::put); + for (Map.Entry e : that.entrySet()) + merged.merge(e.getKey(), e.getValue(), TxnDataValue::merge); + return merged; + } + + public static Data merge(Data left, Data right) + { + if (left == null) + return right; + if (right == null) + return null; + + return left.merge(right); + } + + @Override + public long estimatedSizeOnHeap() + { + long size = EMPTY_SIZE + (size() * TypeSizes.INT_SIZE); + for (TxnDataValue value : values()) + size += value.estimatedSizeOnHeap(); + return size; + } + + public static TxnData emptyPartition(int name, SinglePartitionReadCommand command) + { + TxnData result = new TxnData(); + TxnDataKeyValue empty = new TxnDataKeyValue(PartitionIterators.getOnlyElement(EmptyIterators.partition(), command)); + result.put(name, empty); + return result; + } + + @Override + public Kind kind() + { + return txn_data; + } + + private static final IVersionedSerializer INT32_SERIALIZER = IVersionedSerializer.fromSerializer(Int32Serializer.serializer); + public static final IVersionedSerializer serializer = new IVersionedSerializer() + { + @Override + public void serialize(TxnData data, DataOutputPlus out, Version version) throws IOException + { + CollectionSerializers.serializeMap(data, out, version, INT32_SERIALIZER, TxnDataValue.serializer); + } + + @Override + public TxnData deserialize(DataInputPlus in, Version version) throws IOException + { + return CollectionSerializers.deserializeMap(in, version, INT32_SERIALIZER, TxnDataValue.serializer, TxnData::newWithExpectedSize); + } + + @Override + public long serializedSize(TxnData data, Version version) + { + return CollectionSerializers.serializedMapSize(data, version, INT32_SERIALIZER, TxnDataValue.serializer); + } + }; + + public static final VersionedSerializer nullableSerializer = NullableSerializer.wrap(serializer); +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/TxnDataKeyValue.java b/src/java/org/apache/cassandra/service/accord/txn/TxnDataKeyValue.java new file mode 100644 index 000000000000..ea8d01475771 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/TxnDataKeyValue.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.io.IOException; + +import accord.utils.Invariants; +import org.apache.cassandra.db.partitions.FilteredPartition; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.RowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIteratorSerializer; +import org.apache.cassandra.db.rows.UnfilteredRowIterators; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.serializers.Version; + +import static org.apache.cassandra.db.SerializationHeader.StableHeaderSerializer.STABLE; +import static org.apache.cassandra.db.rows.DeserializationHelper.Flag.FROM_REMOTE; + +public class TxnDataKeyValue extends FilteredPartition implements TxnDataValue +{ + public TxnDataKeyValue(RowIterator rows) + { + super(rows); + } + + @Override + public TxnDataValue.Kind kind() + { + return Kind.key; + } + + @Override + public TxnDataValue merge(TxnDataValue that) + { + Invariants.require(this.equals(that)); + return this; + } + + @Override + public long estimatedSizeOnHeap() + { + long size = 0; + Row staticRow = staticRow(); + if (staticRow != null) + size += staticRow.unsharedHeapSize(); + for (Row row : this) + size += row.unsharedHeapSize(); + // TODO: Include the other parts of FilteredPartition after we rebase to pull in BTreePartitionData? + return size; + } + + public static final TxnDataValueSerializer serializer = new TxnDataValueSerializer<>() + { + @Override + public void serialize(TxnDataKeyValue value, DataOutputPlus out, Version version) throws IOException + { + value.metadata().id.serializeCompact(out); + try (UnfilteredRowIterator iterator = value.unfilteredIterator()) + { + UnfilteredRowIteratorSerializer.serializer.serialize(iterator, out, version.messageVersion(), value.rowCount(), STABLE, null); + } + } + + @Override + public TxnDataKeyValue deserialize(DataInputPlus in, Version version) throws IOException + { + TableMetadata metadata = Schema.instance.getExistingTableMetadata(TableId.deserializeCompact(in)); + UnfilteredRowIteratorSerializer.Header header = UnfilteredRowIteratorSerializer.serializer.deserializeHeader(metadata, in, version.messageVersion(), FROM_REMOTE, STABLE, null); + try (UnfilteredRowIterator partition = UnfilteredRowIteratorSerializer.serializer.deserialize(in, version.messageVersion(), metadata, FROM_REMOTE, header)) + { + return new TxnDataKeyValue(UnfilteredRowIterators.filter(partition, 0)); + } + } + + @Override + public long serializedSize(TxnDataKeyValue value, Version version) + { + TableId tableId = value.metadata().id; + long size = tableId.serializedCompactSize(); + try (UnfilteredRowIterator iterator = value.unfilteredIterator()) + { + return size + UnfilteredRowIteratorSerializer.serializer.serializedSize(iterator, version.messageVersion(), value.rowCount(), STABLE, null); + } + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/TxnDataRangeValue.java b/src/java/org/apache/cassandra/service/accord/txn/TxnDataRangeValue.java new file mode 100644 index 000000000000..edb9ac752850 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/TxnDataRangeValue.java @@ -0,0 +1,178 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.RandomAccess; +import java.util.function.Supplier; +import javax.annotation.Nullable; + +import com.google.common.collect.Lists; + +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.db.partitions.FilteredPartition; +import org.apache.cassandra.db.partitions.PartitionIterator; +import org.apache.cassandra.db.partitions.PartitionIterators; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowIteratorSerializer; +import org.apache.cassandra.db.rows.UnfilteredRowIterators; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.utils.ObjectSizes; + +import static com.google.common.base.Preconditions.checkState; +import static org.apache.cassandra.db.SerializationHeader.StableHeaderSerializer.STABLE; +import static org.apache.cassandra.db.rows.DeserializationHelper.Flag.FROM_REMOTE; +import static org.apache.cassandra.utils.ObjectSizes.sizeOfReferenceArray; + +public class TxnDataRangeValue extends ArrayList implements TxnDataValue, RandomAccess +{ + private static final long EMPTY_SIZE = ObjectSizes.measure(new TxnDataRangeValue(0)) - sizeOfReferenceArray(0); + + public TxnDataRangeValue() {} + + public TxnDataRangeValue(int size) + { + super(size); + } + + @Override + public Kind kind() + { + return Kind.range; + } + + @Override + public TxnDataRangeValue merge(TxnDataValue other) + { + if (isEmpty()) + return (TxnDataRangeValue)other; + + TxnDataRangeValue otherRange = (TxnDataRangeValue)other; + if (otherRange.isEmpty()) + return this; + + TableId tableId = tableId(); + for (FilteredPartition partition : otherRange) + checkState(partition.metadata().id.equals(tableId), "All values should be for the same table"); + + addAll(((TxnDataRangeValue)other)); + return this; + } + + Supplier toPartitionIterator(boolean reversed) + { + // Sorting isn't preserved when merging TxnDataRangeValues together so sort here + sort(); + return () -> PartitionIterators.concat(Lists.transform(this, v -> PartitionIterators.singletonIterator(v.rowIterator(reversed)))); + } + + private void sort() + { + sort(Comparator.comparing(FilteredPartition::partitionKey)); + } + + private @Nullable TableId tableId() + { + if (isEmpty()) + return null; + return get(0).metadata().id; + } + + @Override + public long estimatedSizeOnHeap() + { + long size = EMPTY_SIZE + sizeOfReferenceArray(size()); + for (FilteredPartition partition : this) + { + Row staticRow = partition.staticRow(); + if (staticRow != null) + size += staticRow.unsharedHeapSize(); + for (Row row : partition) + size += row.unsharedHeapSize(); + } + + // TODO: Include the other parts of FilteredPartition after we rebase to pull in BTreePartitionData? + return size; + } + + public static final TxnDataValueSerializer serializer = new TxnDataValueSerializer<>() + { + @Override + public void serialize(TxnDataRangeValue value, DataOutputPlus out, Version version) throws IOException + { + out.writeUnsignedVInt32(value.size()); + + if (value.isEmpty()) + return; + + TableId.serializer.serialize(value.tableId(), out, version.messageVersion()); + for (FilteredPartition partition : value) + { + try (UnfilteredRowIterator iterator = partition.unfilteredIterator()) + { + UnfilteredRowIteratorSerializer.serializer.serialize(iterator, out, version.messageVersion(), partition.rowCount(), STABLE, null); + } + } + } + + @Override + public TxnDataRangeValue deserialize(DataInputPlus in, Version version) throws IOException + { + int numPartitions = in.readUnsignedVInt32(); + TxnDataRangeValue value = new TxnDataRangeValue(numPartitions); + if (numPartitions == 0) + return value; + TableMetadata metadata = Schema.instance.getExistingTableMetadata(TableId.deserialize(in)); + for (int i = 0; i < numPartitions; i++) + { + UnfilteredRowIteratorSerializer.Header header = UnfilteredRowIteratorSerializer.serializer.deserializeHeader(metadata, in, version.messageVersion(), FROM_REMOTE, STABLE, null); + try (UnfilteredRowIterator partition = UnfilteredRowIteratorSerializer.serializer.deserialize(in, version.messageVersion(), metadata, FROM_REMOTE, header)) + { + value.add(new FilteredPartition(UnfilteredRowIterators.filter(partition, 0))); + } + } + return value; + } + + @Override + public long serializedSize(TxnDataRangeValue value, Version version) + { + long size = TypeSizes.sizeofUnsignedVInt(value.size()); + if (value.size() == 0) + return size; + size += TableId.serializer.serializedSize(value.tableId(), version.messageVersion()); + for (FilteredPartition partition : value) + { + try (UnfilteredRowIterator iterator = partition.unfilteredIterator()) + { + size += UnfilteredRowIteratorSerializer.serializer.serializedSize(iterator, version.messageVersion(), partition.rowCount(), STABLE, null); + } + } + return size; + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/TxnDataValue.java b/src/java/org/apache/cassandra/service/accord/txn/TxnDataValue.java new file mode 100644 index 000000000000..bee1340854ba --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/TxnDataValue.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.io.IOException; + +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.serializers.IVersionedSerializer; +import org.apache.cassandra.service.accord.serializers.Version; + +import static org.apache.cassandra.db.TypeSizes.sizeof; + +/** + * The result of either a key or range read during Accord transaction execution or a transaction result + */ +public interface TxnDataValue +{ + interface TxnDataValueSerializer extends IVersionedSerializer + {} + + enum Kind + { + key(0), + range(1); + int id; + + Kind(int id) + { + this.id = id; + } + + public TxnDataValueSerializer serializer() + { + switch (this) + { + case key: + return TxnDataKeyValue.serializer; + case range: + return TxnDataRangeValue.serializer; + default: + throw new IllegalStateException("Unrecognized kind " + this); + } + } + } + + TxnDataValue.Kind kind(); + + TxnDataValue merge(TxnDataValue other); + + long estimatedSizeOnHeap(); + + IVersionedSerializer serializer = new IVersionedSerializer() + { + @SuppressWarnings("unchecked") + @Override + public void serialize(TxnDataValue txnDataValue, DataOutputPlus out, Version version) throws IOException + { + out.writeByte(txnDataValue.kind().ordinal()); + txnDataValue.kind().serializer().serialize(txnDataValue, out, version); + } + + @Override + public TxnDataValue deserialize(DataInputPlus in, Version version) throws IOException + { + TxnDataValue.Kind kind = TxnDataValue.Kind.values()[in.readByte()]; + return (TxnDataValue)kind.serializer().deserialize(in, version); + } + + @SuppressWarnings("unchecked") + @Override + public long serializedSize(TxnDataValue txnDataValue, Version version) + { + return sizeof((byte)txnDataValue.kind().ordinal()) + txnDataValue.kind().serializer().serializedSize(txnDataValue, version); + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/TxnNamedRead.java b/src/java/org/apache/cassandra/service/accord/txn/TxnNamedRead.java new file mode 100644 index 000000000000..ea27a398c2e8 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/TxnNamedRead.java @@ -0,0 +1,507 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.util.Objects; +import java.util.concurrent.Callable; +import java.util.concurrent.TimeUnit; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.Data; +import accord.api.RoutingKey; +import accord.primitives.Range; +import accord.primitives.Seekable; +import accord.primitives.Timestamp; +import accord.utils.Invariants; +import accord.utils.async.AsyncChain; +import accord.utils.async.AsyncChains; +import accord.utils.async.AsyncResults; +import org.apache.cassandra.concurrent.DebuggableTask; +import org.apache.cassandra.concurrent.Stage; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.db.PartitionRangeReadCommand; +import org.apache.cassandra.db.ReadCommand; +import org.apache.cassandra.db.ReadExecutionController; +import org.apache.cassandra.db.SinglePartitionReadCommand; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.db.partitions.FilteredPartition; +import org.apache.cassandra.db.partitions.PartitionIterator; +import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; +import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators; +import org.apache.cassandra.db.rows.RowIterator; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.dht.Token.KeyBound; +import org.apache.cassandra.io.ParameterisedVersionedSerializer; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.service.accord.txn.TxnData.TxnDataNameKind; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.Comparables; +import org.apache.cassandra.utils.MonotonicClock; +import org.apache.cassandra.utils.ObjectSizes; + +import static com.google.common.base.Preconditions.checkState; +import static org.apache.cassandra.io.util.DataOutputBuffer.scratchBuffer; +import static org.apache.cassandra.utils.ByteBufferUtil.readWithVIntLength; +import static org.apache.cassandra.utils.ByteBufferUtil.serializedSizeWithVIntLength; +import static org.apache.cassandra.utils.ByteBufferUtil.writeWithVIntLength; + +public class TxnNamedRead extends AbstractSerialized +{ + @SuppressWarnings("unused") + private static final Logger logger = LoggerFactory.getLogger(TxnNamedRead.class); + + private static final long EMPTY_SIZE = ObjectSizes.measure(new TxnNamedRead(0, null, null)); + + private final int name; + private final Seekable key; + + public TxnNamedRead(int name, PartitionKey key, SinglePartitionReadCommand value, TableMetadatas tables) + { + super(serializeInternal(value, tables, Version.LATEST)); + this.name = name; + this.key = key; + } + + public TxnNamedRead(int name, AbstractBounds range, PartitionRangeReadCommand value, TableMetadatas tables) + { + super(serializeInternal(value, tables, Version.LATEST)); + TableId tableId = value.metadata().id; + this.name = name; + this.key = boundsAsAccordRange(range, tableId); + } + + TxnNamedRead(int name, Seekable key, ByteBuffer bytes) + { + super(bytes); + this.name = name; + this.key = key; + } + + public static TokenRange boundsAsAccordRange(AbstractBounds range, TableId tableId) + { + // Should already have been unwrapped + checkState(!AbstractBounds.strictlyWrapsAround(range.left, range.right)); + + // Read commands can contain a mix of different kinds of bounds to facilitate paging + // and we need to communicate that to Accord as its own ranges. This uses + // TokenKey, SentinelKey, and MinTokenKey and sticks exclusively with left exclusive/right inclusive + // ranges rather add more types of ranges to the mix + // MinTokenKey allows emulating inclusive left and exclusive right with Range + boolean inclusiveLeft = range.inclusiveLeft(); + PartitionPosition startPP = range.left; + boolean startIsMinKeyBound = startPP.getClass() == KeyBound.class ? ((KeyBound)startPP).isMinimumBound : false; + Token startToken = startPP.getToken(); + Token stopToken = range.right.getToken(); + TokenKey startTokenKey; + if (startToken.isMinimum() && inclusiveLeft) + startTokenKey = TokenKey.min(tableId, startToken.getPartitioner()); + else if (inclusiveLeft || startIsMinKeyBound || startToken.equals(stopToken)) + startTokenKey = TokenKey.before(tableId, startToken); + else + startTokenKey = new TokenKey(tableId, startToken); + + boolean inclusiveRight = range.inclusiveRight(); + PartitionPosition endPP = range.right; + boolean endIsMinKeyBound = endPP.getClass() == KeyBound.class ? ((KeyBound)endPP).isMinimumBound : false; + TokenKey stopTokenKey; + if (stopToken.isMinimum()) + stopTokenKey = TokenKey.max(tableId, startToken.getPartitioner()); + else if (inclusiveRight && !endIsMinKeyBound) + stopTokenKey = new TokenKey(tableId, stopToken); + else + stopTokenKey = TokenKey.before(tableId, stopToken); + return TokenRange.create(startTokenKey, stopTokenKey); + } + + public long estimatedSizeOnHeap() + { + long size = EMPTY_SIZE; + // we don't measure the key, as this is shared + size += (unsafeBytes() != null ? ByteBufferUtil.estimatedSizeOnHeap(unsafeBytes()) : 0); + return size; + } + + @Override + protected ByteBuffer serialize(ReadCommand value, TableMetadatas param, Version version) + { + return serializeInternal(value, param, version); + } + + private static ByteBuffer serializeInternal(ReadCommand value, TableMetadatas param, Version version) + { + try (DataOutputBuffer buffer = scratchBuffer.get()) + { + ReadCommand.serializer.serializeForAccord(value, param, buffer, version.messageVersion()); + return buffer.asNewBuffer(); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } + + @Override + protected ByteBuffer reserialize(ByteBuffer buffer, TableMetadatas param, Version srcVersion, Version trgVersion) + { + return buffer; + } + + @Override + protected ReadCommand deserialize(TableMetadatas param, ByteBuffer bytes, Version version) + { + try (DataInputBuffer buffer = new DataInputBuffer(bytes, true)) + { + return ReadCommand.serializer.deserializeForAccord(key, param, buffer, version.messageVersion()); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + TxnNamedRead namedRead = (TxnNamedRead) o; + return name == namedRead.name && key.equals(namedRead.key); + } + + @Override + public int hashCode() + { + return Objects.hash(super.hashCode(), name, key); + } + + @Override + public String toString() + { + return "TxnNamedRead{name='" + name + '\'' + ", keys=" + key + '}'; + } + + public int txnDataName() + { + return name; + } + + public Seekable key() + { + return key; + } + + public static long nowInSeconds(Timestamp executeAt) + { + return TimeUnit.MICROSECONDS.toSeconds(executeAt.hlc()); + } + + public AsyncChain read(TableMetadatas tables, ConsistencyLevel consistencyLevel, Seekable key, Timestamp executeAt) + { + ReadCommand command = deserialize(tables); + if (command == null) + return AsyncResults.success(TxnData.NOOP_DATA); + + // It's fine for our nowInSeconds to lag slightly our insertion timestamp, as to the user + // this simply looks like the transaction witnessed TTL'd data and the data then expired + // immediately after the transaction executed, and this simplifies things a great deal + long nowInSeconds = nowInSeconds(executeAt); + + boolean withoutReconciliation = readsWithoutReconciliation(consistencyLevel); + switch (key.domain()) + { + case Key: + return performLocalKeyRead(((SinglePartitionReadCommand) command).withTransactionalSettings(withoutReconciliation, nowInSeconds)); + case Range: + return performLocalRangeRead(((PartitionRangeReadCommand) command), key.asRange(), consistencyLevel, nowInSeconds); + default: + throw new IllegalStateException("Unhandled domain " + key.domain()); + } + } + + public TxnNamedRead slice(Range range) + { + Invariants.require(key.domain().isRange()); + if (key.equals(range)) + return this; + + Invariants.require(((Range)key).contains(range)); + return new TxnNamedRead(txnDataName(), range, unsafeBytes()); + } + + public TxnNamedRead merge(TxnNamedRead with) + { + Invariants.require(key.domain().isRange()); + if (key.equals(with.key)) + return this; + + Range thisRange = key.asRange(); + Range thatRange = with.key.asRange(); + Invariants.require(thisRange.compareTouching(thatRange) == 0); + RoutingKey start = Comparables.min(thisRange.start(), thatRange.start()); + RoutingKey end = Comparables.max(thisRange.end(), thatRange.end()); + Range range = thisRange.newRange(start, end); + return new TxnNamedRead(txnDataName(), range, unsafeBytes()); + } + + public static boolean readsWithoutReconciliation(ConsistencyLevel consistencyLevel) + { + boolean withoutReconciliation = consistencyLevel == null || consistencyLevel == ConsistencyLevel.ONE; + return withoutReconciliation; + } + + + public ReadCommand command(TableMetadatas tables) + { + return deserialize(tables); + } + + private AsyncChain performLocalKeyRead(SinglePartitionReadCommand read) + { + Callable readCallable = () -> + { + try (ReadExecutionController controller = read.executionController(); + PartitionIterator iterator = UnfilteredPartitionIterators.filter(read.executeLocally(controller), read.nowInSec())) + { + TxnData result = new TxnData(); + if (iterator.hasNext()) + { + TxnDataKeyValue value = new TxnDataKeyValue(iterator.next()); + if (value.hasRows() || read.selectsFullPartition()) + result.put(name, value); + } + return result; + } + }; + + return AsyncChains.ofCallable(Stage.READ.executor(), readCallable, (callable, receiver) -> + new DebuggableTask.RunnableDebuggableTask() + { + private final long approxCreationTimeNanos = MonotonicClock.Global.approxTime.now(); + private volatile long approxStartTimeNanos; + + @Override + public void run() + { + approxStartTimeNanos = MonotonicClock.Global.approxTime.now(); + + try + { + Data call = callable.call(); + receiver.accept(call, null); + } + catch (Throwable t) + { + logger.debug("AsyncChain Callable threw an Exception", t); + receiver.accept(null, t); + } + } + + @Override + public long creationTimeNanos() + { + return approxCreationTimeNanos; + } + + @Override + public long startTimeNanos() + { + return approxStartTimeNanos; + } + + @Override + public String description() + { + return read.toCQLString(); + } + } + ); + } + + public static PartitionRangeReadCommand commandForSubrange(PartitionRangeReadCommand command, Range r, ConsistencyLevel consistencyLevel, long nowInSeconds) + { + AbstractBounds bounds = command.dataRange().keyRange(); + PartitionPosition startPP = bounds.left; + PartitionPosition endPP = bounds.right; + TokenKey startRoutingKey = ((TokenKey)r.start()); + TokenKey endRoutingKey = ((TokenKey)r.end()); + Token subRangeStartToken = startRoutingKey.isMin() ? startPP.getToken() : startRoutingKey.token(); + Token subRangeEndToken = endRoutingKey.isMax() ? endPP.getToken() : endRoutingKey.token(); + + /* + * The way ranges/bounds work for range queries is that the beginning and ending bounds from the command + * could be tokens (and min/max key bounds) or actual keys depending on the bounds of the top level query we + * are running and where we are in paging. We need to preserve whatever is in the command in case it is a + * key and not a token, or it's a token but might be a min/max key bound. + * + * Then Accord will further subdivide the range in the command so need to inject additional bounds in the middle + * that match the range ownership of Accord. + * + * The command still contains the original bound and then the Accord range passed in determines what subset of + * that bound we want. We have to make sure to use the bounds from the command if it is the start or end instead + * of a key bound created from the Accord range since it could be a real key or min/max bound. + * + * When we are dealing with a bound created by Accord's further subdivision we use a maxKeyBound (exclusive) + * for both beginning and end because Bounds is left and right inclusive while Range is only left inclusive. + * We only use TokenRange with Accord which matches the left/right inclusivity of Cassandra's Range. + * + * That means the Range we get from Accord overlaps the previous Range on the left which when converted to a Bound + * would potentially read the same Token twice. So the left needs to be a maxKeyBound to exclude the data that isn't + * owned here and to avoid potentially reading the same data twice. The right bound also needs to be a maxKeyBound since Range + * is right inclusive so every partition we find needs to be < the right bound. + */ + boolean isFirstSubrange = startPP.getToken().equals(subRangeStartToken); + PartitionPosition subRangeStartPP = isFirstSubrange ? startPP : subRangeStartToken.maxKeyBound(); + PartitionPosition subRangeEndPP = endPP.getToken().equals(subRangeEndToken) ? endPP : subRangeEndToken.maxKeyBound(); + // Need to preserve the fact it is a bounds for paging to work, a range is not left inclusive and will not start from where we left off + AbstractBounds subRange = isFirstSubrange ? bounds.withNewRight(subRangeEndPP) : new org.apache.cassandra.dht.Range(subRangeStartPP, subRangeEndPP); + boolean isRangeContinuation = startPP.getToken().equals(subRangeStartToken); + return command.withTransactionalSettings(nowInSeconds, subRange, isRangeContinuation, readsWithoutReconciliation(consistencyLevel)); + } + + private AsyncChain performLocalRangeRead(PartitionRangeReadCommand command, Range r, ConsistencyLevel consistencyLevel, long nowInSeconds) + { + PartitionRangeReadCommand read = commandForSubrange(command, r, consistencyLevel, nowInSeconds); + Callable readCallable = () -> + { + try (ReadExecutionController controller = read.executionController(); + UnfilteredPartitionIterator partition = read.executeLocally(controller); + PartitionIterator iterator = UnfilteredPartitionIterators.filter(partition, read.nowInSec())) + { + TxnData result = new TxnData(); + TxnDataRangeValue value = new TxnDataRangeValue(); + while (iterator.hasNext()) + { + try (RowIterator rows = iterator.next()) + { + FilteredPartition filtered = FilteredPartition.create(rows); + if (filtered.hasRows() || read.selectsFullPartition()) + { + value.add(filtered); + } + } + } + result.put(TxnData.txnDataName(TxnDataNameKind.USER), value); + return result; + } + }; + + return AsyncChains.ofCallable(Stage.READ.executor(), readCallable, (callable, receiver) -> + new DebuggableTask.RunnableDebuggableTask() + { + private final long approxCreationTimeNanos = MonotonicClock.Global.approxTime.now(); + private volatile long approxStartTimeNanos; + + @Override + public void run() + { + approxStartTimeNanos = MonotonicClock.Global.approxTime.now(); + + try + { + Data call = callable.call(); + receiver.accept(call, null); + } + catch (Throwable t) + { + logger.debug("AsyncChain Callable threw an Exception", t); + receiver.accept(null, t); + } + } + + @Override + public long creationTimeNanos() + { + return approxCreationTimeNanos; + } + + @Override + public long startTimeNanos() + { + return approxStartTimeNanos; + } + + @Override + public String description() + { + return command.toCQLString(); + } + } + ); + } + + static final ParameterisedVersionedSerializer serializer = new ParameterisedVersionedSerializer<>() + { + @Override + public void serialize(TxnNamedRead read, TableMetadatasAndKeys tablesAndKeys, DataOutputPlus out, Version version) throws IOException + { + out.writeInt(read.name); + tablesAndKeys.serializeSeekable(read.key, out); + if (!read.isNull()) + { + out.write(0); + writeWithVIntLength(read.bytes(tablesAndKeys.tables, version), out); + } + else + { + out.write(1); + } + } + + @Override + public TxnNamedRead deserialize(TableMetadatasAndKeys tablesAndKeys, DataInputPlus in, Version version) throws IOException + { + int name = in.readInt(); + Seekable key = tablesAndKeys.deserializeSeekable(in); + ByteBuffer bytes = in.readByte() == 1 ? null : readWithVIntLength(in); + if (version != Version.LATEST) + bytes = serializeUnchecked(deserializeUnchecked(tablesAndKeys, bytes, version), tablesAndKeys, Version.LATEST); + return new TxnNamedRead(name, key, bytes); + } + + @Override + public long serializedSize(TxnNamedRead read, TableMetadatasAndKeys tablesAndKeys, Version version) + { + long size = 0; + size += TypeSizes.sizeof(read.name); + size += tablesAndKeys.serializedSeekableSize(read.key); + size += TypeSizes.BYTE_SIZE; // is null + if (!read.isNull()) + size += serializedSizeWithVIntLength(read.bytes(tablesAndKeys.tables, version)); + return size; + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/TxnQuery.java b/src/java/org/apache/cassandra/service/accord/txn/TxnQuery.java new file mode 100644 index 000000000000..c78401ffa2f3 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/TxnQuery.java @@ -0,0 +1,317 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.io.IOException; +import java.util.function.Supplier; +import javax.annotation.Nullable; + +import com.google.common.base.Preconditions; + +import accord.api.Data; +import accord.api.Query; +import accord.api.Read; +import accord.api.Result; +import accord.api.Update; +import accord.primitives.Ranges; +import accord.primitives.Seekables; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import org.apache.cassandra.db.EmptyIterators; +import org.apache.cassandra.db.PartitionRangeReadCommand; +import org.apache.cassandra.db.SinglePartitionReadCommand; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.db.partitions.PartitionIterator; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.metrics.ClientRequestsMetricsHolder; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.service.accord.txn.TxnData.TxnDataNameKind; +import org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.utils.ObjectSizes; + +import static accord.api.Data.NOOP_DATA; +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.base.Preconditions.checkState; +import static org.apache.cassandra.service.accord.txn.TxnData.TxnDataNameKind.CAS_READ; +import static org.apache.cassandra.service.accord.txn.TxnData.txnDataName; + +public abstract class TxnQuery implements Query +{ + + /** + * Used by transaction statements which will have Accord pass back to the C* coordinator code all the data that is + * read even if it is not returned as part of the result to the client. TxnDataName.returning() will fetch the data + * that is returned from TxnData. + * + * Also used by SERIAL key reads, and non-SERIAL key reads when they are executed on Accord. + */ + public static final TxnQuery ALL = new TxnQuery() + { + @Override + protected byte type() + { + return 1; + } + + @Override + public Result doCompute(TxnId txnId, Timestamp executeAt, Seekables keys, @Nullable Data data, @Nullable Read read, @Nullable Update update) + { + return data != null ? (TxnData) data : new TxnData(); + } + }; + + /** + * For transactions that return no results but do still care that they don't apply if the tokens/ranges + * are not owned/managed by Accord + */ + public static final TxnQuery NONE = new TxnQuery() + { + @Override + protected byte type() + { + return 2; + } + + @Override + public Result doCompute(TxnId txnId, Timestamp executeAt, Seekables keys, @Nullable Data data, @Nullable Read read, @Nullable Update update) + { + return new TxnData(); + } + }; + + /** + * For supporting CQL CAS compatible transactions + */ + public static final TxnQuery CONDITION = new TxnQuery() + { + @Override + protected byte type() + { + return 3; + } + + @Override + public Result doCompute(TxnId txnId, Timestamp executeAt, Seekables keys, @Nullable Data data, @Nullable Read read, Update update) + { + checkNotNull(txnId, "txnId should not be null"); + checkNotNull(data, "data should not be null"); + checkNotNull(update, "update should not be null"); + + AccordUpdate accordUpdate = (AccordUpdate)update; + TxnData txnData = (TxnData)data; + boolean conditionCheck = accordUpdate.checkCondition(data); + // If the condition applied an empty result indicates success + if (conditionCheck) + return new TxnData(); + else if (txnData.isEmpty()) + { + TxnRead txnKeyRead = (TxnRead)read; + SinglePartitionReadCommand command = (SinglePartitionReadCommand) txnKeyRead.deserialize(0); + // For CAS must return a non-empty result to indicate error even if there was no partition found + return TxnData.of(txnDataName(CAS_READ), new TxnDataKeyValue(EmptyIterators.row(command.metadata(), command.partitionKey(), command.isReversed()))); + } + else + // If it failed to apply the partition contents are returned and it indicates failure + return ((TxnData)data); + } + }; + + /** + * UNSAFE_EMPTY doesn't validate that the range is owned by Accord so you want to be careful and use NONE + * if your transaction simply doesn't have results because that will validate that Accord owns the range + * for things like blind writes. Empty is used by Accord for things like sync points which may need to execute + * for ranges Accord used to manage, but no longer does. + */ + public static final TxnQuery UNSAFE_EMPTY = new TxnQuery() + { + + @Override + protected byte type() + { + return 4; + } + + @Override + public Result compute(TxnId txnId, Timestamp executeAt, Seekables keys, @Nullable Data data, @Nullable Read read, @Nullable Update update) + { + // Skip the migration checks in the base class for empty transactions, we don't + // want/need the RetryWithNewProtocolResult + return new TxnData(); + } + + @Override + protected Result doCompute(TxnId txnId, Timestamp executeAt, Seekables keys, @Nullable Data data, @Nullable Read read, @Nullable Update update) + { + throw new UnsupportedOperationException(); + } + }; + + public static final TxnQuery RANGE_QUERY = new TxnQuery() + { + @Override + protected byte type() + { + return 5; + } + + @Override + public Result doCompute(TxnId txnId, Timestamp executeAt, Seekables keys, @Nullable Data data, @Nullable Read read, @Nullable Update update) + { + return data != null ? concat((TxnData) data, read) : new TxnData(); + } + + private Result concat(TxnData data, Read read) + { + TxnRead txnRead = (TxnRead) read; + PartitionRangeReadCommand command = (PartitionRangeReadCommand) txnRead.deserialize(0); + TxnDataRangeValue value = (TxnDataRangeValue) data.get(txnDataName(TxnDataNameKind.USER)); + Supplier source = value.toPartitionIterator(command.isReversed()); + // Because the query was split across multiple command stores the pushed down limit won't be sufficient + // to return correct results and has to be applied again here + Supplier sourceWithLimits = () -> command.limits().filter(source.get(), + 0, + command.selectsFullPartition(), + command.metadata().enforceStrictLiveness()); + return new TxnRangeReadResult(sourceWithLimits); + } + }; + + private static final long SIZE = ObjectSizes.measure(ALL); + + private TxnQuery() {} + + abstract protected byte type(); + + abstract protected Result doCompute(TxnId txnId, Timestamp executeAt, Seekables keys, @Nullable Data data, @Nullable Read read, @Nullable Update update); + + @Override + public Result compute(TxnId txnId, Timestamp executeAt, Seekables keys, @Nullable Data data, @Nullable Read read, @Nullable Update update) + { + // TODO (required): This is not the cluster metadata of the current transaction + ClusterMetadata clusterMetadata = ClusterMetadata.current(); + checkState(clusterMetadata.epoch.getEpoch() >= executeAt.epoch(), "TCM epoch %d is < executeAt epoch %d", clusterMetadata.epoch.getEpoch(), executeAt.epoch()); + boolean reads = read != null && !read.keys().isEmpty(); + if (transactionShouldBeBlocked(clusterMetadata, reads, keys, data, update)) + { + if (txnId.isWrite()) + ClientRequestsMetricsHolder.accordWriteMetrics.accordMigrationRejects.mark(); + else + ClientRequestsMetricsHolder.accordReadMetrics.accordMigrationRejects.mark(); + return RetryWithNewProtocolResult.instance; + } + return doCompute(txnId, executeAt, keys, data, read, update); + } + + public long estimatedSizeOnHeap() + { + return SIZE; + } + + public static final UnversionedSerializer serializer = new UnversionedSerializer() + { + @Override + public void serialize(TxnQuery query, DataOutputPlus out) throws IOException + { + Preconditions.checkArgument(query == null | query == ALL | query == NONE | query == CONDITION | query == UNSAFE_EMPTY | query == RANGE_QUERY); + out.writeByte(query == null ? 0 : query.type()); + } + + @Override + public TxnQuery deserialize(DataInputPlus in) throws IOException + { + switch (in.readByte()) + { + default: throw new AssertionError(); + case 0: return null; + case 1: return ALL; + case 2: return NONE; + case 3: return CONDITION; + case 4: return UNSAFE_EMPTY; + case 5: return RANGE_QUERY; + } + } + + @Override + public long serializedSize(TxnQuery query) + { + Preconditions.checkArgument(query == null | query == ALL | query == NONE | query == CONDITION | query == UNSAFE_EMPTY | query == RANGE_QUERY); + return TypeSizes.sizeof((byte)2); + } + }; + + private static boolean transactionShouldBeBlocked(ClusterMetadata clusterMetadata, boolean reads, Seekables keys, Data data, Update update) + { + if (data == NOOP_DATA && (update == null || update.keys().isEmpty())) + return false; + + // TxnQuery needs to be smart enough to allow blind writes through for the non-transactional use cases during migration + // This also allows blind write TransactionStatement to run before TransactionStatemetns with reads can run, + // but this is harmless since we only promise that TransactionStatement works when migrated to Accord. + // TODO (lowpri): This could look at read keys vs write keys to see if it can run in more cases + if (reads) + return !transactionIsSafeToReadAndWrite(clusterMetadata, keys); + else + return !transactionIsSafeToWrite(clusterMetadata, keys); + } + + private static boolean transactionIsSafeToReadAndWrite(ClusterMetadata clusterMetadata, Seekables keys) + { + switch (keys.domain()) + { + case Key: + for (PartitionKey partitionKey : (Seekables)keys) + { + // TODO (required): This is looking at ClusterMetadata, but not the ClusterMetadata for the specified epoch, just that epoch or later. Need to store ConsensusMigrationState in the global Topologies Accord stores for itself. + if (!ConsensusRequestRouter.instance.isKeyManagedByAccordForReadAndWrite(clusterMetadata, partitionKey.table(), partitionKey.partitionKey())) + return false; + } + break; + case Range: + for (accord.primitives.Range range : (Ranges)keys) + { + TokenRange tokenRange = (TokenRange)range; + // TODO (required): This is looking at ClusterMetadata, but not the ClusterMetadata for the specified epoch, just that epoch or later. Need to store ConsensusMigrationState in the global Topologies Accord stores for itself. + if (!ConsensusRequestRouter.instance.isRangeManagedByAccordForReadAndWrite(clusterMetadata, tokenRange.table(), tokenRange)) + return false; + } + break; + default: + throw new IllegalStateException("Unsupported domain " + keys.domain()); + } + + return true; + } + + private static boolean transactionIsSafeToWrite(ClusterMetadata clusterMetadata, Seekables keys) + { + checkState(keys.domain().isKey(), "Only key transactions are supported for writes"); + + for (PartitionKey partitionKey : (Seekables)keys) + { + // TODO (required): This is looking at ClusterMetadata, but not the ClusterMetadata for the specified epoch, just that epoch or later. Need to store ConsensusMigrationState in the global Topologies Accord stores for itself. + if (!ConsensusRequestRouter.instance.isKeyManagedByAccordForWrite(clusterMetadata, partitionKey.table(), partitionKey.partitionKey())) + return false; + } + return true; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/TxnRangeReadResult.java b/src/java/org/apache/cassandra/service/accord/txn/TxnRangeReadResult.java new file mode 100644 index 000000000000..a70709cee9cd --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/TxnRangeReadResult.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.util.function.Supplier; + +import org.apache.cassandra.db.partitions.PartitionIterator; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.RowIterator; +import org.apache.cassandra.utils.ObjectSizes; + +public class TxnRangeReadResult implements TxnResult +{ + private static final long EMPTY_SIZE = ObjectSizes.measure(new TxnRangeReadResult(null)); + + public final Supplier partitions; + + public TxnRangeReadResult(Supplier partitions) + { + this.partitions = partitions; + } + + @Override + public Kind kind() + { + return Kind.range_read; + } + + @Override + public long estimatedSizeOnHeap() + { + long size = EMPTY_SIZE; + PartitionIterator iterator = partitions.get(); + while (iterator.hasNext()) + { + RowIterator rowIterator = iterator.next(); + Row staticRow = rowIterator.staticRow(); + if (staticRow != null) + size += staticRow.unsharedHeapSize(); + while (rowIterator.hasNext()) + size += rowIterator.next().unsharedHeapSize(); + } + // TODO: Include the other parts of FilteredPartition after we rebase to pull in BTreePartitionData? + return size; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/TxnRead.java b/src/java/org/apache/cassandra/service/accord/txn/TxnRead.java new file mode 100644 index 000000000000..d19c241614bf --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/TxnRead.java @@ -0,0 +1,451 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import com.google.common.collect.ImmutableList; + +import accord.api.Data; +import accord.api.DataStore; +import accord.api.Key; +import accord.api.Read; +import accord.local.SafeCommandStore; +import accord.primitives.Keys; +import accord.primitives.Participants; +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.primitives.Routable.Domain; +import accord.primitives.Seekable; +import accord.primitives.Seekables; +import accord.primitives.Timestamp; +import accord.utils.Invariants; +import accord.utils.UnhandledEnum; +import accord.utils.async.AsyncChain; +import accord.utils.async.AsyncChains; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.db.PartitionRangeReadCommand; +import org.apache.cassandra.db.ReadCommand; +import org.apache.cassandra.db.SinglePartitionReadCommand; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.io.ParameterisedVersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.utils.ObjectSizes; + +import static accord.primitives.Routables.Slice.Minimal; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static org.apache.cassandra.service.accord.AccordSerializers.consistencyLevelSerializer; +import static org.apache.cassandra.service.accord.IAccordService.SUPPORTED_READ_CONSISTENCY_LEVELS; +import static org.apache.cassandra.service.accord.txn.TxnData.TxnDataNameKind.CAS_READ; +import static org.apache.cassandra.service.accord.txn.TxnData.TxnDataNameKind.USER; +import static org.apache.cassandra.service.accord.txn.TxnData.txnDataName; +import static org.apache.cassandra.utils.ArraySerializers.deserializeArray; +import static org.apache.cassandra.utils.ArraySerializers.serializeArray; +import static org.apache.cassandra.utils.ArraySerializers.serializedArraySize; +import static org.apache.cassandra.utils.NullableSerializer.deserializeNullable; +import static org.apache.cassandra.utils.NullableSerializer.serializeNullable; +import static org.apache.cassandra.utils.NullableSerializer.serializedNullableSize; + +public class TxnRead extends AbstractKeySorted implements Read +{ + private static final TxnRead EMPTY_KEY = new TxnRead(TableMetadatas.none(), Domain.Key); + private static final TxnRead EMPTY_RANGE = new TxnRead(TableMetadatas.none(), Domain.Range); + private static final long EMPTY_SIZE = ObjectSizes.measure(EMPTY_KEY); + private static final Comparator TXN_NAMED_READ_KEY_COMPARATOR = Comparator.comparing(a -> ((PartitionKey) a.key())); + private static final byte TYPE_EMPTY_KEY = 0; + private static final byte TYPE_EMPTY_RANGE = 1; + private static final byte TYPE_NOT_EMPTY = 2; + + public static TxnRead empty(Domain domain) + { + switch (domain) + { + default: + throw new IllegalStateException("Unhandled domain " + domain); + case Key: + return EMPTY_KEY; + case Range: + return EMPTY_RANGE; + } + } + + final TableMetadatas tables; + // Cassandra's consistency level used by Accord to safely read data written outside of Accord + @Nullable + private final ConsistencyLevel cassandraConsistencyLevel; + + // Specifies the domain in case the TxnRead is empty and it can't be inferred + private final Domain domain; + + private TxnRead(TableMetadatas tables, Domain domain) + { + super(new TxnNamedRead[0], domain); + this.tables = tables; + this.domain = domain; + this.cassandraConsistencyLevel = null; + } + + private TxnRead(TableMetadatas tables, @Nonnull TxnNamedRead[] items, @Nullable ConsistencyLevel cassandraConsistencyLevel) + { + super(items, items[0].key().domain()); + this.tables = tables; + checkArgument(cassandraConsistencyLevel == null || SUPPORTED_READ_CONSISTENCY_LEVELS.contains(cassandraConsistencyLevel), "Unsupported consistency level for read: %s", cassandraConsistencyLevel); + this.cassandraConsistencyLevel = cassandraConsistencyLevel; + this.domain = items[0].key().domain(); + // TODO (expected): relax this condition, require only that it holds for each equal byte[] + // right now this means we don't permit two different range queries in the same transaction touching adjacent ranges + // this is a pretty weak restriction and doesn't interfere with current CQL capabilities, but should be addressed eventually + Invariants.require(domain == Domain.Key || ((Ranges)keys()).mergeTouching() == keys()); + } + + private TxnRead(TableMetadatas tables, @Nonnull List items, @Nullable ConsistencyLevel cassandraConsistencyLevel) + { + super(items, items.get(0).key().domain()); + this.tables = tables; + checkArgument(cassandraConsistencyLevel == null || SUPPORTED_READ_CONSISTENCY_LEVELS.contains(cassandraConsistencyLevel), "Unsupported consistency level for read: %s", cassandraConsistencyLevel); + this.cassandraConsistencyLevel = cassandraConsistencyLevel; + this.domain = items.get(0).key().domain(); + Invariants.require(domain == Domain.Key || ((Ranges)keys()).mergeTouching() == keys()); + } + + private static void sortReads(List reads) + { + if (reads.size() == 0) + return; + reads.sort(TXN_NAMED_READ_KEY_COMPARATOR); + } + + public static TxnRead createTxnRead(TableMetadatas tables, @Nonnull List items, @Nullable ConsistencyLevel consistencyLevel, Domain domain) + { + if (items.isEmpty()) + return empty(domain); + sortReads(items); + return new TxnRead(tables, items, consistencyLevel); + } + + public static TxnRead createSerialRead(List readCommands, ConsistencyLevel consistencyLevel, TableMetadatasAndKeys.KeyCollector keyCollector) + { + List reads = new ArrayList<>(readCommands.size()); + for (int i = 0; i < readCommands.size(); i++) + { + SinglePartitionReadCommand readCommand = readCommands.get(i); + reads.add(new TxnNamedRead(txnDataName(USER, i), keyCollector.collect(readCommand.metadata(), readCommand.partitionKey()), readCommand, keyCollector.tables)); + } + sortReads(reads); + return new TxnRead(keyCollector.tables, reads, consistencyLevel); + } + + public static TxnRead createCasRead(SinglePartitionReadCommand readCommand, ConsistencyLevel consistencyLevel, TableMetadatasAndKeys tablesAndKeys) + { + TxnNamedRead read = new TxnNamedRead(txnDataName(CAS_READ), (PartitionKey) tablesAndKeys.keys.get(0), readCommand, tablesAndKeys.tables); + return new TxnRead(tablesAndKeys.tables, ImmutableList.of(read), consistencyLevel); + } + + // A read that declares it will read from keys but doesn't actually read any data so dependent transactions will + // still be applied first + public static TxnRead createNoOpRead(Keys keys) + { + List reads = new ArrayList<>(keys.size()); + for (int i = 0; i < keys.size(); i++) + reads.add(new TxnNamedRead(txnDataName(USER, i), keys.get(i), null)); + return new TxnRead(TableMetadatas.none(), reads, null); + } + + public static TxnRead createRangeRead(TableMetadatas tables, PartitionRangeReadCommand command, AbstractBounds range, ConsistencyLevel consistencyLevel) + { + return new TxnRead(tables, ImmutableList.of(new TxnNamedRead(txnDataName(USER), range, command, tables)), consistencyLevel); + } + + public long estimatedSizeOnHeap() + { + long size = EMPTY_SIZE; + for (TxnNamedRead read : items) + size += read.estimatedSizeOnHeap(); + return size; + } + + @Override + int compareNonKeyFields(TxnNamedRead left, TxnNamedRead right) + { + return Integer.compare(left.txnDataName(), right.txnDataName()); + } + + ReadCommand deserialize(int i) + { + return get(i).deserialize(tables); + } + + @Override + Seekable getKey(TxnNamedRead read) + { + return read.key(); + } + + @Override + TxnNamedRead[] newArray(int size) + { + return new TxnNamedRead[size]; + } + + @Override + public Seekables keys() + { + return itemKeys; + } + + public ConsistencyLevel cassandraConsistencyLevel() + { + return cassandraConsistencyLevel; + } + + @Override + public Read slice(Ranges ranges) + { + return select(itemKeys.slice(ranges, Minimal)); + } + + @Override + public Read intersecting(Participants participants) + { + return select(itemKeys.intersecting(participants, Minimal)); + } + + private Read select(Seekables select) + { + if (select == keys()) + return this; + + List reads = new ArrayList<>(select.size()); + switch (select.domain()) + { + case Key: + { + Keys keys = (Keys) select; + int i = 0, j = 0; + while (i < select.size() && j < items.length) + { + Key key = keys.get(i); + TxnNamedRead read = items[j]; + int c = key.compareTo((Key)read.key()); + if (c < 0) ++i; + else if (c > 0) ++j; + else + { + reads.add(read); + ++j; + } + } + break; + } + case Range: + { + Ranges ranges = (Ranges) select; + int i = 0, j = 0; + while (i < select.size() && j < items.length) + { + Range range = ranges.get(i); + TxnNamedRead read = items[j]; + int c = range.compareIntersecting((Range) read.key()); + if (c < 0) ++i; + else if (c > 0) ++j; + else + { + reads.add(read.slice(range)); + ++j; + } + } + break; + } + default: + throw new UnhandledEnum(select.domain()); + } + + return createTxnRead(tables, reads, cassandraConsistencyLevel, select.domain()); + } + + @Override + public Read merge(Read read) + { + TxnRead that = (TxnRead)read; + List reads = new ArrayList<>(items.length); + + switch (domain) + { + default: throw new UnhandledEnum(domain); + case Key: + { + int i = 0, j = 0; + while (i < items.length && j < that.items.length) + { + TxnNamedRead r1 = this.items[i], r2 = that.items[j]; + int c = compareKey(r1, r2); + if (c <= 0) + { + reads.add(r1); + ++i; + if (c == 0) + ++j; + } + else + { + reads.add(r2); + ++j; + } + } + break; + } + case Range: + { + int i = 0, j = 0; + TxnNamedRead pending = null; + while (i < items.length && j < that.items.length) + { + TxnNamedRead r1 = this.items[i], r2 = that.items[j]; + int c = compareRange(r1, r2); + TxnNamedRead add; + if (c == 0) + { + add = r1.merge(r2); + ++i; + ++j; + } + else if (c < 0) + { + add = r1; + ++i; + } + else + { + add = r2; + ++j; + } + + if (pending == null) pending = add; + else + { + c = compareRange(pending, add); + if (c < 0) + { + reads.add(pending); + pending = add; + } + else + { + Invariants.require(c == 0); + pending = pending.merge(add); + } + } + } + if (pending != null) + reads.add(pending); + break; + } + } + return createTxnRead(tables, reads, cassandraConsistencyLevel, that.domain); + } + + public void unmemoize() + { + for (TxnNamedRead read : items) + read.unmemoize(); + } + + @Override + public AsyncChain read(Seekable key, SafeCommandStore safeStore, Timestamp executeAt, DataStore store) + { + // Set to null since we don't need it and interop can pass in null + safeStore = null; + ClusterMetadata cm = ClusterMetadata.current(); + checkState(cm.epoch.getEpoch() >= executeAt.epoch(), "TCM epoch %d is < executeAt epoch %d", cm.epoch.getEpoch(), executeAt.epoch()); + + List> results = new ArrayList<>(); + forEachWithKey(key, read -> results.add(read.read(tables, cassandraConsistencyLevel, key, executeAt))); + + if (results.isEmpty()) + // Result type must match everywhere + return AsyncChains.success(new TxnData()); + + if (results.size() == 1) + return results.get(0); + + return AsyncChains.reduce(results, Data::merge); + } + + public static final ParameterisedVersionedSerializer serializer = new ParameterisedVersionedSerializer<>() + { + @Override + public void serialize(TxnRead read, TableMetadatasAndKeys tablesAndKeys, DataOutputPlus out, Version version) throws IOException + { + if (read.items.length > 0) + { + out.write(TYPE_NOT_EMPTY); + serializeArray(read.items, tablesAndKeys, out, version, TxnNamedRead.serializer); + serializeNullable(read.cassandraConsistencyLevel, out, consistencyLevelSerializer); + } + else + { + out.write(read.domain == Domain.Key ? TYPE_EMPTY_KEY : TYPE_EMPTY_RANGE); + } + } + + @Override + public TxnRead deserialize(TableMetadatasAndKeys tablesAndKeys, DataInputPlus in, Version version) throws IOException + { + byte type = in.readByte(); + switch (type) + { + default: + throw new IllegalStateException("Unhandled type " + type); + case TYPE_EMPTY_KEY: + return EMPTY_KEY; + case TYPE_EMPTY_RANGE: + return EMPTY_RANGE; + case TYPE_NOT_EMPTY: + TxnNamedRead[] items = deserializeArray(tablesAndKeys, in, version, TxnNamedRead.serializer, TxnNamedRead[]::new); + ConsistencyLevel consistencyLevel = deserializeNullable(in, consistencyLevelSerializer); + return new TxnRead(tablesAndKeys.tables, items, consistencyLevel); + } + } + + @Override + public long serializedSize(TxnRead read, TableMetadatasAndKeys tablesAndKeys, Version version) + { + long size = 1; // type + if (read.items.length > 0) + { + size += serializedArraySize(read.items, tablesAndKeys, version, TxnNamedRead.serializer); + size += serializedNullableSize(read.cassandraConsistencyLevel, consistencyLevelSerializer); + } + return size; + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/TxnReference.java b/src/java/org/apache/cassandra/service/accord/txn/TxnReference.java new file mode 100644 index 000000000000..59a1c5afb6d3 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/TxnReference.java @@ -0,0 +1,354 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Objects; + +import accord.utils.VIntCoding; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.CollectionType; +import org.apache.cassandra.db.marshal.CompositeType; +import org.apache.cassandra.db.marshal.SetType; +import org.apache.cassandra.db.marshal.UserType; +import org.apache.cassandra.db.partitions.FilteredPartition; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellPath; +import org.apache.cassandra.db.rows.ColumnData; +import org.apache.cassandra.db.rows.ComplexColumnData; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.io.ParameterisedVersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.utils.ByteBufferUtil; + +import static org.apache.cassandra.db.marshal.CollectionType.Kind.SET; +import static org.apache.cassandra.service.accord.AccordSerializers.columnMetadataSerializer; + +public class TxnReference +{ + private final int tuple; + private final TableMetadata table; + private final ColumnMetadata column; + private final CellPath path; + + public TxnReference(int tuple, TableMetadata table, ColumnMetadata column, CellPath path) + { + this.tuple = tuple; + this.table = table; + this.column = column; + this.path = path; + } + + public TxnReference(int tuple, ColumnMetadata column, TableMetadata table) + { + this(tuple, table, column, null); + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + TxnReference reference = (TxnReference) o; + return tuple == reference.tuple && Objects.equals(column, reference.column) && Objects.equals(path, reference.path); + } + + @Override + public int hashCode() + { + return Objects.hash(tuple, column, path); + } + + @Override + public String toString() + { + StringBuilder sb = new StringBuilder().append(tuple); + if (column != null) + sb.append(':').append(column.ksName).append('.').append(column.cfName).append('.').append(column.name.toString()); + if (path != null) + sb.append(path); + return sb.toString(); + } + + public ColumnMetadata column() + { + return column; + } + + public TableMetadata table() + { + return table; + } + + public void collect(TableMetadatas.Collector collector) + { + collector.add(table); + } + + public CellPath path() + { + return path; + } + + public boolean selectsColumn() + { + return column != null; + } + + public boolean selectsPath() + { + return selectsColumn() && path != null; + } + + public boolean isElementSelection() + { + return selectsPath() && column.type.isCollection(); + } + + public boolean isFieldSelection() + { + return selectsPath() && column.type.isUDT(); + } + + public ByteBuffer getPartitionKey(TxnData data) + { + FilteredPartition partition = getPartition(data); + if (partition == null) return null; + return partition.metadata().partitionKeyColumns().size() == 1 + ? partition.partitionKey().getKey() + : ((CompositeType) partition.metadata().partitionKeyType).split(partition.partitionKey().getKey())[column.position()]; + } + + public ByteBuffer getClusteringKey(TxnData data) + { + Row row = getRow(data); + if (row == null) + return null; + return row.clustering().bufferAt(column.position()); + } + + public TxnDataKeyValue getPartition(TxnData data) + { + return (TxnDataKeyValue)data.get(tuple); + } + + public Row getRow(TxnData data) + { + FilteredPartition partition = getPartition(data); + return partition != null ? getRow(partition) : null; + } + + public Row getRow(FilteredPartition partition) + { + if (column != null && column.isStatic()) + return partition.staticRow(); + assert partition.rowCount() <= 1 : "Multi-row references are not allowed"; + if (partition.rowCount() == 0) + return null; + return partition.getAtIdx(0); + } + + public ColumnData getColumnData(Row row) + { + if (column.isComplex() && path == null) + return row.getComplexColumnData(column); + + if (path != null && column.type.isMultiCell()) + { + if (column.type.isCollection()) + { + CollectionType collectionType = (CollectionType) column.type; + + if (collectionType.kind == CollectionType.Kind.LIST) + return row.getComplexColumnData(column).getCellByIndex(ByteBufferUtil.toInt(path.get(0))); + } + + return row.getCell(column, path); + } + + return row.getCell(column); + } + + public ColumnData getColumnData(TxnData data) + { + Row row = getRow(data); + return row != null ? getColumnData(row) : null; + } + + public ByteBuffer getFrozenCollectionElement(Cell collection) + { + CollectionType collectionType = (CollectionType) column.type; + return collectionType.getSerializer().getSerializedValue(collection.buffer(), path.get(0), collectionType.nameComparator()); + } + + public ByteBuffer getFrozenFieldValue(Cell udt) + { + UserType userType = (UserType) column.type; + int field = ByteBufferUtil.getUnsignedShort(path.get(0), 0); + return userType.unpack(udt.buffer()).get(field); + } + + public AbstractType getFieldSelectionType() + { + assert isFieldSelection() : "No field selection type exists"; + UserType userType = (UserType) column.type; + int field = ByteBufferUtil.getUnsignedShort(path.get(0), 0); + return userType.fieldType(field); + } + + public ByteBuffer toByteBuffer(TxnData data, AbstractType receiver) + { + // TODO: confirm all references can be satisfied as part of the txn condition + AbstractType type = column().type; + + // Modify the type we'll check if the reference is to a collection element. + if (selectsPath()) + { + if (type.isCollection()) + { + CollectionType collectionType = (CollectionType) type; + type = collectionType.kind == SET ? collectionType.nameComparator() : collectionType.valueComparator(); + } + else if (type.isUDT()) + type = getFieldSelectionType(); + } + + // Account for frozen collection and reversed clustering key references: + AbstractType receiveType = type.isFrozenCollection() ? receiver.freeze().unwrap() : receiver.unwrap(); + if (!(receiveType == type.unwrap())) + throw new IllegalArgumentException("Receiving type " + receiveType + " does not match " + type.unwrap()); + + if (column().isPartitionKey()) + return getPartitionKey(data); + else if (column().isClusteringColumn()) + return getClusteringKey(data); + + ColumnData columnData = getColumnData(data); + + if (columnData == null) + return null; + + if (selectsComplex()) + { + ComplexColumnData complex = (ComplexColumnData) columnData; + + if (type instanceof CollectionType) + { + CollectionType col = (CollectionType) type; + return col.serializeForNativeProtocol(complex.iterator()); + } + else if (type instanceof UserType) + { + UserType udt = (UserType) type; + return udt.serializeForNativeProtocol(complex.iterator()); + } + + throw new UnsupportedOperationException("Unsupported complex type: " + type); + } + else if (selectsFrozenCollectionElement()) + { + // If a path is selected for a non-frozen collection, the element will already be materialized. + return getFrozenCollectionElement((Cell) columnData); + } + else if (selectsFrozenUDTField()) + { + return getFrozenFieldValue((Cell) columnData); + } + + Cell cell = (Cell) columnData; + return selectsSetElement() ? cell.path().get(0) : cell.buffer(); + } + + private boolean selectsComplex() + { + return column.isComplex() && path == null; + } + + private boolean selectsSetElement() + { + return selectsPath() && column.type instanceof SetType; + } + + private boolean selectsFrozenCollectionElement() + { + return selectsPath() && column.type.isFrozenCollection(); + } + + private boolean selectsFrozenUDTField() + { + return selectsPath() && column.type.isUDT() && !column.type.isMultiCell(); + } + + static final ParameterisedVersionedSerializer serializer = new ParameterisedVersionedSerializer<>() + { + @Override + public void serialize(TxnReference reference, TableMetadatas tables, DataOutputPlus out, Version version) throws IOException + { + out.writeUnsignedVInt32(reference.tuple); + out.writeBoolean(reference.column != null); + if (reference.column != null) + { + tables.serialize(reference.table, out); + columnMetadataSerializer.serialize(reference.column, reference.table, out); + } + out.writeBoolean(reference.path != null); + if (reference.path != null) + CollectionType.cellPathSerializer.serialize(reference.path, out); + } + + @Override + public TxnReference deserialize(TableMetadatas tables, DataInputPlus in, Version version) throws IOException + { + int name = in.readUnsignedVInt32(); + TableMetadata table = null; + ColumnMetadata column = null; + if (in.readBoolean()) + { + table = tables.deserialize(in); + column = columnMetadataSerializer.deserialize(table, in); + } + CellPath path = in.readBoolean() ? CollectionType.cellPathSerializer.deserialize(in) : null; + return new TxnReference(name, table, column, path); + } + + @Override + public long serializedSize(TxnReference reference, TableMetadatas tables, Version version) + { + long size = 0; + size += VIntCoding.sizeOfUnsignedVInt(reference.tuple); + size += TypeSizes.BOOL_SIZE; + if (reference.column != null) + { + size += tables.serializedSize(reference.table); + size += columnMetadataSerializer.serializedSize(reference.column, reference.table); + } + size += TypeSizes.BOOL_SIZE; + if (reference.path != null) + size += CollectionType.cellPathSerializer.serializedSize(reference.path); + return size; + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/TxnReferenceOperation.java b/src/java/org/apache/cassandra/service/accord/txn/TxnReferenceOperation.java new file mode 100644 index 000000000000..21c22b5a57d1 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/TxnReferenceOperation.java @@ -0,0 +1,316 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; + +import org.apache.cassandra.cql3.FieldIdentifier; +import org.apache.cassandra.cql3.Operation; +import org.apache.cassandra.cql3.UpdateParameters; +import org.apache.cassandra.cql3.terms.Constants; +import org.apache.cassandra.cql3.terms.Lists; +import org.apache.cassandra.cql3.terms.Maps; +import org.apache.cassandra.cql3.terms.MultiElements; +import org.apache.cassandra.cql3.terms.Sets; +import org.apache.cassandra.cql3.terms.Term; +import org.apache.cassandra.cql3.terms.UserTypes; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.CollectionType; +import org.apache.cassandra.db.marshal.ListType; +import org.apache.cassandra.db.marshal.MapType; +import org.apache.cassandra.db.marshal.SetType; +import org.apache.cassandra.db.marshal.TupleType; +import org.apache.cassandra.db.marshal.UserType; +import org.apache.cassandra.db.rows.CellPath; +import org.apache.cassandra.io.ParameterisedVersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.AccordSerializers; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.utils.ByteBufferUtil; + +import static org.apache.cassandra.db.marshal.CollectionType.Kind.MAP; +import static org.apache.cassandra.service.accord.AccordSerializers.columnMetadataSerializer; + +public class TxnReferenceOperation +{ + private static final Map, Kind> operationKindMap = initOperationKindMap(); + + private static Map, Kind> initOperationKindMap() + { + Map, Kind> temp = new HashMap<>(); + temp.put(Sets.Adder.class, Kind.SetAdder); + temp.put(Constants.Adder.class, Kind.ConstantAdder); + temp.put(Lists.Appender.class, Kind.ListAppender); + temp.put(Sets.Discarder.class, Kind.SetDiscarder); + temp.put(Lists.Discarder.class, Kind.ListDiscarder); + temp.put(Lists.Prepender.class, Kind.ListPrepender); + temp.put(Maps.Putter.class, Kind.MapPutter); + temp.put(Lists.Setter.class, Kind.ListSetter); + temp.put(Sets.Setter.class, Kind.SetSetter); + temp.put(Maps.Setter.class, Kind.MapSetter); + temp.put(UserTypes.Setter.class, Kind.UserTypeSetter); + temp.put(Constants.Setter.class, Kind.ConstantSetter); + temp.put(Constants.Substracter.class, Kind.ConstantSubtracter); + temp.put(Maps.SetterByKey.class, Kind.MapSetterByKey); + temp.put(Lists.SetterByIndex.class, Kind.ListSetterByIndex); + temp.put(UserTypes.SetterByField.class, Kind.UserTypeSetterByField); + return temp; + } + + private interface ToOperation + { + Operation apply(ColumnMetadata column, Term keyOrIndex, FieldIdentifier field, Term value); + } + + public enum Kind + { + SetAdder((byte) 1, (column, keyOrIndex, field, value) -> new Sets.Adder(column, value)), + ConstantAdder((byte) 2, (column, keyOrIndex, field, value) -> new Constants.Adder(column, value)), + ListAppender((byte) 3, (column, keyOrIndex, field, value) -> new Lists.Appender(column, value)), + SetDiscarder((byte) 4, (column, keyOrIndex, field, value) -> new Sets.Discarder(column, value)), + ListDiscarder((byte) 5, (column, keyOrIndex, field, value) -> new Lists.Discarder(column, value)), + ListPrepender((byte) 6, (column, keyOrIndex, field, value) -> new Lists.Prepender(column, value)), + MapPutter((byte) 7, (column, keyOrIndex, field, value) -> new Maps.Putter(column, value)), + ListSetter((byte) 8, (column, keyOrIndex, field, value) -> new Lists.Setter(column, value)), + SetSetter((byte) 9, (column, keyOrIndex, field, value) -> new Sets.Setter(column, value)), + MapSetter((byte) 10, (column, keyOrIndex, field, value) -> new Maps.Setter(column, value)), + UserTypeSetter((byte) 11, (column, keyOrIndex, field, value) -> new UserTypes.Setter(column, value)), + ConstantSetter((byte) 12, (column, keyOrIndex, field, value) -> new Constants.Setter(column, value)), + ConstantSubtracter((byte) 13, (column, keyOrIndex, field, value) -> new Constants.Substracter(column, value)), + MapSetterByKey((byte) 14, (column, keyOrIndex, field, value) -> new Maps.SetterByKey(column, keyOrIndex, value)), + ListSetterByIndex((byte) 15, (column, keyOrIndex, field, value) -> new Lists.SetterByIndex(column, keyOrIndex, value)), + UserTypeSetterByField((byte) 16, (column, keyOrIndex, field, value) -> new UserTypes.SetterByField(column, field, value)); + + private final byte id; + private final ToOperation toOperation; + + Kind(byte id, ToOperation toOperation) + { + this.id = id; + this.toOperation = toOperation; + } + + public static Kind from(byte b) + { + for (Kind k : values()) + if (k.id == b) + return k; + + throw new IllegalArgumentException("There is no kind with id: " + b); + } + + public static Kind from(Operation operation) + { + Class clazz = operation.getClass(); + Kind kind = operationKindMap.get(clazz); + if (kind == null) + throw new IllegalArgumentException("There is no Kind associated with operation: " + clazz); + return kind; + } + + public static Kind setterFor(ColumnMetadata column) + { + if (column.type instanceof ListType) + return ListSetter; + else if (column.type instanceof SetType) + return SetSetter; + else if (column.type instanceof MapType) + return MapSetter; + else if (column.type instanceof UserType) + return UserTypeSetter; + + return ConstantSetter; + } + + public Operation toOperation(ColumnMetadata column, Term keyOrIndex, FieldIdentifier field, Term value) + { + return toOperation.apply(column, keyOrIndex, field, value); + } + } + + private final Kind kind; + private final ColumnMetadata receiver; + private final TableMetadata table; + private final ByteBuffer key; + private final ByteBuffer field; + private final TxnReferenceValue value; + private final AbstractType valueType; + + public TxnReferenceOperation(Kind kind, ColumnMetadata receiver, TableMetadata table, ByteBuffer key, ByteBuffer field, TxnReferenceValue value) + { + this.kind = kind; + this.receiver = receiver; + this.table = table; + this.key = key; + this.field = field; + + // We don't expect operators on clustering keys, but unwrap just in case. + AbstractType receiverType = receiver.type.unwrap(); + + if (kind == TxnReferenceOperation.Kind.SetDiscarder && receiverType.isCollection() && (((CollectionType) receiverType).kind == MAP)) + { + // The value for a map subtraction is actually a set (see Operation.Substraction) + this.valueType = SetType.getInstance(((MapType) receiverType).getKeysType(), true); + } + else if (kind == Kind.MapSetterByKey || kind == Kind.ListSetterByIndex) + { + this.valueType = ((CollectionType) receiverType).valueComparator(); + } + else if (kind == Kind.UserTypeSetterByField) + { + UserType userType = (UserType) receiverType; + CellPath fieldPath = userType.cellPathForField(new FieldIdentifier(field)); + this.valueType = userType.fieldType(fieldPath); + } + else + { + this.valueType = receiverType; + } + + this.value = value; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + TxnReferenceOperation that = (TxnReferenceOperation) o; + return Objects.equals(receiver, that.receiver) + && kind == that.kind + && Objects.equals(key, that.key) + && Objects.equals(field, that.field) + && Objects.equals(value, that.value); + } + + public void collect(TableMetadatas.Collector collector) + { + collector.add(table); + value.collect(collector); + } + + @Override + public int hashCode() + { + return Objects.hash(receiver, kind, key, field, value); + } + + @Override + public String toString() + { + return receiver + " = " + value; + } + + public ColumnMetadata receiver() + { + return receiver; + } + + public void apply(TxnData data, DecoratedKey key, UpdateParameters up) + { + Operation operation = toOperation(data); + operation.execute(key, up); + } + + private Operation toOperation(TxnData data) + { + FieldIdentifier fieldIdentifier = field == null ? null : new FieldIdentifier(field); + Term valueTerm = toTerm(data, valueType); + Term keyorIndexTerm = key == null ? null : toTerm(key, valueType); + return kind.toOperation(receiver, keyorIndexTerm, fieldIdentifier, valueTerm); + } + + private Term toTerm(TxnData data, AbstractType receivingType) + { + ByteBuffer bytes = value.compute(data, receivingType); + if (bytes == null) + return Constants.NULL_VALUE; + return toTerm(bytes, receivingType); + } + + private Term toTerm(ByteBuffer bytes, AbstractType receivingType) + { + if (receivingType.isCollection()) + return AccordSerializers.deserializeCqlCollectionAsTerm(bytes, receivingType); + else if (receivingType.isUDT()) + return MultiElements.Value.fromSerialized(bytes, (UserType) receivingType); + else if (receivingType.isTuple()) + return MultiElements.Value.fromSerialized(bytes, (TupleType) receivingType); + + return new Constants.Value(bytes); + } + + static final ParameterisedVersionedSerializer serializer = new ParameterisedVersionedSerializer<>() + { + @Override + public void serialize(TxnReferenceOperation operation, TableMetadatas tables, DataOutputPlus out, Version version) throws IOException + { + out.writeByte(operation.kind.id); + tables.serialize(operation.table, out); + columnMetadataSerializer.serialize(operation.receiver, operation.table, out); + TxnReferenceValue.serializer.serialize(operation.value, tables, out, version); + + out.writeBoolean(operation.key != null); + if (operation.key != null) + ByteBufferUtil.writeWithVIntLength(operation.key, out); + + out.writeBoolean(operation.field != null); + if (operation.field != null) + ByteBufferUtil.writeWithVIntLength(operation.field, out); + } + + @Override + public TxnReferenceOperation deserialize(TableMetadatas tables, DataInputPlus in, Version version) throws IOException + { + Kind kind = Kind.from(in.readByte()); + TableMetadata table = tables.deserialize(in); + ColumnMetadata receiver = columnMetadataSerializer.deserialize(table, in); + TxnReferenceValue value = TxnReferenceValue.serializer.deserialize(tables, in, version); + ByteBuffer key = in.readBoolean() ? ByteBufferUtil.readWithVIntLength(in) : null; + ByteBuffer field = in.readBoolean() ? ByteBufferUtil.readWithVIntLength(in) : null; + return new TxnReferenceOperation(kind, receiver, table, key, field, value); + } + + @Override + public long serializedSize(TxnReferenceOperation operation, TableMetadatas tables, Version version) + { + long size = Byte.BYTES; + size += tables.serializedSize(operation.table); + size += columnMetadataSerializer.serializedSize(operation.receiver, operation.table); + size += TxnReferenceValue.serializer.serializedSize(operation.value, tables, version); + + if (operation.key != null) + size += ByteBufferUtil.serializedSizeWithVIntLength(operation.key); + + if (operation.field != null) + size += ByteBufferUtil.serializedSizeWithVIntLength(operation.field); + + return size; + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/TxnReferenceOperations.java b/src/java/org/apache/cassandra/service/accord/txn/TxnReferenceOperations.java new file mode 100644 index 000000000000..679106be1172 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/TxnReferenceOperations.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +import com.google.common.base.Preconditions; + +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.ParameterisedVersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.accord.serializers.Version; + +import static org.apache.cassandra.utils.CollectionSerializers.deserializeList; +import static org.apache.cassandra.utils.CollectionSerializers.serializeList; +import static org.apache.cassandra.utils.CollectionSerializers.serializedListSize; + +public class TxnReferenceOperations +{ + private static final TxnReferenceOperations EMPTY = new TxnReferenceOperations(null, null, Collections.emptyList(), Collections.emptyList()); + + private final TableMetadata metadata; + final Clustering clustering; + final List regulars; + final List statics; + + public TxnReferenceOperations(TableMetadata metadata, Clustering clustering, List regulars, List statics) + { + this.metadata = metadata; + Preconditions.checkArgument(clustering != null || regulars.isEmpty()); + this.clustering = clustering; + this.regulars = regulars; + this.statics = statics; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + TxnReferenceOperations that = (TxnReferenceOperations) o; + return metadata.equals(that.metadata) && Objects.equals(clustering, that.clustering) && regulars.equals(that.regulars) && statics.equals(that.statics); + } + + @Override + public int hashCode() + { + return Objects.hash(metadata, clustering, regulars, statics); + } + + @Override + public String toString() + { + return "TxnReferenceOperations{metadata=" + metadata + ", clustering=" + clustering + ", regulars=" + regulars + ", statics=" + statics + '}'; + } + + public static TxnReferenceOperations empty() + { + return EMPTY; + } + + public boolean isEmpty() + { + return regulars.isEmpty() && statics.isEmpty(); + } + + static final ParameterisedVersionedSerializer serializer = new ParameterisedVersionedSerializer<>() + { + @Override + public void serialize(TxnReferenceOperations operations, TableMetadatas tables, DataOutputPlus out, Version version) throws IOException + { + out.writeBoolean(!operations.isEmpty()); + if (operations.isEmpty()) + return; + + tables.serialize(operations.metadata, out); + out.writeBoolean(operations.clustering != null); + if (operations.clustering != null) + Clustering.serializer.serialize(operations.clustering, out, version.messageVersion(), operations.metadata.comparator.subtypes()); + serializeList(operations.regulars, tables, out, version, TxnReferenceOperation.serializer); + serializeList(operations.statics, tables, out, version, TxnReferenceOperation.serializer); + } + + @Override + public TxnReferenceOperations deserialize(TableMetadatas tables, DataInputPlus in, Version version) throws IOException + { + if (!in.readBoolean()) + return TxnReferenceOperations.empty(); + + TableMetadata metadata = tables.deserialize(in); + Clustering clustering = in.readBoolean() ? Clustering.serializer.deserialize(in, version.messageVersion(), metadata.comparator.subtypes()) : null; + return new TxnReferenceOperations(metadata, clustering, deserializeList(tables, in, version, TxnReferenceOperation.serializer), + deserializeList(tables, in, version, TxnReferenceOperation.serializer)); + } + + @Override + public long serializedSize(TxnReferenceOperations operations, TableMetadatas tables, Version version) + { + long size = TypeSizes.BOOL_SIZE; + if (operations.isEmpty()) + return size; + size += tables.serializedSize(operations.metadata); + size += TypeSizes.BOOL_SIZE; + if (operations.clustering != null) + size += Clustering.serializer.serializedSize(operations.clustering, version.messageVersion(), operations.metadata.comparator.subtypes()); + size += serializedListSize(operations.regulars, tables, version, TxnReferenceOperation.serializer); + size += serializedListSize(operations.statics, tables, version, TxnReferenceOperation.serializer); + return size; + } + + private TableMetadatas tables(TxnReferenceOperations operations) + { + TableMetadatas.Collector collector = new TableMetadatas.Collector(); + collector.add(operations.metadata); + for (TxnReferenceOperation op : operations.regulars) + op.collect(collector); + for (TxnReferenceOperation op : operations.statics) + op.collect(collector); + return collector.build(); + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/TxnReferenceValue.java b/src/java/org/apache/cassandra/service/accord/txn/TxnReferenceValue.java new file mode 100644 index 000000000000..7dbcea1c9372 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/TxnReferenceValue.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Objects; + +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.io.ParameterisedVersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.utils.ByteBufferUtil; + +public abstract class TxnReferenceValue +{ + private interface Serializer + { + void serialize(T t, TableMetadatas tables, DataOutputPlus out, Version version) throws IOException; + T deserialize(TableMetadatas tables, DataInputPlus in, Version version, Kind kind) throws IOException; + long serializedSize(T t, TableMetadatas tables, Version version); + } + + enum Kind + { + CONSTANT(Constant.serializer), + SUBSTITUTION(Substitution.serializer); + + @SuppressWarnings("rawtypes") + final Serializer serializer; + + Kind(Serializer serializer) + { + this.serializer = serializer; + } + } + + protected abstract Kind kind(); + abstract ByteBuffer compute(TxnData data, AbstractType receiver); + abstract void collect(TableMetadatas.Collector collector); + + public static class Constant extends TxnReferenceValue + { + private final ByteBuffer value; + + public Constant(ByteBuffer value) + { + this.value = value; + } + + public ByteBuffer getValue() + { + return value; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Constant constant = (Constant) o; + return value.equals(constant.value); + } + + @Override + public int hashCode() + { + return Objects.hash(value); + } + + @Override + public String toString() + { + return "Constant=" + ByteBufferUtil.bytesToHex(value); + } + + @Override + public Kind kind() + { + return Kind.CONSTANT; + } + + @Override + public ByteBuffer compute(TxnData data, AbstractType receiver) + { + return value; + } + + @Override + void collect(TableMetadatas.Collector collector) + { + } + + private static final Serializer serializer = new Serializer() + { + @Override + public void serialize(Constant constant, TableMetadatas tables, DataOutputPlus out, Version version) throws IOException + { + ByteBufferUtil.writeWithVIntLength(constant.value, out); + } + + @Override + public Constant deserialize(TableMetadatas tables, DataInputPlus in, Version version, Kind kind) throws IOException + { + return new Constant(ByteBufferUtil.readWithVIntLength(in)); + } + + @Override + public long serializedSize(Constant constant, TableMetadatas tables, Version version) + { + return ByteBufferUtil.serializedSizeWithVIntLength(constant.value); + } + }; + } + + public static class Substitution extends TxnReferenceValue + { + private final TxnReference reference; + + public Substitution(TxnReference reference) + { + this.reference = reference; + } + + @Override + public String toString() + { + return reference.toString(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Substitution that = (Substitution) o; + return reference.equals(that.reference); + } + + @Override + public int hashCode() + { + return Objects.hash(reference); + } + + @Override + public Kind kind() + { + return Kind.SUBSTITUTION; + } + + @Override + public ByteBuffer compute(TxnData data, AbstractType receiver) + { + return reference.toByteBuffer(data, receiver); + } + + @Override + void collect(TableMetadatas.Collector collector) + { + reference.collect(collector); + } + + private static final Serializer serializer = new Serializer<>() + { + @Override + public void serialize(Substitution substitution, TableMetadatas tables, DataOutputPlus out, Version version) throws IOException + { + TxnReference.serializer.serialize(substitution.reference, tables, out, version); + } + + @Override + public Substitution deserialize(TableMetadatas tables, DataInputPlus in, Version version, Kind kind) throws IOException + { + return new Substitution(TxnReference.serializer.deserialize(tables, in, version)); + } + + @Override + public long serializedSize(Substitution substitution, TableMetadatas tables, Version version) + { + return TxnReference.serializer.serializedSize(substitution.reference, tables, version); + } + }; + } + + static final ParameterisedVersionedSerializer serializer = new ParameterisedVersionedSerializer<>() + { + @SuppressWarnings("unchecked") + @Override + public void serialize(TxnReferenceValue value, TableMetadatas tables, DataOutputPlus out, Version version) throws IOException + { + out.writeUnsignedVInt32(value.kind().ordinal()); + value.kind().serializer.serialize(value, tables, out, version); + } + + @Override + public TxnReferenceValue deserialize(TableMetadatas tables, DataInputPlus in, Version version) throws IOException + { + Kind kind = Kind.values()[in.readUnsignedVInt32()]; + return kind.serializer.deserialize(tables, in, version, kind); + } + + @SuppressWarnings("unchecked") + @Override + public long serializedSize(TxnReferenceValue value, TableMetadatas tables, Version version) + { + return TypeSizes.sizeofUnsignedVInt(value.kind().ordinal()) + value.kind().serializer.serializedSize(value, tables, version); + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/TxnResult.java b/src/java/org/apache/cassandra/service/accord/txn/TxnResult.java new file mode 100644 index 000000000000..2b5af7c08b27 --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/TxnResult.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import accord.api.Result; + +public interface TxnResult extends Result +{ + enum Kind + { + txn_data(0), + retry_new_protocol(1), + range_read(2); + + int id; + + Kind(int id) + { + this.id = id; + } + } + + Kind kind(); + + long estimatedSizeOnHeap(); +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/TxnUpdate.java b/src/java/org/apache/cassandra/service/accord/txn/TxnUpdate.java new file mode 100644 index 000000000000..1fdc0e54f9ee --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/TxnUpdate.java @@ -0,0 +1,399 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import javax.annotation.Nullable; + +import accord.api.Data; +import accord.api.Update; +import accord.primitives.Keys; +import accord.primitives.Participants; +import accord.primitives.Ranges; +import accord.primitives.RoutableKey; +import accord.primitives.Timestamp; +import accord.utils.Invariants; +import org.apache.cassandra.cql3.QueryOptions; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.AccordObjectSizes; +import org.apache.cassandra.service.accord.IAccordService; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.service.accord.txn.TxnCondition.SerializedTxnCondition; +import org.apache.cassandra.service.accord.txn.TxnWrite.Fragment; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ObjectSizes; + +import static accord.utils.Invariants.requireArgument; +import static accord.utils.SortedArrays.Search.CEIL; +import static com.google.common.base.Preconditions.checkState; +import static org.apache.cassandra.service.accord.AccordSerializers.consistencyLevelSerializer; +import static org.apache.cassandra.utils.ArraySerializers.deserializeArray; +import static org.apache.cassandra.utils.ArraySerializers.serializeArray; +import static org.apache.cassandra.utils.ArraySerializers.serializedArraySize; +import static org.apache.cassandra.utils.ByteBufferUtil.readWithVIntLength; +import static org.apache.cassandra.utils.ByteBufferUtil.serializedSizeWithVIntLength; +import static org.apache.cassandra.utils.ByteBufferUtil.writeWithVIntLength; +import static org.apache.cassandra.utils.NullableSerializer.deserializeNullable; +import static org.apache.cassandra.utils.NullableSerializer.serializeNullable; +import static org.apache.cassandra.utils.NullableSerializer.serializedNullableSize; + +public class TxnUpdate extends AccordUpdate +{ + private static final long EMPTY_SIZE = ObjectSizes.measure(new TxnUpdate(TableMetadatas.none(), null, new ByteBuffer[0], null, null, false)); + private static final int FLAG_PRESERVE_TIMESTAMPS = 0x1; + + final TableMetadatas tables; + private final Keys keys; + private final ByteBuffer[] fragments; + private final AbstractSerialized condition; + + @Nullable + private final ConsistencyLevel cassandraCommitCL; + + // Hints and batchlog want to write with the lower timestamp they generated when applying their writes via Accord + // so they don't resurrect data if they are applied at a later time. Accord should be fine with this because + // the writes are still deterministic from the perspective of coordinators/recovery coordinators. + private final boolean preserveTimestamps; + + // Memoize computation of condition + private Boolean conditionResult; + + public TxnUpdate(TableMetadatas tables, List fragments, TxnCondition condition, @Nullable ConsistencyLevel cassandraCommitCL, boolean preserveTimestamps) + { + requireArgument(cassandraCommitCL == null || IAccordService.SUPPORTED_COMMIT_CONSISTENCY_LEVELS.contains(cassandraCommitCL)); + this.tables = tables; + this.keys = Keys.of(fragments, fragment -> fragment.key); + fragments.sort(Fragment::compareKeys); + // TODO (required): this node could be on version N while the peers are on N-1, which would have issues as the peers wouldn't know about N yet. + // Can not eagerly serialize until we know the "correct" version, else we need a way to fallback on mismatch. + this.fragments = toSerializedValuesArray(keys, fragments, tables, Version.LATEST); + // TODO (desired): slice TxnCondition, or pick a single shard to persist it + this.condition = new SerializedTxnCondition(condition, tables); + this.condition.unmemoize(); + this.condition.deserialize(tables); + this.cassandraCommitCL = cassandraCommitCL; + this.preserveTimestamps = preserveTimestamps; + } + + private TxnUpdate(TableMetadatas tables, Keys keys, ByteBuffer[] fragments, AbstractSerialized condition, ConsistencyLevel cassandraCommitCL, boolean preserveTimestamps) + { + this.tables = tables; + this.keys = keys; + this.fragments = fragments; + this.condition = condition; + this.cassandraCommitCL = cassandraCommitCL; + this.preserveTimestamps = preserveTimestamps; + } + + public static TxnUpdate empty() + { + return new TxnUpdate(TableMetadatas.none(), Collections.emptyList(), TxnCondition.none(), null, false); + } + + @Override + public long estimatedSizeOnHeap() + { + long size = EMPTY_SIZE + condition.estimatedSizeOnHeap(); + for (ByteBuffer update : fragments) + size += ByteBufferUtil.estimatedSizeOnHeap(update); + size += AccordObjectSizes.keys(keys); + return size; + } + + @Override + public String toString() + { + return "TxnUpdate{updates=" + deserialize(keys, tables, fragments) + + ", condition=" + condition.deserialize(tables) + '}'; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + TxnUpdate txnUpdate = (TxnUpdate) o; + return Arrays.equals(fragments, txnUpdate.fragments) && Objects.equals(condition, txnUpdate.condition); + } + + @Override + public int hashCode() + { + int result = Objects.hash(condition); + result = 31 * result + Arrays.hashCode(fragments); + return result; + } + + @Override + public Keys keys() + { + // TODO: It doesn't seem to affect correctness, but should we return the union of the fragment + condition keys? + return keys; + } + + // Batch log and hints want to keep their lower timestamp for the applied writes to avoid resurrecting old data + // when they are applied later, possibly after further updates have already been acknowledged. + public boolean preserveTimestamps() + { + return preserveTimestamps; + } + + @Override + public Update slice(Ranges ranges) + { + Keys keys = this.keys.slice(ranges); + // TODO (desired): Slice the condition. + return new TxnUpdate(tables, keys, select(this.keys, keys, fragments), condition, cassandraCommitCL, preserveTimestamps); + } + + @Override + public Update intersecting(Participants participants) + { + Keys keys = this.keys.intersecting(participants); + // TODO (desired): Slice the condition. + return new TxnUpdate(tables, keys, select(this.keys, keys, fragments), condition, cassandraCommitCL, preserveTimestamps); + } + + private static ByteBuffer[] select(Keys in, Keys out, ByteBuffer[] from) + { + ByteBuffer[] result = new ByteBuffer[out.size()]; + int j = 0; + for (int i = 0 ; i < out.size() ; ++i) + { + j = in.findNext(j, out.get(i), CEIL); + result[i] = from[j]; + } + return result; + } + + @Override + public Update merge(Update update) + { + TxnUpdate that = (TxnUpdate) update; + Keys mergedKeys = this.keys.with(that.keys); + // TODO (desired): special method for linear merging keyed and non-keyed lists simultaneously + ByteBuffer[] mergedFragments = merge(this.keys, that.keys, this.fragments, that.fragments, mergedKeys.size()); + return new TxnUpdate(tables, mergedKeys, mergedFragments, condition, cassandraCommitCL, preserveTimestamps); + } + + private static ByteBuffer[] merge(Keys leftKeys, Keys rightKeys, ByteBuffer[] left, ByteBuffer[] right, int outputSize) + { + ByteBuffer[] out = new ByteBuffer[outputSize]; + int l = 0, r = 0, o = 0; + while (l < leftKeys.size() && r < rightKeys.size()) + { + int c = leftKeys.get(l).compareTo(rightKeys.get(r)); + if (c < 0) { out[o++] = left[l++]; } + else if (c > 0) { out[o++] = right[r++]; } + else if (ByteBufferUtil.compareUnsigned(left[l], right[r]) != 0) { throw new IllegalStateException("The same keys have different values in each input"); } + else { out[o++] = left[l++]; r++; } + } + while (l < leftKeys.size()) { out[o++] = left[l++]; } + while (r < rightKeys.size()) { out[o++] = right[r++]; } + return out; + } + + @Override + public TxnWrite apply(Timestamp executeAt, Data data) + { + ClusterMetadata cm = ClusterMetadata.current(); + checkState(cm.epoch.getEpoch() >= executeAt.epoch(), "TCM epoch %d is < executeAt epoch %d", cm.epoch.getEpoch(), executeAt.epoch()); + if (!checkCondition(data)) + return TxnWrite.EMPTY_CONDITION_FAILED; + + if (keys.isEmpty()) + return new TxnWrite(TableMetadatas.none(), Collections.emptyList(), true); + + List fragments = deserialize(keys, tables, this.fragments); + List updates = new ArrayList<>(fragments.size()); + QueryOptions options = QueryOptions.forProtocolVersion(ProtocolVersion.CURRENT); + AccordUpdateParameters parameters = new AccordUpdateParameters((TxnData) data, options, executeAt.uniqueHlc()); + + for (Fragment fragment : fragments) + // Filter out fragments that already constitute complete updates to avoid persisting them via TxnWrite: + if (!fragment.isComplete()) + updates.add(fragment.complete(parameters, tables)); + + return new TxnWrite(tables, updates, true); + } + + public List completeUpdatesForKey(RoutableKey key) + { + List fragments = deserialize(keys, tables, this.fragments); + List updates = new ArrayList<>(fragments.size()); + + for (Fragment fragment : fragments) + if (fragment.isComplete() && fragment.key.equals(key)) + updates.add(fragment.toUpdate(tables)); + + return updates; + } + + public static final AccordUpdateSerializer serializer = new AccordUpdateSerializer() + { + @Override + public void serialize(TxnUpdate update, TableMetadatasAndKeys tablesAndKeys, DataOutputPlus out, Version version) throws IOException + { + out.writeByte(update.preserveTimestamps ? FLAG_PRESERVE_TIMESTAMPS : 0); + tablesAndKeys.serializeKeys(update.keys, out); + writeWithVIntLength(update.condition.bytes(tablesAndKeys.tables, version), out); + serializeArray(update.fragments, out, ByteBufferUtil.byteBufferSerializer); + serializeNullable(update.cassandraCommitCL, out, consistencyLevelSerializer); + } + + @Override + public TxnUpdate deserialize(TableMetadatasAndKeys tablesAndKeys, DataInputPlus in, Version version) throws IOException + { + int flags = in.readByte(); + boolean preserveTimestamps = (FLAG_PRESERVE_TIMESTAMPS & flags) == 1; + Keys keys = tablesAndKeys.deserializeKeys(in); + ByteBuffer condition = readWithVIntLength(in); + ByteBuffer[] fragments = deserializeArray(in, ByteBufferUtil.byteBufferSerializer, ByteBuffer[]::new); + ConsistencyLevel consistencyLevel = deserializeNullable(in, consistencyLevelSerializer); + return new TxnUpdate(tablesAndKeys.tables, keys, fragments, new SerializedTxnCondition(condition), consistencyLevel, preserveTimestamps); + } + + @Override + public long serializedSize(TxnUpdate update, TableMetadatasAndKeys tablesAndKeys, Version version) + { + long size = 1; // flags + size += tablesAndKeys.serializedKeysSize(update.keys); + size += serializedSizeWithVIntLength(update.condition.bytes(tablesAndKeys.tables, version)); + size += serializedArraySize(update.fragments, ByteBufferUtil.byteBufferSerializer); + size += serializedNullableSize(update.cassandraCommitCL, consistencyLevelSerializer); + return size; + } + }; + + private static ByteBuffer[] toSerializedValuesArray(Keys keys, List items, TableMetadatas tables, Version version) + { + ByteBuffer[] result = new ByteBuffer[keys.size()]; + int i = 0, mi = items.size(), ki = 0; + while (i < mi) + { + PartitionKey key = items.get(i).key; + int j = i + 1; + while (j < mi && items.get(j).key.equals(key)) + ++j; + + int nextki = keys.findNext(ki, key, CEIL); + Arrays.fill(result, ki, nextki, ByteBufferUtil.EMPTY_BYTE_BUFFER); + ki = nextki; + result[ki++] = toSerializedValues(items, tables, i, j, version); + i = j; + } + Arrays.fill(result, ki, result.length, ByteBufferUtil.EMPTY_BYTE_BUFFER); + return result; + } + + private static ByteBuffer toSerializedValues(List items, TableMetadatas tables, int start, int end, Version version) + { + long size = TypeSizes.sizeofUnsignedVInt(version.version) + TypeSizes.sizeofUnsignedVInt(end - start); + for (int i = start ; i < end ; ++i) + size += Fragment.serializer.serializedSize(items.get(i), tables, version); + + try (DataOutputBuffer out = new DataOutputBuffer((int) size)) + { + out.writeUnsignedVInt32(version.version); + out.writeUnsignedVInt32(end - start); + for (int i = start ; i < end ; ++i) + Fragment.serializer.serialize(items.get(i), tables, out, version); + return out.buffer(false); + } + catch (IOException e) + { + throw new RuntimeException(e); + } + } + + private static List deserialize(PartitionKey key, TableMetadatas tables, ByteBuffer bytes) + { + if (!bytes.hasRemaining()) + return Collections.emptyList(); + + try (DataInputBuffer in = new DataInputBuffer(bytes, true)) + { + Version version = Version.fromVersion(in.readUnsignedVInt32()); + int count = in.readUnsignedVInt32(); + switch (count) + { + case 0: throw new IllegalStateException(); + case 1: return Collections.singletonList(Fragment.serializer.deserialize(key, tables, in, version)); + default: + List result = new ArrayList<>(); + for (int i = 0 ; i < count ; ++i) + result.add(Fragment.serializer.deserialize(key, tables, in, version)); + return result; + } + } + catch (IOException e) + { + throw new RuntimeException(e); + } + } + + private static List deserialize(Keys keys, TableMetadatas tables, ByteBuffer[] buffers) + { + Invariants.require(keys.size() == buffers.length); + List result = new ArrayList<>(buffers.length); + for (int i = 0 ; i < keys.size() ; ++i) + result.addAll(deserialize((PartitionKey) keys.get(i), tables, buffers[i])); + return result; + } + + @Override + public boolean checkCondition(Data data) + { + // Assert data that was memoized is same as data that is provided? + if (conditionResult != null) + return conditionResult; + TxnCondition condition = this.condition.deserialize(tables); + if (condition == TxnCondition.none()) + return conditionResult = true; + return conditionResult = condition.applies((TxnData) data); + } + + @Override + public Kind kind() + { + return Kind.TXN; + } + + @Override + public ConsistencyLevel cassandraCommitCL() + { + return cassandraCommitCL; + } +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/TxnWrite.java b/src/java/org/apache/cassandra/service/accord/txn/TxnWrite.java new file mode 100644 index 000000000000..abd27ae588be --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/TxnWrite.java @@ -0,0 +1,495 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.Set; + +import com.google.common.collect.Iterables; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.DataStore; +import accord.api.Write; +import accord.local.SafeCommandStore; +import accord.primitives.PartialTxn; +import accord.primitives.Routable.Domain; +import accord.primitives.RoutableKey; +import accord.primitives.Seekable; +import accord.primitives.Seekables; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import accord.primitives.Writes; +import accord.utils.async.AsyncChain; +import accord.utils.async.AsyncChains; +import org.apache.cassandra.concurrent.Stage; +import org.apache.cassandra.cql3.UpdateParameters; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.Columns; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; +import org.apache.cassandra.db.RegularAndStaticColumns; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.io.ParameterisedVersionedSerializer; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.utils.BooleanSerializer; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ObjectSizes; + +import static com.google.common.base.Preconditions.checkState; +import static org.apache.cassandra.db.rows.DeserializationHelper.Flag.FROM_REMOTE; +import static org.apache.cassandra.utils.ArraySerializers.deserializeArray; +import static org.apache.cassandra.utils.ArraySerializers.serializeArray; +import static org.apache.cassandra.utils.ArraySerializers.serializedArraySize; + +public class TxnWrite extends AbstractKeySorted implements Write +{ + @SuppressWarnings("unused") + private static final Logger logger = LoggerFactory.getLogger(TxnWrite.class); + + public static final TxnWrite EMPTY_CONDITION_FAILED = new TxnWrite(TableMetadatas.none(), Collections.emptyList(), false); + + private static final long EMPTY_SIZE = ObjectSizes.measure(EMPTY_CONDITION_FAILED); + + public static class Update extends AbstractSerialized + { + private static final long EMPTY_SIZE = ObjectSizes.measure(new Update(null, 0, ByteBufferUtil.EMPTY_BYTE_BUFFER)); + public final PartitionKey key; + public final int index; + + public Update(PartitionKey key, int index, PartitionUpdate update, TableMetadatas tables) + { + this(key, index, serializeInternal(update, tables, Version.LATEST)); + } + + private Update(PartitionKey key, int index, ByteBuffer latestVersionBytes) + { + super(latestVersionBytes); + this.key = key; + this.index = index; + } + + @Override + public long estimatedSizeOnHeap() + { + // we don't measure the key, as this is shared + return EMPTY_SIZE + ByteBufferUtil.estimatedSizeOnHeap(unsafeBytes()); + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + Update update = (Update) o; + return index == update.index && key.equals(update.key); + } + + @Override + public int hashCode() + { + return Objects.hash(super.hashCode(), key, index); + } + + @Override + public String toString() + { + return "Complete{" + + "key=" + key + + ", index=" + index + + '}'; + } + + public AsyncChain write(TableMetadatas tables, boolean preserveTimestamps, long timestamp) + { + PartitionUpdate update = deserialize(tables); + if (!preserveTimestamps) + update = new PartitionUpdate.Builder(update, 0).updateAllTimestamp(timestamp).build(); + Mutation mutation = new Mutation(update, PotentialTxnConflicts.ALLOW); + return AsyncChains.ofRunnable(Stage.MUTATION.executor(), mutation::applyUnsafe); + } + + @Override + protected ByteBuffer serialize(PartitionUpdate value, TableMetadatas tables, Version version) + { + return serializeInternal(value, tables, version); + } + + @Override + protected ByteBuffer reserialize(ByteBuffer bytes, TableMetadatas param, Version srcVersion, Version trgVersion) + { + return bytes; + } + + @Override + protected PartitionUpdate deserialize(TableMetadatas tables, ByteBuffer bytes, Version version) + { + return deserialize(key, tables, bytes, version); + } + + private static ByteBuffer serializeInternal(PartitionUpdate value, TableMetadatas tables, Version version) + { + try (DataOutputBuffer out = DataOutputBuffer.scratchBuffer.get()) + { + PartitionUpdate.serializer.serializeWithoutKey(value, tables, out, version.messageVersion()); + return out.asNewBuffer(); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } + + private static PartitionUpdate deserialize(PartitionKey key, TableMetadatas tables, ByteBuffer bytes, Version version) + { + try (DataInputBuffer in = new DataInputBuffer(bytes, true)) + { + return PartitionUpdate.serializer.deserialize(key, tables, in, version.messageVersion(), FROM_REMOTE); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } + + public static final ParameterisedVersionedSerializer serializer = new ParameterisedVersionedSerializer() + { + @Override + public void serialize(Update write, TableMetadatasAndKeys tablesAndKeys, DataOutputPlus out, Version version) throws IOException + { + tablesAndKeys.serializeKey(write.key, out); + out.writeInt(write.index); + ByteBufferUtil.writeWithVIntLength(write.bytes(tablesAndKeys.tables, version), out); + } + + ByteBuffer reserialize(ByteBuffer buffer, TableMetadatasAndKeys tablesAndKeys, Version srcVersion, Version trgVersion) + { + return buffer; + } + + @Override + public Update deserialize(TableMetadatasAndKeys tablesAndKeys, DataInputPlus in, Version version) throws IOException + { + PartitionKey key = tablesAndKeys.deserializeKey(in); + int index = in.readInt(); + ByteBuffer bytes = ByteBufferUtil.readWithVIntLength(in); + if (version != Version.LATEST) + bytes = reserialize(bytes, tablesAndKeys, version, Version.LATEST); + return new Update(key, index, bytes); + } + + @Override + public long serializedSize(Update write, TableMetadatasAndKeys tablesAndKeys, Version version) + { + long size = 0; + size += tablesAndKeys.serializedKeySize(write.key); + size += TypeSizes.INT_SIZE; + size += ByteBufferUtil.serializedSizeWithVIntLength(write.bytes(tablesAndKeys.tables, version)); + return size; + } + }; + } + + /** + * Partition update that can later be supplemented with data from the read phase + */ + public static class Fragment + { + public final PartitionKey key; + public final int index; + public final PartitionUpdate baseUpdate; + public final TxnReferenceOperations referenceOps; + + public Fragment(PartitionKey key, int index, PartitionUpdate baseUpdate, TxnReferenceOperations referenceOps) + { + this.key = key; + this.index = index; + this.baseUpdate = baseUpdate; + this.referenceOps = referenceOps; + } + + public static int compareKeys(Fragment left, Fragment right) + { + return left.key.compareTo(right.key); + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Fragment fragment = (Fragment) o; + return index == fragment.index && key.equals(fragment.key) && baseUpdate.equals(fragment.baseUpdate) && referenceOps.equals(fragment.referenceOps); + } + + @Override + public int hashCode() + { + return Objects.hash(key, index, baseUpdate, referenceOps); + } + + @Override + public String toString() + { + return "Fragment{key=" + key + ", index=" + index + ", baseUpdate=" + baseUpdate + ", referenceOps=" + referenceOps + '}'; + } + + public boolean isComplete() + { + return referenceOps.isEmpty(); + } + + public Update toUpdate(TableMetadatas tables) + { + return new Update(key, index, baseUpdate, tables); + } + + public Update complete(AccordUpdateParameters parameters, TableMetadatas tables) + { + if (isComplete()) + return toUpdate(tables); + + DecoratedKey key = baseUpdate.partitionKey(); + PartitionUpdate.Builder updateBuilder = new PartitionUpdate.Builder(baseUpdate.metadata(), + key, + columns(baseUpdate, referenceOps), + baseUpdate.rowCount(), + baseUpdate.canHaveShadowedData()); + + UpdateParameters up = parameters.updateParameters(baseUpdate.metadata(), key, index); + TxnData data = parameters.getData(); + Row staticRow = applyUpdates(baseUpdate.staticRow(), referenceOps.statics, key, Clustering.STATIC_CLUSTERING, up, data); + + if (!staticRow.isEmpty()) + updateBuilder.add(staticRow); + + Row existing = baseUpdate.hasRows() ? Iterables.getOnlyElement(baseUpdate) : null; + Row row = applyUpdates(existing, referenceOps.regulars, key, referenceOps.clustering, up, data); + if (row != null) + updateBuilder.add(row); + + return new Update(this.key, index, updateBuilder.build(), tables); + } + + private static Columns columns(Columns current, List referenceOps) + { + if (referenceOps.isEmpty()) + return current; + + Set missing = null; + for (int i = 0, mi = referenceOps.size() ; i < mi ; ++i) + { + ColumnMetadata cm = referenceOps.get(i).receiver(); + if (!current.contains(cm)) + { + if (missing == null) + missing = new HashSet<>(); + missing.add(cm); + } + } + if (missing == null) + return current; + return current.mergeTo(Columns.from(missing)); + } + + private static RegularAndStaticColumns columns(PartitionUpdate update, TxnReferenceOperations referenceOps) + { + checkState(!referenceOps.isEmpty()); + RegularAndStaticColumns current = update.columns(); + return new RegularAndStaticColumns(columns(current.statics, referenceOps.statics), + columns(current.regulars, referenceOps.regulars)); + } + + private static Row applyUpdates(Row existing, List operations, DecoratedKey key, Clustering clustering, UpdateParameters up, TxnData data) + { + if (operations.isEmpty()) + return existing; + + if (existing != null && !existing.isEmpty()) + { + checkState(existing.clustering().equals(clustering)); + up.addRow(existing); + } + else + up.newRow(clustering); + + operations.forEach(op -> op.apply(data, key, up)); + return up.buildRow(); + } + + static final FragmentSerializer serializer = new FragmentSerializer(); + static class FragmentSerializer + { + public void serialize(Fragment fragment, TableMetadatas tables, DataOutputPlus out, Version version) throws IOException + { + out.writeUnsignedVInt32(fragment.index); + PartitionUpdate.serializer.serializeWithoutKey(fragment.baseUpdate, tables, out, version.messageVersion()); + TxnReferenceOperations.serializer.serialize(fragment.referenceOps, tables, out, version); + } + + public Fragment deserialize(PartitionKey key, TableMetadatas tables, DataInputPlus in, Version version) throws IOException + { + int idx = in.readUnsignedVInt32(); + // TODO (required): why FROM_REMOTE? + PartitionUpdate baseUpdate = PartitionUpdate.serializer.deserialize(key, tables, in, version.messageVersion(), FROM_REMOTE); + TxnReferenceOperations referenceOps = TxnReferenceOperations.serializer.deserialize(tables, in, version); + return new Fragment(key, idx, baseUpdate, referenceOps); + } + + public long serializedSize(Fragment fragment, TableMetadatas tables, Version version) + { + long size = 0; + size += TypeSizes.sizeofUnsignedVInt(fragment.index); + size += PartitionUpdate.serializer.serializedSizeWithoutKey(fragment.baseUpdate, tables, version.messageVersion()); + size += TxnReferenceOperations.serializer.serializedSize(fragment.referenceOps, tables, version); + return size; + } + } + } + + public final TableMetadatas tables; + private final boolean isConditionMet; + + private TxnWrite(TableMetadatas tables, Update[] items, boolean isConditionMet) + { + super(items, Domain.Key); + this.tables = tables; + this.isConditionMet = isConditionMet; + } + + public TxnWrite(TableMetadatas tables, List items, boolean isConditionMet) + { + super(items, Domain.Key); + this.tables = tables; + this.isConditionMet = isConditionMet; + } + + @Override + int compareNonKeyFields(Update left, Update right) + { + return Integer.compare(left.index, right.index); + } + + @Override + Seekable getKey(Update item) + { + return item.key; + } + + @Override + Update[] newArray(int size) + { + return new Update[size]; + } + + public void unmemoize() + { + for (int i = 0 ; i < size() ; ++i) + items[i].unmemoize(); + } + + @Override + public AsyncChain apply(Seekable key, SafeCommandStore safeStore, TxnId txnId, Timestamp executeAt, DataStore store, PartialTxn txn) + { + // UnrecoverableRepairUpdate will deserialize as null at other nodes + // Accord should skip the Update for a read transaction, but handle it here anyways + TxnUpdate txnUpdate = ((TxnUpdate)txn.update()); + if (txnUpdate == null) + return Writes.SUCCESS; + + long timestamp = executeAt.uniqueHlc(); + + // TODO (expected): optimise for the common single update case; lots of lists allocated + List> results = new ArrayList<>(); + if (isConditionMet) + { + boolean preserveTimestamps = txnUpdate.preserveTimestamps(); + // Apply updates not specified fully by the client but built from fragments completed by data from reads. + // This occurs, for example, when an UPDATE statement uses a value assigned by a LET statement. + forEachWithKey(key, write -> results.add(write.write(tables, preserveTimestamps, timestamp))); + // Apply updates that are fully specified by the client and not reliant on data from reads. + // ex. INSERT INTO tbl (a, b, c) VALUES (1, 2, 3) + // These updates are persisted only in TxnUpdate and not in TxnWrite to avoid duplication. + List updates = txnUpdate.completeUpdatesForKey((RoutableKey) key); + updates.forEach(write -> results.add(write.write(tables, preserveTimestamps, timestamp))); + } + + if (results.isEmpty()) + return Writes.SUCCESS; + + if (results.size() == 1) + return results.get(0).flatMap(o -> Writes.SUCCESS); + + return AsyncChains.reduce(results, (i1, i2) -> null, (Void)null).flatMap(ignore -> Writes.SUCCESS); + } + + public long estimatedSizeOnHeap() + { + long size = EMPTY_SIZE; + for (Update update : this) + size += update.estimatedSizeOnHeap(); + return size; + } + + public static final ParameterisedVersionedSerializer serializer = new ParameterisedVersionedSerializer<>() + { + @Override + public void serialize(TxnWrite write, Seekables keys, DataOutputPlus out, Version version) throws IOException + { + write.tables.serializeSelf(out); + BooleanSerializer.serializer.serialize(write.isConditionMet, out); + serializeArray(write.items, new TableMetadatasAndKeys(write.tables, keys), out, version, Update.serializer); + } + + @Override + public TxnWrite deserialize(Seekables keys, DataInputPlus in, Version version) throws IOException + { + TableMetadatas tables = TableMetadatas.deserializeSelf(in); + boolean isConditionMet = BooleanSerializer.serializer.deserialize(in); + return new TxnWrite(tables, deserializeArray(new TableMetadatasAndKeys(tables, keys), in, version, Update.serializer, Update[]::new), isConditionMet); + } + + @Override + public long serializedSize(TxnWrite write, Seekables keys, Version version) + { + return write.tables.serializedSelfSize() + + BooleanSerializer.serializer.serializedSize(write.isConditionMet) + + serializedArraySize(write.items, new TableMetadatasAndKeys(write.tables, keys), version, Update.serializer); + } + }; +} diff --git a/src/java/org/apache/cassandra/service/accord/txn/UnrecoverableRepairUpdate.java b/src/java/org/apache/cassandra/service/accord/txn/UnrecoverableRepairUpdate.java new file mode 100644 index 000000000000..7167a637ed2c --- /dev/null +++ b/src/java/org/apache/cassandra/service/accord/txn/UnrecoverableRepairUpdate.java @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.util.Map; +import java.util.concurrent.atomic.AtomicLong; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import accord.api.Data; +import accord.api.Update; +import accord.api.Write; +import accord.local.Node; +import accord.primitives.Keys; +import accord.primitives.Participants; +import accord.primitives.Ranges; +import accord.primitives.Seekables; +import accord.primitives.Timestamp; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.locator.Endpoints; +import org.apache.cassandra.locator.Replica; +import org.apache.cassandra.locator.ReplicaPlan; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.service.reads.ReadCoordinator; +import org.apache.cassandra.service.reads.repair.BlockingReadRepair; + +/** + * This update is used to support blocking read repair from non-transactional Cassandra reads. Cassandra creates + * a read repair mutation per node and this enables some partitions to be readable that would otherwise run into messages + * size limits. + * + * This update is used during the `Execute` phase to apply the repair mutations directly in AccordInteropExecution similar + * to how Accord applies read repair mutations for normal Accord transactions. It will always produce an empty update + * for Accord to use in the Apply phase because Accord doesn't support a per replica Apply and adding it would be redundant + * with the support that exists in AccordInteropExecution. + * + * The state for this update is always kept in memory and is never serialized. Only the Id is propagated so the cache + * can evict the update and then load it back. We don't need to persist it or have it be recoverable because if the original + * coordinator fails to complete the transaction then the dependent Cassandra read that triggered the read repair will + * also fail and it doesn't matter if the read repair is partially applied or not applied at all since it doesn't propose + * new values. + * + * The reason we stash this in an Update that Accord won't actually use is that there isn't an explicit parameter + * passing mechanism in Accord we can only provide implementations of Read, Query, and Update and then Accord will hand them + * back via the Txn during execution. + */ +public class UnrecoverableRepairUpdate, P extends ReplicaPlan.ForRead> extends AccordUpdate +{ + private static class Key + { + final int nodeId; + final long counter; + + private Key(@Nonnull int nodeId, long counter) + { + this.nodeId = nodeId; + this.counter = counter; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Key key = (Key) o; + + if (nodeId != key.nodeId) return false; + return counter == key.counter; + } + + @Override + public int hashCode() + { + int result = nodeId; + result = 31 * result + (int) (counter ^ (counter >>> 32)); + return result; + } + } + + private static final AtomicLong nextCounter = new AtomicLong(0); + + public final BlockingReadRepair parent; + public final Seekables keys; + public final DecoratedKey dk; + public final Map mutations; + public final ReplicaPlan.ForWrite writePlan; + public final Key updateKey; + + public UnrecoverableRepairUpdate(Node.Id nodeId, BlockingReadRepair parent, + Seekables keys, DecoratedKey dk, Map mutations, ReplicaPlan.ForWrite writePlan) + { + this.parent = parent; + this.keys = keys; + this.dk = dk; + this.mutations = mutations; + mutations.values().forEach(Mutation::allowPotentialTransactionConflicts); + this.writePlan = writePlan; + this.updateKey = new Key(nodeId.id, nextCounter.getAndIncrement()); + } + + @Override + public Seekables keys() + { + return Keys.EMPTY; + } + + @Override + public Write apply(Timestamp executeAt, @Nullable Data data) + { + return null; + } + + @Override + public Update slice(Ranges ranges) + { + return this; + } + + @Override + public Update intersecting(Participants participants) + { + return this; + } + + @Override + public Update merge(Update other) + { + return this; + } + + @Override + public ConsistencyLevel cassandraCommitCL() + { + // Leads to standard async persist/commit which is fine since the repair mutations were applied + // as part of execute/read + return null; + } + + @Override + public Kind kind() + { + return Kind.UNRECOVERABLE_REPAIR; + } + + @Override + public long estimatedSizeOnHeap() + { + return 0; + } + + public void runBRR(ReadCoordinator readCoordinator) + { + // This read repair is effectively running as a delegate of the read repair instance that did the reads + // to generate the mutations, but since we already have the mutations we can go ahead and apply them + // now that we are inside a transaction that guarantees that the contents of the mutations consist + // of committed data everywhere we go to apply it + parent.repairPartitionDirectly(readCoordinator, dk, mutations, writePlan); + } + + // Only the original coordinator ever needs to access an UnrecoverableRepairUpdate + public static final AccordUpdateSerializer serializer = new AccordUpdateSerializer<>() + { + @Override + public void serialize(UnrecoverableRepairUpdate update, TableMetadatasAndKeys tablesAndKeys, DataOutputPlus out, Version version) + { + } + + @Override + public UnrecoverableRepairUpdate deserialize(TableMetadatasAndKeys tablesAndKeys, DataInputPlus in, Version version) + { + return null; + } + + @Override + public long serializedSize(UnrecoverableRepairUpdate update, TableMetadatasAndKeys tablesAndKeys, Version version) + { + return 0; + } + }; +} diff --git a/src/java/org/apache/cassandra/service/consensus/TransactionalMode.java b/src/java/org/apache/cassandra/service/consensus/TransactionalMode.java new file mode 100644 index 000000000000..edb4e1a42a6b --- /dev/null +++ b/src/java/org/apache/cassandra/service/consensus/TransactionalMode.java @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.consensus; + +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.IAccordService; +import org.apache.cassandra.service.consensus.migration.TableMigrationState; +import org.apache.cassandra.service.consensus.migration.TransactionalMigrationFromMode; +import org.apache.cassandra.tcm.ClusterMetadata; + +import static com.google.common.base.Preconditions.checkState; +import static org.apache.cassandra.utils.LocalizeString.toLowerCaseLocalized; + +/* + * Configure the transactional behavior of a table. Enables accord on a table and defines how it mixes with non-serial writes + * + * For Accord transactions to function correctly when mixed with non-SERIAL writes it's necessary for the writes to occur through Accord. + * + * Accord will also use this configuration to determine what consistency level to perform its reads + * at since it will need to be able to read data written at non-SERIAL consistency levels. + * + * BlockingReadRepair will also use this configuration to determine how BRR mutations are applied. For migration + * and accord the BRR mutations will be applied as Accord transactions so that BRR doesn't expose Accord to + * uncommitted Accord data that is being RRed. This can occur when Accord has applied a transaction at some, but not + * all replica since Accord defaults to asynchronous commit. + * + * By routing repairs through Accord it is guaranteed that the Accord derived contents of the repair have already been applied at any + * replica where Accord applies the transaction. This also prevents BRR from breaking atomicity of Accord writes. + * + * If they are not written through Accord then reads through Accord will be required to occur at + * consistency level compatible with the non-serial writes preventing single replica reads from being performed + * by Accord. It will also require Accord to perform read repair of non-serial writes. + * + * Even then there is the potential for Accord to inconsistently execute transactions at different replicas + * because different coordinators for an Accord transaction may encounter different non-SERIAL write state and + * race to commit different outcomes for the transaction. + * + * This is different from Paxos because Paxos performs consensus on the actual values to be applied so recovery + * coordinators will always produce a consistent state when applying a transaction. Accord performs consensus on + * the execution order of transaction and different coordinators witnessing different states not managed by Accord + * can produce multiple outcomes for a transaction. + * + * // TODO to safely migrate you would have to route all writes through Accord with the current implementation + * // We could do it by range instead in the migration version, but then we need to know when all in flight writes + * // are done before marking a range as migrated. Would waiting out the timeout be enough (timeout bugs!)? + */ +public enum TransactionalMode +{ + // Running on Paxos V1 or V2 with Accord disabled + off(false, false, false, false), + + // These modes always provide correct execution with mixed_reads allow non-transaction non-SERIAL operations + // at the expense of slower Accord transactions, and full allowing faster transaction execution, but forcing + // all reads and writes to occur transactionally + + /* + * Execute writes through Accord skipping StorageProxy's normal write path, but commit + * writes at the provided consistency level so they can be read via non-SERIAL consistency levels. + * This mode makes it safe to read/write data that Accord will read/write. + */ + mixed_reads(true, true, false, true), + + /* + * Execute writes through Accord skipping StorageProxy's normal write path. Ignores the provided consistency level + * which makes Accord commit writes at ANY similar to Paxos with commit consistency level ANY. + */ + full(true, true, true, true), + + // TODO (maybe): These unsafe modes don't have Accord do async commit and single replica reads so how useful are they besides preserving non-SERIAL performance? + // These modes are unsafe when Accord and non-SERIAL reads and writes interact with the same data + // They don't guarantee that non-SERIAL reads or writes will see the latest Accord writes or that + // Accord transactions will recover correctly + + /* + * Enables Accord and makes it safe to perform non-SERIAL reads of Accord data without guaranteeing that they will + * see the latest Accord writes. non-SERIAL writes to data read by Accord will make Accord txn recovery non-deterministic + * + * Allow mixing of non-SERIAL writes and Accord, but still force BRR through Accord. + * This mode makes it safe to perform non-SERIAL or SERIAL reads of Accord data, but unsafe + * to write data that Accord may attempt to read. + */ + test_unsafe_writes(true, false, false, true), + + /* + * Enables Accord but does not allow non-SERIAL reads and writes to occur safely to data read/written by Accord + * + * Execute non-SERIAL writes through Cassandra via StorageProxy's normal write path. This can lead Accord to compute + * multiple outcomes for a transaction that depend on data written by non-SERIAL writes. + * + * SERIAL reads and CAS will run on Accord. Accord will honor provided consistency levels and do synchronous commit + * so the results can be read correctly with non-SERIAL CLs, but read repair could interfere with Accord. + **/ + test_unsafe(true, false, false, false), + + // For tests, Accord will read and be forced to do interop reads + test_interop_read(true, false, true, true); + + public final boolean accordIsEnabled; + public final boolean nonSerialWritesThroughAccord; + public final boolean nonSerialReadsThroughAccord; + public final boolean blockingReadRepairThroughAccord; + private final String cqlParam; + + TransactionalMode(boolean accordIsEnabled, boolean nonSerialWritesThroughAccord, boolean nonSerialReadsThroughAccord, boolean blockingReadRepairThroughAccord) + { + this.accordIsEnabled = accordIsEnabled; + this.nonSerialWritesThroughAccord = nonSerialWritesThroughAccord; + this.nonSerialReadsThroughAccord = nonSerialReadsThroughAccord; + this.blockingReadRepairThroughAccord = blockingReadRepairThroughAccord; + this.cqlParam = String.format("transactional_mode = '%s'", toLowerCaseLocalized(this.name())); + checkState(this.name().startsWith("test_") || (nonSerialReadsThroughAccord && nonSerialWritesThroughAccord) || !nonSerialReadsThroughAccord, "Doesn't make sense to do non-SERIAL reads through Accord without also doing non-SERIAL writes through Accord"); + } + + // This can be inferred from whether non-SERIAL reads are done through Accord + public boolean ignoresSuppliedCommitCL() + { + return nonSerialReadsThroughAccord; + } + + public ConsistencyLevel commitCLForMode(TransactionalMigrationFromMode fromMode, ConsistencyLevel consistencyLevel, ClusterMetadata cm, TableId tableId, Token token) + { + if (ignoresSuppliedCommitCL()) + { + TableMigrationState tms = cm.consensusMigrationState.tableStates.get(tableId); + checkState(tms != null || fromMode == TransactionalMigrationFromMode.none); + + // Only ignore the supplied consistency level if the token is not migrating + // otherwise honor it since there could still be Paxos and non-SERIAL reads racing with migration. + // Migrating to Accord, Paxos continues reading during the first phase of migration + // Migrating to Paxos, this doesn't really matter since this transaction will get RetryOnDifferentSystemException + if (tms == null || tms.migratedRanges.intersects(token)) + return null; + } + + if (!IAccordService.SUPPORTED_COMMIT_CONSISTENCY_LEVELS.contains(consistencyLevel)) + throw new UnsupportedOperationException("Consistency level " + consistencyLevel + " is unsupported with Accord for write/commit, supported are ANY, ONE, QUORUM, and ALL"); + + return consistencyLevel; + } + + /** + * Infer whether Accord can ignore the read CL and bias towards correctness by reading from a quorum + * if it's needed due to how non-SERIAL writes are done + */ + public boolean ignoresSuppliedReadCL() + { + return nonSerialWritesThroughAccord && blockingReadRepairThroughAccord; + } + + public ConsistencyLevel readCLForMode(TransactionalMigrationFromMode fromMode, ConsistencyLevel consistencyLevel, ClusterMetadata cm, TableId tableId, Token token) + { + if (ignoresSuppliedReadCL()) + { + TableMigrationState tms = cm.consensusMigrationState.tableStates.get(tableId); + checkState(tms != null || fromMode == TransactionalMigrationFromMode.none); + + // Only ignore the supplied consistency level if the token is not migrating + // otherwise honor it because we might read through Accord for non-SERIAL reads before repair is run + // this is OK to do because BRR still works and Accord isn't computing a write so recovery + // determinism isn't an issue + if (tms == null || tms.migratedRanges.intersects(token)) + return null; + } + + if (!IAccordService.SUPPORTED_READ_CONSISTENCY_LEVELS.contains(consistencyLevel)) + throw new UnsupportedOperationException("Consistency level " + consistencyLevel + " is unsupported with Accord for read, supported are ONE, QUORUM, and SERIAL"); + + return consistencyLevel; + } + + public ConsistencyLevel readCLForMode(TransactionalMigrationFromMode fromMode, ConsistencyLevel consistencyLevel, ClusterMetadata cm, TableId tableId, AbstractBounds range) + { + if (ignoresSuppliedReadCL()) + { + TableMigrationState tms = cm.consensusMigrationState.tableStates.get(tableId); + checkState(tms != null || fromMode == TransactionalMigrationFromMode.none); + + // Only ignore the supplied consistency level if none of the range is migrating + // otherwise honor it because we might read through Accord for non-SERIAL reads before repair is run + // this is OK to do because BRR still works and Accord isn't computing a write so recovery + // determinism isn't an issue + if (tms == null || range.intersects(tms.migratedRangesAsPartitionPosition())) + return null; + } + + if (!IAccordService.SUPPORTED_READ_CONSISTENCY_LEVELS.contains(consistencyLevel)) + throw new UnsupportedOperationException("Consistency level " + consistencyLevel + " is unsupported with Accord for read, supported are ONE, QUORUM, and SERIAL"); + return consistencyLevel; + } + + public String asCqlParam() + { + return cqlParam; + } + + public boolean nonSerialWritesThroughAccord() + { + return nonSerialWritesThroughAccord; + } + + public boolean readRepairsThroughAccord() + { + return blockingReadRepairThroughAccord; + } + + public static TransactionalMode fromOrdinal(int ordinal) + { + return values()[ordinal]; + } + + public static TransactionalMode fromString(String name) + { + return valueOf(toLowerCaseLocalized(name)); + } + + public boolean isTestMode() + { + return name().startsWith("test_"); + } +} diff --git a/src/java/org/apache/cassandra/service/consensus/migration/ConsensusKeyMigrationState.java b/src/java/org/apache/cassandra/service/consensus/migration/ConsensusKeyMigrationState.java new file mode 100644 index 000000000000..4078b379b246 --- /dev/null +++ b/src/java/org/apache/cassandra/service/consensus/migration/ConsensusKeyMigrationState.java @@ -0,0 +1,390 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.consensus.migration; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.TimeUnit; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import com.google.common.primitives.Ints; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.primitives.Keys; +import accord.primitives.Timestamp; +import com.github.benmanes.caffeine.cache.CacheLoader; +import com.github.benmanes.caffeine.cache.Caffeine; +import com.github.benmanes.caffeine.cache.LoadingCache; +import com.github.benmanes.caffeine.cache.Weigher; +import org.apache.cassandra.concurrent.ImmediateExecutor; +import org.apache.cassandra.concurrent.Stage; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.SystemKeyspace; +import org.apache.cassandra.db.WriteType; +import org.apache.cassandra.exceptions.CasWriteTimeoutException; +import org.apache.cassandra.exceptions.RetryOnDifferentSystemException; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.locator.EndpointsForToken; +import org.apache.cassandra.locator.Replica; +import org.apache.cassandra.metrics.ClientRequestsMetricsHolder; +import org.apache.cassandra.net.IVerbHandler; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.IAccordService; +import org.apache.cassandra.service.accord.RequestBookkeeping; +import org.apache.cassandra.service.accord.TimeOnlyRequestBookkeeping.LatencyRequestBookkeeping; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.service.paxos.AbstractPaxosRepair.Failure; +import org.apache.cassandra.service.paxos.AbstractPaxosRepair.Result; +import org.apache.cassandra.service.paxos.PaxosRepair; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.transport.Dispatcher; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ObjectSizes; +import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.UUIDSerializer; + +import static accord.local.durability.DurabilityService.SyncLocal.Self; +import static accord.local.durability.DurabilityService.SyncRemote.NoRemote; +import static accord.local.durability.DurabilityService.SyncRemote.Quorum; +import static org.apache.cassandra.config.DatabaseDescriptor.getReadRpcTimeout; +import static org.apache.cassandra.config.DatabaseDescriptor.getWriteRpcTimeout; +import static org.apache.cassandra.net.Verb.CONSENSUS_KEY_MIGRATION; +import static org.apache.cassandra.service.consensus.migration.ConsensusMigrationTarget.paxos; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; + +/** + * Tracks the migration state of individual keys storing the migration (or not) in system.consensus_migration_state + * with an in-memory cache in front. Only locally replicated keys are tracked here to avoid storing too much + * state when token aware routing is not used. + * + * It is safe to migrate keys multiple times so no effort is made to ensure exactly once behavior and the system table + * expires key migration state after 7 days. + */ +public abstract class ConsensusKeyMigrationState +{ + private static final Logger logger = LoggerFactory.getLogger(ConsensusKeyMigrationState.class); + + /* + * Used to notify other replicas when key migration has occurred so they can + * also cache that the key migration was done + */ + public static class ConsensusKeyMigrationFinished + { + @Nonnull + private final UUID tableId; + @Nonnull + private final ByteBuffer partitionKey; + @Nonnull + private final ConsensusMigratedAt consensusMigratedAt; + + private ConsensusKeyMigrationFinished(@Nonnull UUID tableId, + @Nonnull ByteBuffer partitionKey, + @Nonnull ConsensusMigratedAt consensusMigratedAt) + { + this.tableId = tableId; + this.partitionKey = partitionKey; + this.consensusMigratedAt = consensusMigratedAt; + } + + public static final UnversionedSerializer serializer = new UnversionedSerializer() + { + @Override + public void serialize(ConsensusKeyMigrationFinished t, DataOutputPlus out) throws IOException + { + UUIDSerializer.serializer.serialize(t.tableId, out); + ByteBufferUtil.writeWithVIntLength(t.partitionKey, out); + ConsensusMigratedAt.serializer.serialize(t.consensusMigratedAt, out); + } + + @Override + public ConsensusKeyMigrationFinished deserialize(DataInputPlus in) throws IOException + { + UUID tableId = UUIDSerializer.serializer.deserialize(in); + ByteBuffer partitionKey = ByteBufferUtil.readWithVIntLength(in); + ConsensusMigratedAt consensusMigratedAt = ConsensusMigratedAt.serializer.deserialize(in); + return new ConsensusKeyMigrationFinished(tableId, partitionKey, consensusMigratedAt); + } + + @Override + public long serializedSize(ConsensusKeyMigrationFinished t) + { + return UUIDSerializer.serializer.serializedSize(t.tableId) + + ByteBufferUtil.serializedSizeWithVIntLength(t.partitionKey) + + ConsensusMigratedAt.serializer.serializedSize(t.consensusMigratedAt); + } + }; + } + + /* + * Bundles various aspects of key migration state together to avoid multiple lookups + * and to communicate multiple result values and state + */ + public static class KeyMigrationState + { + static final KeyMigrationState MIGRATION_NOT_NEEDED = new KeyMigrationState(null, null, null, null); + + public final ConsensusMigratedAt consensusMigratedAt; + + public final Epoch currentEpoch; + + public final TableMigrationState tableMigrationState; + + public final DecoratedKey key; + + private KeyMigrationState(ConsensusMigratedAt consensusMigratedAt, Epoch currentEpoch, + TableMigrationState tableMigrationState, DecoratedKey key) + { + this.consensusMigratedAt = consensusMigratedAt; + this.currentEpoch = currentEpoch; + this.tableMigrationState = tableMigrationState; + this.key = key; + } + + /* + * This will trigger a distributed migration for the key, but will only block on local completion + * so Paxos reads can return a result as soon as the local state is ready + */ + public void maybePerformAccordToPaxosKeyMigration(boolean isForWrite) + { + if (paxosReadSatisfiedByKeyMigration()) + return; + + // TODO (desired): Better query start time + TableMigrationState tms = tableMigrationState; + repairKeyAccord(key, tms.tableId, tms.minMigrationEpoch(key.getToken()).getEpoch(), Dispatcher.RequestTime.forImmediateExecution(), false, isForWrite); + } + + boolean paxosReadSatisfiedByKeyMigration() + { + // No migration in progress, it's safe + if (tableMigrationState == null) + return true; + + return tableMigrationState.paxosReadSatisfiedByKeyMigrationAtEpoch(key, consensusMigratedAt); + } + } + + private static final int EMPTY_KEY_SIZE = Ints.checkedCast(ObjectSizes.measureDeep(Pair.create(null, UUID.randomUUID()))); + private static final int VALUE_SIZE = Ints.checkedCast(ObjectSizes.measureDeep(new ConsensusMigratedAt(Epoch.EMPTY, ConsensusMigrationTarget.accord))); + + private static final CacheLoader, ConsensusMigratedAt> LOADING_FUNCTION = k -> SystemKeyspace.loadConsensusKeyMigrationState(k.left, k.right); + private static final Weigher, ConsensusMigratedAt> WEIGHER_FUNCTION = (k, v) -> EMPTY_KEY_SIZE + Ints.checkedCast(ByteBufferUtil.estimatedSizeOnHeap(k.left)) + VALUE_SIZE; + + @VisibleForTesting + public static final LoadingCache, ConsensusMigratedAt> MIGRATION_STATE_CACHE = + Caffeine.newBuilder() + .maximumWeight(DatabaseDescriptor.getConsensusMigrationCacheSizeInMiB() << 20) + .weigher(WEIGHER_FUNCTION) + .executor(ImmediateExecutor.INSTANCE) + .build(LOADING_FUNCTION); + + public static final IVerbHandler consensusKeyMigrationFinishedHandler = message -> { + saveConsensusKeyMigrationLocally(message.payload.partitionKey, message.payload.tableId, message.payload.consensusMigratedAt); + }; + + private ConsensusKeyMigrationState() {} + + @VisibleForTesting + public static void reset() + { + MIGRATION_STATE_CACHE.invalidateAll(); + } + + public static void maybeSaveAccordKeyMigrationLocally(PartitionKey partitionKey, Epoch epoch) + { + TableId tableId = partitionKey.table(); + UUID tableUUID = tableId.asUUID(); + DecoratedKey dk = partitionKey.partitionKey(); + ByteBuffer key = dk.getKey(); + + TableMigrationState tms = ClusterMetadata.current().consensusMigrationState.tableStates.get(tableId); + if (tms == null) + return; + + ConsensusMigratedAt migratedAt = new ConsensusMigratedAt(epoch, paxos); + if (!tms.paxosReadSatisfiedByKeyMigrationAtEpoch(dk, migratedAt)) + return; + + saveConsensusKeyMigrationLocally(key, tableUUID, migratedAt); + } + + public static KeyMigrationState getKeyMigrationState(TableId tableId, DecoratedKey key) + { + ClusterMetadata cm = ClusterMetadata.current(); + TableMigrationState tms = cm.consensusMigrationState.tableStates.get(tableId); + // No state means no migration for this table + if (tms == null) + return KeyMigrationState.MIGRATION_NOT_NEEDED; + return getKeyMigrationState(cm, tms, key); + } + + /* + * Should be called where we know we replicate the key so that the system table contains useful information + * about whether the migration already occurred. + * + * This is a more expensive check that might read from the system table to determine if migration occurred. + */ + static KeyMigrationState getKeyMigrationState(ClusterMetadata cm, TableMigrationState tms, DecoratedKey key) + { + if (tms.migratingRanges.intersects(key.getToken())) + { + ConsensusMigratedAt consensusMigratedAt = getConsensusMigratedAt(tms.tableId, key); + if (consensusMigratedAt == null) + return new KeyMigrationState(null, cm.epoch, tms, key); + return new KeyMigrationState(consensusMigratedAt, cm.epoch, tms, key); + } + + return KeyMigrationState.MIGRATION_NOT_NEEDED; + } + + public static @Nullable ConsensusMigratedAt getConsensusMigratedAt(TableId tableId, DecoratedKey key) + { + return MIGRATION_STATE_CACHE.get(Pair.create(key.getKey(), tableId.asUUID())); + } + + /* + * Trigger a distributed repair of Accord state for this key. + */ + static void repairKeyAccord(DecoratedKey key, + TableId tableId, + long minEpoch, + Dispatcher.RequestTime requestTime, + boolean global, + boolean isForWrite) + { + repairKeysAccord(ImmutableList.of(key), tableId, minEpoch, requestTime, global, isForWrite); + } + + static void repairKeysAccord(List keys, + TableId tableId, + long minEpoch, + Dispatcher.RequestTime requestTime, + boolean global, + boolean isForWrite) + { + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(tableId); + if (isForWrite) ClientRequestsMetricsHolder.casWriteMetrics.accordKeyMigrations.mark(); + else ClientRequestsMetricsHolder.casReadMetrics.accordKeyMigrations.mark(); + // Global will always create a transaction to effect the barrier so all replicas + // will soon be ready to execute, but only waits for the local replica to be ready + // Local will only create a transaction if it can't find an existing one to wait on + Keys partitionKeys = AccordService.intersecting(Keys.of(keys, k -> new PartitionKey(tableId, k))); + if (partitionKeys.isEmpty()) + throw new RetryOnDifferentSystemException(); + + IAccordService accord = AccordService.instance(); + + long start = nanoTime(); + long deadline = requestTime.computeDeadline(isForWrite ? getWriteRpcTimeout(TimeUnit.NANOSECONDS) : getReadRpcTimeout(TimeUnit.NANOSECONDS)); + RequestBookkeeping bookkeeping = new LatencyRequestBookkeeping(cfs == null ? null : cfs.metric.keyMigration); + AccordService.getBlocking(accord.sync(Timestamp.minForEpoch(minEpoch), partitionKeys, Self, global ? Quorum : NoRemote), + partitionKeys, bookkeeping, start, deadline); + maybeSaveAccordKeyMigrationLocally((PartitionKey) partitionKeys.get(0), Epoch.create(minEpoch)); + } + + static void repairKeyPaxos(EndpointsForToken naturalReplicas, + Epoch currentEpoch, + DecoratedKey key, + ColumnFamilyStore cfs, + ConsistencyLevel consistencyLevel, + Dispatcher.RequestTime requestTime, + long timeoutNanos, + boolean isLocallyReplicated, + boolean isForWrite) + { + if (isForWrite) + ClientRequestsMetricsHolder.accordWriteMetrics.paxosKeyMigrations.mark(); + else + ClientRequestsMetricsHolder.accordReadMetrics.paxosKeyMigrations.mark(); + TableMetadata tableMetadata = cfs.metadata(); + PaxosRepair repair = PaxosRepair.create(consistencyLevel, key, tableMetadata, timeoutNanos); + long start = nanoTime(); + repair.start(requestTime.startedAtNanos()); + Result result; + try + { + result = repair.await(); + switch (result.outcome) + { + default: + case CANCELLED: + throw new IllegalStateException("Unexpected PaxosRepair outcome " + result.outcome); + case DONE: + // Don't want to repeatedly save this in the non-token aware case + if (isLocallyReplicated) + saveConsensusKeyMigration(naturalReplicas, + new ConsensusKeyMigrationFinished(tableMetadata.id.asUUID(), + key.getKey(), + new ConsensusMigratedAt(currentEpoch, ConsensusMigrationTarget.accord))); + return; + case FAILURE: + Failure failure = (Failure)result; + if (failure.failure == null) + throw new CasWriteTimeoutException(WriteType.CAS, consistencyLevel, 0, 0, 0); + throw new RuntimeException(failure.failure); + } + } + catch (InterruptedException e) + { + throw new RuntimeException(e); + } + finally + { + cfs.metric.keyMigration.addNano(nanoTime() - start); + } + } + + private static void saveConsensusKeyMigration(EndpointsForToken replicas, ConsensusKeyMigrationFinished finished) + { + Message out = Message.out(CONSENSUS_KEY_MIGRATION, finished); + replicas.endpoints(); + for (Replica replica : replicas) + { + if (replica.isSelf()) + saveConsensusKeyMigrationLocally(finished.partitionKey, finished.tableId, finished.consensusMigratedAt); + else + MessagingService.instance().send(out, replica.endpoint()); + } + } + + private static void saveConsensusKeyMigrationLocally(ByteBuffer partitionKey, UUID tableId, ConsensusMigratedAt consensusMigratedAt) + { + // Order doesn't matter, existing values don't matter, version doesn't matter + // If any of this races or goes backwards the result is that key migration is + // reattempted and it should be very rare + MIGRATION_STATE_CACHE.put(Pair.create(partitionKey, tableId), consensusMigratedAt); + Stage.MUTATION.execute(() -> SystemKeyspace.saveConsensusKeyMigrationState(partitionKey, tableId, consensusMigratedAt)); + } +} diff --git a/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigratedAt.java b/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigratedAt.java new file mode 100644 index 000000000000..bf3e798466b8 --- /dev/null +++ b/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigratedAt.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.consensus.migration; + +import java.io.IOException; +import javax.annotation.Nullable; + +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.utils.NullableSerializer; + +public class ConsensusMigratedAt +{ + public static final UnversionedSerializer serializer = NullableSerializer.wrap(new UnversionedSerializer() + { + @Override + public void serialize(ConsensusMigratedAt t, DataOutputPlus out) throws IOException + { + Epoch.serializer.serialize(t.migratedAtEpoch, out); + out.writeByte(t.migratedAtTarget.value); + } + + @Override + public ConsensusMigratedAt deserialize(DataInputPlus in) throws IOException + { + Epoch migratedAtEpoch = Epoch.serializer.deserialize(in); + ConsensusMigrationTarget target = ConsensusMigrationTarget.fromValue(in.readByte()); + return new ConsensusMigratedAt(migratedAtEpoch, target); + } + + @Override + public long serializedSize(ConsensusMigratedAt t) + { + return TypeSizes.sizeof(ConsensusMigrationTarget.accord.value) + + Epoch.serializer.serializedSize(t.migratedAtEpoch); + } + }); + + // Fields are not nullable when used for messaging + @Nullable + public final Epoch migratedAtEpoch; + + @Nullable + public final ConsensusMigrationTarget migratedAtTarget; + + public ConsensusMigratedAt(Epoch migratedAtEpoch, ConsensusMigrationTarget migratedAtTarget) + { + this.migratedAtEpoch = migratedAtEpoch; + this.migratedAtTarget = migratedAtTarget; + } +} diff --git a/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigrationMutationHelper.java b/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigrationMutationHelper.java new file mode 100644 index 000000000000..b3291994db30 --- /dev/null +++ b/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigrationMutationHelper.java @@ -0,0 +1,374 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.consensus.migration; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.function.Predicate; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.primitives.Routable.Domain; +import accord.primitives.Txn; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.IMutation; +import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.exceptions.RetryOnDifferentSystemException; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.TableParams; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.IAccordService; +import org.apache.cassandra.service.accord.IAccordService.IAccordResult; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.txn.TxnCondition; +import org.apache.cassandra.service.accord.txn.TxnQuery; +import org.apache.cassandra.service.accord.txn.TxnRead; +import org.apache.cassandra.service.accord.txn.TxnReferenceOperations; +import org.apache.cassandra.service.accord.txn.TxnResult; +import org.apache.cassandra.service.accord.txn.TxnUpdate; +import org.apache.cassandra.service.accord.txn.TxnWrite; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tracing.Tracing; +import org.apache.cassandra.transport.Dispatcher; + +import static com.google.common.base.Preconditions.checkState; +import static java.util.function.Predicate.not; +import static org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter.getTableMetadata; + +/** + * Applying mutations can fail with RetryOnDifferentSystemException if a + * mutation conflicts with a table and range that needs to be managed + * transactionally. This impacts mutations, logged/unlogged batches, hints,and blocking read repair. + * + * This class contains the logic needed for managing these retry loops and splitting the mutations up + */ +public class ConsensusMigrationMutationHelper +{ + private static final Logger logger = LoggerFactory.getLogger(ConsensusMigrationMutationHelper.class); + + private static ConsensusMigrationMutationHelper instance = new ConsensusMigrationMutationHelper(); + + public static ConsensusMigrationMutationHelper instance() + { + return instance; + } + + @VisibleForTesting + public static void replaceInstanceForTest(ConsensusMigrationMutationHelper testInstance) + { + instance = testInstance; + } + + @VisibleForTesting + public static void resetInstanceForTest() + { + instance = new ConsensusMigrationMutationHelper(); + } + + public ConsensusMigrationMutationHelper() {} + + private static ConsistencyLevel consistencyLevelForCommit(ClusterMetadata cm, Collection mutations, @Nullable ConsistencyLevel consistencyLevel) + { + // Null means no specific consistency behavior is required from Accord, it's functionally similar to ANY + // if you aren't reading the result back via Accord + if (consistencyLevel == null) + return null; + + for (IMutation mutation : mutations) + { + for (TableId tableId : mutation.getTableIds()) + { + TableParams tableParams = getTableMetadata(cm, tableId).params; + TransactionalMode mode = tableParams.transactionalMode; + TransactionalMigrationFromMode migrationFromMode = tableParams.transactionalMigrationFrom; + // commitCLForMode should return either null or the supplied consistency level + // in which case we will commit everything at that CL since Accord doesn't support per table + // commit consistency + ConsistencyLevel commitCL = mode.commitCLForMode(migrationFromMode, consistencyLevel, cm, tableId, mutation.key().getToken()); + if (commitCL != null) + return commitCL; + } + } + return null; + } + + /** + * Result of splitting mutations across Accord and non-transactional boundaries + */ + public static class SplitMutations implements SplitConsumer + { + @Nullable + private List accordMutations; + + @Nullable + private List normalMutations; + + private SplitMutations() {} + + public List accordMutations() + { + return accordMutations; + } + + public List normalMutations() + { + return normalMutations; + } + + @Override + public void consume(@Nullable T accordMutation, @Nullable T normalMutation, List mutations, int mutationIndex) + { + // Avoid allocating an ArrayList in common single mutation single system case + if (mutations.size() == 1 && (accordMutation != null ^ normalMutation != null)) + { + if (accordMutation != null) + accordMutations = mutations; + else + normalMutations = mutations; + return; + } + + if (accordMutation != null) + { + if (accordMutations == null) + accordMutations = new ArrayList<>(Math.min(mutations.size(), 10)); + accordMutations.add(accordMutation); + } + if (normalMutation != null) + { + if (normalMutations == null) + normalMutations = new ArrayList<>(Math.min(mutations.size(), 10)); + normalMutations.add(normalMutation); + } + } + } + + public interface SplitConsumer + { + void consume(@Nullable T accordMutation, @Nullable T normalMutation, List mutations, int mutationIndex); + } + + public static SplitMutations splitMutationsIntoAccordAndNormal(ClusterMetadata cm, List mutations) + { + SplitMutations splitMutations = new SplitMutations<>(); + splitMutationsIntoAccordAndNormal(cm, mutations, splitMutations); + return splitMutations; + } + + public static void splitMutationsIntoAccordAndNormal(ClusterMetadata cm, List mutations, SplitConsumer splitConsumer) + { + for (int i=0,mi=mutations.size(); i splitMutation = instance.splitMutationIntoAccordAndNormal(mutations.get(i), cm); + splitConsumer.consume(splitMutation.accordMutation, splitMutation.normalMutation, mutations, i); + } + } + + /** + * Result of splitting a mutation across Accord and non-transactional boundaries + */ + public static class SplitMutation + { + @Nullable + public final T accordMutation; + @Nullable + public final T normalMutation; + + public SplitMutation(@Nullable T accordMutation, @Nullable T normalMutation) + { + this.accordMutation = accordMutation; + this.normalMutation = normalMutation; + } + } + + public SplitMutation splitMutationIntoAccordAndNormal(T mutation, ClusterMetadata cm) + { + if (mutation.potentialTxnConflicts().allowed) + return new SplitMutation<>(null, mutation); + + Token token = mutation.key().getToken(); + Predicate isAccordUpdate = tableId -> tokenShouldBeWrittenThroughAccord(cm, tableId, token, TransactionalMode::nonSerialWritesThroughAccord, TransactionalMigrationFromMode::nonSerialWritesThroughAccord); + + T accordMutation = (T)mutation.filter(isAccordUpdate); + T normalMutation = (T)mutation.filter(not(isAccordUpdate)); + for (PartitionUpdate pu : mutation.getPartitionUpdates()) + checkState((accordMutation == null ? false : accordMutation.hasUpdateForTable(pu.metadata().id)) + || (normalMutation == null ? false : normalMutation.hasUpdateForTable(pu.metadata().id)), + "All partition updates should still be present after splitting"); + return new SplitMutation(accordMutation, normalMutation); + } + + public IAccordResult mutateWithAccordAsync(ClusterMetadata cm, Mutation mutation, @Nullable ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime) + { + return mutateWithAccordAsync(cm, ImmutableList.of(mutation), consistencyLevel, requestTime); + } + + public static IAccordResult mutateWithAccordAsync(ClusterMetadata cm, Collection mutations, @Nullable ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime) + { + if (consistencyLevel != null && !IAccordService.SUPPORTED_COMMIT_CONSISTENCY_LEVELS.contains(consistencyLevel)) + throw new InvalidRequestException(consistencyLevel + " is not supported by Accord"); + + TableMetadatas tables; + { + TableMetadatas.Collector tableCollector = new TableMetadatas.Collector(); + for (IMutation mutation : mutations) + { + for (TableId tableId : mutation.getTableIds()) + tableCollector.add(cm.schema.getTableMetadata(tableId)); + } + tables = tableCollector.build(); + } + + TableMetadatasAndKeys.KeyCollector keyCollector = new TableMetadatasAndKeys.KeyCollector(tables); + + int fragmentIndex = 0; + List fragments = new ArrayList<>(mutations.size()); + long minEpoch = Epoch.EMPTY.getEpoch(); + for (IMutation mutation : mutations) + { + for (PartitionUpdate update : mutation.getPartitionUpdates()) + { + PartitionKey pk = keyCollector.collect(update.metadata(), update.partitionKey()); + minEpoch = Math.max(minEpoch, update.metadata().epoch.getEpoch()); + fragments.add(new TxnWrite.Fragment(pk, fragmentIndex++, update, TxnReferenceOperations.empty())); + } + } + // Potentially ignore commit consistency level if the TransactionalMode specifies full + ConsistencyLevel clForCommit = consistencyLevelForCommit(cm, mutations, consistencyLevel); + TableMetadatasAndKeys tablesAndKeys = new TableMetadatasAndKeys(tables, keyCollector.build()); + TxnUpdate update = new TxnUpdate(tables, fragments, TxnCondition.none(), clForCommit, true); + Txn.InMemory txn = new Txn.InMemory(tablesAndKeys.keys, TxnRead.empty(Domain.Key), TxnQuery.NONE, update, tablesAndKeys); + return AccordService.instance().coordinateAsync(minEpoch, txn, clForCommit, requestTime); + } + + public static void validateSafeToExecuteNonTransactionally(IMutation mutation) throws RetryOnDifferentSystemException + { + if (mutation.potentialTxnConflicts().allowed) + return; + + String keyspace = mutation.getKeyspaceName(); + // System keyspaces are never managed by Accord + if (SchemaConstants.isSystemKeyspace(keyspace)) + return; + + // Local keyspaces are never managed by Accord + if (Schema.instance.localKeyspaces().containsKeyspace(keyspace)) + return; + + ClusterMetadata cm = ClusterMetadata.current(); + + DecoratedKey dk = mutation.key(); + // Check all the partition updates and if any can't be done return an error response + // and the coordinator can retry with things correctly routed + boolean throwRetryOnDifferentSystem = false; + // Track CFS so we only mark each one once + Set markedColumnFamilies = null; + for (PartitionUpdate pu : mutation.getPartitionUpdates()) + { + TableId tableId = pu.metadata().id; + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(tableId); + if (tokenShouldBeWrittenThroughAccord(cm, tableId, dk.getToken(), TransactionalMode::nonSerialWritesThroughAccord, TransactionalMigrationFromMode::nonSerialWritesThroughAccord)) + { + throwRetryOnDifferentSystem = true; + if (markedColumnFamilies == null) + markedColumnFamilies = new HashSet<>(); + if (markedColumnFamilies.add(tableId)) + cfs.metric.mutationsRejectedOnWrongSystem.mark(); + logger.debug("Rejecting mutation on wrong system to table {}.{}", cfs.keyspace.getName(), cfs.name); + Tracing.trace("Rejecting mutation on wrong system to table {}.{} token {}", cfs.keyspace.getName(), cfs.name, dk.getToken()); + } + } + if (throwRetryOnDifferentSystem) + throw new RetryOnDifferentSystemException(); + } + + public static boolean tokenShouldBeWrittenThroughAccord(@Nonnull ClusterMetadata cm, + @Nonnull TableId tableId, + @Nonnull Token token, + Predicate nonSerialWritesThroughAccord, + Predicate nonSerialWritesThroughAccordFrom) + { + TableMetadata tm = getTableMetadata(cm, tableId); + if (tm == null) + return false; + + boolean transactionalModeWritesThroughAccord = nonSerialWritesThroughAccord.test(tm.params.transactionalMode); + TransactionalMigrationFromMode transactionalMigrationFromMode = tm.params.transactionalMigrationFrom; + boolean migrationFromWritesThroughAccord = nonSerialWritesThroughAccordFrom.test(transactionalMigrationFromMode); + if (transactionalModeWritesThroughAccord && migrationFromWritesThroughAccord) + return true; + + // Could be migrating or could be completely migrated, if it's migrating check if the key for this mutation + if (transactionalModeWritesThroughAccord || migrationFromWritesThroughAccord) + { + TableMigrationState tms = cm.consensusMigrationState.tableStates.get(tm.id); + + if (tms == null) + { + if (transactionalMigrationFromMode == TransactionalMigrationFromMode.none) + // There is no migration and no TMS so do what the schema says since no migration should be required + return transactionalModeWritesThroughAccord; + else + // If we are migrating from something and there is no migration state the migration hasn't begun + // so continue to do what we are migrating from does until the range is marked as migrating + return migrationFromWritesThroughAccord; + } + + // This logic is driven by the fact that Paxos is not picky about how data is written since it's txn recovery + // is deterministic in the face of non-deterministic reads because consensus is agreeing on the writes that will be done to the database + // Accord agrees on what computation will produce those writes and then asynchronously executes those computations, potentially multiple times + // with different results if Accord reads non-transactionally written data that could be seen differently by different coordinators + + // If the current mode writes through Accord then we should always write though Accord for ranges managed by Accord. + // Accord needs to do synchronous commit and respect the consistency level so non-SERIAL reads can read Accord's + // writes. + if (transactionalModeWritesThroughAccord) + { + return tms.migratingAndMigratedRanges.intersects(token); + } + + // If we are migrating from a mode that used to write to Accord then any range that isn't migrated + // should continue to write through Accord. Accord might still be executing txns pre-migration so continue + // to route writes through Accord until migration is completed. + if (migrationFromWritesThroughAccord) + return !tms.migratedRanges.intersects(token); + } + return false; + } +} diff --git a/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigrationRepairResult.java b/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigrationRepairResult.java new file mode 100644 index 000000000000..551ee1f1c7cb --- /dev/null +++ b/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigrationRepairResult.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.consensus.migration; + +import javax.annotation.Nullable; + +import accord.primitives.Ranges; +import org.apache.cassandra.tcm.Epoch; + +import static com.google.common.base.Preconditions.checkArgument; + +public class ConsensusMigrationRepairResult +{ + private static final ConsensusMigrationRepairResult INELIGIBLE = new ConsensusMigrationRepairResult(ConsensusMigrationRepairType.INELIGIBLE, Epoch.EMPTY, null); + public final ConsensusMigrationRepairType type; + public final Epoch minEpoch; + @Nullable + public final Ranges barrieredRanges; + + private ConsensusMigrationRepairResult(ConsensusMigrationRepairType type, Epoch minEpoch, @Nullable Ranges barrieredRanges) + { + this.type = type; + this.minEpoch = minEpoch; + this.barrieredRanges = barrieredRanges; + } + + public static ConsensusMigrationRepairResult fromRepair(Epoch minEpoch, Ranges barrieredRanges, boolean dataRepaired, boolean paxosRepaired, boolean accordRepaired, boolean deadNodesExcluded) + { + checkArgument(!accordRepaired || minEpoch.isAfter(Epoch.EMPTY), "Epoch should not be empty if Accord repairs was performed"); + if (deadNodesExcluded) return INELIGIBLE; + return new ConsensusMigrationRepairResult(new ConsensusMigrationRepairType(dataRepaired, paxosRepaired, accordRepaired), minEpoch, barrieredRanges); + } +} diff --git a/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigrationRepairType.java b/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigrationRepairType.java new file mode 100644 index 000000000000..f02620303bb5 --- /dev/null +++ b/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigrationRepairType.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.consensus.migration; + +public class ConsensusMigrationRepairType +{ + public static final ConsensusMigrationRepairType INELIGIBLE = new ConsensusMigrationRepairType(false, false ,false); + + public final boolean repairedData; + public final boolean repairedPaxos; + public final boolean repairedAccord; + + public ConsensusMigrationRepairType(boolean repairedData, boolean repairedPaxos, boolean repairedAccord) + { + this.repairedData = repairedData; + this.repairedPaxos = repairedPaxos; + this.repairedAccord = repairedAccord; + } + + public boolean migrationToAccordEligible() + { + return repairedData; + } + + public boolean migrationToPaxosEligible() + { + return repairedAccord; + } + + // Require both data and Paxos repair since Paxos only repairs to QUORUM and Accord needs ALL + public boolean repairsPaxos() + { + return repairedData && repairedPaxos; + } + + public boolean ineligibleForMigration() + { + return !migrationToAccordEligible() && !migrationToPaxosEligible(); + } + + @Override + public String toString() + { + return "ConsensusMigrationRepairType{" + + "repairedData=" + repairedData + + ", repairedPaxos=" + repairedPaxos + + ", repairedAccord=" + repairedAccord + + '}'; + } +} diff --git a/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigrationState.java b/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigrationState.java new file mode 100644 index 000000000000..67d43ab77b95 --- /dev/null +++ b/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigrationState.java @@ -0,0 +1,279 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.consensus.migration; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; + +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.DistributedSchema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.MetadataValue; +import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.utils.PojoToString; + +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.base.Preconditions.checkState; +import static org.apache.cassandra.service.consensus.migration.TableMigrationState.initialRepairPendingRanges; +import static org.apache.cassandra.utils.CollectionSerializers.deserializeMap; +import static org.apache.cassandra.utils.CollectionSerializers.serializeMap; +import static org.apache.cassandra.utils.CollectionSerializers.serializedMapSize; + +// TODO this will mostly go away once we can move TableMigrationState into the table schema +public class ConsensusMigrationState implements MetadataValue +{ + public static ConsensusMigrationState EMPTY = new ConsensusMigrationState(Epoch.EMPTY, ImmutableMap.of()); + + @Nonnull + public final Map tableStates; + + public final Epoch lastModified; + + public ConsensusMigrationState(@Nonnull Epoch lastModified, @Nonnull Map tableStates) + { + checkNotNull(tableStates, "tableStates is null"); + checkNotNull(lastModified, "lastModified is null"); + this.lastModified = lastModified; + this.tableStates = ImmutableMap.copyOf(tableStates); + } + + public Map toMap(@Nullable Set keyspaceNames, @Nullable Set tableNames) + { + return ImmutableMap.of("lastModifiedEpoch", lastModified.getEpoch(), + "tableStates", tableStatesAsMaps(keyspaceNames, tableNames), + "version", PojoToString.CURRENT_VERSION); + } + + public Collection tableStates() + { + return tableStates.values(); + } + + public List tableStatesFor(List tableIDs) + { + return tableIDs.stream().map(tableStates::get).collect(Collectors.toList()); + } + + private List> tableStatesAsMaps(@Nullable Set keyspaceNames, + @Nullable Set tableNames) + { + ImmutableList.Builder> builder = ImmutableList.builder(); + for (TableMigrationState tms : tableStates.values()) + { + if (keyspaceNames != null && !keyspaceNames.contains(tms.keyspaceName)) + continue; + if (tableNames != null && !tableNames.contains(tms.tableName)) + continue; + builder.add(tms.toMap()); + } + return builder.build(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + ConsensusMigrationState that = (ConsensusMigrationState) o; + return tableStates.equals(that.tableStates); + } + + public ConsensusMigrationState withReversedMigrations(Map tables, Epoch epoch) + { + if (tables.isEmpty()) + return this; + + ImmutableMap.Builder updated = ImmutableMap.builder(); + + tableStates.forEach((id, state) -> { + if (!tables.containsKey(id)) + updated.put(id, state); + }); + + tables.values().forEach(metadata -> { + TableMigrationState state = tableStates.get(metadata.id); + if (state != null) + updated.put(metadata.id, state.reverseMigration(ConsensusMigrationTarget.fromTransactionalMode(metadata.params.transactionalMode), epoch)); + }); + + return new ConsensusMigrationState(lastModified, updated.build()); + } + + private static void withRangesMigrating(Map current, ImmutableMap.Builder next, TableMetadata metadata, List> ranges, boolean overwrite) + { + TableMigrationState tableState = current.get(metadata.id); + checkState(tableState != null || overwrite, "Can't begin migrating a table without first altering the schema to set transactional mode"); + TransactionalMigrationFromMode migrationFromMode = metadata.params.transactionalMigrationFrom; + ConsensusMigrationTarget target = ConsensusMigrationTarget.fromTransactionalMode(metadata.params.transactionalMode); + checkState(migrationFromMode != null && migrationFromMode != TransactionalMigrationFromMode.none, "Table transactional migration from can't be null or none"); + + Map>> migratingRangesByEpoch = ImmutableMap.of(); + if (!ranges.isEmpty()) + migratingRangesByEpoch = ImmutableMap.of(Epoch.EMPTY, ranges); + + if (overwrite) + tableState = new TableMigrationState(metadata.keyspace, metadata.name, metadata.id, target, ImmutableList.of(), initialRepairPendingRanges(target, ranges), migratingRangesByEpoch); + else + tableState = tableState.withRangesMigrating(ranges, target); + + next.put(metadata.id, tableState); + } + + private static void putUnchanged(Map current, ImmutableMap.Builder next, Set changed) + { + current.forEach((id, migrationState) -> { + if (!changed.contains(id)) + next.put(id, migrationState); + }); + } + + private static void putUnchanged(Map current, ImmutableMap.Builder next, Collection changed) + { + Set changedIds = changed.stream().map(TableMetadata::id).collect(Collectors.toSet()); + + putUnchanged(current, next, changedIds); + } + + public ConsensusMigrationState withRangesMigrating(Collection tables, List> ranges, boolean overwrite) + { + ImmutableMap.Builder updated = ImmutableMap.builder(); + putUnchanged(tableStates, updated, tables); + tables.forEach(metadata -> withRangesMigrating(tableStates, updated, metadata, ranges, overwrite)); + return new ConsensusMigrationState(lastModified, updated.build()); + } + + public ConsensusMigrationState withMigrationsCompletedFor(Collection completed) + { + ImmutableMap.Builder updated = ImmutableMap.builder(); + putUnchanged(tableStates, updated, new HashSet<>(completed)); + return new ConsensusMigrationState(lastModified, updated.build()); + } + + public ConsensusMigrationState withRangesRepairedAtEpoch(TableMetadata metadata, List> ranges, Epoch minEpoch, ConsensusMigrationRepairType repairType) + { + TableMigrationState state = Preconditions.checkNotNull(tableStates.get(metadata.id)); + state = state.withRangesRepairedAtEpoch(ranges, minEpoch, repairType); + + if (state.hasMigratedFullTokenRange(metadata.partitioner)) + { + return withMigrationsCompletedFor(Collections.singleton(metadata.id)); + } + else + { + ImmutableMap.Builder updated = ImmutableMap.builder(); + putUnchanged(tableStates, updated, Collections.singleton(metadata.id)); + updated.put(metadata.id, state); + return new ConsensusMigrationState(lastModified, updated.build()); + } + } + + public ConsensusMigrationState withMigrationsRemovedFor(Set removed) + { + if (tableStates.isEmpty() || Sets.intersection(tableStates.keySet(), removed).isEmpty()) + return this; + ImmutableMap.Builder updated = ImmutableMap.builder(); + putUnchanged(tableStates, updated, removed); + return new ConsensusMigrationState(lastModified, updated.build()); + } + + @Override + public int hashCode() + { + return Objects.hash(tableStates); + } + + @Override + public ConsensusMigrationState withLastModified(Epoch epoch) + { + ImmutableMap.Builder newMap = ImmutableMap.builderWithExpectedSize(tableStates.size()); + tableStates.forEach((tableId, tableState) -> { + newMap.put(tableId, tableState.withReplacementForEmptyEpoch(epoch)); + }); + return new ConsensusMigrationState(epoch, newMap.build()); + } + + @Override + public Epoch lastModified() + { + return lastModified; + } + + public void validateAgainstSchema(DistributedSchema schema) + { + tableStates.forEach((id, migrationState) -> { + TableMetadata metadata = schema.getTableMetadata(id); + checkState(ConsensusMigrationTarget.fromTransactionalMode(metadata.params.transactionalMode).equals(migrationState.targetProtocol)); + }); + } + + public static final MetadataSerializer serializer = new MetadataSerializer() + { + @Override + public void serialize(ConsensusMigrationState consensusMigrationState, DataOutputPlus out, Version version) throws IOException + { + Epoch.serializer.serialize(consensusMigrationState.lastModified, out, version); + serializeMap(consensusMigrationState.tableStates, out, version, TableId.metadataSerializer, TableMigrationState.serializer); + } + + @Override + public ConsensusMigrationState deserialize(DataInputPlus in, Version version) throws IOException + { + Epoch lastModified = Epoch.serializer.deserialize(in, version); + Map tableMigrationStates = deserializeMap(in, version, TableId.metadataSerializer, TableMigrationState.serializer, Maps::newHashMapWithExpectedSize); + return new ConsensusMigrationState(lastModified, tableMigrationStates); + } + + @Override + public long serializedSize(ConsensusMigrationState t, Version version) + { + return Epoch.serializer.serializedSize(t.lastModified, version) + + serializedMapSize(t.tableStates, version, TableId.metadataSerializer, TableMigrationState.serializer); + } + }; + + @Override + public String toString() + { + return "ConsensusMigrationState{" + + "tableStates=" + tableStates + + ", lastModified=" + lastModified + + '}'; + } +} diff --git a/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigrationTarget.java b/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigrationTarget.java new file mode 100644 index 000000000000..f711380c1099 --- /dev/null +++ b/src/java/org/apache/cassandra/service/consensus/migration/ConsensusMigrationTarget.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.consensus.migration; + +import com.google.common.primitives.SignedBytes; + +import org.apache.cassandra.service.consensus.TransactionalMode; + +import static org.apache.cassandra.utils.LocalizeString.toLowerCaseLocalized; + +public enum ConsensusMigrationTarget +{ + paxos(0), + accord(1); + + public final byte value; + + ConsensusMigrationTarget(int value) + { + this.value = SignedBytes.checkedCast(value); + } + + public boolean isMigratedBy(ConsensusMigrationRepairType repairType) + { + return this == accord ? repairType.migrationToAccordEligible() : repairType.migrationToPaxosEligible(); + } + + public static ConsensusMigrationTarget fromString(String targetProtocol) + { + return ConsensusMigrationTarget.valueOf(toLowerCaseLocalized(targetProtocol)); + } + + public static ConsensusMigrationTarget fromValue(byte value) + { + switch (value) + { + default: + throw new IllegalArgumentException(value + " is not recognized"); + case 0: + return paxos; + case 1: + return accord; + } + } + + public static ConsensusMigrationTarget fromTransactionalMode(TransactionalMode mode) + { + return mode.accordIsEnabled ? accord : paxos; + } +} diff --git a/src/java/org/apache/cassandra/service/consensus/migration/ConsensusRequestRouter.java b/src/java/org/apache/cassandra/service/consensus/migration/ConsensusRequestRouter.java new file mode 100644 index 000000000000..c508fbc279d8 --- /dev/null +++ b/src/java/org/apache/cassandra/service/consensus/migration/ConsensusRequestRouter.java @@ -0,0 +1,919 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.consensus.migration; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.function.BiPredicate; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; + +import accord.primitives.Routable.Domain; +import accord.primitives.Seekables; +import accord.primitives.Txn; +import accord.primitives.Txn.Kind; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.db.PartitionRangeReadCommand; +import org.apache.cassandra.db.ReadCommand; +import org.apache.cassandra.db.SinglePartitionReadCommand; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.dht.Bounds; +import org.apache.cassandra.dht.NormalizedRanges; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.exceptions.RetryOnDifferentSystemException; +import org.apache.cassandra.locator.EndpointsForToken; +import org.apache.cassandra.locator.ReplicaLayout; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.Keyspaces; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.TableParams; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.service.consensus.migration.ConsensusKeyMigrationState.KeyMigrationState; +import org.apache.cassandra.service.paxos.Paxos; +import org.apache.cassandra.service.reads.ReadCoordinator; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.transport.Dispatcher; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.Pair; + +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.base.Preconditions.checkState; +import static org.apache.cassandra.dht.Range.compareRightToken; +import static org.apache.cassandra.service.consensus.migration.ConsensusKeyMigrationState.getConsensusMigratedAt; +import static org.apache.cassandra.service.consensus.migration.ConsensusMigrationTarget.paxos; +import static org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter.ConsensusRoutingDecision.accord; +import static org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter.ConsensusRoutingDecision.paxosV1; +import static org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter.ConsensusRoutingDecision.paxosV2; + +/** + * Helper class to decide where to route a request that requires consensus, migrating a key if necessary + * before rerouting. + * + * This router has to be used for all SERIAL reads and writes to ensure the correct operation of Paxos/Acocrd during migration + * and for all non-SERIAL reads because non-SERIAL reads may end up being routed to Accord and Accord needs CRR to manage + * any key migrations that need to be performed + */ +public class ConsensusRequestRouter +{ + public enum ConsensusRoutingDecision + { + paxosV1, + paxosV2, + accord, + } + + public static volatile ConsensusRequestRouter instance = new ConsensusRequestRouter(); + + @VisibleForTesting + public static void setInstance(ConsensusRequestRouter testInstance) + { + instance = testInstance; + } + + @VisibleForTesting + public static void resetInstance() + { + instance = new ConsensusRequestRouter(); + } + + protected ConsensusRequestRouter() {} + + ConsensusRoutingDecision decisionFor(TransactionalMode transactionalMode) + { + if (transactionalMode.accordIsEnabled) + return accord; + + return pickPaxos(); + } + + /* + * Accord never handles local tables, but if the table doesn't exist then we need to generate the correct + * InvalidRequestException. + */ + private static TableMetadata metadata(ClusterMetadata cm, String keyspace, String table) + { + Optional ksm = cm.schema.maybeGetKeyspaceMetadata(keyspace); + if (ksm.isEmpty()) + { + // It's a non-distributed table which is fine, but we want to error if it doesn't exist + // We should never actually reach here unless there is a race with dropping the table + Keyspaces localKeyspaces = Schema.instance.localKeyspaces(); + KeyspaceMetadata ksm2 = localKeyspaces.getNullable(keyspace); + if (ksm2 == null) + throw new InvalidRequestException("Keyspace " + keyspace + " does not exist"); + // Explicitly including views in case they get used in non-distributed tables + TableMetadata tbm2 = ksm2.getTableOrViewNullable(table); + if (tbm2 == null) + throw new InvalidRequestException("Table " + keyspace + "." + table + " does not exist"); + return null; + } + TableMetadata tbm = ksm.get().getTableNullable(table); + if (tbm == null) + throw new InvalidRequestException("Table " + keyspace + "." + table + " does not exist"); + + return tbm; + } + + public ConsensusRoutingDecision routeAndMaybeMigrate(@Nonnull ClusterMetadata cm, @Nonnull DecoratedKey key, @Nonnull String keyspace, @Nonnull String table, ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime, long timeoutNanos, boolean isForWrite) + { + TableMetadata metadata = metadata(cm, keyspace, table); + + // Non-distributed tables always take the Paxos path + if (metadata == null) + return pickPaxos(); + return routeAndMaybeMigrate(cm, metadata, key, consistencyLevel, requestTime, timeoutNanos, isForWrite); + } + + public ConsensusRoutingDecision routeAndMaybeMigrate(@Nonnull ClusterMetadata cm, @Nonnull DecoratedKey key, @Nonnull TableId tableId, ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime, long timeoutNanos, boolean isForWrite) + { + TableMetadata metadata = getTableMetadata(cm, tableId); + // Non-distributed tables always take the Paxos path + if (metadata == null) + pickPaxos(); + return routeAndMaybeMigrate(cm, metadata, key, consistencyLevel, requestTime, timeoutNanos, isForWrite); + } + + public static TableMetadata getTableMetadata(ClusterMetadata cm, TableId tableId) + { + TableMetadata tm = cm.schema.getTableMetadata(tableId); + if (tm == null) + { + // It's a non-distributed table which is fine, but we want to error if it doesn't exist + // We should never actually reach here unless there is a race with dropping the table + Keyspaces localKeyspaces = Schema.instance.localKeyspaces(); + TableMetadata tm2 = localKeyspaces.getTableOrViewNullable(tableId); + if (tm2 == null) + throw new InvalidRequestException("Table with id " + tableId + " does not exist"); + return null; + } + return tm; + } + + protected ConsensusRoutingDecision routeAndMaybeMigrate(ClusterMetadata cm, @Nonnull TableMetadata tmd, @Nonnull DecoratedKey key, ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime, long timeoutNanos, boolean isForWrite) + { + + if (!tmd.params.transactionalMigrationFrom.isMigrating()) + return decisionFor(tmd.params.transactionalMode); + + TableMigrationState tms = cm.consensusMigrationState.tableStates.get(tmd.id); + if (tms == null) + return decisionFor(tmd.params.transactionalMigrationFrom.from); + + Token token = key.getToken(); + if (tms.migratedRanges.intersects(token)) + return pickMigrated(tms.targetProtocol); + + if (tms.migratingRanges.intersects(token)) + return pickBasedOnKeyMigrationStatus(cm, tmd, tms, key, consistencyLevel, requestTime, timeoutNanos, isForWrite); + + // It's not migrated so infer the protocol from the target + return pickNotMigrated(tms.targetProtocol); + } + + /** + * If the key was already migrated then we can pick the target protocol otherwise + * we have to run a repair operation on the key to migrate it. + */ + private static ConsensusRoutingDecision pickBasedOnKeyMigrationStatus(ClusterMetadata cm, TableMetadata tmd, TableMigrationState tms, DecoratedKey key, ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime, long timeoutNanos, boolean isForWrite) + { + checkState(pickPaxos() != paxosV1, "Can't migrate from PaxosV1 to anything"); + + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(tmd.id); + if (cfs == null) + throw new InvalidRequestException("Can't route consensus request to nonexistent CFS %s.%s".format(tmd.keyspace, tmd.name)); + + // Migration to accord has two phases for each range, in the first phase we can't do key migration because Accord + // can't safely read until the range has had its data repaired so Paxos continues to be used for all reads + // and writes + Token token = key.getToken(); + if (tms.targetProtocol == ConsensusMigrationTarget.accord && tms.repairPendingRanges.intersects(token)) + return pickPaxos(); + + // If it is locally replicated we can check our local migration state to see if it was already migrated + EndpointsForToken naturalReplicas = ReplicaLayout.forNonLocalStrategyTokenRead(cm, cfs.keyspace.getMetadata(), token); + boolean isLocallyReplicated = naturalReplicas.lookup(FBUtilities.getBroadcastAddressAndPort()) != null; + if (isLocallyReplicated) + { + ConsensusMigratedAt consensusMigratedAt = getConsensusMigratedAt(tms.tableId, key); + // Check that key migration that was performed satisfies the requirements of the current in flight migration + // for the range + // Be aware that for Accord->Paxos the cache only tells us if the key was repaired locally + // This ends up still being safe because every single Paxos read (in a migrating range) during migration will check + // locally to see if repair is necessary + if (consensusMigratedAt != null && tms.satisfiedByKeyMigrationAtEpoch(key, consensusMigratedAt)) + return pickMigrated(tms.targetProtocol); + + if (tms.targetProtocol == paxos) + { + // Run the Accord barrier txn now so replicas don't start independent + // barrier transactions to accomplish the migration + // They still might need to go through the fast local path for barrier txns + // at each replica, but they won't create their own txn since we created it here + ConsensusKeyMigrationState.repairKeyAccord(key, tms.tableId, tms.minMigrationEpoch(token).getEpoch(), requestTime, true, isForWrite); + return paxosV2; + } + // Fall through for repairKeyPaxos + } + + // If it's not locally replicated then: + // Accord -> Paxos - Paxos will ask Accord to migrate in the read at each replica if necessary + // Paxos -> Accord - Paxos needs to be repaired before Accord runs so do it here + if (tms.targetProtocol == paxos) + // TODO (important): Why are these two cases paxosV2 instead of `pickPaxos`? + // Because we only supported PaxosV2 for migration? + // Eventually we want to support both so just use pickPaxos and error out on migration from paxosV1 elsewhere? + return paxosV2; + else + { + if (tms.accordSafeToReadRanges.intersects(key.getToken())) + // Should exit exceptionally if the repair is not done + ConsensusKeyMigrationState.repairKeyPaxos(naturalReplicas, cm.epoch, key, cfs, consistencyLevel, requestTime, timeoutNanos, isLocallyReplicated, isForWrite); + else + return pickPaxos(); + } + + return pickMigrated(tms.targetProtocol); + } + + // Allows tests to inject specific responses + public boolean isKeyInMigratingOrMigratedRangeDuringPaxosBegin(TableId tableId, DecoratedKey key) + { + return isKeyInMigratingOrMigratedRangeFromPaxos(tableId, key); + } + + // Allows tests to inject specific responses + public boolean isKeyInMigratingOrMigratedRangeDuringPaxosAccept(TableId tableId, DecoratedKey key) + { + return isKeyInMigratingOrMigratedRangeFromPaxos(tableId, key); + } + + /* + * A lightweight check against cluster metadata that doesn't check if the key has already been migrated + * using local system table state. + */ + public boolean isKeyInMigratingOrMigratedRangeFromPaxos(TableId tableId, DecoratedKey key) + { + TableMigrationState tms = ClusterMetadata.current().consensusMigrationState.tableStates.get(tableId); + // No state means no migration for this table + if (tms == null) + return false; + + // We assume that key migration was already performed and it's safe to execute this on Paxos + if (tms.targetProtocol == ConsensusMigrationTarget.paxos) + return false; + + Token token = key.getToken(); + // Migration from Paxos to Accord has two phases and in the first phase we continue to run Paxos + // until the data has been repaired for the range so that Accord can safely read it after Paxos key migration + if (tms.repairPendingRanges.intersects(token)) + return false; + // The coordinator will need to retry either on Accord if they are trying + // to propose their own value, or by setting the consensus migration epoch to recover an incomplete transaction + if (tms.migratingAndMigratedRanges.intersects(token)) + return true; + + return false; + } + + public boolean isRangeManagedByAccordForReadAndWrite(ClusterMetadata cm, TableId tableId, TokenRange range) + { + TableMetadata metadata = getTableMetadata(cm, tableId); + TransactionalMode transactionalMode = metadata.params.transactionalMode; + TransactionalMigrationFromMode transactionalMigrationFromMode = metadata.params.transactionalMigrationFrom; + TableMigrationState tms = cm.consensusMigrationState.tableStates.get(tableId); + if (tms == null) + { + checkState(transactionalMigrationFromMode == TransactionalMigrationFromMode.none, "TableMigrationState shouldn't be null during migration"); + return transactionalMode.nonSerialReadsThroughAccord; + } + + // = token ends up as a min and max key bound in C* parlance and min and max token key in Accord parlance + // and the conversion to a C* range results in the unintentional creation of a wrap around range. + // Instead treat it like a key and do that check. + if (range.start().isTokenSentinel() + && !range.end().isSentinel() + && range.start().token().equals(range.end().token())) + { + checkState(!range.end().isTokenSentinel(), "Unexpected empty range"); + return isTokenManagedByAccordForReadAndWrite(metadata, tms, range.start().token()); + } + else if (range.start().isTokenSentinel()) + { + // Start is particularly problematic because we use min MinTokenKey to make start inclusive and this is something + // that isn't possible to mimic at all with Range, for end it's less problematic because just the token + // is sufficient for Accord to route the query and select the correct shards even if it might accidentally run + // on an extra shard, the filtering will take care of it. There is nothing to do here but convert to a bounds + // and use the bounds check + PartitionPosition startPP = range.start().token().minKeyBound(); + PartitionPosition endPP; + if (range.end().isTableSentinel()) + endPP = DatabaseDescriptor.getPartitioner().getMinimumToken().maxKeyBound(); + else if (range.end().isTokenSentinel()) + endPP = range.end().token().minKeyBound(); + else + endPP = range.end().token().maxKeyBound(); + Bounds bounds = new Bounds<>(startPP, endPP); + return isBoundsExclusivelyManagedByAccordForRead(transactionalMode, transactionalMigrationFromMode, tms, bounds); + } + else + { + return isRangeManagedByAccordForReadAndWrite(metadata, + cm.consensusMigrationState.tableStates.get(tableId), + range.toKeyspaceRange()); + } + } + + /* + * A lightweight check against cluster metadata that doesn't check if the range has already been migrated + * using local system table state. It just assumes that the key migration has already been done. + * + * This version is for is full read write transactions + */ + public boolean isRangeManagedByAccordForReadAndWrite(TableMetadata metadata, TableMigrationState tms, Range range) + { + checkState(!range.isTrulyWrapAround(), "Accidentally created a wrap around range"); + TransactionalMode transactionalMode = metadata.params.transactionalMode; + TransactionalMigrationFromMode migrationFrom = metadata.params.transactionalMigrationFrom; + + if (migrationFrom.isMigrating()) + checkState(tms != null, "Can't have migration in progress without tms"); + + if (transactionalMode.accordIsEnabled) + { + if (!migrationFrom.isMigrating()) + return true; + if (migrationFrom.migratingFromAccord()) + return true; + // Accord can only read/write the key if it is in a safe to read (repaired) range + if (Range.intersects(tms.accordSafeToReadRanges, ImmutableList.of(range))) + return true; + } + else + { + // Once the migration starts only barriers are allowed to run for the key in Accord + if (migrationFrom.migratingFromAccord() && !Range.intersects(tms.migratingAndMigratedRanges, ImmutableList.of(range))) + return true; + } + + return false; + } + + public boolean isKeyManagedByAccordForReadAndWrite(ClusterMetadata cm, TableId tableId, DecoratedKey key) + { + return isTokenManagedByAccordForReadAndWrite(getTableMetadata(cm, tableId), + cm.consensusMigrationState.tableStates.get(tableId), + key.getToken()); + } + + /* + * A lightweight check against cluster metadata that doesn't check if the key has already been migrated + * using local system table state. It just assumes that the key migration has already been done. + * + * This version is for is full read write transactions + */ + public boolean isTokenManagedByAccordForReadAndWrite(TableMetadata metadata, TableMigrationState tms, Token token) + { + TransactionalMode transactionalMode = metadata.params.transactionalMode; + TransactionalMigrationFromMode migrationFrom = metadata.params.transactionalMigrationFrom; + + if (migrationFrom.isMigrating()) + checkState(tms != null, "Can't have migration in progress without tms"); + + if (transactionalMode.accordIsEnabled) + { + if (!migrationFrom.isMigrating()) + return true; + if (migrationFrom.migratingFromAccord()) + return true; + // Accord can only read/write the key if it is in a safe to read (repaired) range + if (tms.accordSafeToReadRanges.intersects(token)) + return true; + } + else + { + // Once the migration starts only barriers are allowed to run for the key in Accord + if (migrationFrom.migratingFromAccord() && !tms.migratingAndMigratedRanges.intersects(token)) + return true; + } + + return false; + } + + public boolean isKeyManagedByAccordForWrite(ClusterMetadata cm, TableId tableId, DecoratedKey key) + { + return isKeyManagedByAccordForWrite(getTableMetadata(cm, tableId), + cm.consensusMigrationState.tableStates.get(tableId), + key); + } + + /* + * A lightweight check against cluster metadata that doesn't check if the key has already been migrated + * using local system table state. It just assumes that the key migration has already been done. + * + * This version is for writes through Accord before Accord is able to safely read. + */ + public boolean isKeyManagedByAccordForWrite(TableMetadata metadata, TableMigrationState tms, DecoratedKey key) + { + TransactionalMode transactionalMode = metadata.params.transactionalMode; + TransactionalMigrationFromMode migrationFrom = metadata.params.transactionalMigrationFrom; + Token token = key.getToken(); + + if (migrationFrom.isMigrating()) + checkState(tms != null, "Can't have migration in progress without tms"); + + if (transactionalMode.accordIsEnabled) + { + if (!migrationFrom.isMigrating()) + return true; + if (migrationFrom.migratingFromAccord()) + return true; + // Accord can blind write to the key even if it isn't safe to read from it so use migratingAndMigratedRanges + if (tms.migratingAndMigratedRanges.intersects(token)) + return true; + } + else + { + // We can always allow writes through Accord and it's necessary to do that so that + // andy premigration txns aren't exposed to non-transactional writes + if (migrationFrom.nonSerialWritesThroughAccord() && !tms.migratedRanges.intersects(token)) + return true; + } + + return false; + } + + public static Txn.Kind shouldReadEphemerally(Seekables keys, TableParams tableParams, Txn.Kind kind) + { + if (kind != Kind.Read) + return kind; + if (!DatabaseDescriptor.getAccordEphemeralReadEnabledEnabled()) + return kind; + // TODO (nicetohave): this could be enhanced to check the token or the range during migration or work in other modes besides full + if (tableParams.transactionalMode != TransactionalMode.full || tableParams.transactionalMigrationFrom != TransactionalMigrationFromMode.none) + return kind; + // Number of ranges doesn't matter + if (keys.domain() == Domain.Range) + return Kind.EphemeralRead; + if (keys.size() > 1) + return kind; + return Kind.EphemeralRead; + } + + private static ConsensusRoutingDecision pickMigrated(ConsensusMigrationTarget targetProtocol) + { + if (targetProtocol.equals(ConsensusMigrationTarget.accord)) + return accord; + else + return pickPaxos(); + } + + private static ConsensusRoutingDecision pickNotMigrated(ConsensusMigrationTarget targetProtocol) + { + if (targetProtocol.equals(ConsensusMigrationTarget.accord)) + return pickPaxos(); + else + return accord; + } + + private static ConsensusRoutingDecision pickPaxos() + { + return Paxos.useV2() ? paxosV2 : paxosV1; + } + + public static void validateSafeToReadNonTransactionally(ReadCommand command) + { + if (command.potentialTxnConflicts().allowed) + return; + + String keyspace = command.metadata().keyspace; + // System keyspaces are never managed by Accord + if (SchemaConstants.isSystemKeyspace(keyspace)) + return; + + // Local keyspaces are never managed by Accord + if (Schema.instance.localKeyspaces().containsKeyspace(keyspace)) + return; + + ClusterMetadata cm = ClusterMetadata.current(); + TableId tableId = command.metadata().id; + TableMetadata tableMetadata = getTableMetadata(cm, tableId); + // Null for local or dropped tables + if (tableMetadata == null) + return; + + TransactionalMode transactionalMode = tableMetadata.params.transactionalMode; + TransactionalMigrationFromMode transactionalMigrationFromMode = tableMetadata.params.transactionalMigrationFrom; + if (!transactionalMode.nonSerialReadsThroughAccord && !transactionalMigrationFromMode.nonSerialReadsThroughAccord()) + return; + + TableMigrationState tms = cm.consensusMigrationState.tableStates.get(tableId); + + // Null with a transaction mode that reads through Accord indicates a completed migration or table created + // to use Accord initially + if (tms == null) + { + checkState(transactionalMigrationFromMode == TransactionalMigrationFromMode.none); + if (transactionalMode.nonSerialReadsThroughAccord) + { + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(tableId); + if (cfs != null) + cfs.metric.readsRejectedOnWrongSystem.mark(); + throw new RetryOnDifferentSystemException(); + } + } + + boolean isExclusivelyReadableFromAccord; + if (command.isRangeRequest()) + isExclusivelyReadableFromAccord = isBoundsExclusivelyManagedByAccordForRead(transactionalMode, transactionalMigrationFromMode, tms, command.dataRange().keyRange()); + else + isExclusivelyReadableFromAccord = isTokenExclusivelyManagedByAccordForRead(transactionalMode, transactionalMigrationFromMode, tms, ((SinglePartitionReadCommand)command).partitionKey().getToken()); + + if (isExclusivelyReadableFromAccord) + { + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(tableId); + if (cfs != null) + cfs.metric.readsRejectedOnWrongSystem.mark(); + throw new RetryOnDifferentSystemException(); + } + } + + private static boolean isTokenExclusivelyManagedByAccordForRead(@Nonnull TransactionalMode transactionalMode, + @Nonnull TransactionalMigrationFromMode migrationFrom, + @Nonnull TableMigrationState tms, + @Nonnull Token token) + { + checkNotNull(transactionalMode, "transactionalMode is null"); + checkNotNull(migrationFrom, "migrationFrom is null"); + checkNotNull(tms, "tms (TableMigrationState) is null"); + checkNotNull(token, "bounds is null"); + + if (transactionalMode.accordIsEnabled) + { + if (!migrationFrom.isMigrating()) + return true; + if (migrationFrom.migratingFromAccord()) + return true; + + // Accord is exclusive once the range is fully migrated to Accord, but possible to read from safely + // when accordSafeToReadRanges covers the entire bound + if (tms.migratedRanges.intersects(token)) + return true; + } + else + { + // Once the migration starts only barriers are allowed to run for the key in Accord + if (migrationFrom.migratingFromAccord() && !tms.migratingAndMigratedRanges.intersects(token)) + return true; + } + + return false; + } + + // Returns true if any part of the bound + private static boolean isBoundsExclusivelyManagedByAccordForRead(@Nonnull TransactionalMode transactionalMode, + @Nonnull TransactionalMigrationFromMode migrationFrom, + @Nonnull TableMigrationState tms, + @Nonnull AbstractBounds bounds) + { + checkNotNull(transactionalMode, "transactionalMode is null"); + checkNotNull(migrationFrom, "migrationFrom is null"); + checkNotNull(tms, "tms (TableMigrationState) is null"); + checkNotNull(bounds, "bounds is null"); + + BiPredicate, NormalizedRanges> intersects = (testBounds, testRanges) -> { + // TODO (nicetohave): Efficiency of this intersection + for (org.apache.cassandra.dht.Range range : testRanges) + { + Pair, AbstractBounds> intersectionAndRemainder = Range.intersectionAndRemainder(testBounds, range); + return intersectionAndRemainder.left != null; + } + return false; + }; + + if (bounds.left.getToken().equals(bounds.right.getToken()) && !bounds.inclusiveLeft() && bounds.inclusiveRight()) + { + return isTokenExclusivelyManagedByAccordForRead(transactionalMode, migrationFrom, tms, bounds.left.getToken()); + } + + if (transactionalMode.accordIsEnabled) + { + if (!migrationFrom.isMigrating()) + return true; + if (migrationFrom.migratingFromAccord()) + return true; + + // Accord is exclusive once the range is fully migrated to Accord, but possible to read from safely + // when accordSafeToReadRanges covers the entire bound + if (intersects.test(bounds, tms.migratedRanges)) + return true; + } + else + { + // Once the migration starts only barriers are allowed to run for the key in Accord + if (migrationFrom.migratingFromAccord() && !intersects.test(bounds, tms.migratingAndMigratedRanges)) + return true; + } + + return false; + } + + public enum RangeReadTarget + { + accord, + normal + } + + public static class RangeReadWithTarget + { + public final PartitionRangeReadCommand read; + public final RangeReadTarget target; + + private RangeReadWithTarget(PartitionRangeReadCommand read, RangeReadTarget target) + { + this.read = read; + this.target = target; + } + + @Override + public String toString() + { + return "RangeReadWithTarget{" + + "read=" + read + + ", target=" + target + + '}'; + } + } + + /** + * While it's possible to map the Accord read to a single txn it doesn't seem worth it since it's a pretty unusual + * scenario where we do this during migration and have a lot of different read commands. + */ + public static List splitReadIntoAccordAndNormal(ClusterMetadata cm, PartitionRangeReadCommand read, ReadCoordinator readCoordinator, Dispatcher.RequestTime requestTime) + { + if (!readCoordinator.isEventuallyConsistent()) + return ImmutableList.of(new RangeReadWithTarget(read, RangeReadTarget.normal)); + TableMetadata tm = getTableMetadata(cm, read.metadata().id); + if (tm == null || (!tm.params.transactionalMode.nonSerialReadsThroughAccord && !tm.params.transactionalMigrationFrom.nonSerialReadsThroughAccord())) + return ImmutableList.of(new RangeReadWithTarget(read, RangeReadTarget.normal)); + + List result = null; + TransactionalMode transactionalMode = tm.params.transactionalMode; + TransactionalMigrationFromMode transactionalMigrationFromMode = tm.params.transactionalMigrationFrom; + boolean transactionalModeReadsThroughAccord = transactionalMode.nonSerialReadsThroughAccord; + RangeReadTarget migrationToTarget = transactionalModeReadsThroughAccord ? RangeReadTarget.accord : RangeReadTarget.normal; + boolean migrationFromReadsThroughAccord = transactionalMigrationFromMode.nonSerialReadsThroughAccord(); + RangeReadTarget migrationFromTarget = migrationFromReadsThroughAccord ? RangeReadTarget.accord : RangeReadTarget.normal; + TableMigrationState tms = cm.consensusMigrationState.tableStates.get(tm.id); + if (tms == null) + { + if (transactionalMigrationFromMode == TransactionalMigrationFromMode.none) + // There is no migration and no TMS so do what the schema says since no migration should be required + return ImmutableList.of(new RangeReadWithTarget(read, transactionalModeReadsThroughAccord ? RangeReadTarget.accord : RangeReadTarget.normal)); + else + // If we are migrating from something and there is no migration state the migration hasn't begun + // so continue to do what we are migrating from does until the range is marked as migrating + return ImmutableList.of(new RangeReadWithTarget(read, migrationFromReadsThroughAccord ? RangeReadTarget.accord : RangeReadTarget.normal)); + } + + + // AbstractBounds can potentially be left/right inclusive while Range used to track migration is only right inclusive + // The right way to tackle this seems to be to find the tokens that intersect the key range and then split until + // until nothing intersects + AbstractBounds keyRange = read.dataRange().keyRange(); + AbstractBounds remainder = keyRange; + + // Migrating to Accord we only read through Accord when the range is fully migrated, but migrating back + // we stop reading from Accord as soon as the range is marked migrating and do key migration on read + NormalizedRanges migratedRanges = transactionalModeReadsThroughAccord ? tms.migratedRanges : tms.migratingAndMigratedRanges; + + // Add the preceding range if any + if (!migratedRanges.isEmpty()) + { + Token firstMigratingToken = migratedRanges.get(0).left.getToken(); + int leftCmp = keyRange.left.getToken().compareTo(firstMigratingToken); + int rightCmp = compareRightToken(keyRange.right.getToken(), firstMigratingToken); + if (leftCmp <= 0) + { + if (rightCmp <= 0) + return ImmutableList.of(new RangeReadWithTarget(read, migrationFromTarget)); + AbstractBounds precedingRange = keyRange.withNewRight(rightCmp <= 0 ? keyRange.right : firstMigratingToken.maxKeyBound()); + // Could be an empty bound, it's fine to let a min KeyBound and max KeyBound through as that isn't empty + if (!precedingRange.left.equals(precedingRange.right)) + { + result = new ArrayList<>(); + result.add(new RangeReadWithTarget(read.forSubRange(precedingRange, true), migrationFromTarget)); + } + } + } + + boolean hadAccordReads = false; + for (Range r : migratedRanges) + { + Pair, AbstractBounds> intersectionAndRemainder = Range.intersectionAndRemainder(remainder, r); + if (intersectionAndRemainder.left != null) + { + if (result == null) + result = new ArrayList<>(); + PartitionRangeReadCommand subRead = read.forSubRange(intersectionAndRemainder.left, result.isEmpty() ? true : false); + result.add(new RangeReadWithTarget(subRead, migrationToTarget)); + hadAccordReads = true; + } + remainder = intersectionAndRemainder.right; + if (remainder == null) + break; + } + + if (remainder != null) + { + if (result != null) + result.add(new RangeReadWithTarget(read.forSubRange(remainder, false), migrationFromTarget)); + else + return ImmutableList.of(new RangeReadWithTarget(read.forSubRange(remainder, true), migrationFromTarget)); + } + + checkState(result != null && !result.isEmpty(), "Shouldn't have null or empty result"); + checkState(result.get(0).read.dataRange().startKey().equals(read.dataRange().startKey()), "Split reads should encompass entire range"); + checkState(result.get(result.size() - 1).read.dataRange().stopKey().equals(read.dataRange().stopKey()), "Split reads should encompass entire range"); + if (result.size() > 1) + { + for (int i = 0; i < result.size() - 1; i++) + { + checkState(result.get(i).read.dataRange().stopKey().equals(result.get(i + 1).read.dataRange().startKey()), "Split reads should all be adjacent"); + checkState(result.get(i).target != result.get(i + 1).target, "Split reads should be for different targets"); + } + } + + //TODO (later): https://issues.apache.org/jira/browse/CASSANDRA-20211 Range reads could use a barrier + if (hadAccordReads) + { + // do barrier + } + + return result; + } + + /** + * Result of splitting mutations across Accord and non-transactional boundaries + */ + public static class SplitReads + { + @Nullable + public final SinglePartitionReadCommand.Group accordReads; + + @Nullable + public final SinglePartitionReadCommand.Group normalReads; + + private SplitReads(SinglePartitionReadCommand.Group accordReads, SinglePartitionReadCommand.Group normalReads) + { + this.accordReads = accordReads; + this.normalReads = normalReads; + } + } + + public static SplitReads splitReadsIntoAccordAndNormal(ClusterMetadata cm, SinglePartitionReadCommand.Group reads, ReadCoordinator coordinator, Dispatcher.RequestTime requestTime) + { + if (!coordinator.isEventuallyConsistent()) + return new SplitReads(null, reads); + List accordReads = null; + List normalReads = null; + + TableMetadata tm = getTableMetadata(cm, reads.queries.get(0).metadata().id); + if (tm == null || (!tm.params.transactionalMode.nonSerialReadsThroughAccord && !tm.params.transactionalMigrationFrom.nonSerialReadsThroughAccord())) + return new SplitReads(null, reads); + + TransactionalMode transactionalMode = tm.params.transactionalMode; + TransactionalMigrationFromMode transactionalMigrationFromMode = tm.params.transactionalMigrationFrom; + TableMigrationState tms = cm.consensusMigrationState.tableStates.get(tm.id); + + for (SinglePartitionReadCommand command : reads.queries) + { + if (tokenShouldBeReadThroughAccord(tms, command.partitionKey().getToken(), transactionalMode, transactionalMigrationFromMode)) + { + if (accordReads == null) + accordReads = new ArrayList<>(reads.queries.size()); + accordReads.add(command); + } + else + { + if (normalReads == null) + normalReads = new ArrayList<>(reads.queries.size()); + normalReads.add(command); + } + } + + // When migrating from Accord -> Paxos we need to do the Accord barrier to have acknowledged Accord writes + // be visible to non-SERIAL reads, but from Paxos -> Accord we don't need to because read only transactions + // don't have recovery determinism issues and Accord will honor read consistency levels and match the behavior + // of non-serially reading Paxos transactions. Since it's a non-SERIAL read there is no guarantee of seeing + // in-flight Paxos operations, for that you would need to read at SERIAL. + // If the migration direction is from a mode that used to read through Accord then Accord would be + // doing async commit so we need barriers if this mode is no longer reading through Accord. + if (transactionalMigrationFromMode.isMigrating() && transactionalMigrationFromMode.nonSerialReadsThroughAccord() && !transactionalMode.nonSerialReadsThroughAccord && normalReads != null) + { + checkState(!normalReads.isEmpty()); + List keysNeedingBarrier = null; + long maxRequiredEpoch = Long.MIN_VALUE; + for (SinglePartitionReadCommand readCommand : normalReads) + { + DecoratedKey key = readCommand.partitionKey(); + KeyMigrationState kms = ConsensusKeyMigrationState.getKeyMigrationState(cm, tms, key); + if (!kms.paxosReadSatisfiedByKeyMigration()) + { + if (keysNeedingBarrier == null) + keysNeedingBarrier = new ArrayList<>(normalReads.size()); + keysNeedingBarrier.add(key); + maxRequiredEpoch = Math.max(tms.minMigrationEpoch(key.getToken()).getEpoch(), maxRequiredEpoch); + } + } + + if (keysNeedingBarrier != null) + { + checkState(!keysNeedingBarrier.isEmpty()); + checkState(maxRequiredEpoch != Long.MIN_VALUE); + // Local barriers don't support multiple keys so create a global one unless there is a single key + // See BarrierType enum for explanation of global vs local + boolean global = keysNeedingBarrier.size() > 1 ? true : false; + ConsensusKeyMigrationState.repairKeysAccord(keysNeedingBarrier, tm.id, maxRequiredEpoch, requestTime, global, false); + } + } + + SinglePartitionReadCommand.Group accordGroup = accordReads != null ? SinglePartitionReadCommand.Group.create(accordReads, reads.limits()) : null; + SinglePartitionReadCommand.Group normalGroup = normalReads != null ? SinglePartitionReadCommand.Group.create(normalReads, reads.limits()) : null; + return new SplitReads(accordGroup, normalGroup); + } + + private static boolean tokenShouldBeReadThroughAccord(TableMigrationState tms, + @Nonnull Token token, + @Nonnull TransactionalMode transactionalMode, + TransactionalMigrationFromMode transactionalMigrationFromMode) + { + boolean transactionalModeReadsThroughAccord = transactionalMode.nonSerialReadsThroughAccord; + boolean migrationFromReadsThroughAccord = transactionalMigrationFromMode.nonSerialReadsThroughAccord(); + + if (transactionalModeReadsThroughAccord && migrationFromReadsThroughAccord) + return true; + + // Could be migrating or could be completely migrated, if it's migrating check if the key for this mutation + if (transactionalModeReadsThroughAccord || migrationFromReadsThroughAccord) + { + if (tms == null) + { + if (transactionalMigrationFromMode == TransactionalMigrationFromMode.none) + // There is no migration and no TMS so do what the schema says since no migration should be required + return transactionalModeReadsThroughAccord; + else + // If we are migrating from something and there is no migration state the migration hasn't begun + // so continue to do what we are migrating from does until the range is marked as migrating + return migrationFromReadsThroughAccord; + } + + // In theory we can start reading from Accord immediately because we know these transactions are 100% + // read only but then that impacts performance more so wait for the range to be completely migrated + // when it can potentially do single replica reads + if (transactionalModeReadsThroughAccord) + return tms.migratedRanges.intersects(token); + + // If we are migrating from a mode that used to write to Accord then any range that isn't migrating/migrated + // should continue to write through Accord. + // It's not completely symmetrical because Paxos is able to read Accord's writes by performing a single key barrier + // and regular mutations will be able to do the same thing (needs to be added along with non-transactional reads) + // This means that migrating ranges don't need to be written through Accord because we are running Paxos now + // and not Accord. When migrating to Accord we need to do all the writes through Accord even if we aren't + // reading through Accord so that repair + Accord metadata is sufficient for Accord to be able to read + // safely and deterministically from any coordinator + if (migrationFromReadsThroughAccord) + return !tms.migratingAndMigratedRanges.intersects(token); + } + return false; + } +} diff --git a/src/java/org/apache/cassandra/service/consensus/migration/ConsensusTableMigration.java b/src/java/org/apache/cassandra/service/consensus/migration/ConsensusTableMigration.java new file mode 100644 index 000000000000..00ac2589a36d --- /dev/null +++ b/src/java/org/apache/cassandra/service/consensus/migration/ConsensusTableMigration.java @@ -0,0 +1,405 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.consensus.migration; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.Function; +import java.util.function.Predicate; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import com.google.common.base.Predicates; +import com.google.common.collect.ImmutableList; +import com.google.common.util.concurrent.FutureCallback; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.NormalizedRanges; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.repair.RepairJobDesc; +import org.apache.cassandra.repair.RepairParallelism; +import org.apache.cassandra.repair.RepairResult; +import org.apache.cassandra.repair.messages.RepairOption; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.service.paxos.Paxos; +import org.apache.cassandra.streaming.PreviewKind; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.tcm.transformations.BeginConsensusMigrationForTableAndRange; +import org.apache.cassandra.tcm.transformations.MaybeFinishConsensusMigrationForTableAndRange; + +import static java.lang.String.format; +import static java.util.Collections.emptyList; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static org.apache.cassandra.dht.NormalizedRanges.normalizedRanges; +import static org.apache.cassandra.dht.Range.normalize; +import static org.apache.cassandra.utils.CollectionSerializers.deserializeList; +import static org.apache.cassandra.utils.CollectionSerializers.serializeCollection; +import static org.apache.cassandra.utils.CollectionSerializers.serializedCollectionSize; + +/** + * Track and update the migration state of individual table and ranges within those tables + */ +public abstract class ConsensusTableMigration +{ + private static final Logger logger = LoggerFactory.getLogger(ConsensusTableMigration.class); + + public static final MetadataSerializer> rangesSerializer = new MetadataSerializer>() + { + + @Override + public void serialize(NormalizedRanges t, DataOutputPlus out, Version version) throws IOException + { + serializeCollection(t, out, version, Range.serializer); + } + + @Override + public NormalizedRanges deserialize(DataInputPlus in, Version version) throws IOException + { + return normalizedRanges(deserializeList(in, version, Range.serializer)); + } + + @Override + public long serializedSize(NormalizedRanges t, Version version) + { + return serializedCollectionSize(t, version, Range.serializer); + } + }; + + public static final FutureCallback completedRepairJobHandler = new FutureCallback() + { + @Override + public void onSuccess(@Nullable RepairResult repairResult) + { + checkNotNull(repairResult, "repairResult should not be null"); + ConsensusMigrationRepairResult migrationResult = repairResult.consensusMigrationRepairResult; + ConsensusMigrationRepairType repairType = migrationResult.type; + + // Need to repair both Paxos and base table state + // Could track them separately, but doesn't seem worth the effort + if (repairType.ineligibleForMigration()) + return; + + RepairJobDesc desc = repairResult.desc; + TableMetadata tm = Schema.instance.getTableMetadata(desc.keyspace, desc.columnFamily); + if (tm == null) + return; + TableMigrationState tms = ClusterMetadata.current().consensusMigrationState.tableStates.get(tm.id); + if (tms == null || !Range.intersects(tms.migratingRanges, desc.ranges)) + return; + + if (!tms.targetProtocol.isMigratedBy(repairResult.consensusMigrationRepairResult.type)) + return; + + NormalizedRanges paxosRepairedRanges = NormalizedRanges.empty(); + if (repairType.migrationToAccordEligible()) + // Paxos always repairs all ranges requested by the repair although there should be nothing + // repaired in the migrated and Accord managed ranges + paxosRepairedRanges = normalizedRanges(desc.ranges); + + NormalizedRanges accordBarrieredRanges = NormalizedRanges.empty(); + if (repairType.migrationToPaxosEligible()) + // Accord only barriers ranges it thinks it manages and repair collects which it barriered + // precisely which doesn't have to match what the entire repair covers + accordBarrieredRanges = normalizedRanges(migrationResult.barrieredRanges.stream() + .map(range -> ((TokenRange)range).toKeyspaceRange()) + .collect(toImmutableList())); + accordBarrieredRanges = normalizedRanges(accordBarrieredRanges); + + ClusterMetadataService.instance().commit( + new MaybeFinishConsensusMigrationForTableAndRange( + desc.keyspace, desc.columnFamily, paxosRepairedRanges, accordBarrieredRanges, + migrationResult.minEpoch, repairType.repairedData, repairType.repairedPaxos, repairType.repairedAccord)); + } + + @Override + public void onFailure(Throwable throwable) + { + // Only successes drive forward progress + } + }; + + private ConsensusTableMigration() {} + + public static @Nullable TableMigrationState getTableMigrationState(TableId tableId) + { + ClusterMetadata cm = ClusterMetadata.current(); + return cm.consensusMigrationState.tableStates.get(tableId); + } + // Used by callers to avoid looking up the TMS multiple times + public static @Nullable TableMigrationState getTableMigrationState(long epoch, TableId tableId) + { + ClusterMetadata cm = ClusterMetadataService.instance().fetchLogFromCMS(Epoch.create(epoch)); + return cm.consensusMigrationState.tableStates.get(tableId); + } + + public static void startMigrationToConsensusProtocol(@Nullable List keyspaceNames, + @Nonnull Optional> maybeTables, + @Nonnull Optional maybeRangesStr) + { + checkArgument(!maybeTables.isPresent() || !maybeTables.get().isEmpty(), "Must provide at least 1 table if Optional is not empty"); + ClusterMetadata cm = ClusterMetadata.current(); + + if (keyspaceNames == null || keyspaceNames.isEmpty()) + { + keyspaceNames = ImmutableList.copyOf(StorageService.instance.getNonLocalStrategyKeyspaces()); + } + checkState(keyspaceNames.size() == 1 || !maybeTables.isPresent(), "Can't specify tables with multiple keyspaces"); + List ids = keyspacesAndTablesToTableIds(cm, keyspaceNames, maybeTables); + + List tableIds = new ArrayList<>(); + for (TableId tableId : ids) + { + TableMetadata metadata = cm.schema.getTableMetadata(tableId); + if (metadata == null || !metadata.params.transactionalMigrationFrom.isMigrating()) + continue; + tableIds.add(tableId); + } + + if (!Paxos.useV2()) + throw new IllegalStateException("Can't do any consensus migrations to/from PaxosV1, switch to V2 first"); + + IPartitioner partitioner = DatabaseDescriptor.getPartitioner(); + Optional>> maybeParsedRanges = maybeRangesStr.map(rangesStr -> ImmutableList.copyOf(RepairOption.parseRanges(rangesStr, partitioner))); + Token minToken = partitioner.getMinimumToken(); + NormalizedRanges ranges = normalizedRanges(maybeParsedRanges.orElse(ImmutableList.of(new Range(minToken, minToken)))); + + ClusterMetadataService.instance().commit(new BeginConsensusMigrationForTableAndRange(ranges, tableIds)); + } + + public static Integer finishMigrationToConsensusProtocol(@Nonnull String keyspace, + @Nonnull Optional> maybeTables, + @Nonnull Optional maybeRangesStr, + @Nonnull ConsensusMigrationTarget target) + { + checkArgument(!maybeTables.isPresent() || !maybeTables.get().isEmpty(), "Must provide at least 1 table if Optional is not empty"); + checkNotNull(target); + ClusterMetadata cm = ClusterMetadata.current(); + + Optional>> localKeyspaceRanges = Optional.of(ImmutableList.copyOf(StorageService.instance.getLocalReplicas(keyspace).onlyFull().ranges())); + List> ranges = maybeRangesToRanges(maybeRangesStr, localKeyspaceRanges); + Map allTableMigrationStates = ClusterMetadata.current().consensusMigrationState.tableStates; + List tableIds = keyspacesAndTablesToTableIds(cm, ImmutableList.of(keyspace), maybeTables, Optional.of(allTableMigrationStates::containsKey)); + + checkState(tableIds.stream().allMatch(allTableMigrationStates::containsKey), "All tables need to be migrating"); + List tableMigrationStates = new ArrayList<>(); + tableIds.forEach(table -> { + TableMetadata tm = cm.schema.getTableMetadata(table); + if (tm == null) + { + logger.warn("Table {} does not exist or was dropped", table); + return; + } + TableMigrationState tms = allTableMigrationStates.get(table); + if (tms == null) + { + logger.warn("Table {} does not have any migration state", tm.name); + return; + } + if (!Range.intersects(ranges, tms.migratingRanges)) + { + logger.warn("Table {} with migrating ranges {} does not intersect with any requested ranges {}", tm.name, tms.migratingRanges, ranges); + return; + } + tableMigrationStates.add(tms); + }); + + switch (target) + { + case accord: + List migratingToAccord = tableMigrationStates.stream().filter(tms -> tms.targetProtocol == ConsensusMigrationTarget.accord).collect(toImmutableList()); + Integer accordDataRepairCmd = finishMigrationToAccordDataRepair(keyspace, migratingToAccord, ranges); + // All ranges are already repaired and ready for Paxos repair + // so kick that off instead + if (accordDataRepairCmd == null) + return finishMigrationToAccordPaxosRepair(keyspace, migratingToAccord, ranges); + return accordDataRepairCmd; + case paxos: + List migratingToPaxos = tableMigrationStates.stream().filter(tms -> tms.targetProtocol == ConsensusMigrationTarget.paxos).collect(toImmutableList());; + return finishMigrationToPaxos(keyspace, migratingToPaxos, ranges); + default: + throw new IllegalArgumentException("Unsupported target: " + target); + } + } + + private interface MigrationFinisher + { + Integer finish(Collection tables, List> ranges); + } + + private static Integer finishMigrationTo(String name, List tableMigrationStates, List> requestedRanges, Function>> migratingRanges, MigrationFinisher migrationFinisher) + { + logger.info("Begin finish migration to {} for ranges {} and tables {}", name, requestedRanges, tableMigrationStates); + List> intersectingRangesList = new ArrayList<>(); + tableMigrationStates.stream().map(migratingRanges).forEach(intersectingRangesList::addAll); + NormalizedRanges intersectingRanges = normalizedRanges(intersectingRangesList); + intersectingRanges = intersectingRanges.intersection(normalizedRanges(requestedRanges)); + if (intersectingRanges.isEmpty()) + { + logger.warn("No requested ranges {} intersect any migrating ranges in any table for migration: {}", requestedRanges, name); + return null; + } + + // Repair requires that the ranges once again be grouped by the ranges provided originally which all + // fall within local range boundaries. This was already checked in maybeRangesToRanges. + List> intersectingRangesGrouped = new ArrayList<>(); + for (Range r : requestedRanges) + { + List> intersectionsForGroup = new ArrayList<>(); + for (Range intersectedRange : intersectingRanges) + intersectionsForGroup.addAll(r.intersectionWith(intersectedRange)); + intersectingRangesGrouped.addAll(normalize(intersectionsForGroup)); + } + return migrationFinisher.finish(tableMigrationStates, intersectingRangesGrouped); + } + + /* + * This is basically just invoking classic Cassandra repair and is pretty redundant with invoking repair + * directly which would also work without issue. It's include so the same interface works for both migrating to/from + * Accord, but it's not great in that repair has a lot of options that might need to be forwarded. + * + * Still maybe more valuable to put this layer of abstraction in so we can change how it works later and it's less + * tightly coupled with the Repair interface which is pretty orthogonal to consensus migration. + * + * This first repair is necessary to allow Accord to read data that was written non-serially because we can't do key + * migration for those operations because there is no metadata like we have with Paxos. + */ + private static Integer finishMigrationToAccordDataRepair(String keyspace, List migratingToAccord, List> requestedRanges) + { + return finishMigrationTo("Accord Data Repair", migratingToAccord, requestedRanges, TableMigrationState::repairPendingRanges, (tables, intersectingRanges) -> { + RepairOption repairOption = getRepairOption(tables, intersectingRanges, true, false, false); + return StorageService.instance.repair(keyspace, repairOption, emptyList()).left; + }); + } + + /* + * Need to perform a second repair that is Paxos and then data so that when migrating to FULL mode Accord can read + * the result of any Paxos operation from any replica. This should only be done on the migrating ranges that are no longer pending data repair + */ + private static Integer finishMigrationToAccordPaxosRepair(String keyspace, List migratingToAccord, List> requestedRanges) + { + return finishMigrationTo("Accord Paxos Repair", migratingToAccord, requestedRanges, tms -> tms.migratingRanges.subtract(tms.repairPendingRanges), (tables, intersectingRanges) -> { + RepairOption repairOption = getRepairOption(tables, intersectingRanges, true, true, false); + return StorageService.instance.repair(keyspace, repairOption, emptyList()).left; + }); + } + + /* + * Migration back to Paxos is pretty simple since Accord can bring all replicas up to date by running barriers and + * supports key migration immediately without any repair. Paxos doesn't have the same sensitivity to non-deterministic + * data reads. + */ + private static Integer finishMigrationToPaxos(String keyspace, List migratingToPaxos, List> requestedRanges) + { + return finishMigrationTo("Paxos", migratingToPaxos, requestedRanges, TableMigrationState::migratingRanges, (tables, intersectingRanges) -> { + RepairOption repairOption = getRepairOption(tables, intersectingRanges, false, false, true); + return StorageService.instance.repair(keyspace, repairOption, emptyList()).left; + }); + } + + + private static List keyspacesAndTablesToTableIds(@Nonnull ClusterMetadata cm, @Nonnull List keyspaceNames, @Nonnull Optional> maybeTables) + { + return keyspacesAndTablesToTableIds(cm, keyspaceNames, maybeTables, Optional.empty()); + } + + private static List keyspacesAndTablesToTableIds(@Nonnull ClusterMetadata cm, @Nonnull List keyspaceNames, @Nonnull Optional> maybeTables, @Nonnull Optional> includeTable) + { + List tableIds = new ArrayList<>(); + for (String keyspaceName : keyspaceNames) + { + Optional> maybeTableIds = maybeTables.map(tableNames -> + tableNames + .stream() + .map(tableName -> { + TableMetadata tm = cm.schema.getTableMetadata(keyspaceName, tableName); + if (tm == null) + throw new IllegalArgumentException(format("Unknown table %s.%s", keyspaceName, tableName)); + return tm.id; + }) + .collect(toImmutableList())); + tableIds.addAll( + maybeTableIds.orElseGet(() -> + cm.schema.getKeyspace(keyspaceName).getColumnFamilyStores() + .stream() + .map(ColumnFamilyStore::getTableId) + .filter(includeTable.orElse(Predicates.alwaysTrue())) // Filter out non-migrating so they don't generate an error + .collect(toImmutableList()))); + } + return tableIds; + } + + @Nonnull + private static RepairOption getRepairOption(Collection tables, List> intersectingRanges, boolean repairData, boolean repairPaxos, boolean repairAccord) + { + boolean primaryRange = false; + // TODO (review): Should disabling incremental repair be exposed for the Paxos repair in case someone explicitly does not do incremental repair? + boolean incremental = repairData; + boolean trace = false; + int numJobThreads = 1; + boolean pullRepair = false; + boolean forceRepair = false; + boolean optimiseStreams = false; + boolean ignoreUnreplicatedKeyspaces = true; + boolean dontPurgeTombstones = false; + RepairOption repairOption = new RepairOption(RepairParallelism.PARALLEL, primaryRange, incremental, trace, numJobThreads, intersectingRanges, pullRepair, forceRepair, PreviewKind.NONE, optimiseStreams, ignoreUnreplicatedKeyspaces, repairData, repairPaxos, dontPurgeTombstones, repairAccord); + tables.forEach(table -> repairOption.getColumnFamilies().add(table.tableName)); + return repairOption; + } + + + // Repair is restricted to local ranges, but manipulating CMS migration state doesn't need to be restricted + private static @Nonnull List> maybeRangesToRanges(@Nonnull Optional maybeRangesStr) + { + return maybeRangesToRanges(maybeRangesStr, Optional.empty()); + } + + private static @Nonnull List> maybeRangesToRanges(@Nonnull Optional maybeRangesStr, Optional>> restrictToRanges) + { + IPartitioner partitioner = DatabaseDescriptor.getPartitioner(); + Optional>> maybeParsedRanges = maybeRangesStr.map(rangesStr -> ImmutableList.copyOf(RepairOption.parseRanges(rangesStr, partitioner))); + Token minToken = partitioner.getMinimumToken(); + List> defaultRanges = restrictToRanges.orElse(ImmutableList.of(new Range(minToken, minToken))); + List> ranges = maybeParsedRanges.orElse(defaultRanges); + checkArgument(ranges.stream().allMatch(range -> defaultRanges.stream().anyMatch(defaultRange -> defaultRange.contains(range))), + "If ranges are specified each range must be contained within a local range (" + defaultRanges + ") for this node to allow for precise repairs. Specified " + ranges); + return ranges; + } +} diff --git a/src/java/org/apache/cassandra/service/consensus/migration/TableMigrationState.java b/src/java/org/apache/cassandra/service/consensus/migration/TableMigrationState.java new file mode 100644 index 000000000000..7fb435b2b283 --- /dev/null +++ b/src/java/org/apache/cassandra/service/consensus/migration/TableMigrationState.java @@ -0,0 +1,449 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.consensus.migration; + +import java.io.IOException; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSortedMap; +import com.google.common.collect.Iterables; +import com.google.common.collect.Maps; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.NormalizedRanges; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static org.apache.cassandra.db.TypeSizes.sizeof; +import static org.apache.cassandra.dht.NormalizedRanges.normalizedRanges; +import static org.apache.cassandra.dht.Range.normalize; +import static org.apache.cassandra.dht.Range.subtract; +import static org.apache.cassandra.utils.CollectionSerializers.deserializeMap; +import static org.apache.cassandra.utils.CollectionSerializers.deserializeSet; +import static org.apache.cassandra.utils.CollectionSerializers.serializeCollection; +import static org.apache.cassandra.utils.CollectionSerializers.serializeMap; +import static org.apache.cassandra.utils.CollectionSerializers.serializedCollectionSize; +import static org.apache.cassandra.utils.CollectionSerializers.serializedMapSize; + +// TODO Move this into the schema for the table once this is based off of TrM +public class TableMigrationState +{ + private static final Logger logger = LoggerFactory.getLogger(TableMigrationState.class); + + @Nonnull + public final String keyspaceName; + + @Nonnull + public final String tableName; + + @Nonnull + public final TableId tableId; + + @Nonnull + public final ConsensusMigrationTarget targetProtocol; + + /** + * Migrated means that both phases are completed when migrating to Accord. Paxos only has one phase. + */ + @Nonnull + public final NormalizedRanges migratedRanges; + + /* + * Necessary to track which ranges started migrating at which epoch + * in order to know whether a repair qualifies in terms of finishing + * migration of the range. + */ + @Nonnull + public final NavigableMap> migratingRangesByEpoch; + + /** + * Ranges that are migrating and could be in either phase when migrating to Accord. Paxos only has one phase. + */ + @Nonnull + public final NormalizedRanges migratingRanges; + + /** + * These are ranges that are migrating and have not been repaired yet when migrating to Accord so Accord can't read from them. These ranges + * should continue to be operated on by Paxos + * + * When migrating to Accord a repair can only move the range to migrated if it is already in repairCompletedRanges + * + * Additionally Paxos continues to operate on a migrating range and key migration is not performed + * + * This is skipped when migrating from Accord to Paxos. + */ + @Nonnull + public final NormalizedRanges repairPendingRanges; + + /** + * Ranges that are migrating could be in either phase when migrating to Accord. Paxos only has one phase. + */ + @Nonnull + public final NormalizedRanges migratingAndMigratedRanges; + + /** + * Same as migratingAndMigratedRanges if migrating to Paxos, otherwise migratingAndMigratedRanges.subtract(repairPendingRanges) + * + * Not included in equals or hashCode because it is inferred from other fields + */ + @Nonnull + public final NormalizedRanges accordSafeToReadRanges; + + public TableMigrationState(@Nonnull String keyspaceName, + @Nonnull String tableName, + @Nonnull TableId tableId, + @Nonnull ConsensusMigrationTarget targetProtocol, + @Nonnull Collection> migratedRanges, + @Nonnull Collection> repairPendingRanges, + @Nonnull Map>> migratingRangesByEpoch) + { + this.keyspaceName = keyspaceName; + this.tableName = tableName; + this.tableId = tableId; + this.targetProtocol = targetProtocol; + this.migratedRanges = normalizedRanges(migratedRanges); + this.repairPendingRanges = normalizedRanges(repairPendingRanges); + this.migratingRangesByEpoch = ImmutableSortedMap.copyOf( + migratingRangesByEpoch.entrySet() + .stream() + .map(entry -> new AbstractMap.SimpleEntry<>(entry.getKey(), normalizedRanges(entry.getValue()))) + .collect(Collectors.toList())); + this.migratingRanges = normalizedRanges(migratingRangesByEpoch.values().stream().flatMap(Collection::stream).collect(Collectors.toList())); + this.migratingAndMigratedRanges = normalizedRanges(ImmutableList.>builder().addAll(migratedRanges).addAll(migratingRanges).build()); + this.accordSafeToReadRanges = !repairPendingRanges.isEmpty() ? migratingAndMigratedRanges.subtract(this.repairPendingRanges) : migratingAndMigratedRanges; + } + + static List> initialRepairPendingRanges(ConsensusMigrationTarget target, List> initialMigratingRanges) + { + return target == ConsensusMigrationTarget.accord ? ImmutableList.copyOf(initialMigratingRanges) : ImmutableList.of(); + } + + public TableMigrationState reverseMigration(ConsensusMigrationTarget target, Epoch epoch) + { + IPartitioner partitioner = DatabaseDescriptor.getPartitioner(); + Range fullRange = new Range<>(partitioner.getMinimumToken(), partitioner.getMinimumToken()); + List> allTouched = new ArrayList<>(migratedRanges); + allTouched.addAll(migratingRanges); + allTouched = Range.deoverlap(allTouched); + return new TableMigrationState(keyspaceName, tableName, tableId, target, + Range.normalize(fullRange.subtractAll(allTouched)), + initialRepairPendingRanges(target, migratingRanges), + Collections.singletonMap(epoch, migratingRanges)); + } + + public boolean hasMigratedFullTokenRange(IPartitioner partitioner) + { + // migrated ranges are normalized + if (!migratingRanges.isEmpty() || migratedRanges.size() != 1) + return false; + + Range fullRange = new Range<>(partitioner.getMinimumToken(), partitioner.getMinimumToken()); + return migratedRanges.get(0).contains(fullRange); + } + + @Nonnull + public List> migratingRanges() { + + return migratingRanges; + } + + @Nonnull + public List> repairPendingRanges() + { + return repairPendingRanges; + } + + public TableMigrationState withRangesMigrating(@Nonnull Collection> ranges, + @Nonnull ConsensusMigrationTarget target) + { + checkState(!migratingRangesByEpoch.containsKey(Epoch.EMPTY), "Shouldn't already have an entry for the empty epoch"); + // Doesn't matter which epoch the range started migrating in for this context so merge them all + Collection> migratingRanges = normalize(migratingRangesByEpoch.values().stream().flatMap(Collection::stream).collect(Collectors.toList())); + checkArgument(target == targetProtocol, "Requested migration to target protocol " + target + " conflicts with in progress migration to protocol " + targetProtocol); + List> normalizedRanges = normalize(ranges); + if (subtract(normalizedRanges, migratingRanges).isEmpty()) + logger.warn("Range " + ranges + " is already being migrated"); + Set> withoutAlreadyMigrated = subtract(normalizedRanges, migratedRanges); + if (withoutAlreadyMigrated.isEmpty()) + logger.warn("Range " + ranges + " is already migrated"); + Set> withoutBoth = subtract(withoutAlreadyMigrated, migratingRanges); + if (withoutBoth.isEmpty()) + logger.warn("Range " + ranges + " is already migrating/migrated"); + + if (!Range.equals(normalizedRanges, withoutBoth)) + logger.warn("Ranges " + normalizedRanges + " to start migrating is already partially migrating/migrated " + withoutBoth); + + Map>> newMigratingRanges = new HashMap<>(migratingRangesByEpoch.size() + 1); + newMigratingRanges.putAll(migratingRangesByEpoch); + newMigratingRanges.put(Epoch.EMPTY, normalizedRanges); + + List> newRepairPendingRanges = new ArrayList<>(repairPendingRanges); + if (target == ConsensusMigrationTarget.accord) + newRepairPendingRanges.addAll(withoutBoth); + + return new TableMigrationState(keyspaceName, tableName, tableId, targetProtocol, migratedRanges, newRepairPendingRanges, newMigratingRanges); + } + + public TableMigrationState withReplacementForEmptyEpoch(@Nonnull Epoch replacementEpoch) + { + if (!migratingRangesByEpoch.containsKey(Epoch.EMPTY)) + return this; + Map>> newMigratingRangesByEpoch = new HashMap<>(migratingRangesByEpoch.size()); + migratingRangesByEpoch.forEach((epoch, ranges) -> { + if (epoch.equals(Epoch.EMPTY)) + newMigratingRangesByEpoch.put(replacementEpoch, ranges); + else + newMigratingRangesByEpoch.put(epoch, ranges); + }); + + if (newMigratingRangesByEpoch != null) + return new TableMigrationState(keyspaceName, tableName, tableId, targetProtocol, migratedRanges, repairPendingRanges, newMigratingRangesByEpoch); + else + return this; + } + + public TableMigrationState withRangesRepairedAtEpoch(@Nonnull Collection> ranges, + @Nonnull Epoch epoch, + @Nonnull ConsensusMigrationRepairType repairType) + { + checkState(!migratingRangesByEpoch.containsKey(Epoch.EMPTY), "Shouldn't have an entry for the empty epoch"); + checkArgument(epoch.isAfter(Epoch.EMPTY), "Epoch shouldn't be empty"); + + NormalizedRanges normalizedRepairedRanges = normalizedRanges(ranges); + // This should be inclusive because the epoch we store in the map is the epoch in which the range has been marked migrating + // in startMigrationToConsensusProtocol + NavigableMap> coveredEpochs = migratingRangesByEpoch.headMap(epoch, true); + NormalizedRanges normalizedMigratingRanges = normalizedRanges(coveredEpochs.values().stream().flatMap(Collection::stream).collect(Collectors.toList())); + // These are the ranges that are impacted by this repair based on the epoch filtering needed to make sure this repair applies to this migration + NormalizedRanges normalizedRepairedIntersection = normalizedRepairedRanges.intersection(normalizedMigratingRanges); + checkState(!normalizedRepairedIntersection.isEmpty(), "None of Ranges " + ranges + " were being migrated"); + + // Any ranges that were repair pending can't actually be fully migrated yet, they will be subtracted from repairPendingRanges after + // the new migratingRangesByEpoch and migratedRanges are constructed + NormalizedRanges actuallyMigratedRanges = normalizedRepairedIntersection.subtract(repairPendingRanges); + + List> newMigratedRanges = migratedRanges; + Map> newMigratingRangesByEpoch = migratingRangesByEpoch; + + // Not all repairs are capable of completing the migration to a given target + if ((targetProtocol == ConsensusMigrationTarget.accord && repairType.repairsPaxos()) + || (targetProtocol == ConsensusMigrationTarget.paxos && repairType.repairedAccord)) + { + newMigratingRangesByEpoch = new HashMap<>(); + // Everything in this epoch or later can't have been migrated so re-add all of them + newMigratingRangesByEpoch.putAll(migratingRangesByEpoch.tailMap(epoch, false)); + // Include anything still remaining to be migrated after subtracting what was repaired (and not excluded to due repairPendingRanges) + for (Map.Entry> e : coveredEpochs.entrySet()) + { + // Epoch when these ranges started migrating + Epoch rangesEpoch = e.getKey(); + NormalizedRanges epochMigratingRanges = e.getValue(); + NormalizedRanges remainingRanges = epochMigratingRanges.subtract(actuallyMigratedRanges); + if (!remainingRanges.isEmpty()) + newMigratingRangesByEpoch.put(rangesEpoch, remainingRanges); + } + + newMigratedRanges = new ArrayList<>(normalizedMigratingRanges.size() + ranges.size()); + newMigratedRanges.addAll(migratedRanges); + newMigratedRanges.addAll(actuallyMigratedRanges); + } + + // After this repair any ranges in normalizedRepairedIntersection is repaired and no longer repair pending + // Accord can safely read from them if this is a migration to Accord (after Paxos key migration) + List> newRepairPendingRanges = repairPendingRanges; + if (repairType.repairedData) + { + List> repairedRangesSatisfyingEpoch = new ArrayList<>(); + for (Map.Entry> e : coveredEpochs.entrySet()) + repairedRangesSatisfyingEpoch.addAll(normalizedRepairedRanges.intersection(e.getValue())); + newRepairPendingRanges = repairPendingRanges.subtract(normalizedRanges(repairedRangesSatisfyingEpoch)); + } + + return new TableMigrationState(keyspaceName, tableName, tableId, targetProtocol, newMigratedRanges, newRepairPendingRanges, newMigratingRangesByEpoch); + } + + public boolean paxosReadSatisfiedByKeyMigrationAtEpoch(DecoratedKey key, ConsensusMigratedAt consensusMigratedAt) + { + // This check is being done from a Paxos read attempt which needs to + // check if Accord needs to resolve any in flight accord transactions + // if the migration target is Accord then nothing needs to be done + if (targetProtocol != ConsensusMigrationTarget.paxos) + return true; + + return satisfiedByKeyMigrationAtEpoch(key, consensusMigratedAt); + } + + public boolean satisfiedByKeyMigrationAtEpoch(@Nonnull DecoratedKey key, @Nullable ConsensusMigratedAt consensusMigratedAt) + { + if (consensusMigratedAt == null) + { + // It hasn't been migrated and needs migration if it is in a migrating range + return migratingRanges.intersects(key.getToken()); + } + else + { + // It has been migrated and might be from a late enough epoch to satisfy this migration + return consensusMigratedAt.migratedAtTarget == targetProtocol + && migratingRangesByEpoch.headMap(consensusMigratedAt.migratedAtEpoch, true).values() + .stream() + .flatMap(List::stream) + .anyMatch(range -> range.contains(key.getToken())); + } + } + + public Epoch minMigrationEpoch(Token token) + { + // TODO should there be an index to make this more efficient? + for (Map.Entry> e : migratingRangesByEpoch.entrySet()) + { + if (e.getValue().intersects(token)) + return e.getKey(); + } + return Epoch.EMPTY; + } + + + public @Nonnull TableId getTableId() + { + return tableId; + } + + public Map toMap() + { + ImmutableMap.Builder builder = ImmutableMap.builder(); + builder.put("keyspace", keyspaceName); + builder.put("table", tableName); + builder.put("tableId", tableId.toString()); + builder.put("targetProtocol", targetProtocol.toString()); + builder.put("migratedRanges", migratedRanges.stream().map(Objects::toString).collect(toImmutableList())); + Map> rangesByEpoch = new LinkedHashMap<>(); + for (Map.Entry> entry : migratingRangesByEpoch.entrySet()) + { + rangesByEpoch.put(entry.getKey().getEpoch(), entry.getValue().stream().map(Objects::toString).collect(toImmutableList())); + } + builder.put("migratingRangesByEpoch", rangesByEpoch); + builder.put("repairPendingRanges", repairPendingRanges.stream().map(Objects::toString).collect(toImmutableList())); + return builder.build(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + TableMigrationState that = (TableMigrationState) o; + return keyspaceName.equals(that.keyspaceName) && tableName.equals(that.tableName) && tableId.equals(that.tableId) && targetProtocol == that.targetProtocol && migratedRanges.equals(that.migratedRanges) && migratingRangesByEpoch.equals(that.migratingRangesByEpoch) && migratingRanges.equals(that.migratingRanges) && migratingAndMigratedRanges.equals(that.migratingAndMigratedRanges); + } + + @Override + public int hashCode() + { + return Objects.hash(keyspaceName, tableName, tableId, targetProtocol, migratedRanges, migratingRangesByEpoch, migratingRanges, migratingAndMigratedRanges); + } + + public static final MetadataSerializer serializer = new MetadataSerializer() + { + @Override + public void serialize(TableMigrationState t, DataOutputPlus out, Version version) throws IOException + { + out.write(t.targetProtocol.value); + out.writeUTF(t.keyspaceName); + out.writeUTF(t.tableName); + t.tableId.serialize(out); + serializeCollection(t.migratedRanges, out, version, Range.serializer); + serializeCollection(t.repairPendingRanges, out, version, Range.serializer); + serializeMap(t.migratingRangesByEpoch, out, version, Epoch.serializer, ConsensusTableMigration.rangesSerializer); + } + + @Override + public TableMigrationState deserialize(DataInputPlus in, Version version) throws IOException + { + ConsensusMigrationTarget targetProtocol = ConsensusMigrationTarget.fromValue(in.readByte()); + String keyspaceName = in.readUTF(); + String tableName = in.readUTF(); + TableId tableId = TableId.deserialize(in); + Set> migratedRanges = deserializeSet(in, version, Range.serializer); + Set> repairPendingRanges = deserializeSet(in, version, Range.serializer); + Map> migratingRangesByEpoch = deserializeMap(in, version, Epoch.serializer, ConsensusTableMigration.rangesSerializer, Maps::newHashMapWithExpectedSize); + return new TableMigrationState(keyspaceName, tableName, tableId, targetProtocol, migratedRanges, repairPendingRanges, migratingRangesByEpoch); + } + + @Override + public long serializedSize(TableMigrationState t, Version version) + { + return sizeof(t.targetProtocol.value) + + sizeof(t.keyspaceName) + + sizeof(t.tableName) + + t.tableId.serializedSize() + + serializedCollectionSize(t.migratedRanges, version, Range.serializer) + + serializedCollectionSize(t.repairPendingRanges, version, Range.serializer) + + serializedMapSize(t.migratingRangesByEpoch, version, Epoch.serializer, ConsensusTableMigration.rangesSerializer); + } + }; + + public Iterable> migratedRangesAsPartitionPosition() + { + return Iterables.transform(migratedRanges, range -> new Range<>(range.left.maxKeyBound(), range.right.maxKeyBound())); + } + + @Override + public String toString() + { + return "TableMigrationState{" + + "keyspaceName='" + keyspaceName + '\'' + + ", tableName='" + tableName + '\'' + + ", tableId=" + tableId + + ", targetProtocol=" + targetProtocol + + ", migratedRanges=" + migratedRanges + + ", migratingRangesByEpoch=" + migratingRangesByEpoch + + ", migratingRanges=" + migratingRanges + + ", repairPendingRanges=" + repairPendingRanges + + ", migratingAndMigratedRanges=" + migratingAndMigratedRanges + + '}'; + } +} diff --git a/src/java/org/apache/cassandra/service/consensus/migration/TransactionalMigrationFromMode.java b/src/java/org/apache/cassandra/service/consensus/migration/TransactionalMigrationFromMode.java new file mode 100644 index 000000000000..fe1d9a028c02 --- /dev/null +++ b/src/java/org/apache/cassandra/service/consensus/migration/TransactionalMigrationFromMode.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.consensus.migration; + +import org.apache.cassandra.service.consensus.TransactionalMode; + +import static org.apache.cassandra.utils.LocalizeString.toLowerCaseLocalized; + +/** + * This tracks the state of a migration either from Paxos -> Accord, Accord [interop mode a] -> Accord [interop mode b] or Accord -> Paxos. + * The `TransactionalMode` associated with each transition from a system is how interoperability should be achieved during the migration with various performance/safety tradeoffs. + */ +public enum TransactionalMigrationFromMode +{ + none(null), // No migration is in progress. The currently active transaction system could be either Accord or Paxos. + off(TransactionalMode.off), + mixed_reads(TransactionalMode.mixed_reads), + full(TransactionalMode.full), + test_unsafe(TransactionalMode.test_unsafe), + test_unsafe_writes(TransactionalMode.test_unsafe_writes), + test_interop_read(TransactionalMode.test_interop_read); + + public final TransactionalMode from; + + TransactionalMigrationFromMode(TransactionalMode from) + { + this.from = from; + } + + public static TransactionalMigrationFromMode fromMode(TransactionalMode prev, TransactionalMode next) + { + if (next.accordIsEnabled == prev.accordIsEnabled) + return none; + + switch (prev) + { + default: throw new IllegalArgumentException(); + case off: return off; + case mixed_reads: return mixed_reads; + case full: return full; + case test_interop_read: return test_interop_read; + case test_unsafe: return test_unsafe; + case test_unsafe_writes: return test_unsafe_writes; + } + } + + public static TransactionalMigrationFromMode fromOrdinal(int ordinal) + { + return values()[ordinal]; + } + + public static TransactionalMigrationFromMode fromString(String name) + { + return valueOf(toLowerCaseLocalized(name)); + } + + public boolean migratingFromAccord() + { + return from != null && from.accordIsEnabled; + } + + public boolean nonSerialWritesThroughAccord() + { + return from != null && from.nonSerialWritesThroughAccord; + } + + public boolean readRepairsThroughAccord() + { + return from != null && from.blockingReadRepairThroughAccord; + } + + public boolean nonSerialReadsThroughAccord() + { + return from != null && from.nonSerialReadsThroughAccord; + } + + public boolean isMigrating() + { + return this != none; + } + + public String asCqlParam() + { + return String.format("transactional_migration_from = '%s'", toLowerCaseLocalized(this.name())); + } +} diff --git a/src/java/org/apache/cassandra/service/paxos/AbstractPaxosRepair.java b/src/java/org/apache/cassandra/service/paxos/AbstractPaxosRepair.java index 05c67da3f1f1..7640d75e8981 100644 --- a/src/java/org/apache/cassandra/service/paxos/AbstractPaxosRepair.java +++ b/src/java/org/apache/cassandra/service/paxos/AbstractPaxosRepair.java @@ -24,9 +24,9 @@ import java.util.List; import java.util.Objects; import java.util.function.Consumer; +import javax.annotation.Nullable; import com.google.common.base.Preconditions; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -64,9 +64,9 @@ public void accept(T input) public static class Result extends State { - enum Outcome { DONE, CANCELLED, FAILURE } + public enum Outcome { DONE, CANCELLED, FAILURE } - final Outcome outcome; + public final Outcome outcome; public Result(Outcome outcome) { @@ -127,15 +127,20 @@ public int hashCode() } private final DecoratedKey partitionKey; + @Nullable private final Ballot incompleteBallot; + + protected final long retryTimeoutNanos; + private List listeners = null; private volatile State state; private volatile long startedNanos = Long.MIN_VALUE; - public AbstractPaxosRepair(DecoratedKey partitionKey, Ballot incompleteBallot) + public AbstractPaxosRepair(DecoratedKey partitionKey, Ballot incompleteBallot, long retryTimeoutNanos) { this.partitionKey = partitionKey; this.incompleteBallot = incompleteBallot; + this.retryTimeoutNanos = retryTimeoutNanos; } public State state() @@ -158,7 +163,8 @@ public boolean isComplete() return isResult(state); } - public Ballot incompleteBallot() + // Shouldn't be null when used by PaxosRepairs, but will be null when used by ConsensusRequestRouter + public @Nullable Ballot incompleteBallot() { return incompleteBallot; } @@ -203,11 +209,19 @@ public final DecoratedKey partitionKey() public State restart(State state) { return restart(state, Long.MIN_VALUE); } public abstract State restart(State state, long waitUntil); + // Used to start repairs from PaxosTableRepairs public final synchronized AbstractPaxosRepair start() + { + long startedNanos = Math.max(Long.MIN_VALUE + 1, nanoTime()); + return start(startedNanos); + } + + // Used to start repairs from ConsensusRequestRouter + public final synchronized AbstractPaxosRepair start(long queryStartNanos) { updateState(null, null, (state, i2) -> { Preconditions.checkState(!isStarted()); - startedNanos = Math.max(Long.MIN_VALUE + 1, nanoTime()); + startedNanos = queryStartNanos; return restart(state); }); return this; diff --git a/src/java/org/apache/cassandra/service/paxos/Commit.java b/src/java/org/apache/cassandra/service/paxos/Commit.java index 3aa8d65bcef0..c6a9916f794c 100644 --- a/src/java/org/apache/cassandra/service/paxos/Commit.java +++ b/src/java/org/apache/cassandra/service/paxos/Commit.java @@ -23,25 +23,26 @@ import java.io.IOException; import java.util.function.BiFunction; - import javax.annotation.Nullable; import com.google.common.base.Objects; -import org.apache.cassandra.db.*; -import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; import org.apache.cassandra.db.partitions.PartitionUpdate; +import org.apache.cassandra.db.rows.DeserializationHelper; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.schema.TableMetadata; -import static org.apache.cassandra.db.SystemKeyspace.*; +import static org.apache.cassandra.db.SystemKeyspace.legacyPaxosTtlSec; import static org.apache.cassandra.service.paxos.Commit.CompareResult.AFTER; import static org.apache.cassandra.service.paxos.Commit.CompareResult.BEFORE; import static org.apache.cassandra.service.paxos.Commit.CompareResult.IS_REPROPOSAL; -import static org.apache.cassandra.service.paxos.Commit.CompareResult.WAS_REPROPOSED_BY; import static org.apache.cassandra.service.paxos.Commit.CompareResult.SAME; +import static org.apache.cassandra.service.paxos.Commit.CompareResult.WAS_REPROPOSED_BY; import static org.apache.cassandra.utils.FBUtilities.nowInSeconds; public class Commit @@ -314,7 +315,7 @@ public boolean hasSameBallot(Commit other) public Mutation makeMutation() { - return new Mutation(update); + return new Mutation(update, PotentialTxnConflicts.ALLOW); } @Override diff --git a/src/java/org/apache/cassandra/service/paxos/ContentionStrategy.java b/src/java/org/apache/cassandra/service/paxos/ContentionStrategy.java index 59ee5505123e..63dba970be3a 100644 --- a/src/java/org/apache/cassandra/service/paxos/ContentionStrategy.java +++ b/src/java/org/apache/cassandra/service/paxos/ContentionStrategy.java @@ -22,356 +22,126 @@ import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableMap; -import com.codahale.metrics.Snapshot; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.RetryStrategy; +import org.apache.cassandra.service.TimeoutStrategy; +import org.apache.cassandra.service.TimeoutStrategy.LatencySourceFactory; +import org.apache.cassandra.service.TimeoutStrategy.ReadWriteLatencySourceFactory; +import org.apache.cassandra.service.TimeoutStrategy.Wait; import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.NoSpamLogger; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicReference; -import java.util.function.DoubleSupplier; -import java.util.function.LongBinaryOperator; import java.util.function.Supplier; import java.util.regex.Matcher; import java.util.regex.Pattern; -import static java.lang.Double.parseDouble; -import static java.lang.Integer.parseInt; -import static java.lang.Math.*; +import javax.annotation.Nullable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static java.lang.Math.max; +import static java.lang.Math.min; import static java.util.Arrays.stream; -import static java.util.concurrent.TimeUnit.*; +import static java.util.concurrent.TimeUnit.MICROSECONDS; +import static java.util.concurrent.TimeUnit.MINUTES; import static org.apache.cassandra.config.DatabaseDescriptor.*; import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.casReadMetrics; import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.casWriteMetrics; -import static org.apache.cassandra.utils.Clock.Global.nanoTime; +import static org.apache.cassandra.service.TimeoutStrategy.parseInMicros; import static org.apache.cassandra.utils.Clock.waitUntil; import static org.apache.cassandra.utils.LocalizeString.toLowerCaseLocalized; /** - *

    A strategy for making back-off decisions for Paxos operations that fail to make progress because of other paxos operations. - * The strategy is defined by four factors:

      - *
    • {@link #min} - *
    • {@link #max} - *
    • {@link #minDelta} - *
    • {@link #waitRandomizer} - *
    - * - *

    The first three represent time periods, and may be defined dynamically based on a simple calculation over:

      - *
    • {@code pX()} recent experienced latency distribution for successful operations, - * e.g. {@code p50(rw)} the maximum of read and write median latencies, - * {@code p999(r)} the 99.9th percentile of read latencies - *
    • {@code attempts} the number of failed attempts made by the operation so far - *
    • {@code constant} a user provided floating point constant - *
    - * - *

    Their calculation may take any of these forms - *

  • constant {@code $constant$[mu]s} - *
  • dynamic constant {@code pX() * constant} - *
  • dynamic linear {@code pX() * constant * attempts} - *
  • dynamic exponential {@code pX() * constant ^ attempts} - * - *

    Furthermore, the dynamic calculations can be bounded with a min/max, like so: - * {@code min[mu]s <= dynamic expr <= max[mu]s} - * - * e.g. - *

  • {@code 10ms <= p50(rw)*0.66} - *
  • {@code 10ms <= p95(rw)*1.8^attempts <= 100ms} - *
  • {@code 5ms <= p50(rw)*0.5} - * - *

    These calculations are put together to construct a range from which we draw a random number. - * The period we wait for {@code X} will be drawn so that {@code min <= X < max}. - * - *

    With the constraint that {@code max} must be {@code minDelta} greater than {@code min}, - * but no greater than its expression-defined maximum. {@code max} will be increased up until - * this point, after which {@code min} will be decreased until this gap is imposed. - * - *

    The {@link #waitRandomizer} property specifies the manner in which a random value is drawn from the range. - * It is defined using one of the following specifiers: - *

  • uniform - *
  • exp($power$) or exponential($power$) - *
  • qexp($power$) or qexponential($power$) or quantizedexponential($power$) - * - * The uniform specifier is self-explanatory, selecting all values in the range with equal probability. - * The exponential specifier draws values towards the end of the range with higher probability, raising - * a floating point number in the range [0..1.0) to the power provided, and translating the resulting value - * to a uniform value in the range. - * The quantized exponential specifier partitions the range into {@code attempts} buckets, then applies the pure - * exponential approach to draw values from [0..attempts), before drawing a uniform value from the corresponding bucket - * - *

    Finally, there is also a {@link #traceAfterAttempts} property that permits initiating tracing of operations - * that experience a certain minimum number of failed paxos rounds due to contention. A setting of 0 or 1 will initiate - * a trace session after the first failed ballot. + * See {@link RetryStrategy} + * TODO (expected): deprecate in favour of pure RetryStrategy */ -public class ContentionStrategy +public class ContentionStrategy extends RetryStrategy { - private static final Logger logger = LoggerFactory.getLogger(ContentionStrategy.class); - - private static final Pattern BOUND = Pattern.compile( - "(?0|[0-9]+[mu]s)" + - "|((?0|[0-9]+[mu]s) *<= *)?" + - "(p(?[0-9]+)\\((?r|w|rw|wr)\\)|(?0|[0-9]+[mu]s))" + - "\\s*([*]\\s*(?[0-9.]+)?\\s*(?[*^]\\s*attempts)?)?" + - "( *<= *(?0|[0-9]+[mu]s))?"); - private static final Pattern TIME = Pattern.compile( - "0|([0-9]+)ms|([0-9]+)us"); - private static final Pattern RANDOMIZER = Pattern.compile( - "uniform|exp(onential)?[(](?[0-9.]+)[)]|q(uantized)?exp(onential)?[(](?[0-9.]+)[)]"); - private static final String DEFAULT_WAIT_RANDOMIZER = "qexp(1.5)"; // at least 0ms, and at least 66% of median latency - private static final String DEFAULT_MIN = "0 <= p50(rw)*0.66"; // at least 0ms, and at least 66% of median latency - private static final String DEFAULT_MAX = "10ms <= p95(rw)*1.8^attempts <= 100ms"; // p95 latency with exponential back-off at rate of 1.8^attempts - private static final String DEFAULT_MIN_DELTA = "5ms <= p50(rw)*0.5"; // at least 5ms, and at least 50% of median latency - - private static volatile ContentionStrategy current; - - // Factories can be useful for testing purposes, to supply custom implementations of selectors and modifiers. - final static LatencySelectorFactory selectors = new LatencySelectorFactory(){}; - final static LatencyModifierFactory modifiers = new LatencyModifierFactory(){}; - final static WaitRandomizerFactory randomizers = new WaitRandomizerFactory(){}; - - static - { - current = new ContentionStrategy(defaultWaitRandomizer(), defaultMinWait(), defaultMaxWait(), defaultMinDelta(), Integer.MAX_VALUE); - } - - static interface LatencyModifierFactory - { - default LatencyModifier identity() { return (l, a) -> l; } - default LatencyModifier multiply(double constant) { return (l, a) -> saturatedCast(l * constant); } - default LatencyModifier multiplyByAttempts(double multiply) { return (l, a) -> saturatedCast(l * multiply * a); } - default LatencyModifier multiplyByAttemptsExp(double base) { return (l, a) -> saturatedCast(l * pow(base, a)); } - } - - static interface LatencySupplier - { - abstract long get(double percentile); - } + private static final Logger logger = LoggerFactory.getLogger(RetryStrategy.class); - static interface LatencySelector - { - abstract long select(LatencySupplier readLatencyHistogram, LatencySupplier writeLatencyHistogram); - } - - static interface LatencySelectorFactory - { - default LatencySelector constant(long latency) { return (read, write) -> latency; } - default LatencySelector read(double percentile) { return (read, write) -> read.get(percentile); } - default LatencySelector write(double percentile) { return (read, write) -> write.get(percentile); } - default LatencySelector maxReadWrite(double percentile) { return (read, write) -> max(read.get(percentile), write.get(percentile)); } - } - - static interface LatencyModifier - { - long modify(long latency, int attempts); - } - - static interface WaitRandomizer - { - abstract long wait(long min, long max, int attempts); - } - - static interface WaitRandomizerFactory + public enum Type { - default LongBinaryOperator uniformLongSupplier() { return (min, max) -> ThreadLocalRandom.current().nextLong(min, max); } // DO NOT USE METHOD HANDLES (want to fetch afresh each time) - default DoubleSupplier uniformDoubleSupplier() { return () -> ThreadLocalRandom.current().nextDouble(); } - - default WaitRandomizer uniform() { return new Uniform(uniformLongSupplier()); } - default WaitRandomizer exponential(double power) { return new Exponential(uniformLongSupplier(), uniformDoubleSupplier(), power); } - default WaitRandomizer quantizedExponential(double power) { return new QuantizedExponential(uniformLongSupplier(), uniformDoubleSupplier(), power); } - - static class Uniform implements WaitRandomizer - { - final LongBinaryOperator uniformLong; - - public Uniform(LongBinaryOperator uniformLong) - { - this.uniformLong = uniformLong; - } - - @Override - public long wait(long min, long max, int attempts) - { - return uniformLong.applyAsLong(min, max); - } - } - - static abstract class AbstractExponential implements WaitRandomizer - { - final LongBinaryOperator uniformLong; - final DoubleSupplier uniformDouble; - final double power; - - public AbstractExponential(LongBinaryOperator uniformLong, DoubleSupplier uniformDouble, double power) - { - this.uniformLong = uniformLong; - this.uniformDouble = uniformDouble; - this.power = power; - } - } - - static class Exponential extends AbstractExponential - { - public Exponential(LongBinaryOperator uniformLong, DoubleSupplier uniformDouble, double power) - { - super(uniformLong, uniformDouble, power); - } - - @Override - public long wait(long min, long max, int attempts) - { - if (attempts == 1) - return uniformLong.applyAsLong(min, max); + READ("Contended Paxos Read"), WRITE("Contended Paxos Write"), REPAIR("Contended Paxos Repair"); - double p = uniformDouble.getAsDouble(); - long delta = max - min; - delta *= Math.pow(p, power); - return max - delta; - } - } + final String traceTitle; + final String lowercase; - static class QuantizedExponential extends AbstractExponential + Type(String traceTitle) { - public QuantizedExponential(LongBinaryOperator uniformLong, DoubleSupplier uniformDouble, double power) - { - super(uniformLong, uniformDouble, power); - } - - @Override - public long wait(long min, long max, int attempts) - { - long quanta = (max - min) / attempts; - if (attempts == 1 || quanta == 0) - return uniformLong.applyAsLong(min, max); - - double p = uniformDouble.getAsDouble(); - int base = (int) (attempts * Math.pow(p, power)); - return max - ThreadLocalRandom.current().nextLong(quanta * base, quanta * (base + 1)); - } + this.traceTitle = traceTitle; + this.lowercase = toLowerCaseLocalized(name()); } } - static class SnapshotAndTime - { - final long validUntil; - final Snapshot snapshot; + static final Pattern LEGACY = Pattern.compile( + "(?0|[0-9]+[mu]?s)" + + "|((?0|[0-9]+[mu]?s) *<= *)?" + + "(?[^=]+)?" + + "( *<= *(?0|[0-9]+[mu]?s))?"); - SnapshotAndTime(long validUntil, Snapshot snapshot) - { - this.validUntil = validUntil; - this.snapshot = snapshot; - } - } + private static final String DEFAULT_WAIT_RANDOMIZER = "uniform"; + private static final String DEFAULT_MIN = "0"; + private static final String DEFAULT_MAX = "100ms"; + private static final String DEFAULT_SPREAD = "100ms"; + private static final String MAX_INT = "" + Integer.MAX_VALUE; + private static final LatencySourceFactory LATENCIES = new ReadWriteLatencySourceFactory(casReadMetrics, casWriteMetrics); - static class TimeLimitedLatencySupplier extends AtomicReference implements LatencySupplier + private static volatile ContentionStrategy current; + private static volatile ParsedStrategy currentParsed; + static { - final Supplier snapshotSupplier; - final long validForNanos; - - TimeLimitedLatencySupplier(Supplier snapshotSupplier, long time, TimeUnit units) - { - this.snapshotSupplier = snapshotSupplier; - this.validForNanos = units.toNanos(time); - } - - private Snapshot getSnapshot() - { - long now = nanoTime(); - - SnapshotAndTime cur = get(); - if (cur != null && cur.validUntil > now) - return cur.snapshot; - - Snapshot newSnapshot = snapshotSupplier.get(); - SnapshotAndTime next = new SnapshotAndTime(now + validForNanos, newSnapshot); - if (compareAndSet(cur, next)) - return next.snapshot; - - return accumulateAndGet(next, (a, b) -> a.validUntil > b.validUntil ? a : b).snapshot; - } - - @Override - public long get(double percentile) - { - return (long)getSnapshot().getValue(percentile); - } + String waitRandomizer = orElse(DatabaseDescriptor::getPaxosContentionWaitRandomizer, DEFAULT_WAIT_RANDOMIZER); + String min = orElse(DatabaseDescriptor::getPaxosContentionMinWait, DEFAULT_MIN); + String max = orElse(DatabaseDescriptor::getPaxosContentionMaxWait, DEFAULT_MAX); + String spread = orElse(DatabaseDescriptor::getPaxosContentionMinDelta, DEFAULT_SPREAD); + current = parse(waitRandomizer, min, max, spread, MAX_INT, MAX_INT); + currentParsed = new ParsedStrategy(waitRandomizer, min, max, spread, MAX_INT, MAX_INT, current); } - static class Bound - { - final long min, max, onFailure; - final LatencyModifier modifier; - final LatencySelector selector; - final LatencySupplier reads, writes; - - Bound(long min, long max, long onFailure, LatencyModifier modifier, LatencySelector selector) - { - Preconditions.checkArgument(min<=max, "min (%s) must be less than or equal to max (%s)", min, max); - this.min = min; - this.max = max; - this.onFailure = onFailure; - this.modifier = modifier; - this.selector = selector; - this.reads = new TimeLimitedLatencySupplier(casReadMetrics.latency::getSnapshot, 10L, SECONDS); - this.writes = new TimeLimitedLatencySupplier(casWriteMetrics.latency::getSnapshot, 10L, SECONDS); - } - - long get(int attempts) - { - try - { - long base = selector.select(reads, writes); - return max(min, min(max, modifier.modify(base, attempts))); - } - catch (Throwable t) - { - NoSpamLogger.getLogger(logger, 1L, MINUTES).info("", t); - return onFailure; - } - } - - public String toString() - { - return "Bound{" + - "min=" + min + - ", max=" + max + - ", onFailure=" + onFailure + - ", modifier=" + modifier + - ", selector=" + selector + - '}'; - } - } - - final WaitRandomizer waitRandomizer; - final Bound min, max, minDelta; + final @Nullable LegacyWait spread; final int traceAfterAttempts; - public ContentionStrategy(String waitRandomizer, String min, String max, String minDelta, int traceAfterAttempts) + public ContentionStrategy(WaitRandomizer waitRandomizer, LegacyWait min, LegacyWait max, LegacyWait spread, int retries, int traceAfterAttempts) { - this.waitRandomizer = parseWaitRandomizer(waitRandomizer); - this.min = parseBound(min, true); - this.max = parseBound(max, false); - this.minDelta = parseBound(minDelta, true); + super(waitRandomizer, min.min, min, min.max, max, max.max, retries); this.traceAfterAttempts = traceAfterAttempts; + this.spread = spread; } - public enum Type + public long computeWait(int attempt, TimeUnit units) { - READ("Contended Paxos Read"), WRITE("Contended Paxos Write"), REPAIR("Contended Paxos Repair"); + if (attempt > maxAttempts) + return -1; - final String traceTitle; - final String lowercase; + long minWaitMicros = min.getMicros(attempt); + long maxWaitMicros = max.getMicros(attempt); + long spreadMicros = spread == null ? 0 : spread.getMicros(attempt); - Type(String traceTitle) + if (minWaitMicros + spreadMicros >= maxWaitMicros) { - this.traceTitle = traceTitle; - this.lowercase = toLowerCaseLocalized(name()); + if (spreadMicros == 0) + return units.convert(maxWaitMicros, MICROSECONDS); + + if (maxWaitMicros < minWaitMicros) + maxWaitMicros = minWaitMicros; + long newMaxWaitMicros = minWaitMicros + spreadMicros; + if (newMaxWaitMicros > maxMaxMicros) + { + newMaxWaitMicros = maxMaxMicros; + minWaitMicros = max(this.minMinMicros, maxWaitMicros - spreadMicros); + } + maxWaitMicros = newMaxWaitMicros; + if (minWaitMicros >= maxWaitMicros) + return minWaitMicros; } + + return units.convert(waitRandomizer.wait(minWaitMicros, maxWaitMicros, attempt), MICROSECONDS); } long computeWaitUntilForContention(int attempts, TableMetadata table, DecoratedKey partitionKey, ConsistencyLevel consistency, Type type) @@ -395,25 +165,10 @@ long computeWaitUntilForContention(int attempts, TableMetadata table, DecoratedK Tracing.instance.getSessionId()); } - long minWaitMicros = min.get(attempts); - long maxWaitMicros = max.get(attempts); - long minDeltaMicros = minDelta.get(attempts); - - if (minWaitMicros + minDeltaMicros > maxWaitMicros) - { - maxWaitMicros = minWaitMicros + minDeltaMicros; - if (maxWaitMicros > this.max.max) - { - maxWaitMicros = this.max.max; - minWaitMicros = max(this.min.min, min(this.min.max, maxWaitMicros - minDeltaMicros)); - } - } - - long wait = waitRandomizer.wait(minWaitMicros, maxWaitMicros, attempts); - return nanoTime() + MICROSECONDS.toNanos(wait); + return super.computeWaitUntil(attempts); } - boolean doWaitForContention(long deadline, int attempts, TableMetadata table, DecoratedKey partitionKey, ConsistencyLevel consistency, Type type) + public boolean doWaitForContention(long deadline, int attempts, TableMetadata table, DecoratedKey partitionKey, ConsistencyLevel consistency, Type type) { long until = computeWaitUntilForContention(attempts, table, partitionKey, consistency, type); if (until >= deadline) @@ -441,212 +196,152 @@ static long waitUntilForContention(int attempts, TableMetadata table, DecoratedK return current.computeWaitUntilForContention(attempts, table, partitionKey, consistency, type); } - static class ParsedStrategy + public static class ParsedStrategy { - final String waitRandomizer, min, max, minDelta; - final ContentionStrategy strategy; + public final String waitRandomizer, min, max, spread, retries, trace; + public final ContentionStrategy strategy; - ParsedStrategy(String waitRandomizer, String min, String max, String minDelta, ContentionStrategy strategy) + protected ParsedStrategy(String waitRandomizer, String min, String max, String spread, String retries, String trace, ContentionStrategy strategy) { this.waitRandomizer = waitRandomizer; this.min = min; this.max = max; - this.minDelta = minDelta; + this.spread = spread; + this.retries = retries; + this.trace = trace; this.strategy = strategy; } + + public String toString() + { + return "min=" + min + ",max=" + max + ",spread=" + spread + ",retries=" + retries + + ",random=" + waitRandomizer + ",trace=" + current.traceAfterAttempts; + } } @VisibleForTesting - static ParsedStrategy parseStrategy(String spec) + public static ParsedStrategy parseStrategy(String spec, ParsedStrategy defaultStrategy) { String[] args = spec.split(","); String waitRandomizer = find(args, "random"); String min = find(args, "min"); String max = find(args, "max"); - String minDelta = find(args, "delta"); + String spread = find(args, "spread"); String trace = find(args, "trace"); + if (spread == null) + spread = find(args, "delta"); + String retries = find(args, "retries"); - if (waitRandomizer == null) waitRandomizer = defaultWaitRandomizer(); - if (min == null) min = defaultMinWait(); - if (max == null) max = defaultMaxWait(); - if (minDelta == null) minDelta = defaultMinDelta(); - int traceAfterAttempts = trace == null ? current.traceAfterAttempts: Integer.parseInt(trace); + if (waitRandomizer == null) waitRandomizer = defaultStrategy.waitRandomizer; + if (min == null) min = defaultStrategy.min; + if (max == null) max = defaultStrategy.max; + if (spread == null) spread = defaultStrategy.spread; + if (retries == null) retries = defaultStrategy.retries; + if (trace == null) trace = defaultStrategy.trace; - ContentionStrategy strategy = new ContentionStrategy(waitRandomizer, min, max, minDelta, traceAfterAttempts); - return new ParsedStrategy(waitRandomizer, min, max, minDelta, strategy); + ContentionStrategy strategy = parse(waitRandomizer, min, max, spread, retries, trace); + return new ParsedStrategy(waitRandomizer, min, max, spread, retries, trace, strategy); } - - public static void setStrategy(String spec) + private static ContentionStrategy parse(String waitRandomizerString, String minString, String maxString, String spreadString, String retriesString, String traceString) { - ParsedStrategy parsed = parseStrategy(spec); - current = parsed.strategy; - setPaxosContentionWaitRandomizer(parsed.waitRandomizer); - setPaxosContentionMinWait(parsed.min); - setPaxosContentionMaxWait(parsed.max); - setPaxosContentionMinDelta(parsed.minDelta); - } - - public static String getStrategySpec() - { - return "min=" + defaultMinWait() - + ",max=" + defaultMaxWait() - + ",delta=" + defaultMinDelta() - + ",random=" + defaultWaitRandomizer() - + ",trace=" + current.traceAfterAttempts; + return new ContentionStrategy(parseWaitRandomizer(waitRandomizerString), + parseLegacy(minString, true), parseLegacy(maxString, false), parseLegacy(spreadString, false), + Integer.parseInt(retriesString), Integer.parseInt(traceString)); } private static String find(String[] args, String param) { return stream(args).filter(s -> s.startsWith(param + '=')) - .map(s -> s.substring(param.length() + 1)) - .findFirst().orElse(null); + .map(s -> s.substring(param.length() + 1)) + .findFirst().orElse(null); } - private static LatencySelector parseLatencySelector(Matcher m, LatencySelectorFactory selectors) - { - String perc = m.group("perc"); - if (perc == null) - return selectors.constant(parseInMicros(m.group("constbase"))); - - double percentile = parseDouble("0." + perc); - String rw = m.group("rw"); - if (rw.length() == 2) - return selectors.maxReadWrite(percentile); - else if ("r".equals(rw)) - return selectors.read(percentile); - else - return selectors.write(percentile); - } - private static LatencyModifier parseLatencyModifier(Matcher m, LatencyModifierFactory modifiers) + public static synchronized void setStrategy(String spec) { - String mod = m.group("mod"); - if (mod == null) - return modifiers.identity(); - - double modifier = parseDouble(mod); - - String modkind = m.group("modkind"); - if (modkind == null) - return modifiers.multiply(modifier); - - if (modkind.startsWith("*")) - return modifiers.multiplyByAttempts(modifier); - else if (modkind.startsWith("^")) - return modifiers.multiplyByAttemptsExp(modifier); - else - throw new IllegalArgumentException("Unrecognised attempt modifier: " + modkind); - } - - static long saturatedCast(double v) - { - if (v > Long.MAX_VALUE) - return Long.MAX_VALUE; - return (long) v; + ParsedStrategy parsed = parseStrategy(spec, currentParsed); + currentParsed = parsed; + current = parsed.strategy; + setPaxosContentionWaitRandomizer(parsed.waitRandomizer); + setPaxosContentionMinWait(parsed.min); + setPaxosContentionMaxWait(parsed.max); + setPaxosContentionMinDelta(parsed.spread); } - static WaitRandomizer parseWaitRandomizer(String input) + public static String getStrategySpec() { - return parseWaitRandomizer(input, randomizers); + return currentParsed.toString(); } - static WaitRandomizer parseWaitRandomizer(String input, WaitRandomizerFactory randomizers) + private static String orElse(Supplier get, String orElse) { - Matcher m = RANDOMIZER.matcher(input); - if (!m.matches()) - throw new IllegalArgumentException(input + " does not match" + RANDOMIZER); - - String exp; - exp = m.group("exp"); - if (exp != null) - return randomizers.exponential(Double.parseDouble(exp)); - exp = m.group("qexp"); - if (exp != null) - return randomizers.quantizedExponential(Double.parseDouble(exp)); - return randomizers.uniform(); + String result = get.get(); + return result != null ? result : orElse; } - static Bound parseBound(String input, boolean isMin) + @VisibleForTesting + static LegacyWait parseLegacy(String spec, boolean isMin) { - return parseBound(input, isMin, selectors, modifiers); + long defaultMaxMicros = getRpcTimeout(MICROSECONDS); + return parseLegacy(spec, 0, defaultMaxMicros, isMin ? 0 : defaultMaxMicros, LATENCIES); } - @VisibleForTesting - static Bound parseBound(String input, boolean isMin, LatencySelectorFactory selectors, LatencyModifierFactory modifiers) + public static LegacyWait parseLegacy(String input, long defaultMinMicros, long defaultMaxMicros, long onFailure, LatencySourceFactory latencies) { - Matcher m = BOUND.matcher(input); + Matcher m = LEGACY.matcher(input); if (!m.matches()) - throw new IllegalArgumentException(input + " does not match " + BOUND); + throw new IllegalArgumentException(input + " does not match " + LEGACY); String maybeConst = m.group("const"); if (maybeConst != null) { long v = parseInMicros(maybeConst); - return new Bound(v, v, v, modifiers.identity(), selectors.constant(v)); + return new LegacyWait(v, v, v, new Wait.Constant(v)); } - long min = parseInMicros(m.group("min"), 0); - long max = parseInMicros(m.group("max"), maxQueryTimeoutMicros() / 2); - return new Bound(min, max, isMin ? min : max, parseLatencyModifier(m, modifiers), parseLatencySelector(m, selectors)); - } - - private static long parseInMicros(String input, long orElse) - { - if (input == null) - return orElse; - - return parseInMicros(input); + long min = parseInMicros(m.group("min"), defaultMinMicros); + long max = parseInMicros(m.group("max"), defaultMaxMicros); + return new LegacyWait(min, max, onFailure, TimeoutStrategy.parseWait(m.group("delegate"), latencies)); } - private static long parseInMicros(String input) - { - Matcher m = TIME.matcher(input); - if (!m.matches()) - throw new IllegalArgumentException(input + " does not match " + TIME); - - String text; - if (null != (text = m.group(1))) - return parseInt(text) * 1000; - else if (null != (text = m.group(2))) - return parseInt(text); - else - return 0; - } - @VisibleForTesting - static String defaultWaitRandomizer() + private static class LegacyWait implements Wait { - return orElse(DatabaseDescriptor::getPaxosContentionWaitRandomizer, DEFAULT_WAIT_RANDOMIZER); - } - - @VisibleForTesting - static String defaultMinWait() - { - return orElse(DatabaseDescriptor::getPaxosContentionMinWait, DEFAULT_MIN); - } + final long min, max, onFailure; + final Wait delegate; - @VisibleForTesting - static String defaultMaxWait() - { - return orElse(DatabaseDescriptor::getPaxosContentionMaxWait, DEFAULT_MAX); - } + LegacyWait(long min, long max, long onFailure, Wait delegate) + { + Preconditions.checkArgument(min <= max, "min (%s) must be less than or equal to max (%s)", min, max); + this.min = min; + this.max = max; + this.onFailure = onFailure; + this.delegate = delegate; + } - @VisibleForTesting - static String defaultMinDelta() - { - return orElse(DatabaseDescriptor::getPaxosContentionMinDelta, DEFAULT_MIN_DELTA); - } + public long getMicros(int attempts) + { + try + { + return max(min, min(max, delegate.getMicros(attempts))); + } + catch (Throwable t) + { + NoSpamLogger.getLogger(logger, 1L, MINUTES).info("", t); + return onFailure; + } + } - @VisibleForTesting - static long maxQueryTimeoutMicros() - { - return max(max(getCasContentionTimeout(MICROSECONDS), getWriteRpcTimeout(MICROSECONDS)), getReadRpcTimeout(MICROSECONDS)); + public String toString() + { + return "Bound{" + + "min=" + min + + ", max=" + max + + ", onFailure=" + onFailure + + ", delegate=" + delegate + + '}'; + } } - private static String orElse(Supplier get, String orElse) - { - String result = get.get(); - return result != null ? result : orElse; - } } diff --git a/src/java/org/apache/cassandra/service/paxos/Paxos.java b/src/java/org/apache/cassandra/service/paxos/Paxos.java index 06f90907d502..24a0ee7317f0 100644 --- a/src/java/org/apache/cassandra/service/paxos/Paxos.java +++ b/src/java/org/apache/cassandra/service/paxos/Paxos.java @@ -28,32 +28,15 @@ import java.util.function.Function; import java.util.function.Predicate; import java.util.function.Supplier; - import javax.annotation.Nullable; import com.google.common.base.Preconditions; import com.google.common.collect.Iterators; import com.google.common.collect.Maps; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.codahale.metrics.Meter; -import org.apache.cassandra.exceptions.CasWriteTimeoutException; -import org.apache.cassandra.exceptions.ExceptionCode; -import org.apache.cassandra.gms.FailureDetector; -import org.apache.cassandra.locator.AbstractReplicationStrategy; -import org.apache.cassandra.locator.EndpointsForToken; -import org.apache.cassandra.locator.InOurDc; -import org.apache.cassandra.locator.MetaStrategy; -import org.apache.cassandra.locator.Replica; -import org.apache.cassandra.locator.ReplicaLayout; -import org.apache.cassandra.locator.ReplicaLayout.ForTokenWrite; -import org.apache.cassandra.locator.ReplicaPlan.ForRead; -import org.apache.cassandra.metrics.ClientRequestSizeMetrics; -import org.apache.cassandra.schema.KeyspaceMetadata; -import org.apache.cassandra.schema.SchemaConstants; -import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.config.Config; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.ConsistencyLevel; @@ -67,11 +50,14 @@ import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.db.rows.RowIterator; import org.apache.cassandra.dht.Token; +import org.apache.cassandra.exceptions.CasWriteTimeoutException; +import org.apache.cassandra.exceptions.ExceptionCode; import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.exceptions.IsBootstrappingException; import org.apache.cassandra.exceptions.ReadFailureException; import org.apache.cassandra.exceptions.ReadTimeoutException; import org.apache.cassandra.exceptions.RequestExecutionException; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.RequestFailureException; import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.exceptions.RequestTimeoutException; @@ -79,20 +65,38 @@ import org.apache.cassandra.exceptions.WriteFailureException; import org.apache.cassandra.exceptions.WriteTimeoutException; import org.apache.cassandra.gms.EndpointState; +import org.apache.cassandra.gms.FailureDetector; import org.apache.cassandra.gms.Gossiper; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.locator.AbstractReplicationStrategy; +import org.apache.cassandra.locator.EndpointsForToken; +import org.apache.cassandra.locator.InOurDc; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.MetaStrategy; +import org.apache.cassandra.locator.Replica; +import org.apache.cassandra.locator.ReplicaLayout; +import org.apache.cassandra.locator.ReplicaLayout.ForTokenWrite; +import org.apache.cassandra.locator.ReplicaPlan.ForRead; import org.apache.cassandra.metrics.ClientRequestMetrics; +import org.apache.cassandra.metrics.ClientRequestSizeMetrics; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.CASRequest; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.service.FailureRecordingCallback.AsMap; +import org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter; import org.apache.cassandra.service.paxos.Commit.Proposal; +import org.apache.cassandra.service.paxos.PaxosPrepare.FoundIncompleteAccepted; +import org.apache.cassandra.service.paxos.PaxosPrepare.FoundIncompleteCommitted; +import org.apache.cassandra.service.paxos.PaxosPropose.Superseded; import org.apache.cassandra.service.paxos.cleanup.PaxosRepairState; -import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.service.reads.DataResolver; +import org.apache.cassandra.service.reads.ReadCoordinator; import org.apache.cassandra.service.reads.repair.NoopReadRepair; +import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.tcm.ownership.DataPlacement; @@ -100,22 +104,24 @@ import org.apache.cassandra.transport.Dispatcher; import org.apache.cassandra.triggers.TriggerExecutor; import org.apache.cassandra.utils.CassandraVersion; -import org.apache.cassandra.utils.CollectionSerializer; import org.apache.cassandra.utils.FBUtilities; -import org.apache.cassandra.service.paxos.PaxosPrepare.FoundIncompleteAccepted; -import org.apache.cassandra.service.paxos.PaxosPrepare.FoundIncompleteCommitted; import org.apache.cassandra.utils.NoSpamLogger; +import static com.google.common.base.Preconditions.checkState; import static java.util.Collections.emptyMap; import static java.util.concurrent.TimeUnit.NANOSECONDS; import static java.util.concurrent.TimeUnit.SECONDS; import static org.apache.cassandra.config.CassandraRelevantProperties.PAXOS_LOG_TTL_LINEARIZABILITY_VIOLATIONS; import static org.apache.cassandra.config.CassandraRelevantProperties.PAXOS_MODERN_RELEASE; import static org.apache.cassandra.config.Config.PaxosVariant.v2_without_linearizable_reads_or_rejected_writes; +import static org.apache.cassandra.config.DatabaseDescriptor.getCasContentionTimeout; +import static org.apache.cassandra.config.DatabaseDescriptor.getWriteRpcTimeout; +import static org.apache.cassandra.db.ConsistencyLevel.LOCAL_QUORUM; +import static org.apache.cassandra.db.ConsistencyLevel.LOCAL_SERIAL; +import static org.apache.cassandra.db.ConsistencyLevel.QUORUM; +import static org.apache.cassandra.db.ConsistencyLevel.SERIAL; import static org.apache.cassandra.db.Keyspace.openAndGetStore; import static org.apache.cassandra.exceptions.RequestFailureReason.TIMEOUT; -import static org.apache.cassandra.config.DatabaseDescriptor.*; -import static org.apache.cassandra.db.ConsistencyLevel.*; import static org.apache.cassandra.locator.InetAddressAndPort.Serializer.inetAddressAndPortSerializer; import static org.apache.cassandra.locator.ReplicaLayout.forTokenWriteLiveAndDown; import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.casReadMetrics; @@ -123,19 +129,26 @@ import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.readMetrics; import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.readMetricsMap; import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.writeMetricsMap; +import static org.apache.cassandra.service.StorageProxy.ConsensusAttemptResult; +import static org.apache.cassandra.service.StorageProxy.ConsensusAttemptResult.RETRY_NEW_PROTOCOL; +import static org.apache.cassandra.service.StorageProxy.ConsensusAttemptResult.casResult; +import static org.apache.cassandra.service.StorageProxy.ConsensusAttemptResult.serialReadResult; import static org.apache.cassandra.service.paxos.Ballot.Flag.GLOBAL; import static org.apache.cassandra.service.paxos.Ballot.Flag.LOCAL; import static org.apache.cassandra.service.paxos.BallotGenerator.Global.nextBallot; import static org.apache.cassandra.service.paxos.BallotGenerator.Global.staleBallot; -import static org.apache.cassandra.service.paxos.ContentionStrategy.*; import static org.apache.cassandra.service.paxos.ContentionStrategy.Type.READ; import static org.apache.cassandra.service.paxos.ContentionStrategy.Type.WRITE; +import static org.apache.cassandra.service.paxos.ContentionStrategy.waitForContention; import static org.apache.cassandra.service.paxos.PaxosCommit.commit; import static org.apache.cassandra.service.paxos.PaxosCommitAndPrepare.commitAndPrepare; +import static org.apache.cassandra.service.paxos.PaxosPrepare.Status.Outcome.MAYBE_FAILURE; import static org.apache.cassandra.service.paxos.PaxosPrepare.prepare; import static org.apache.cassandra.service.paxos.PaxosPropose.propose; import static org.apache.cassandra.utils.Clock.Global.nanoTime; -import static org.apache.cassandra.utils.CollectionSerializer.newHashSet; +import static org.apache.cassandra.utils.CollectionSerializers.deserializeSet; +import static org.apache.cassandra.utils.CollectionSerializers.serializeCollection; +import static org.apache.cassandra.utils.CollectionSerializers.serializedCollectionSize; import static org.apache.cassandra.utils.FBUtilities.getBroadcastAddressAndPort; import static org.apache.cassandra.utils.NoSpamLogger.Level.WARN; @@ -307,21 +320,21 @@ static class Serializer implements IVersionedSerializer { public void serialize(Electorate electorate, DataOutputPlus out, int version) throws IOException { - CollectionSerializer.serializeCollection(inetAddressAndPortSerializer, electorate.natural, out, version); - CollectionSerializer.serializeCollection(inetAddressAndPortSerializer, electorate.pending, out, version); + serializeCollection(electorate.natural, out, version, inetAddressAndPortSerializer); + serializeCollection(electorate.pending, out, version, inetAddressAndPortSerializer); } public Electorate deserialize(DataInputPlus in, int version) throws IOException { - Set endpoints = CollectionSerializer.deserializeCollection(inetAddressAndPortSerializer, newHashSet(), in, version); - Set pending = CollectionSerializer.deserializeCollection(inetAddressAndPortSerializer, newHashSet(), in, version); + Set endpoints = deserializeSet(in, version, inetAddressAndPortSerializer); + Set pending = deserializeSet(in, version, inetAddressAndPortSerializer); return new Electorate(endpoints, pending); } public long serializedSize(Electorate electorate, int version) { - return CollectionSerializer.serializedSizeCollection(inetAddressAndPortSerializer, electorate.natural, version) + - CollectionSerializer.serializedSizeCollection(inetAddressAndPortSerializer, electorate.pending, version); + return serializedCollectionSize(electorate.natural, version, inetAddressAndPortSerializer) + + serializedCollectionSize(electorate.pending, version, inetAddressAndPortSerializer); } } @@ -415,6 +428,12 @@ public EndpointsForToken readCandidates() return electorateNatural; } + @Override + public EndpointsForToken liveAndDown() + { + return all; + } + @Override public boolean stillAppliesTo(ClusterMetadata newMetadata) { @@ -432,7 +451,7 @@ public void collectSuccess(InetAddressAndPort inetAddressAndPort) } @Override - public void collectFailure(InetAddressAndPort inetAddressAndPort, RequestFailureReason t) + public void collectFailure(InetAddressAndPort inetAddressAndPort, RequestFailure t) { } @@ -677,7 +696,7 @@ public interface Async * Any successful prepare phase yielding a read that rejects the condition must be followed by the proposal of * an empty update, to ensure the evaluation of the condition is linearized with respect to other reads and writes. * - * @param key the row key for the row to CAS + * @param partitionKey the row key for the row to CAS * @param request the conditions for the CAS to apply as well as the update to perform if the conditions hold. * @param consistencyForConsensus the consistency for the paxos prepare and propose round. This can only be either SERIAL or LOCAL_SERIAL. * @param consistencyForCommit the consistency for write done during the commit phase. This can be anything, except SERIAL or LOCAL_SERIAL. @@ -685,48 +704,22 @@ public interface Async * @return null if the operation succeeds in updating the row, or the current values corresponding to conditions. * (since, if the CAS doesn't succeed, it means the current value do not match the conditions). */ - public static RowIterator cas(DecoratedKey key, - CASRequest request, - ConsistencyLevel consistencyForConsensus, - ConsistencyLevel consistencyForCommit, - ClientState clientState) - throws UnavailableException, IsBootstrappingException, RequestFailureException, RequestTimeoutException, InvalidRequestException - { - final long start = nanoTime(); - final long proposeDeadline = start + getCasContentionTimeout(NANOSECONDS); - final long commitDeadline = Math.max(proposeDeadline, start + getWriteRpcTimeout(NANOSECONDS)); - return cas(key, request, consistencyForConsensus, consistencyForCommit, clientState, start, proposeDeadline, commitDeadline); - } - public static RowIterator cas(DecoratedKey key, - CASRequest request, - ConsistencyLevel consistencyForConsensus, - ConsistencyLevel consistencyForCommit, - ClientState clientState, - long proposeDeadline, - long commitDeadline - ) - throws UnavailableException, IsBootstrappingException, RequestFailureException, RequestTimeoutException, InvalidRequestException - { - return cas(key, request, consistencyForConsensus, consistencyForCommit, clientState, nanoTime(), proposeDeadline, commitDeadline); - } - private static RowIterator cas(DecoratedKey partitionKey, - CASRequest request, - ConsistencyLevel consistencyForConsensus, - ConsistencyLevel consistencyForCommit, - ClientState clientState, - long start, - long proposeDeadline, - long commitDeadline - ) + public static ConsensusAttemptResult cas(DecoratedKey partitionKey, + CASRequest request, + ConsistencyLevel consistencyForConsensus, + ConsistencyLevel consistencyForCommit, + ClientState clientState, + Dispatcher.RequestTime requestTime) throws UnavailableException, IsBootstrappingException, RequestFailureException, RequestTimeoutException, InvalidRequestException { + final long proposeDeadline = requestTime.startedAtNanos() + getCasContentionTimeout(NANOSECONDS); + final long commitDeadline = Math.max(proposeDeadline, requestTime.startedAtNanos() + getWriteRpcTimeout(NANOSECONDS)); SinglePartitionReadCommand readCommand = request.readCommand(FBUtilities.nowInSeconds()); TableMetadata metadata = readCommand.metadata(); consistencyForConsensus.validateForCas(); consistencyForCommit.validateForCasCommit(Keyspace.open(metadata.keyspace).getReplicationStrategy()); - Ballot minimumBallot = null; int failedAttemptsDueToContention = 0; try (PaxosOperationLock lock = PaxosState.lock(partitionKey, metadata, proposeDeadline, consistencyForConsensus, true)) { @@ -737,7 +730,14 @@ private static RowIterator cas(DecoratedKey partitionKey, Tracing.trace("Reading existing values for CAS precondition"); BeginResult begin = begin(proposeDeadline, readCommand, consistencyForConsensus, - true, minimumBallot, failedAttemptsDueToContention, request.requestTime()); + true, null, failedAttemptsDueToContention, request.requestTime()); + + if (begin.retryWithNewConsenusProtocol) + { + casWriteMetrics.beginMigrationRejects.mark(); + return RETRY_NEW_PROTOCOL; + } + Ballot ballot = begin.ballot; Participants participants = begin.participants; failedAttemptsDueToContention = begin.failedAttemptsDueToContention; @@ -756,7 +756,7 @@ private static RowIterator cas(DecoratedKey partitionKey, { Tracing.trace("CAS precondition rejected", current); casWriteMetrics.conditionNotMet.inc(); - return current.rowIterator(); + return casResult(current.rowIterator(false)); } // If we failed to meet our condition, it does not mean we can do nothing: if we do not propose @@ -772,7 +772,7 @@ private static RowIterator cas(DecoratedKey partitionKey, if (begin.isLinearizableRead) { Tracing.trace("CAS precondition does not match current values {}; read is already linearizable; aborting", current); - return conditionNotMet(current); + return casResult(conditionNotMet(current)); } Tracing.trace("CAS precondition does not match current values {}; proposing empty update", current); @@ -806,7 +806,7 @@ else if (begin.isPromised) continue; } - PaxosPropose.Status propose = propose(proposal, participants, conditionMet).awaitUntil(proposeDeadline); + PaxosPropose.Status propose = propose(proposal, participants, conditionMet, false).awaitUntil(proposeDeadline); switch (propose.outcome) { default: throw new IllegalStateException(); @@ -817,7 +817,7 @@ else if (begin.isPromised) case SUCCESS: { if (!conditionMet) - return conditionNotMet(current); + return casResult(conditionNotMet(current)); // no need to commit a no-op; either it // 1) reached a majority, in which case it was agreed, had no effect and we can do nothing; or @@ -830,7 +830,8 @@ else if (begin.isPromised) case SUPERSEDED: { - switch (propose.superseded().hadSideEffects) + Superseded superseded = propose.superseded(); + switch (superseded.hadSideEffects) { default: throw new IllegalStateException(); @@ -842,7 +843,12 @@ else if (begin.isPromised) .markAndThrowAsTimeoutOrFailure(true, consistencyForConsensus, failedAttemptsDueToContention); case NO: - minimumBallot = propose.superseded().by; + // Shouldn't retry on this protocol + if (superseded.needsConsensusMigration) + { + casWriteMetrics.acceptMigrationRejects.mark(); + return RETRY_NEW_PROTOCOL; + } // We have been superseded without our proposal being accepted by anyone, so we can safely retry Tracing.trace("Paxos proposal not accepted (pre-empted by a higher ballot)"); if (!waitForContention(proposeDeadline, ++failedAttemptsDueToContention, metadata, partitionKey, consistencyForConsensus, WRITE)) @@ -860,12 +866,12 @@ else if (begin.isPromised) throw result.maybeFailure().markAndThrowAsTimeoutOrFailure(true, consistencyForCommit, failedAttemptsDueToContention); } Tracing.trace("CAS successful"); - return null; + return casResult((RowIterator)null); } finally { - final long latency = nanoTime() - start; + final long latency = nanoTime() - requestTime.startedAtNanos(); if (failedAttemptsDueToContention > 0) { @@ -883,28 +889,16 @@ private static RowIterator conditionNotMet(FilteredPartition read) { Tracing.trace("CAS precondition rejected", read); casWriteMetrics.conditionNotMet.inc(); - return read.rowIterator(); - } - - public static PartitionIterator read(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyForConsensus, Dispatcher.RequestTime requestTime) - throws InvalidRequestException, UnavailableException, ReadFailureException, ReadTimeoutException - { - long deadline = requestTime.computeDeadline(DatabaseDescriptor.getReadRpcTimeout(NANOSECONDS)); - return read(group, consistencyForConsensus, requestTime, deadline); + return read.rowIterator(false); } - public static PartitionIterator read(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyForConsensus, long deadline) - throws InvalidRequestException, UnavailableException, ReadFailureException, ReadTimeoutException - { - return read(group, consistencyForConsensus, Dispatcher.RequestTime.forImmediateExecution(), deadline); - } - - private static PartitionIterator read(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyForConsensus, Dispatcher.RequestTime requestTime, long deadline) + public static ConsensusAttemptResult read(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyForConsensus, Dispatcher.RequestTime requestTime) throws InvalidRequestException, UnavailableException, ReadFailureException, ReadTimeoutException { long start = nanoTime(); if (group.queries.size() > 1) throw new InvalidRequestException("SERIAL/LOCAL_SERIAL consistency may only be requested for one partition at a time"); + long deadline = requestTime.computeDeadline(DatabaseDescriptor.getReadRpcTimeout(NANOSECONDS)); int failedAttemptsDueToContention = 0; Ballot minimumBallot = null; @@ -917,6 +911,12 @@ private static PartitionIterator read(SinglePartitionReadCommand.Group group, Co final BeginResult begin = begin(deadline, read, consistencyForConsensus, false, minimumBallot, failedAttemptsDueToContention, requestTime); failedAttemptsDueToContention = begin.failedAttemptsDueToContention; + if (begin.retryWithNewConsenusProtocol) + { + casReadMetrics.beginMigrationRejects.mark(); + return RETRY_NEW_PROTOCOL; + } + switch (PAXOS_VARIANT) { default: @@ -925,16 +925,16 @@ private static PartitionIterator read(SinglePartitionReadCommand.Group group, Co case v2_without_linearizable_reads_or_rejected_writes: case v2_without_linearizable_reads: - return begin.readResponse; + return serialReadResult(begin.readResponse); case v2: // no need to submit an empty proposal, as the promise will be treated as complete for future optimistic reads if (begin.isLinearizableRead) - return begin.readResponse; + return serialReadResult(begin.readResponse); } Proposal proposal = Proposal.empty(begin.ballot, read.partitionKey(), read.metadata()); - PaxosPropose.Status propose = propose(proposal, begin.participants, false).awaitUntil(deadline); + PaxosPropose.Status propose = propose(proposal, begin.participants, true, false).awaitUntil(deadline); switch (propose.outcome) { default: throw new IllegalStateException(); @@ -943,10 +943,21 @@ private static PartitionIterator read(SinglePartitionReadCommand.Group group, Co throw propose.maybeFailure().markAndThrowAsTimeoutOrFailure(false, consistencyForConsensus, failedAttemptsDueToContention); case SUCCESS: - return begin.readResponse; + return serialReadResult(begin.readResponse); case SUPERSEDED: - switch (propose.superseded().hadSideEffects) + Superseded superseded = propose.superseded(); + // For consensus migration we are going to bail out earlier if migration is needed + // otherwise it it will fail every single query that races with migration being started + // during the propose step. Necessary because of CASSANDRA-18276 + // Shouldn't retry again on this protocol + if (superseded.needsConsensusMigration) + { + casReadMetrics.acceptMigrationRejects.mark(); + return RETRY_NEW_PROTOCOL; + } + // TODO https://issues.apache.org/jira/browse/CASSANDRA-18276 side effects shouldn't matter for reads + switch (superseded.hadSideEffects) { default: throw new IllegalStateException(); @@ -964,6 +975,7 @@ private static PartitionIterator read(SinglePartitionReadCommand.Group group, Co if (!waitForContention(deadline, ++failedAttemptsDueToContention, group.metadata(), group.queries.get(0).partitionKey(), consistencyForConsensus, READ)) throw MaybeFailure.noResponses(begin.participants).markAndThrowAsTimeoutOrFailure(true, consistencyForConsensus, failedAttemptsDueToContention); } + break; } } } @@ -994,9 +1006,11 @@ static class BeginResult final boolean isPromised; final Ballot retryWithAtLeast; - public BeginResult(Ballot ballot, Participants participants, int failedAttemptsDueToContention, PartitionIterator readResponse, boolean isLinearizableRead, boolean isPromised, Ballot retryWithAtLeast) + final boolean retryWithNewConsenusProtocol; + + public BeginResult(Ballot ballot, Participants participants, int failedAttemptsDueToContention, PartitionIterator readResponse, boolean isLinearizableRead, boolean isPromised, Ballot retryWithAtLeast, boolean retryWithNewConsenusProtocol) { - assert isPromised || isLinearizableRead; + assert isPromised || isLinearizableRead || retryWithNewConsenusProtocol; this.ballot = ballot; this.participants = participants; this.failedAttemptsDueToContention = failedAttemptsDueToContention; @@ -1004,6 +1018,12 @@ public BeginResult(Ballot ballot, Participants participants, int failedAttemptsD this.isLinearizableRead = isLinearizableRead; this.isPromised = isPromised; this.retryWithAtLeast = retryWithAtLeast; + this.retryWithNewConsenusProtocol = retryWithNewConsenusProtocol; + } + + static BeginResult retryOnNewProtocol() + { + return new BeginResult(null, null, -1, null, false, false, null, true); } } @@ -1047,6 +1067,14 @@ private static BeginResult begin(long deadline, // prepare PaxosPrepare retry = null; PaxosPrepare.Status prepare = preparing.awaitUntil(deadline); + + // After performing the prepare phase we may discover that we can't propose + // our own transaction on this protocol by discovering a new CM Epoch + if (ConsensusRequestRouter.instance.isKeyInMigratingOrMigratedRangeDuringPaxosBegin(query.metadata().id, query.partitionKey()) && prepare.outcome != MAYBE_FAILURE) + { + return BeginResult.retryOnNewProtocol(); + } + boolean isPromised = false; retry: switch (prepare.outcome) { @@ -1075,7 +1103,7 @@ private static BeginResult begin(long deadline, // and in fact it's possible for a CAS to sometimes determine if side effects occurred by reading // the underlying data and not witnessing the timestamp of its ballot (or any newer for the relevant data). Proposal repropose = new Proposal(inProgress.ballot, inProgress.accepted.update); - PaxosPropose.Status proposeResult = propose(repropose, inProgress.participants, false).awaitUntil(deadline); + PaxosPropose.Status proposeResult = propose(repropose, inProgress.participants, false, true).awaitUntil(deadline); switch (proposeResult.outcome) { default: throw new IllegalStateException(); @@ -1088,6 +1116,7 @@ private static BeginResult begin(long deadline, break retry; case SUPERSEDED: + checkState(!proposeResult.superseded().needsConsensusMigration, "Should not receive needsConsensusMigration rejects from begin"); // since we are proposing a previous value that was maybe superseded by us before completion // we don't need to test the side effects, as we just want to start again, and fall through // to the superseded section below @@ -1112,7 +1141,7 @@ private static BeginResult begin(long deadline, PaxosPrepare.Success success = prepare.success(); Supplier plan = () -> success.participants; - DataResolver resolver = new DataResolver<>(query, plan, NoopReadRepair.instance, requestTime); + DataResolver resolver = new DataResolver<>(ReadCoordinator.DEFAULT, query, plan, NoopReadRepair.instance, requestTime); for (int i = 0 ; i < success.responses.size() ; ++i) resolver.preprocess(success.responses.get(i)); @@ -1130,7 +1159,7 @@ class WasRun implements Runnable { boolean v; public void run() { v = true; } } break; } - return new BeginResult(success.ballot, success.participants, failedAttemptsDueToContention, result, !hadShortRead.v && success.isReadSafe, isPromised, success.supersededBy); + return new BeginResult(success.ballot, success.participants, failedAttemptsDueToContention, result, !hadShortRead.v && success.isReadSafe, isPromised, success.supersededBy, false); } case MAYBE_FAILURE: diff --git a/src/java/org/apache/cassandra/service/paxos/PaxosCommit.java b/src/java/org/apache/cassandra/service/paxos/PaxosCommit.java index b5ce86794dbb..b79e032fe233 100644 --- a/src/java/org/apache/cassandra/service/paxos/PaxosCommit.java +++ b/src/java/org/apache/cassandra/service/paxos/PaxosCommit.java @@ -29,7 +29,7 @@ import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.db.Mutation; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.locator.EndpointsForToken; import org.apache.cassandra.locator.InOurDc; import org.apache.cassandra.locator.InetAddressAndPort; @@ -44,13 +44,12 @@ import org.apache.cassandra.utils.concurrent.ConditionAsConsumer; import static java.util.Collections.emptyMap; -import static org.apache.cassandra.exceptions.RequestFailureReason.NODE_DOWN; import static org.apache.cassandra.exceptions.RequestFailureReason.UNKNOWN; import static org.apache.cassandra.net.Verb.PAXOS2_COMMIT_REMOTE_REQ; import static org.apache.cassandra.net.Verb.PAXOS_COMMIT_REQ; import static org.apache.cassandra.service.StorageProxy.shouldHint; import static org.apache.cassandra.service.StorageProxy.submitHint; -import static org.apache.cassandra.service.paxos.Commit.*; +import static org.apache.cassandra.service.paxos.Commit.Agreed; import static org.apache.cassandra.utils.concurrent.ConditionAsConsumer.newConditionAsConsumer; // Does not support EACH_QUORUM, as no such thing as EACH_SERIAL @@ -186,7 +185,7 @@ void start(Participants participants, boolean async) executeOnSelf |= isSelfOrSend(commitMessage, mutationMessage, participants.allLive.endpoint(i)); for (int i = 0, mi = participants.allDown.size(); i < mi ; ++i) - onFailure(participants.allDown.endpoint(i), NODE_DOWN); + onFailure(participants.allDown.endpoint(i), RequestFailure.NODE_DOWN); if (executeOnSelf) { @@ -223,7 +222,7 @@ private static boolean isInLocalDc(InetAddressAndPort destination) * Record a failure or timeout, and maybe submit a hint to {@code from} */ @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason reason) + public void onFailure(InetAddressAndPort from, RequestFailure reason) { if (logger.isTraceEnabled()) logger.trace("{} {} from {}", commit, reason, from); diff --git a/src/java/org/apache/cassandra/service/paxos/PaxosCommitAndPrepare.java b/src/java/org/apache/cassandra/service/paxos/PaxosCommitAndPrepare.java index 7046dfbb3753..b81f6b720b0e 100644 --- a/src/java/org/apache/cassandra/service/paxos/PaxosCommitAndPrepare.java +++ b/src/java/org/apache/cassandra/service/paxos/PaxosCommitAndPrepare.java @@ -45,7 +45,7 @@ public class PaxosCommitAndPrepare static PaxosPrepare commitAndPrepare(Agreed commit, Paxos.Participants participants, SinglePartitionReadCommand readCommand, boolean isWrite, boolean acceptEarlyReadSuccess) { Ballot ballot = newBallot(commit.ballot, participants.consistencyForConsensus); - Request request = new Request(commit, ballot, participants.electorate, readCommand, isWrite); + Request request = new Request(commit, ballot, participants.electorate, readCommand, isWrite, true); PaxosPrepare prepare = new PaxosPrepare(participants, request, acceptEarlyReadSuccess, null); Tracing.trace("Committing {}; Preparing {}", commit.ballot, ballot); @@ -59,21 +59,21 @@ private static class Request extends PaxosPrepare.AbstractRequest { final Agreed commit; - Request(Agreed commit, Ballot ballot, Paxos.Electorate electorate, SinglePartitionReadCommand read, boolean isWrite) + Request(Agreed commit, Ballot ballot, Paxos.Electorate electorate, SinglePartitionReadCommand read, boolean isWrite, boolean isForRecovery) { - super(ballot, electorate, read, isWrite); + super(ballot, electorate, read, isWrite, isForRecovery); this.commit = commit; } - private Request(Agreed commit, Ballot ballot, Paxos.Electorate electorate, DecoratedKey partitionKey, TableMetadata table, boolean isWrite) + private Request(Agreed commit, Ballot ballot, Paxos.Electorate electorate, DecoratedKey partitionKey, TableMetadata table, boolean isWrite, boolean isForRecovery) { - super(ballot, electorate, partitionKey, table, isWrite); + super(ballot, electorate, partitionKey, table, isWrite, isForRecovery); this.commit = commit; } Request withoutRead() { - return new Request(commit, ballot, electorate, partitionKey, table, isForWrite); + return new Request(commit, ballot, electorate, partitionKey, table, isForWrite, isForRecovery); } public String toString() @@ -84,14 +84,14 @@ public String toString() public static class RequestSerializer extends PaxosPrepare.AbstractRequestSerializer { - Request construct(Agreed param, Ballot ballot, Paxos.Electorate electorate, SinglePartitionReadCommand read, boolean isWrite) + Request construct(Agreed param, Ballot ballot, Paxos.Electorate electorate, SinglePartitionReadCommand read, boolean isWrite, boolean isForRecovery) { - return new Request(param, ballot, electorate, read, isWrite); + return new Request(param, ballot, electorate, read, isWrite, isForRecovery); } - Request construct(Agreed param, Ballot ballot, Paxos.Electorate electorate, DecoratedKey partitionKey, TableMetadata table, boolean isWrite) + Request construct(Agreed param, Ballot ballot, Paxos.Electorate electorate, DecoratedKey partitionKey, TableMetadata table, boolean isWrite, boolean isForRecovery) { - return new Request(param, ballot, electorate, partitionKey, table, isWrite); + return new Request(param, ballot, electorate, partitionKey, table, isWrite, isForRecovery); } @Override diff --git a/src/java/org/apache/cassandra/service/paxos/PaxosPrepare.java b/src/java/org/apache/cassandra/service/paxos/PaxosPrepare.java index 3b04acd26ba6..b1fb0e5cd6ea 100644 --- a/src/java/org/apache/cassandra/service/paxos/PaxosPrepare.java +++ b/src/java/org/apache/cassandra/service/paxos/PaxosPrepare.java @@ -34,9 +34,16 @@ import org.slf4j.LoggerFactory; import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.db.*; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.ReadCommand; +import org.apache.cassandra.db.ReadExecutionController; +import org.apache.cassandra.db.ReadResponse; +import org.apache.cassandra.db.SinglePartitionReadCommand; +import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.UnavailableException; import org.apache.cassandra.gms.EndpointState; import org.apache.cassandra.gms.Gossiper; @@ -44,7 +51,6 @@ import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.metrics.PaxosMetrics; import org.apache.cassandra.net.IVerbHandler; import org.apache.cassandra.net.Message; @@ -52,7 +58,9 @@ import org.apache.cassandra.schema.Schema; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.consensus.migration.ConsensusKeyMigrationState.KeyMigrationState; import org.apache.cassandra.service.paxos.PaxosPrepare.Status.Outcome; +import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tracing.Tracing; @@ -63,18 +71,40 @@ import static org.apache.cassandra.locator.InetAddressAndPort.Serializer.inetAddressAndPortSerializer; import static org.apache.cassandra.net.Verb.PAXOS2_PREPARE_REQ; import static org.apache.cassandra.net.Verb.PAXOS2_PREPARE_RSP; +import static org.apache.cassandra.service.consensus.migration.ConsensusKeyMigrationState.getKeyMigrationState; + +import org.apache.cassandra.service.consensus.migration.ConsensusMigratedAt; import static org.apache.cassandra.service.paxos.Ballot.Flag.NONE; -import static org.apache.cassandra.service.paxos.Commit.*; -import static org.apache.cassandra.service.paxos.Paxos.*; -import static org.apache.cassandra.service.paxos.PaxosPrepare.Status.Outcome.*; +import static org.apache.cassandra.service.paxos.Commit.Accepted; +import static org.apache.cassandra.service.paxos.Commit.Committed; +import static org.apache.cassandra.service.paxos.Commit.CompareResult; +import static org.apache.cassandra.service.paxos.Commit.isAfter; +import static org.apache.cassandra.service.paxos.Paxos.Electorate; +import static org.apache.cassandra.service.paxos.Paxos.LOG_TTL_LINEARIZABILITY_VIOLATIONS; +import static org.apache.cassandra.service.paxos.Paxos.Participants; +import static org.apache.cassandra.service.paxos.Paxos.consistency; +import static org.apache.cassandra.service.paxos.Paxos.getPaxosVariant; +import static org.apache.cassandra.service.paxos.Paxos.isInRangeAndShouldProcess; +import static org.apache.cassandra.service.paxos.Paxos.newBallot; +import static org.apache.cassandra.service.paxos.Paxos.verifyElectorate; +import static org.apache.cassandra.service.paxos.PaxosPrepare.Status.Outcome.ELECTORATE_MISMATCH; +import static org.apache.cassandra.service.paxos.PaxosPrepare.Status.Outcome.FOUND_INCOMPLETE_ACCEPTED; +import static org.apache.cassandra.service.paxos.PaxosPrepare.Status.Outcome.FOUND_INCOMPLETE_COMMITTED; +import static org.apache.cassandra.service.paxos.PaxosPrepare.Status.Outcome.MAYBE_FAILURE; +import static org.apache.cassandra.service.paxos.PaxosPrepare.Status.Outcome.PROMISED; +import static org.apache.cassandra.service.paxos.PaxosPrepare.Status.Outcome.READ_PERMITTED; +import static org.apache.cassandra.service.paxos.PaxosPrepare.Status.Outcome.SUPERSEDED; +import static org.apache.cassandra.service.paxos.PaxosState.MaybePromise; +import static org.apache.cassandra.service.paxos.PaxosState.MaybePromise.Outcome.PERMIT_READ; +import static org.apache.cassandra.service.paxos.PaxosState.MaybePromise.Outcome.PROMISE; +import static org.apache.cassandra.service.paxos.PaxosState.MaybePromise.Outcome.REJECT; +import static org.apache.cassandra.service.paxos.PaxosState.Snapshot; +import static org.apache.cassandra.service.paxos.PaxosState.get; import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis; import static org.apache.cassandra.utils.Clock.Global.nanoTime; -import static org.apache.cassandra.service.paxos.PaxosState.*; -import static org.apache.cassandra.service.paxos.PaxosState.MaybePromise.Outcome.*; -import static org.apache.cassandra.utils.CollectionSerializer.deserializeMap; -import static org.apache.cassandra.utils.CollectionSerializer.newHashMap; -import static org.apache.cassandra.utils.CollectionSerializer.serializeMap; -import static org.apache.cassandra.utils.CollectionSerializer.serializedSizeMap; +import static org.apache.cassandra.utils.CollectionSerializers.deserializeMap; +import static org.apache.cassandra.utils.CollectionSerializers.serializeMap; +import static org.apache.cassandra.utils.CollectionSerializers.serializedMapSize; import static org.apache.cassandra.utils.concurrent.Awaitable.SyncAwaitable.waitUntil; /** @@ -344,7 +374,7 @@ static PaxosPrepare prepare(Ballot minimumBallot, Participants participants, Sin static PaxosPrepare prepareWithBallot(Ballot ballot, Participants participants, SinglePartitionReadCommand readCommand, boolean isWrite, boolean acceptEarlyReadPermission) { Tracing.trace("Preparing {} with read", ballot); - Request request = new Request(ballot, participants.electorate, readCommand, isWrite); + Request request = new Request(ballot, participants.electorate, readCommand, isWrite, true); return prepareWithBallotInternal(participants, request, acceptEarlyReadPermission, null); } @@ -352,7 +382,7 @@ static PaxosPrepare prepareWithBallot(Ballot ballot, Participants participants, static > T prepareWithBallot(Ballot ballot, Participants participants, DecoratedKey partitionKey, TableMetadata table, boolean isWrite, boolean acceptEarlyReadPermission, T onDone) { Tracing.trace("Preparing {}", ballot); - prepareWithBallotInternal(participants, new Request(ballot, participants.electorate, partitionKey, table, isWrite), acceptEarlyReadPermission, onDone); + prepareWithBallotInternal(participants, new Request(ballot, participants.electorate, partitionKey, table, isWrite, true), acceptEarlyReadPermission, onDone); return onDone; } @@ -805,7 +835,7 @@ private void addReadResponse(ReadResponse response, InetAddressAndPort from) } @Override - public synchronized void onFailure(InetAddressAndPort from, RequestFailureReason reason) + public synchronized void onFailure(InetAddressAndPort from, RequestFailure reason) { if (logger.isTraceEnabled()) logger.trace("{} {} failure from {}", request, reason, from); @@ -813,7 +843,7 @@ public synchronized void onFailure(InetAddressAndPort from, RequestFailureReason if (isDone()) return; - super.onFailureWithMutex(from, reason); + super.onFailureWithMutex(from, reason.reason); ++failures; if (failures + participants.sizeOfConsensusQuorum == 1 + participants.sizeOfPoll()) @@ -876,7 +906,7 @@ private void refreshStaleParticipants() } @Override - public void onRefreshFailure(InetAddressAndPort from, RequestFailureReason reason) + public void onRefreshFailure(InetAddressAndPort from, RequestFailure reason) { onFailure(from, reason); } @@ -911,8 +941,9 @@ static abstract class AbstractRequest> final boolean isForWrite; final DecoratedKey partitionKey; final TableMetadata table; + final boolean isForRecovery; - AbstractRequest(Ballot ballot, Electorate electorate, SinglePartitionReadCommand read, boolean isForWrite) + AbstractRequest(Ballot ballot, Electorate electorate, SinglePartitionReadCommand read, boolean isForWrite, boolean isForRecovery) { this.ballot = ballot; this.electorate = electorate; @@ -920,9 +951,10 @@ static abstract class AbstractRequest> this.isForWrite = isForWrite; this.partitionKey = read.partitionKey(); this.table = read.metadata(); + this.isForRecovery = isForRecovery; } - AbstractRequest(Ballot ballot, Electorate electorate, DecoratedKey partitionKey, TableMetadata table, boolean isForWrite) + AbstractRequest(Ballot ballot, Electorate electorate, DecoratedKey partitionKey, TableMetadata table, boolean isForWrite, boolean isForRecovery) { this.ballot = ballot; this.electorate = electorate; @@ -930,6 +962,7 @@ static abstract class AbstractRequest> this.table = table; this.read = null; this.isForWrite = isForWrite; + this.isForRecovery = isForRecovery; } abstract R withoutRead(); @@ -942,19 +975,19 @@ public String toString() static class Request extends AbstractRequest { - Request(Ballot ballot, Electorate electorate, SinglePartitionReadCommand read, boolean isWrite) + Request(Ballot ballot, Electorate electorate, SinglePartitionReadCommand read, boolean isWrite, boolean isForRecovery) { - super(ballot, electorate, read, isWrite); + super(ballot, electorate, read, isWrite, isForRecovery); } - private Request(Ballot ballot, Electorate electorate, DecoratedKey partitionKey, TableMetadata table, boolean isWrite) + private Request(Ballot ballot, Electorate electorate, DecoratedKey partitionKey, TableMetadata table, boolean isWrite, boolean isForRecovery) { - super(ballot, electorate, partitionKey, table, isWrite); + super(ballot, electorate, partitionKey, table, isWrite, isForRecovery); } Request withoutRead() { - return read == null ? this : new Request(ballot, electorate, partitionKey, table, isForWrite); + return read == null ? this : new Request(ballot, electorate, partitionKey, table, isForWrite, isForRecovery); } public String toString() @@ -965,11 +998,16 @@ public String toString() static class Response { + @Nonnull final MaybePromise.Outcome outcome; - Response(MaybePromise.Outcome outcome) + @Nullable + final ConsensusMigratedAt maybeConsenusMigratedAt; + + Response(@Nonnull MaybePromise.Outcome outcome, @Nullable ConsensusMigratedAt maybeConsenusMigratedAt) { this.outcome = outcome; + this.maybeConsenusMigratedAt = maybeConsenusMigratedAt; } Permitted permitted() { return (Permitted) this; } Rejected rejected() { return (Rejected) this; } @@ -999,9 +1037,9 @@ static class Permitted extends Response @Nullable final Ballot supersededBy; final Epoch electorateEpoch; - Permitted(MaybePromise.Outcome outcome, long lowBound, @Nullable Accepted latestAcceptedButNotCommitted, Committed latestCommitted, @Nullable ReadResponse readResponse, boolean hadProposalStability, Map gossipInfo, Epoch electorateEpoch, @Nullable Ballot supersededBy) + Permitted(MaybePromise.Outcome outcome, @Nullable ConsensusMigratedAt maybeConsensusMigratedAt, long lowBound, @Nullable Accepted latestAcceptedButNotCommitted, Committed latestCommitted, @Nullable ReadResponse readResponse, boolean hadProposalStability, Map gossipInfo, Epoch electorateEpoch, @Nullable Ballot supersededBy) { - super(outcome); + super(outcome, maybeConsensusMigratedAt); this.lowBound = lowBound; this.latestAcceptedButNotCommitted = latestAcceptedButNotCommitted; this.latestCommitted = latestCommitted; @@ -1023,9 +1061,9 @@ static class Rejected extends Response { final Ballot supersededBy; - Rejected(Ballot supersededBy) + Rejected(Ballot supersededBy, @Nullable ConsensusMigratedAt maybeConsensusMigratedAt) { - super(REJECT); + super(REJECT, maybeConsensusMigratedAt); this.supersededBy = supersededBy; } @@ -1069,6 +1107,7 @@ static Response execute(AbstractRequest request, InetAddressAndPort from) static Response execute(AbstractRequest request, PaxosState state) { MaybePromise result = state.promiseIfNewer(request.ballot, request.isForWrite); + KeyMigrationState keyMigrationState = getKeyMigrationState(request.table.id, request.partitionKey); switch (result.outcome) { case PROMISE: @@ -1104,6 +1143,8 @@ static Response execute(AbstractRequest request, PaxosState state) if (request.read != null) { + // Make sure the read is safe and there is no Accord state that needs application + keyMigrationState.maybePerformAccordToPaxosKeyMigration(request.isForWrite); try (ReadExecutionController executionController = request.read.executionController(); UnfilteredPartitionIterator iterator = request.read.executeLocally(executionController)) { @@ -1125,10 +1166,10 @@ static Response execute(AbstractRequest request, PaxosState state) ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreInstance(request.table.id); long lowBound = cfs.getPaxosRepairLowBound(request.partitionKey).uuidTimestamp(); - return new Permitted(result.outcome, lowBound, acceptedButNotCommitted, committed, readResponse, hasProposalStability, gossipInfo, electorateEpoch, supersededBy); + return new Permitted(result.outcome, keyMigrationState.consensusMigratedAt, lowBound, acceptedButNotCommitted, committed, readResponse, hasProposalStability, gossipInfo, electorateEpoch, supersededBy); case REJECT: - return new Rejected(result.supersededBy()); + return new Rejected(result.supersededBy(), keyMigrationState.consensusMigratedAt); default: throw new IllegalStateException(); @@ -1138,8 +1179,8 @@ static Response execute(AbstractRequest request, PaxosState state) static abstract class AbstractRequestSerializer, T> implements IVersionedSerializer { - abstract R construct(T param, Ballot ballot, Electorate electorate, SinglePartitionReadCommand read, boolean isWrite); - abstract R construct(T param, Ballot ballot, Electorate electorate, DecoratedKey partitionKey, TableMetadata table, boolean isWrite); + abstract R construct(T param, Ballot ballot, Electorate electorate, SinglePartitionReadCommand read, boolean isWrite, boolean isForRecovery); + abstract R construct(T param, Ballot ballot, Electorate electorate, DecoratedKey partitionKey, TableMetadata table, boolean isWrite, boolean isForRecovery); @Override public void serialize(R request, DataOutputPlus out, int version) throws IOException @@ -1157,6 +1198,8 @@ public void serialize(R request, DataOutputPlus out, int version) throws IOExcep request.table.id.serialize(out); DecoratedKey.serializer.serialize(request.partitionKey, out, version); } + if (version >= MessagingService.VERSION_51) + out.writeBoolean(request.isForRecovery); } public R deserialize(T param, DataInputPlus in, int version) throws IOException @@ -1167,38 +1210,47 @@ public R deserialize(T param, DataInputPlus in, int version) throws IOException if ((flag & 1) != 0) { SinglePartitionReadCommand readCommand = (SinglePartitionReadCommand) ReadCommand.serializer.deserialize(in, version); - return construct(param, ballot, electorate, readCommand, (flag & 2) == 0); + boolean isForRecovery = false; + if (version >= MessagingService.VERSION_51) + isForRecovery = in.readBoolean(); + return construct(param, ballot, electorate, readCommand, (flag & 2) == 0, isForRecovery); } else { TableMetadata table = Schema.instance.getExistingTableMetadata(TableId.deserialize(in)); DecoratedKey partitionKey = (DecoratedKey) DecoratedKey.serializer.deserialize(in, table.partitioner, version); - return construct(param, ballot, electorate, partitionKey, table, (flag & 2) != 0); + boolean isForRecovery = false; + if (version >= MessagingService.VERSION_51) + isForRecovery = in.readBoolean(); + return construct(param, ballot, electorate, partitionKey, table, (flag & 2) != 0, isForRecovery); } } @Override public long serializedSize(R request, int version) { - return Ballot.sizeInBytes() + long size = Ballot.sizeInBytes() + Electorate.serializer.serializedSize(request.electorate, version) + 1 + (request.read != null ? ReadCommand.serializer.serializedSize(request.read, version) : request.table.id.serializedSize() + DecoratedKey.serializer.serializedSize(request.partitionKey, version)); + if (version >= MessagingService.VERSION_51) + size += TypeSizes.sizeof(request.isForRecovery); + return size; } } public static class RequestSerializer extends AbstractRequestSerializer { - Request construct(Object ignore, Ballot ballot, Electorate electorate, SinglePartitionReadCommand read, boolean isWrite) + Request construct(Object ignore, Ballot ballot, Electorate electorate, SinglePartitionReadCommand read, boolean isWrite, boolean isForRecovery) { - return new Request(ballot, electorate, read, isWrite); + return new Request(ballot, electorate, read, isWrite, isForRecovery); } - Request construct(Object ignore, Ballot ballot, Electorate electorate, DecoratedKey partitionKey, TableMetadata table, boolean isWrite) + Request construct(Object ignore, Ballot ballot, Electorate electorate, DecoratedKey partitionKey, TableMetadata table, boolean isWrite, boolean isForRecovery) { - return new Request(ballot, electorate, partitionKey, table, isWrite); + return new Request(ballot, electorate, partitionKey, table, isWrite, isForRecovery); } public Request deserialize(DataInputPlus in, int version) throws IOException @@ -1213,8 +1265,8 @@ public void serialize(Response response, DataOutputPlus out, int version) throws { if (response.isRejected()) { - out.writeByte(0); Rejected rejected = (Rejected) response; + out.writeByte(0); rejected.supersededBy.serialize(out); } else @@ -1232,12 +1284,14 @@ public void serialize(Response response, DataOutputPlus out, int version) throws Committed.serializer.serialize(promised.latestCommitted, out, version); if (promised.readResponse != null) ReadResponse.serializer.serialize(promised.readResponse, out, version); - serializeMap(inetAddressAndPortSerializer, EndpointState.nullableSerializer, promised.gossipInfo, out, version); + serializeMap(promised.gossipInfo, out, version, inetAddressAndPortSerializer, EndpointState.nullableSerializer); if (version >= MessagingService.VERSION_51) Epoch.messageSerializer.serialize(promised.electorateEpoch, out, version); if (promised.outcome == PERMIT_READ) promised.supersededBy.serialize(out); } + if (version >= MessagingService.VERSION_51) + ConsensusMigratedAt.serializer.serialize(response.maybeConsenusMigratedAt, out); } public Response deserialize(DataInputPlus in, int version) throws IOException @@ -1246,7 +1300,10 @@ public Response deserialize(DataInputPlus in, int version) throws IOException if (flags == 0) { Ballot supersededBy = Ballot.deserialize(in); - return new Rejected(supersededBy); + ConsensusMigratedAt consensusMigratedAt = null; + if (version >= MessagingService.VERSION_51) + consensusMigratedAt = ConsensusMigratedAt.serializer.deserialize(in); + return new Rejected(supersededBy, consensusMigratedAt); } else { @@ -1254,35 +1311,42 @@ public Response deserialize(DataInputPlus in, int version) throws IOException Accepted acceptedNotCommitted = (flags & 2) != 0 ? Accepted.serializer.deserialize(in, version) : null; Committed committed = Committed.serializer.deserialize(in, version); ReadResponse readResponse = (flags & 4) != 0 ? ReadResponse.serializer.deserialize(in, version) : null; - Map gossipInfo = deserializeMap(inetAddressAndPortSerializer, EndpointState.nullableSerializer, newHashMap(), in, version); + Map gossipInfo = deserializeMap(in, version, inetAddressAndPortSerializer, EndpointState.nullableSerializer); Epoch electorateEpoch = version >= MessagingService.VERSION_51 ? Epoch.messageSerializer.deserialize(in, version) : Epoch.EMPTY; MaybePromise.Outcome outcome = (flags & 16) != 0 ? PERMIT_READ : PROMISE; boolean hasProposalStability = (flags & 8) != 0; Ballot supersededBy = null; if (outcome == PERMIT_READ) supersededBy = Ballot.deserialize(in); - return new Permitted(outcome, lowBound, acceptedNotCommitted, committed, readResponse, hasProposalStability, gossipInfo, electorateEpoch, supersededBy); + ConsensusMigratedAt consensusMigratedAt = null; + if (version >= MessagingService.VERSION_51) + consensusMigratedAt = ConsensusMigratedAt.serializer.deserialize(in); + return new Permitted(outcome, consensusMigratedAt, lowBound, acceptedNotCommitted, committed, readResponse, hasProposalStability, gossipInfo, electorateEpoch, supersededBy); } } public long serializedSize(Response response, int version) { + long size = 1; //flags if (response.isRejected()) { - return 1 + Ballot.sizeInBytes(); + size += Ballot.sizeInBytes(); } else { Permitted permitted = (Permitted) response; - return 1 - + VIntCoding.computeUnsignedVIntSize(permitted.lowBound) + size += VIntCoding.computeUnsignedVIntSize(permitted.lowBound) + (permitted.latestAcceptedButNotCommitted == null ? 0 : Accepted.serializer.serializedSize(permitted.latestAcceptedButNotCommitted, version)) + Committed.serializer.serializedSize(permitted.latestCommitted, version) + (permitted.readResponse == null ? 0 : ReadResponse.serializer.serializedSize(permitted.readResponse, version)) - + serializedSizeMap(inetAddressAndPortSerializer, EndpointState.nullableSerializer, permitted.gossipInfo, version) + + serializedMapSize(permitted.gossipInfo, version, inetAddressAndPortSerializer, EndpointState.nullableSerializer) + (version >= MessagingService.VERSION_51 ? Epoch.messageSerializer.serializedSize(permitted.electorateEpoch, version) : 0) + (permitted.outcome == PERMIT_READ ? Ballot.sizeInBytes() : 0); } + if (version >= MessagingService.VERSION_51) + size += ConsensusMigratedAt.serializer.serializedSize(response.maybeConsenusMigratedAt); + + return size; } } diff --git a/src/java/org/apache/cassandra/service/paxos/PaxosPrepareRefresh.java b/src/java/org/apache/cassandra/service/paxos/PaxosPrepareRefresh.java index 925daaf9dd86..ad909c6a004b 100644 --- a/src/java/org/apache/cassandra/service/paxos/PaxosPrepareRefresh.java +++ b/src/java/org/apache/cassandra/service/paxos/PaxosPrepareRefresh.java @@ -24,6 +24,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.exceptions.WriteTimeoutException; import org.apache.cassandra.io.IVersionedSerializer; @@ -38,15 +39,13 @@ import org.apache.cassandra.service.paxos.Commit.Committed; import org.apache.cassandra.tracing.Tracing; -import static org.apache.cassandra.exceptions.RequestFailureReason.TIMEOUT; -import static org.apache.cassandra.exceptions.RequestFailureReason.UNKNOWN; import static org.apache.cassandra.net.Verb.PAXOS2_PREPARE_REFRESH_REQ; import static org.apache.cassandra.service.paxos.Commit.isAfter; import static org.apache.cassandra.service.paxos.PaxosRequestCallback.shouldExecuteOnSelf; import static org.apache.cassandra.utils.FBUtilities.getBroadcastAddressAndPort; import static org.apache.cassandra.utils.NullableSerializer.deserializeNullable; import static org.apache.cassandra.utils.NullableSerializer.serializeNullable; -import static org.apache.cassandra.utils.NullableSerializer.serializedSizeNullable; +import static org.apache.cassandra.utils.NullableSerializer.serializedNullableSize; /** * Nodes that have promised in response to our prepare, may be missing the latestCommit, meaning we cannot be sure the @@ -65,7 +64,7 @@ public class PaxosPrepareRefresh implements RequestCallbackWithFailure refresh) } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason reason) + public void onFailure(InetAddressAndPort from, RequestFailure reason) { callbacks.onRefreshFailure(from, reason); } @@ -124,8 +123,8 @@ private void executeOnSelf() } catch (Exception ex) { - RequestFailureReason reason = UNKNOWN; - if (ex instanceof WriteTimeoutException) reason = TIMEOUT; + RequestFailure reason = RequestFailure.UNKNOWN; + if (ex instanceof WriteTimeoutException) reason = RequestFailure.TIMEOUT; else logger.error("Failed to apply paxos refresh-prepare locally", ex); onFailure(getBroadcastAddressAndPort(), reason); @@ -167,7 +166,7 @@ public void doVerb(Message message) { Response response = execute(message.payload, message.from()); if (response == null) - MessagingService.instance().respondWithFailure(UNKNOWN, message); + MessagingService.instance().respondWithFailure(RequestFailureReason.UNKNOWN, message); else MessagingService.instance().respond(response, message); } @@ -226,18 +225,18 @@ public static class ResponseSerializer implements IVersionedSerializer { public void serialize(Response response, DataOutputPlus out, int version) throws IOException { - serializeNullable(Ballot.Serializer.instance, response.isSupersededBy, out, version); + serializeNullable(response.isSupersededBy, out, version, Ballot.Serializer.instance); } public Response deserialize(DataInputPlus in, int version) throws IOException { - Ballot isSupersededBy = deserializeNullable(Ballot.Serializer.instance, in, version); + Ballot isSupersededBy = deserializeNullable(in, version, Ballot.Serializer.instance); return new Response(isSupersededBy); } public long serializedSize(Response response, int version) { - return serializedSizeNullable(Ballot.Serializer.instance, response.isSupersededBy, version); + return serializedNullableSize(response.isSupersededBy, version, Ballot.Serializer.instance); } } diff --git a/src/java/org/apache/cassandra/service/paxos/PaxosPropose.java b/src/java/org/apache/cassandra/service/paxos/PaxosPropose.java index db702af7d43c..650bd8818c82 100644 --- a/src/java/org/apache/cassandra/service/paxos/PaxosPropose.java +++ b/src/java/org/apache/cassandra/service/paxos/PaxosPropose.java @@ -22,6 +22,7 @@ import java.util.concurrent.atomic.AtomicLongFieldUpdater; import java.util.concurrent.atomic.AtomicReferenceFieldUpdater; import java.util.function.Consumer; +import javax.annotation.Nullable; import com.google.common.annotations.VisibleForTesting; import org.slf4j.Logger; @@ -29,7 +30,7 @@ import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.TypeSizes; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; @@ -38,13 +39,16 @@ import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.service.paxos.Commit.Proposal; +import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.utils.concurrent.ConditionAsConsumer; +import static com.google.common.base.Preconditions.checkArgument; import static java.util.Collections.emptyMap; import static org.apache.cassandra.exceptions.RequestFailureReason.UNKNOWN; import static org.apache.cassandra.net.Verb.PAXOS2_PROPOSE_REQ; -import static org.apache.cassandra.service.paxos.PaxosPropose.Superseded.SideEffects.NO; -import static org.apache.cassandra.service.paxos.PaxosPropose.Superseded.SideEffects.MAYBE; +import static org.apache.cassandra.service.paxos.PaxosPropose.Status.SideEffects.MAYBE; +import static org.apache.cassandra.service.paxos.PaxosPropose.Status.SideEffects.NO; +import static org.apache.cassandra.service.paxos.PaxosState.AcceptResult; import static org.apache.cassandra.utils.Clock.Global.nanoTime; import static org.apache.cassandra.utils.concurrent.ConditionAsConsumer.newConditionAsConsumer; @@ -54,13 +58,13 @@ * indicating (respectively) that we have had no side effect, or that we cannot * know if we our proposal produced a side effect. */ -public class PaxosPropose> extends PaxosRequestCallback +public class PaxosPropose> extends PaxosRequestCallback { private static final Logger logger = LoggerFactory.getLogger(PaxosPropose.class); public static final RequestHandler requestHandler = new RequestHandler(); public static final RequestSerializer requestSerializer = new RequestSerializer(); - public static final ResponseSerializer responseSerializer = new ResponseSerializer(); + public static final AcceptResultSerializer ACCEPT_RESULT_SERIALIZER = new AcceptResultSerializer(); /** * Represents the current status of a propose action: it is a status rather than a result, @@ -77,22 +81,30 @@ enum Outcome { SUCCESS, SUPERSEDED, MAYBE_FAILURE } } Superseded superseded() { return (Superseded) this; } Paxos.MaybeFailure maybeFailure() { return ((MaybeFailure) this).info; } - public String toString() { return "Success"; } + public String toString() { return outcome.toString(); } + + enum SideEffects { NO, MAYBE } } static class Superseded extends Status { - enum SideEffects { NO, MAYBE } + @Nullable final Ballot by; final SideEffects hadSideEffects; - Superseded(Ballot by, SideEffects hadSideEffects) + // Consensus migration can occur at the same time that we are superseded + // and it's important to preserve returning the uncertainty of the superseded + // at the same time as enforcing the need for consensus migration + final boolean needsConsensusMigration; + Superseded(@Nullable Ballot by, SideEffects hadSideEffects, boolean needsConsensusMigration) { super(Outcome.SUPERSEDED); + checkArgument(needsConsensusMigration == true || by != null, "Must be superseded by ballot if not due to consensus migration"); this.by = by; this.hadSideEffects = hadSideEffects; + this.needsConsensusMigration = needsConsensusMigration; } - public String toString() { return "Superseded(" + by + ',' + hadSideEffects + ')'; } + public String toString() { return "Superseded(" + by + ',' + hadSideEffects + ',' + needsConsensusMigration + ')'; } } private static class MaybeFailure extends Status @@ -107,7 +119,7 @@ private static class MaybeFailure extends Status public String toString() { return info.toString(); } } - private static final Status success = new Status(Status.Outcome.SUCCESS); + private static final Status STATUS_SUCCESS = new Status(Status.Outcome.SUCCESS); private static final AtomicLongFieldUpdater responsesUpdater = AtomicLongFieldUpdater.newUpdater(PaxosPropose.class, "responses"); private static final AtomicReferenceFieldUpdater supersededByUpdater = AtomicReferenceFieldUpdater.newUpdater(PaxosPropose.class, Ballot.class, "supersededBy"); @@ -126,6 +138,10 @@ private static class MaybeFailure extends Status final int participants; /** Number of accepts required */ final int required; + + /** Repairing an in flight txn not proposing a new one **/ + final boolean isForRecovery; + /** Invoke on reaching a terminal status */ final OnDone onDone; @@ -145,7 +161,9 @@ private static class MaybeFailure extends Status /** The newest superseding ballot from a refusal; only returned to the caller if we fail to reach a quorum */ private volatile Ballot supersededBy; - private PaxosPropose(Proposal proposal, int participants, int required, boolean waitForNoSideEffect, OnDone onDone) + private volatile boolean needsConsensusMigration = false; + + private PaxosPropose(Proposal proposal, int participants, int required, boolean waitForNoSideEffect, boolean isForRecovery, OnDone onDone) { this.proposal = proposal; assert required > 0; @@ -153,6 +171,7 @@ private PaxosPropose(Proposal proposal, int participants, int required, boolean this.participants = participants; this.required = required; this.onDone = onDone; + this.isForRecovery = isForRecovery; } /** @@ -160,8 +179,10 @@ private PaxosPropose(Proposal proposal, int participants, int required, boolean * or for the present status if the time elapses without a final result being reached. * @param waitForNoSideEffect if true, on failure we will wait until we can say with certainty there are no side effects * or until we know we will never be able to determine this with certainty + * @param isForRecovery if true the value being proposed is not a new value it is a value from an existing in flight proposal + * and will be allowed to proceed even if the key is migrating to a different consensus protocol */ - static Paxos.Async propose(Proposal proposal, Paxos.Participants participants, boolean waitForNoSideEffect) + static Paxos.Async propose(Proposal proposal, Paxos.Participants participants, boolean waitForNoSideEffect, boolean isForRecovery) { if (waitForNoSideEffect && proposal.update.isEmpty()) waitForNoSideEffect = false; // by definition this has no "side effects" (besides linearizing the operation) @@ -169,9 +190,9 @@ static Paxos.Async propose(Proposal proposal, Paxos.Participants partici // to avoid unnecessary object allocations we extend PaxosPropose to implements Paxos.Async class Async extends PaxosPropose> implements Paxos.Async { - private Async(Proposal proposal, int participants, int required, boolean waitForNoSideEffect) + private Async(Proposal proposal, int participants, int required, boolean waitForNoSideEffect, boolean isForRecovery) { - super(proposal, participants, required, waitForNoSideEffect, newConditionAsConsumer()); + super(proposal, participants, required, waitForNoSideEffect, isForRecovery, newConditionAsConsumer()); } public Status awaitUntil(long deadline) @@ -190,24 +211,24 @@ public Status awaitUntil(long deadline) } } - Async propose = new Async(proposal, participants.sizeOfPoll(), participants.sizeOfConsensusQuorum, waitForNoSideEffect); + Async propose = new Async(proposal, participants.sizeOfPoll(), participants.sizeOfConsensusQuorum, waitForNoSideEffect, isForRecovery); propose.start(participants); return propose; } - static > T propose(Proposal proposal, Paxos.Participants participants, boolean waitForNoSideEffect, T onDone) + static > T propose(Proposal proposal, Paxos.Participants participants, boolean waitForNoSideEffect, boolean isForRecovery, T onDone) { if (waitForNoSideEffect && proposal.update.isEmpty()) waitForNoSideEffect = false; // by definition this has no "side effects" (besides linearizing the operation) - PaxosPropose propose = new PaxosPropose<>(proposal, participants.sizeOfPoll(), participants.sizeOfConsensusQuorum, waitForNoSideEffect, onDone); + PaxosPropose propose = new PaxosPropose<>(proposal, participants.sizeOfPoll(), participants.sizeOfConsensusQuorum, waitForNoSideEffect, isForRecovery, onDone); propose.start(participants); return onDone; } void start(Paxos.Participants participants) { - Message message = Message.out(PAXOS2_PROPOSE_REQ, new Request(proposal), participants.isUrgent()); + Message message = Message.out(PAXOS2_PROPOSE_REQ, new Request(proposal, isForRecovery), participants.isUrgent()); boolean executeOnSelf = false; for (int i = 0, size = participants.sizeOfPoll(); i < size ; ++i) @@ -219,7 +240,7 @@ void start(Paxos.Participants participants) } if (executeOnSelf) - PAXOS2_PROPOSE_REQ.stage.execute(() -> executeOnSelf(proposal)); + PAXOS2_PROPOSE_REQ.stage.execute(() -> executeOnSelf(proposal, isForRecovery)); } /** @@ -230,40 +251,44 @@ Status status() long responses = this.responses; if (isSuccessful(responses)) - return success; + return STATUS_SUCCESS; - if (!canSucceed(responses) && supersededBy != null) + if (!canSucceed(responses) && (supersededBy != null || needsConsensusMigration)) { Superseded.SideEffects sideEffects = hasNoSideEffects(responses) ? NO : MAYBE; - return new Superseded(supersededBy, sideEffects); + return new Superseded(supersededBy, sideEffects, needsConsensusMigration); } return new MaybeFailure(new Paxos.MaybeFailure(participants, required, accepts(responses), failureReasonsAsMap())); } - private void executeOnSelf(Proposal proposal) + private void executeOnSelf(Proposal proposal, boolean isForRecovery) { - executeOnSelf(proposal, RequestHandler::execute); + executeOnSelf(proposal, isForRecovery, RequestHandler::execute); } - public void onResponse(Response response, InetAddressAndPort from) + public void onResponse(AcceptResult acceptResult, InetAddressAndPort from) { + checkArgument(!isForRecovery || acceptResult.rejectedDueToConsensusMigration == false, "Repair should never be rejected due to consensus migration"); if (logger.isTraceEnabled()) - logger.trace("{} for {} from {}", response, proposal, from); + logger.trace("{} for {} from {}", acceptResult, proposal, from); - Ballot supersededBy = response.supersededBy; + Ballot supersededBy = acceptResult.supersededBy; if (supersededBy != null) supersededByUpdater.accumulateAndGet(this, supersededBy, (a, b) -> a == null ? b : b.uuidTimestamp() > a.uuidTimestamp() ? b : a); - long increment = supersededBy == null + long increment = supersededBy == null && !acceptResult.rejectedDueToConsensusMigration ? ACCEPT_INCREMENT : REFUSAL_INCREMENT; + if (acceptResult.rejectedDueToConsensusMigration) + needsConsensusMigration = true; + update(increment); } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason reason) + public void onFailure(InetAddressAndPort from, RequestFailure reason) { if (logger.isTraceEnabled()) logger.trace("{} {} failure from {}", proposal, reason, from); @@ -375,29 +400,21 @@ private static int failures(long responses) static class Request { final Proposal proposal; - Request(Proposal proposal) + final boolean isForRecovery; + Request(Proposal proposal, boolean isForRecovery) { this.proposal = proposal; + this.isForRecovery = isForRecovery; } + @Override public String toString() { - return proposal.toString("Propose"); - } - } - - /** - * The response to a proposal, indicating success (if {@code supersededBy == null}, - * or failure, alongside the ballot that beat us - */ - static class Response - { - final Ballot supersededBy; - Response(Ballot supersededBy) - { - this.supersededBy = supersededBy; + return "Request{" + + "proposal=" + proposal.toString("Propose") + + ", isForRecovery=" + isForRecovery + + '}'; } - public String toString() { return supersededBy == null ? "Accept" : "RejectProposal(supersededBy=" + supersededBy + ')'; } } /** @@ -408,14 +425,15 @@ public static class RequestHandler implements IVerbHandler @Override public void doVerb(Message message) { - Response response = execute(message.payload.proposal, message.from()); - if (response == null) + ClusterMetadataService.instance().fetchLogFromCMS(message.epoch()); + AcceptResult acceptResult = execute(message.payload.proposal, message.payload.isForRecovery, message.from()); + if (acceptResult == null) MessagingService.instance().respondWithFailure(UNKNOWN, message); else - MessagingService.instance().respond(response, message); + MessagingService.instance().respond(acceptResult, message); } - public static Response execute(Proposal proposal, InetAddressAndPort from) + public static AcceptResult execute(Proposal proposal, boolean isForRecovery, InetAddressAndPort from) { if (!Paxos.isInRangeAndShouldProcess(from, proposal.update.partitionKey(), proposal.update.metadata(), false)) return null; @@ -423,7 +441,7 @@ public static Response execute(Proposal proposal, InetAddressAndPort from) long start = nanoTime(); try (PaxosState state = PaxosState.get(proposal)) { - return new Response(state.acceptIfLatest(proposal)); + return state.acceptIfLatest(proposal, isForRecovery); } finally { @@ -438,42 +456,61 @@ public static class RequestSerializer implements IVersionedSerializer public void serialize(Request request, DataOutputPlus out, int version) throws IOException { Proposal.serializer.serialize(request.proposal, out, version); + if (version >= MessagingService.VERSION_51) + out.writeBoolean(request.isForRecovery); } @Override public Request deserialize(DataInputPlus in, int version) throws IOException { Proposal propose = Proposal.serializer.deserialize(in, version); - return new Request(propose); + boolean isForRecovery = false; + if (version >= MessagingService.VERSION_51) + isForRecovery = in.readBoolean(); + return new Request(propose, isForRecovery); } @Override public long serializedSize(Request request, int version) { - return Proposal.serializer.serializedSize(request.proposal, version); + long size = Proposal.serializer.serializedSize(request.proposal, version); + if (version >= MessagingService.VERSION_51) + size += TypeSizes.sizeof(request.isForRecovery); + return size; } } - public static class ResponseSerializer implements IVersionedSerializer + public static class AcceptResultSerializer implements IVersionedSerializer { - public void serialize(Response response, DataOutputPlus out, int version) throws IOException + public void serialize(PaxosState.AcceptResult acceptResult, DataOutputPlus out, int version) throws IOException { - out.writeBoolean(response.supersededBy != null); - if (response.supersededBy != null) - response.supersededBy.serialize(out); + out.writeBoolean(acceptResult.supersededBy != null); + if (acceptResult.supersededBy != null) + acceptResult.supersededBy.serialize(out); + if (version >= MessagingService.VERSION_51) + out.writeBoolean(acceptResult.rejectedDueToConsensusMigration); } - public Response deserialize(DataInputPlus in, int version) throws IOException + public AcceptResult deserialize(DataInputPlus in, int version) throws IOException { boolean isSuperseded = in.readBoolean(); - return isSuperseded ? new Response(Ballot.deserialize(in)) : new Response(null); + Ballot supersededBy = null; + if (isSuperseded) + supersededBy = Ballot.deserialize(in); + boolean rejectedDueToConsensusMigration = false; + if (version >= MessagingService.VERSION_51) + rejectedDueToConsensusMigration = in.readBoolean(); + return new AcceptResult(supersededBy, rejectedDueToConsensusMigration); } - public long serializedSize(Response response, int version) + public long serializedSize(AcceptResult acceptResult, int version) { - return response.supersededBy != null + long size = acceptResult.supersededBy != null ? TypeSizes.sizeof(true) + Ballot.sizeInBytes() : TypeSizes.sizeof(false); + if (version >= MessagingService.VERSION_51) + size += TypeSizes.sizeof(acceptResult.rejectedDueToConsensusMigration); + return size; } } } diff --git a/src/java/org/apache/cassandra/service/paxos/PaxosRepair.java b/src/java/org/apache/cassandra/service/paxos/PaxosRepair.java index e36e1d352250..8c9f219a3b85 100644 --- a/src/java/org/apache/cassandra/service/paxos/PaxosRepair.java +++ b/src/java/org/apache/cassandra/service/paxos/PaxosRepair.java @@ -20,7 +20,12 @@ import java.io.IOException; -import java.util.*; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.function.Function; @@ -40,7 +45,7 @@ import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.UnavailableException; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; @@ -56,6 +61,7 @@ import org.apache.cassandra.schema.Schema; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.paxos.PaxosPropose.Superseded; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.utils.CassandraVersion; @@ -63,21 +69,32 @@ import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.MonotonicClock; +import static com.google.common.base.Preconditions.checkState; +import static java.util.concurrent.TimeUnit.NANOSECONDS; import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; import static org.apache.cassandra.config.CassandraRelevantProperties.PAXOS_REPAIR_RETRY_TIMEOUT_IN_MS; import static org.apache.cassandra.config.CassandraRelevantProperties.SKIP_PAXOS_REPAIR_VERSION_VALIDATION; import static org.apache.cassandra.exceptions.RequestFailureReason.UNKNOWN; import static org.apache.cassandra.net.Verb.PAXOS2_REPAIR_REQ; -import static java.util.concurrent.TimeUnit.NANOSECONDS; -import static org.apache.cassandra.service.paxos.Commit.*; +import static org.apache.cassandra.service.paxos.Commit.Accepted; +import static org.apache.cassandra.service.paxos.Commit.Committed; +import static org.apache.cassandra.service.paxos.Commit.Proposal; +import static org.apache.cassandra.service.paxos.Commit.isAfter; +import static org.apache.cassandra.service.paxos.Commit.latest; +import static org.apache.cassandra.service.paxos.Commit.timestampsClash; import static org.apache.cassandra.service.paxos.ContentionStrategy.Type.REPAIR; import static org.apache.cassandra.service.paxos.ContentionStrategy.waitUntilForContention; -import static org.apache.cassandra.service.paxos.Paxos.*; -import static org.apache.cassandra.service.paxos.PaxosPrepare.*; +import static org.apache.cassandra.service.paxos.Paxos.Participants; +import static org.apache.cassandra.service.paxos.Paxos.isInRangeAndShouldProcess; +import static org.apache.cassandra.service.paxos.Paxos.staleBallotNewerThan; +import static org.apache.cassandra.service.paxos.PaxosPrepare.FoundIncompleteAccepted; +import static org.apache.cassandra.service.paxos.PaxosPrepare.FoundIncompleteCommitted; +import static org.apache.cassandra.service.paxos.PaxosPrepare.Status; +import static org.apache.cassandra.service.paxos.PaxosPrepare.prepareWithBallot; import static org.apache.cassandra.utils.Clock.Global.nanoTime; import static org.apache.cassandra.utils.NullableSerializer.deserializeNullable; import static org.apache.cassandra.utils.NullableSerializer.serializeNullable; -import static org.apache.cassandra.utils.NullableSerializer.serializedSizeNullable; +import static org.apache.cassandra.utils.NullableSerializer.serializedNullableSize; /** * Facility to finish any in-progress paxos transaction, and ensure that a quorum of nodes agree on the most recent operation. @@ -126,10 +143,11 @@ public class PaxosRepair extends AbstractPaxosRepair public static final RequestSerializer requestSerializer = new RequestSerializer(); public static final ResponseSerializer responseSerializer = new ResponseSerializer(); public static final RequestHandler requestHandler = new RequestHandler(); - private static final long RETRY_TIMEOUT_NANOS = getRetryTimeoutNanos(); private static final ScheduledExecutorPlus RETRIES = executorFactory().scheduled("PaxosRepairRetries"); + private static final long RETRY_TIMEOUT_NANOS = getRetryTimeoutNanos(); + private static long getRetryTimeoutNanos() { long retryMillis = PAXOS_REPAIR_RETRY_TIMEOUT_IN_MS.getLong(); @@ -174,7 +192,7 @@ private class Querying extends State implements RequestCallbackWithFailure i1.onFailure()); } @@ -254,6 +272,7 @@ else if (isAcceptedButNotCommitted && !isPromisedButNotAccepted && !reproposalMa { if (logger.isTraceEnabled()) logger.trace("PaxosRepair of {} completing {}", partitionKey(), latestAccepted); + // We need to complete this in-progress accepted proposal, which may not have been seen by a majority // However, since we have not sought any promises, we can simply complete the existing proposal // since this is an idempotent operation - both us and the original proposer (and others) can @@ -262,8 +281,7 @@ else if (isAcceptedButNotCommitted && !isPromisedButNotAccepted && !reproposalMa // If ballots with same timestamp have been both accepted and rejected by different nodes, // to avoid a livelock we simply try to poison, knowing we will fail but use a new ballot // (note there are alternative approaches but this is conservative) - - return PaxosPropose.propose(latestAccepted, participants, false, + return PaxosPropose.propose(latestAccepted, participants, false, true, new ProposingRepair(latestAccepted)); } else if (isAcceptedButNotCommitted || isPromisedButNotAccepted || latestWitnessed.compareTo(latestPreviouslyWitnessed) < 0) @@ -321,9 +339,10 @@ public State execute(Status input) throws Throwable // (else an "earlier" operation can sneak in and invalidate us while we're proposing // with a newer ballot) FoundIncompleteAccepted incomplete = input.incompleteAccepted(); + Proposal propose = new Proposal(incomplete.ballot, incomplete.accepted.update); logger.trace("PaxosRepair of {} found incomplete {}", partitionKey(), incomplete.accepted); - return PaxosPropose.propose(propose, participants, false, + return PaxosPropose.propose(propose, participants, false, true, new ProposingRepair(propose)); // we don't know if we're done, so we must restart } @@ -341,7 +360,7 @@ public State execute(Status input) throws Throwable // propose the empty ballot logger.trace("PaxosRepair of {} submitting empty proposal", partitionKey()); Proposal proposal = Proposal.empty(input.success().ballot, partitionKey(), table); - return PaxosPropose.propose(proposal, participants, false, + return PaxosPropose.propose(proposal, participants, false, true, new ProposingRepair(proposal)); } @@ -368,7 +387,9 @@ public State execute(PaxosPropose.Status input) return retry(this); case SUPERSEDED: - if (isAfter(input.superseded().by, prevSupersededBy)) + Superseded superseded = input.superseded(); + checkState(!superseded.needsConsensusMigration, "Repair should not encounter consensus migration rejection"); + if (isAfter(superseded.by, prevSupersededBy)) prevSupersededBy = input.superseded().by; return retry(this); @@ -408,9 +429,9 @@ public State execute(PaxosCommit.Status input) } } - private PaxosRepair(DecoratedKey partitionKey, Ballot incompleteBallot, TableMetadata table, ConsistencyLevel paxosConsistency) + private PaxosRepair(DecoratedKey partitionKey, @Nullable Ballot incompleteBallot, TableMetadata table, ConsistencyLevel paxosConsistency, long retryTimeoutNanos) { - super(partitionKey, incompleteBallot); + super(partitionKey, incompleteBallot, retryTimeoutNanos); // TODO: move precondition into super ctor Preconditions.checkArgument(paxosConsistency.isSerialConsistency()); this.table = table; @@ -420,12 +441,17 @@ private PaxosRepair(DecoratedKey partitionKey, Ballot incompleteBallot, TableMet public static PaxosRepair create(ConsistencyLevel consistency, DecoratedKey partitionKey, Ballot incompleteBallot, TableMetadata table) { - return new PaxosRepair(partitionKey, incompleteBallot, table, consistency); + return new PaxosRepair(partitionKey, incompleteBallot, table, consistency, RETRY_TIMEOUT_NANOS); + } + + public static PaxosRepair create(ConsistencyLevel consistency, DecoratedKey partitionKey, TableMetadata table, long retryTimeoutNanos) + { + return new PaxosRepair(partitionKey, null, table, consistency, retryTimeoutNanos); } private State retry(State state) { - Preconditions.checkState(isStarted()); + checkState(isStarted()); if (isResult(state)) return state; @@ -440,7 +466,7 @@ public State restart(State state, long waitUntil) participants = Participants.get(table, partitionKey(), paxosConsistency); - if (waitUntil > Long.MIN_VALUE && waitUntil - startedNanos() > RETRY_TIMEOUT_NANOS) + if (waitUntil > Long.MIN_VALUE && waitUntil - startedNanos() > retryTimeoutNanos) return new Failure(null); try @@ -462,7 +488,7 @@ public State restart(State state, long waitUntil) private ConsistencyLevel commitConsistency() { - Preconditions.checkState(paxosConsistency.isSerialConsistency()); + checkState(paxosConsistency.isSerialConsistency()); return paxosConsistency.isDatacenterLocal() ? ConsistencyLevel.LOCAL_QUORUM : ConsistencyLevel.QUORUM; } @@ -623,14 +649,14 @@ public static class ResponseSerializer implements IVersionedSerializer public void serialize(Response response, DataOutputPlus out, int version) throws IOException { response.latestWitnessedOrLowBound.serialize(out); - serializeNullable(Accepted.serializer, response.acceptedButNotCommitted, out, version); + serializeNullable(response.acceptedButNotCommitted, out, version, Accepted.serializer); Committed.serializer.serialize(response.committed, out, version); } public Response deserialize(DataInputPlus in, int version) throws IOException { Ballot latestWitnessed = Ballot.deserialize(in); - Accepted acceptedButNotCommitted = deserializeNullable(Accepted.serializer, in, version); + Accepted acceptedButNotCommitted = deserializeNullable(in, version, Accepted.serializer); Committed committed = Committed.serializer.deserialize(in, version); return new Response(latestWitnessed, acceptedButNotCommitted, committed); } @@ -638,7 +664,7 @@ public Response deserialize(DataInputPlus in, int version) throws IOException public long serializedSize(Response response, int version) { return Ballot.sizeInBytes() - + serializedSizeNullable(Accepted.serializer, response.acceptedButNotCommitted, version) + + serializedNullableSize(response.acceptedButNotCommitted, version, Accepted.serializer) + Committed.serializer.serializedSize(response.committed, version); } } diff --git a/src/java/org/apache/cassandra/service/paxos/PaxosRequestCallback.java b/src/java/org/apache/cassandra/service/paxos/PaxosRequestCallback.java index aad32ace0503..06509edfc89d 100644 --- a/src/java/org/apache/cassandra/service/paxos/PaxosRequestCallback.java +++ b/src/java/org/apache/cassandra/service/paxos/PaxosRequestCallback.java @@ -20,18 +20,21 @@ import java.util.function.BiFunction; +import org.apache.cassandra.config.DatabaseDescriptor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.cassandra.config.CassandraRelevantProperties; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.WriteTimeoutException; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.Message; import org.apache.cassandra.service.FailureRecordingCallback; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.utils.TriFunction; -import static org.apache.cassandra.exceptions.RequestFailureReason.TIMEOUT; -import static org.apache.cassandra.exceptions.RequestFailureReason.UNKNOWN; +import static org.apache.cassandra.exceptions.RequestFailure.TIMEOUT; +import static org.apache.cassandra.exceptions.RequestFailure.UNKNOWN; import static org.apache.cassandra.utils.FBUtilities.getBroadcastAddressAndPort; public abstract class PaxosRequestCallback extends FailureRecordingCallback @@ -44,6 +47,8 @@ public abstract class PaxosRequestCallback extends FailureRecordingCallback message) { + if (DatabaseDescriptor.getAccordTransactionsEnabled()) + ClusterMetadataService.instance().fetchLogFromPeerOrCMS(message.from(), message.epoch()); onResponse(message.payload, message.from()); } @@ -58,7 +63,7 @@ protected void executeOnSelf(I parameter, BiFunction void executeOnSelf(I parameter, BiFunction void executeOnSelf(I parameter1, J parameter2, TriFunction execute) + { + T response; + try + { + response = execute.apply(parameter1, parameter2, getBroadcastAddressAndPort()); + if (response == null) + return; + } + catch (Exception ex) + { + RequestFailure reason = UNKNOWN; + if (ex instanceof WriteTimeoutException) reason = TIMEOUT; + else logger.error("Failed to apply {}, {} locally", parameter1, parameter2, ex); + + onFailure(getBroadcastAddressAndPort(), reason); + return; + } + + onResponse(response, getBroadcastAddressAndPort()); + } + static boolean shouldExecuteOnSelf(InetAddressAndPort replica) { return USE_SELF_EXECUTION && replica.equals(getBroadcastAddressAndPort()); diff --git a/src/java/org/apache/cassandra/service/paxos/PaxosState.java b/src/java/org/apache/cassandra/service/paxos/PaxosState.java index a3f019e4bf25..44ab0b6e4a63 100644 --- a/src/java/org/apache/cassandra/service/paxos/PaxosState.java +++ b/src/java/org/apache/cassandra/service/paxos/PaxosState.java @@ -26,40 +26,60 @@ import java.util.concurrent.atomic.AtomicReferenceFieldUpdater; import java.util.function.BiConsumer; import java.util.function.Function; - import javax.annotation.Nonnull; import javax.annotation.Nullable; import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; import com.google.common.primitives.Ints; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.github.benmanes.caffeine.cache.Caffeine; import org.apache.cassandra.concurrent.ImmediateExecutor; import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.db.*; -import org.apache.cassandra.metrics.PaxosMetrics; -import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.Directories; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.db.SystemKeyspace; +import org.apache.cassandra.db.WriteType; +import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.exceptions.ReadTimeoutException; import org.apache.cassandra.exceptions.RequestTimeoutException; import org.apache.cassandra.exceptions.WriteTimeoutException; +import org.apache.cassandra.metrics.PaxosMetrics; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.consensus.migration.ConsensusKeyMigrationState; +import org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter; import org.apache.cassandra.service.paxos.uncommitted.PaxosBallotTracker; import org.apache.cassandra.service.paxos.uncommitted.PaxosStateTracker; import org.apache.cassandra.service.paxos.uncommitted.PaxosUncommittedTracker; import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.utils.Nemesis; +import static com.google.common.base.Preconditions.checkState; import static java.util.concurrent.TimeUnit.SECONDS; import static org.apache.cassandra.config.CassandraRelevantProperties.PAXOS_DISABLE_COORDINATOR_LOCKING; -import static org.apache.cassandra.utils.Clock.Global.nanoTime; import static org.apache.cassandra.config.Config.PaxosStatePurging.gc_grace; import static org.apache.cassandra.config.Config.PaxosStatePurging.legacy; import static org.apache.cassandra.config.DatabaseDescriptor.paxosStatePurging; -import static org.apache.cassandra.service.paxos.Commit.*; -import static org.apache.cassandra.service.paxos.PaxosState.MaybePromise.Outcome.*; +import static org.apache.cassandra.service.paxos.Commit.Accepted; import static org.apache.cassandra.service.paxos.Commit.Accepted.latestAccepted; +import static org.apache.cassandra.service.paxos.Commit.AcceptedWithTTL; +import static org.apache.cassandra.service.paxos.Commit.Agreed; +import static org.apache.cassandra.service.paxos.Commit.Committed; import static org.apache.cassandra.service.paxos.Commit.Committed.latestCommitted; +import static org.apache.cassandra.service.paxos.Commit.CommittedWithTTL; +import static org.apache.cassandra.service.paxos.Commit.Proposal; import static org.apache.cassandra.service.paxos.Commit.isAfter; +import static org.apache.cassandra.service.paxos.Commit.latest; +import static org.apache.cassandra.service.paxos.PaxosState.AcceptResult.RETRY_NEW_PROTOCOL; +import static org.apache.cassandra.service.paxos.PaxosState.AcceptResult.SUCCESS; +import static org.apache.cassandra.service.paxos.PaxosState.MaybePromise.Outcome.PERMIT_READ; +import static org.apache.cassandra.service.paxos.PaxosState.MaybePromise.Outcome.PROMISE; +import static org.apache.cassandra.service.paxos.PaxosState.MaybePromise.Outcome.REJECT; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; /** * We save to memory the result of each operation before persisting to disk, however each operation that performs @@ -67,6 +87,9 @@ */ public class PaxosState implements PaxosOperationLock { + @SuppressWarnings("unused") + private static final Logger logger = LoggerFactory.getLogger(PaxosState.class.getName()); + private static volatile boolean DISABLE_COORDINATOR_LOCKING = PAXOS_DISABLE_COORDINATOR_LOCKING.getBoolean(); public static final ConcurrentHashMap ACTIVE = new ConcurrentHashMap<>(); public static final Map RECENT = Caffeine.newBuilder() @@ -114,7 +137,7 @@ public static PaxosBallotTracker ballotTracker() public static void initializeTrackers() { - Preconditions.checkState(TrackerHandle.tracker != null); + checkState(TrackerHandle.tracker != null); PaxosMetrics.initialize(); } @@ -621,7 +644,7 @@ else if (isAfter(ballot, latestWriteOrLowBound)) /** * Record an acceptance of the proposal if there is no newer promise; otherwise inform the caller of the newer ballot */ - public Ballot acceptIfLatest(Proposal proposal) + public AcceptResult acceptIfLatest(Proposal proposal, boolean isForRecovery) { if (paxosStatePurging() == legacy && !(proposal instanceof AcceptedWithTTL)) proposal = AcceptedWithTTL.withDefaultTTL(proposal); @@ -629,20 +652,31 @@ public Ballot acceptIfLatest(Proposal proposal) // state.promised can be null, because it is invalidated by committed; // we may also have accepted a newer proposal than we promised, so we confirm that we are the absolute newest // (or that we have the exact same ballot as our promise, which is the typical case) + boolean shouldRejectDueToConsensusMigration; Snapshot before, after; while (true) { Snapshot realBefore = current; before = realBefore.removeExpired((int)proposal.ballot.unix(SECONDS)); Ballot latest = before.latestWitnessedOrLowBound(); + if (isForRecovery) + shouldRejectDueToConsensusMigration = false; + else + shouldRejectDueToConsensusMigration = ConsensusRequestRouter.instance + .isKeyInMigratingOrMigratedRangeDuringPaxosAccept(proposal.update.metadata().id, + proposal.update.partitionKey()); if (!proposal.isSameOrAfter(latest)) { Tracing.trace("Rejecting proposal {}; latest is now {}", proposal.ballot, latest); - return latest; + return new AcceptResult(latest, shouldRejectDueToConsensusMigration); } - if (proposal.hasSameBallot(before.committed)) // TODO: consider not answering - return null; // no need to save anything, or indeed answer at all + if (shouldRejectDueToConsensusMigration) + return RETRY_NEW_PROTOCOL; + + // TODO: Consider not answering in the committed ballot case where there is no need to save anything or answer at all + if (proposal.hasSameBallot(before.committed)) + return null; after = new Snapshot(realBefore.promised, realBefore.promisedWrite, proposal.accepted(), realBefore.committed); if (currentUpdater.compareAndSet(this, realBefore, after)) @@ -659,7 +693,8 @@ public Ballot acceptIfLatest(Proposal proposal) // though this Tracing.trace("Accepting proposal {}", proposal); SystemKeyspace.savePaxosProposal(proposal); - return null; + checkState(!shouldRejectDueToConsensusMigration); + return SUCCESS; } public void commit(Agreed commit) @@ -781,6 +816,8 @@ public static Boolean legacyPropose(Commit proposal) boolean accept = proposal.isSameOrAfter(before.latestWitnessedOrLowBound()); if (accept) { + PartitionUpdate partitionUpdate = proposal.update; + checkState(ConsensusKeyMigrationState.getKeyMigrationState(partitionUpdate.metadata().id, partitionUpdate.partitionKey()).tableMigrationState == null, "Using PaxosV1 while consensus migration is in progress is not supported"); if (proposal.hasSameBallot(before.committed) || currentUpdater.compareAndSet(unsafeState, realBefore, new Snapshot(realBefore.promised, realBefore.promisedWrite, @@ -819,4 +856,35 @@ public static Snapshot unsafeGetIfPresent(DecoratedKey partitionKey, TableMetada if (cur != null) return cur.current; return RECENT.get(key); } + + /** + * The response to a proposal, indicating success (if {@code supersededBy == null}, + * or failure, alongside the ballot that beat us + */ + public static class AcceptResult + { + static final AcceptResult SUCCESS = new AcceptResult(false); + + static final AcceptResult RETRY_NEW_PROTOCOL = new AcceptResult(true); + + @Nullable + public final Ballot supersededBy; + + public final boolean rejectedDueToConsensusMigration; + + public AcceptResult(@Nullable Ballot supersededBy, boolean rejectedDueToConsensusMigration) + { + this.supersededBy = supersededBy; + this.rejectedDueToConsensusMigration = rejectedDueToConsensusMigration; + } + + // Success result + private AcceptResult(boolean rejectedDueToConsensusMigration) + { + supersededBy = null; + this.rejectedDueToConsensusMigration = rejectedDueToConsensusMigration; + } + + public String toString() { return supersededBy == null && !rejectedDueToConsensusMigration ? "Accept" : "RejectProposal(supersededBy=" + supersededBy + ", rejectedDueToConsensusMigration=" + rejectedDueToConsensusMigration + ')'; } + } } diff --git a/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosCleanupComplete.java b/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosCleanupComplete.java index 682375668a17..d875d5c63cdc 100644 --- a/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosCleanupComplete.java +++ b/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosCleanupComplete.java @@ -19,7 +19,12 @@ package org.apache.cassandra.service.paxos.cleanup; import java.io.IOException; -import java.util.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.TypeSizes; @@ -27,7 +32,7 @@ import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; @@ -78,7 +83,7 @@ public synchronized void run() } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason reason) + public void onFailure(InetAddressAndPort from, RequestFailure reason) { tryFailure(new PaxosCleanupException("Timed out waiting on response from " + from)); } diff --git a/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosCleanupLocalCoordinator.java b/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosCleanupLocalCoordinator.java index a53fec3e6081..7e5935f03d4a 100644 --- a/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosCleanupLocalCoordinator.java +++ b/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosCleanupLocalCoordinator.java @@ -24,6 +24,7 @@ import java.util.concurrent.ConcurrentHashMap; import com.google.common.base.Preconditions; +import com.google.common.util.concurrent.Uninterruptibles; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,9 +42,15 @@ import org.apache.cassandra.service.paxos.PaxosState; import org.apache.cassandra.service.paxos.uncommitted.UncommittedPaxosKey; import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.utils.Clock; import org.apache.cassandra.utils.CloseableIterator; import org.apache.cassandra.utils.concurrent.AsyncFuture; +import static java.util.concurrent.TimeUnit.MICROSECONDS; +import static java.util.concurrent.TimeUnit.MILLISECONDS; +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.apache.cassandra.config.DatabaseDescriptor.getCasContentionTimeout; +import static org.apache.cassandra.config.DatabaseDescriptor.getWriteRpcTimeout; import static org.apache.cassandra.service.paxos.cleanup.PaxosCleanupSession.TIMEOUT_NANOS; public class PaxosCleanupLocalCoordinator extends AsyncFuture @@ -134,8 +141,10 @@ private void scheduleKeyRepairsOrFinish() return; } + long txnTimeoutMicros = Math.max(getCasContentionTimeout(MICROSECONDS), getWriteRpcTimeout(MICROSECONDS)); + boolean waitForCoordinator = DatabaseDescriptor.getPaxosRepairRaceWait(); while (inflight.size() < parallelism && uncommittedIter.hasNext()) - repairKey(uncommittedIter.next()); + repairKey(uncommittedIter.next(), txnTimeoutMicros, waitForCoordinator); } @@ -143,7 +152,7 @@ private void scheduleKeyRepairsOrFinish() finish(); } - private boolean repairKey(UncommittedPaxosKey uncommitted) + private boolean repairKey(UncommittedPaxosKey uncommitted, long txnTimeoutMicros, boolean waitForCoordinator) { logger.trace("repairing {}", uncommitted); Preconditions.checkState(!inflight.containsKey(uncommitted.getKey())); @@ -154,6 +163,9 @@ private boolean repairKey(UncommittedPaxosKey uncommitted) if (consistency == null) return false; + if (waitForCoordinator) + maybeWaitForOriginalCoordinator(uncommitted, txnTimeoutMicros); + inflight.put(uncommitted.getKey(), tableRepairs.startOrGetOrQueue(uncommitted.getKey(), uncommitted.ballot(), uncommitted.getConsistencyLevel(), table, result -> { if (result.wasSuccessful()) onKeyFinish(uncommitted.getKey()); @@ -163,6 +175,24 @@ private boolean repairKey(UncommittedPaxosKey uncommitted) return true; } + /** + * Wait to repair things that are still potentially executing at the original coordinator to avoid + * causing timeouts. This should only have to happen at most a few times when the repair starts + */ + private static void maybeWaitForOriginalCoordinator(UncommittedPaxosKey uncommitted, long txnTimeoutMicros) + { + long nowMicros = MILLISECONDS.toMicros(Clock.Global.currentTimeMillis()); + long ballotElapsedMicros = nowMicros - uncommitted.ballot().unixMicros(); + if (ballotElapsedMicros < 0 && Math.abs(ballotElapsedMicros) > SECONDS.toMicros(1)) + logger.warn("Encountered ballot that is more than 1 second in the future, is there a clock sync issue? {}", uncommitted.ballot()); + if (ballotElapsedMicros < txnTimeoutMicros) + { + long sleepMicros = txnTimeoutMicros - ballotElapsedMicros; + logger.info("Paxos auto repair encountered a potentially in progress ballot, sleeping {}us to allow the in flight operation to finish", sleepMicros); + Uninterruptibles.sleepUninterruptibly(sleepMicros, MICROSECONDS); + } + } + private synchronized void onKeyFinish(DecoratedKey key) { if (!inflight.containsKey(key)) diff --git a/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosCleanupSession.java b/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosCleanupSession.java index 80f571cd26c4..62b288986e4a 100644 --- a/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosCleanupSession.java +++ b/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosCleanupSession.java @@ -19,7 +19,9 @@ package org.apache.cassandra.service.paxos.cleanup; import java.lang.ref.WeakReference; -import java.util.*; +import java.util.Collection; +import java.util.Queue; +import java.util.UUID; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.TimeUnit; @@ -28,7 +30,7 @@ import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.gms.ApplicationState; import org.apache.cassandra.gms.EndpointState; import org.apache.cassandra.gms.IEndpointStateChangeSubscriber; @@ -239,7 +241,7 @@ public void convict(InetAddressAndPort ep, double phi) } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason reason) + public void onFailure(InetAddressAndPort from, RequestFailure reason) { fail(from.toString() + ' ' + reason + " for cleanup request for paxos cleanup session " + session); } diff --git a/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosFinishPrepareCleanup.java b/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosFinishPrepareCleanup.java index 07b1bbe33405..0fc189f2e428 100644 --- a/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosFinishPrepareCleanup.java +++ b/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosFinishPrepareCleanup.java @@ -18,9 +18,11 @@ package org.apache.cassandra.service.paxos.cleanup; -import java.util.*; +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.IVerbHandler; import org.apache.cassandra.net.Message; @@ -51,7 +53,7 @@ public static PaxosFinishPrepareCleanup finish(SharedContext ctx, Collection private static void add(SharedContext ctx, AtomicReference pendingCleanup, Message message) { PendingCleanup next = new PendingCleanup(message); - PendingCleanup prev = IntrusiveStack.push(AtomicReference::get, AtomicReference::compareAndSet, pendingCleanup, next); + PendingCleanup prev = IntrusiveStack.getAndPush(AtomicReference::get, AtomicReference::compareAndSet, pendingCleanup, next); if (prev == null) Stage.MISC.execute(() -> cleanup(ctx, pendingCleanup)); } diff --git a/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosStartPrepareCleanup.java b/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosStartPrepareCleanup.java index a375481c3489..71f02d374a5b 100644 --- a/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosStartPrepareCleanup.java +++ b/src/java/org/apache/cassandra/service/paxos/cleanup/PaxosStartPrepareCleanup.java @@ -19,17 +19,24 @@ package org.apache.cassandra.service.paxos.cleanup; import java.io.IOException; -import java.util.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import org.apache.cassandra.config.DatabaseDescriptor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.cassandra.db.*; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.dht.AbstractBounds; import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.gms.EndpointState; import org.apache.cassandra.gms.HeartBeatState; import org.apache.cassandra.io.IVersionedSerializer; @@ -47,6 +54,7 @@ import org.apache.cassandra.service.paxos.Ballot; import org.apache.cassandra.service.paxos.Commit; import org.apache.cassandra.service.paxos.PaxosRepairHistory; +import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.utils.concurrent.AsyncFuture; import static org.apache.cassandra.net.Verb.PAXOS2_CLEANUP_START_PREPARE_REQ; @@ -94,7 +102,7 @@ public static PaxosStartPrepareCleanup prepare(SharedContext ctx, TableId tableI } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason reason) + public void onFailure(InetAddressAndPort from, RequestFailure reason) { tryFailure(new PaxosCleanupException("Received " + reason + " failure response from " + from)); } @@ -178,6 +186,8 @@ public long serializedSize(Request request, int version) public static IVerbHandler createVerbHandler(SharedContext ctx) { return in -> { + if (DatabaseDescriptor.getAccordTransactionsEnabled()) + ClusterMetadataService.instance().fetchLogFromPeerOrCMS(in.from(), in.epoch()); ColumnFamilyStore table = Schema.instance.getColumnFamilyStoreInstance(in.payload.tableId); // Note: pre-5.1 we would use gossip state included in the request payload to update topology // prior to cleanup. Topology is no longer derived from gossip state, so this has been removed. diff --git a/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosBallotTracker.java b/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosBallotTracker.java index 41314a2eafe5..06e4881fb4f0 100644 --- a/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosBallotTracker.java +++ b/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosBallotTracker.java @@ -38,7 +38,7 @@ import org.apache.cassandra.service.paxos.Commit; import static org.apache.cassandra.io.util.SequentialWriterOption.FINISH_ON_CLOSE; -import static org.apache.cassandra.net.Crc.crc32; +import static org.apache.cassandra.utils.Crc.crc32; /** * Tracks the highest paxos ballot we've seen, and the lowest ballot we can accept. diff --git a/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosRows.java b/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosRows.java index 5cca1def581d..1aae7ca8d342 100644 --- a/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosRows.java +++ b/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosRows.java @@ -76,7 +76,7 @@ private PaxosRows() {} private static ColumnMetadata paxosColumn(String name, AbstractType type) { - return ColumnMetadata.regularColumn(SchemaConstants.SYSTEM_KEYSPACE_NAME, SystemKeyspace.PAXOS, name, type); + return ColumnMetadata.regularColumn(SchemaConstants.SYSTEM_KEYSPACE_NAME, SystemKeyspace.PAXOS, name, type, ColumnMetadata.NO_UNIQUE_ID); } public static Ballot getPromise(Row row) diff --git a/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosStateTracker.java b/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosStateTracker.java index 69494ee60e26..3e90c83c5b28 100644 --- a/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosStateTracker.java +++ b/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosStateTracker.java @@ -254,7 +254,7 @@ private void updateLowBoundFromRepairHistory() throws IOException Ballot lowBound = null; ListType listType = ListType.getInstance(BytesType.instance, false); - ColumnMetadata pointsColumn = ColumnMetadata.regularColumn(SYSTEM_KEYSPACE_NAME, PAXOS_REPAIR_HISTORY, "points", listType); + ColumnMetadata pointsColumn = ColumnMetadata.regularColumn(SYSTEM_KEYSPACE_NAME, PAXOS_REPAIR_HISTORY, "points", listType, ColumnMetadata.NO_UNIQUE_ID); try (ReadExecutionController controller = query.executionController(); PartitionIterator partitions = query.executeInternal(controller)) { while (partitions.hasNext()) diff --git a/src/java/org/apache/cassandra/service/paxos/v1/PrepareCallback.java b/src/java/org/apache/cassandra/service/paxos/v1/PrepareCallback.java index 4aedb6d63dee..717acf4ab58f 100644 --- a/src/java/org/apache/cassandra/service/paxos/v1/PrepareCallback.java +++ b/src/java/org/apache/cassandra/service/paxos/v1/PrepareCallback.java @@ -19,11 +19,16 @@ package org.apache.cassandra.service.paxos.v1; +import java.util.Collections; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; import com.google.common.collect.Iterables; +import org.apache.cassandra.config.Config; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.db.ConsistencyLevel; @@ -35,6 +40,7 @@ import org.apache.cassandra.service.paxos.Commit; import org.apache.cassandra.service.paxos.PrepareResponse; import org.apache.cassandra.transport.Dispatcher; +import org.apache.cassandra.utils.FBUtilities; public class PrepareCallback extends AbstractPaxosCallback { @@ -79,8 +85,28 @@ public synchronized void onResponse(Message message) latch.decrement(); } - public Iterable replicasMissingMostRecentCommit() + public Iterable replicasMissingMostRecentCommit(TableMetadata metadata) { + /** + * this check is only needed for mixed mode operation with 4.0 and can be removed once upgrade support dropped + * see the comment in {@link org.apache.cassandra.distributed.upgrade.MixedModePaxosTTLTest} for a full explanation. + */ + if (DatabaseDescriptor.paxosStatePurging() == Config.PaxosStatePurging.legacy) + { + // In general, we need every replicas that have answered to the prepare (a quorum) to agree on the MRC (see + // coment in StorageProxy.beginAndRepairPaxos(), but basically we need to make sure at least a quorum of nodes + // have learn a commit before commit a new one otherwise that previous commit is not guaranteed to have reach a + // quorum and further commit may proceed on incomplete information). + // However, if that commit is too hold, it may have been expired from some of the replicas paxos table (we don't + // keep the paxos state forever or that could grow unchecked), and we could end up in some infinite loop as + // explained on CASSANDRA-12043. To avoid that, we ignore an MRC that is too old, i.e. older than the TTL we set + // on paxos tables. For such an old commit, we rely on hints and repair to ensure the commit has indeed been + // propagated to all nodes. + long paxosTtlSec = SystemKeyspace.legacyPaxosTtlSec(metadata); + if (TimeUnit.MICROSECONDS.toSeconds(mostRecentCommit.ballot.unixMicros()) + paxosTtlSec < FBUtilities.nowInSeconds()) + return Collections.emptySet(); + } + return Iterables.filter(commitsByReplica.keySet(), inetAddress -> (!commitsByReplica.get(inetAddress).ballot.equals(mostRecentCommit.ballot))); } } diff --git a/src/java/org/apache/cassandra/service/reads/AbstractReadExecutor.java b/src/java/org/apache/cassandra/service/reads/AbstractReadExecutor.java index 88b3ba49fab1..1774164fb964 100644 --- a/src/java/org/apache/cassandra/service/reads/AbstractReadExecutor.java +++ b/src/java/org/apache/cassandra/service/reads/AbstractReadExecutor.java @@ -66,6 +66,7 @@ public abstract class AbstractReadExecutor { private static final Logger logger = LoggerFactory.getLogger(AbstractReadExecutor.class); + protected final ReadCoordinator coordinator; protected final ReadCommand command; private final ReplicaPlan.SharedForTokenRead replicaPlan; protected final ReadRepair readRepair; @@ -78,14 +79,15 @@ public abstract class AbstractReadExecutor private final int initialDataRequestCount; protected volatile PartitionIterator result = null; - AbstractReadExecutor(ColumnFamilyStore cfs, ReadCommand command, ReplicaPlan.ForTokenRead replicaPlan, int initialDataRequestCount, Dispatcher.RequestTime requestTime) + AbstractReadExecutor(ReadCoordinator coordinator, ColumnFamilyStore cfs, ReadCommand command, ReplicaPlan.ForTokenRead replicaPlan, int initialDataRequestCount, Dispatcher.RequestTime requestTime) { + this.coordinator = coordinator; this.command = command; this.replicaPlan = ReplicaPlan.shared(replicaPlan); this.initialDataRequestCount = initialDataRequestCount; // the ReadRepair and DigestResolver both need to see our updated - this.readRepair = ReadRepair.create(command, this.replicaPlan, requestTime); - this.digestResolver = new DigestResolver<>(command, this.replicaPlan, requestTime); + this.readRepair = ReadRepair.create(coordinator, command, this.replicaPlan, requestTime); + this.digestResolver = new DigestResolver<>(coordinator, command, this.replicaPlan, requestTime); this.handler = new ReadCallback<>(digestResolver, command, this.replicaPlan, requestTime); this.cfs = cfs; this.traceState = Tracing.instance.get(); @@ -136,13 +138,14 @@ private void makeRequests(ReadCommand readCommand, Iterable replicas) { boolean hasLocalEndpoint = false; Message message = null; + readCommand = coordinator.maybeAllowOutOfRangeReads(readCommand, replicaPlan().consistencyLevel()); for (Replica replica: replicas) { assert replica.isFull() || readCommand.acceptsTransient(); InetAddressAndPort endpoint = replica.endpoint(); - if (replica.isSelf()) + if (replica.isSelf() && coordinator.localReadSupported()) { hasLocalEndpoint = true; continue; @@ -154,7 +157,7 @@ private void makeRequests(ReadCommand readCommand, Iterable replicas) if (null == message) message = readCommand.createMessage(false, requestTime).withEpoch(ClusterMetadata.current().epoch); - MessagingService.instance().sendWithCallback(message, endpoint, handler); + coordinator.sendReadCommand(message, endpoint, handler); } // We delay the local (potentially blocking) read till the end to avoid stalling remote requests. @@ -179,8 +182,11 @@ public void executeAsync() EndpointsForToken selected = replicaPlan().contacts(); EndpointsForToken fullDataRequests = selected.filter(Replica::isFull, initialDataRequestCount); makeFullDataRequests(fullDataRequests); - makeTransientDataRequests(selected.filterLazily(Replica::isTransient)); - makeDigestRequests(selected.filterLazily(r -> r.isFull() && !fullDataRequests.contains(r))); + EndpointsForToken transientRequests = selected.filter(Replica::isTransient); + makeTransientDataRequests(transientRequests); + EndpointsForToken digestRequests = selected.filter(r -> r.isFull() && !fullDataRequests.contains(r)); + makeDigestRequests(digestRequests); + coordinator.notifyOfInitialContacts(fullDataRequests, transientRequests, digestRequests); } /** @@ -189,6 +195,7 @@ public void executeAsync() public static AbstractReadExecutor getReadExecutor(ClusterMetadata metadata, SinglePartitionReadCommand command, ConsistencyLevel consistencyLevel, + ReadCoordinator coordinator, Dispatcher.RequestTime requestTime) throws UnavailableException { Keyspace keyspace = Keyspace.open(command.metadata().keyspace); @@ -197,28 +204,30 @@ public static AbstractReadExecutor getReadExecutor(ClusterMetadata metadata, ReplicaPlan.ForTokenRead replicaPlan = ReplicaPlans.forRead(metadata, keyspace, + command.metadata().id, command.partitionKey().getToken(), command.indexQueryPlan(), consistencyLevel, - retry); + retry, + coordinator); // Speculative retry is disabled *OR* // 11980: Disable speculative retry if using EACH_QUORUM in order to prevent miscounting DC responses if (retry.equals(NeverSpeculativeRetryPolicy.INSTANCE) || consistencyLevel == ConsistencyLevel.EACH_QUORUM) - return new NeverSpeculatingReadExecutor(cfs, command, replicaPlan, requestTime, false); + return new NeverSpeculatingReadExecutor(coordinator, cfs, command, replicaPlan, requestTime, false); + + if (retry.equals(AlwaysSpeculativeRetryPolicy.INSTANCE)) + return new AlwaysSpeculatingReadExecutor(coordinator, cfs, command, replicaPlan, requestTime); // There are simply no extra replicas to speculate. // Handle this separately so it can record failed attempts to speculate due to lack of replicas if (replicaPlan.contacts().size() == replicaPlan.readCandidates().size()) { boolean recordFailedSpeculation = consistencyLevel != ConsistencyLevel.ALL; - return new NeverSpeculatingReadExecutor(cfs, command, replicaPlan, requestTime, recordFailedSpeculation); + return new NeverSpeculatingReadExecutor(coordinator, cfs, command, replicaPlan, requestTime, recordFailedSpeculation); } - - if (retry.equals(AlwaysSpeculativeRetryPolicy.INSTANCE)) - return new AlwaysSpeculatingReadExecutor(cfs, command, replicaPlan, requestTime); else // PERCENTILE or CUSTOM. - return new SpeculatingReadExecutor(cfs, command, replicaPlan, requestTime); + return new SpeculatingReadExecutor(coordinator, cfs, command, replicaPlan, requestTime); } public boolean hasLocalRead() @@ -272,13 +281,14 @@ public static class NeverSpeculatingReadExecutor extends AbstractReadExecutor */ private final boolean logFailedSpeculation; - public NeverSpeculatingReadExecutor(ColumnFamilyStore cfs, + public NeverSpeculatingReadExecutor(ReadCoordinator coordinator, + ColumnFamilyStore cfs, ReadCommand command, ReplicaPlan.ForTokenRead replicaPlan, Dispatcher.RequestTime requestTime, boolean logFailedSpeculation) { - super(cfs, command, replicaPlan, 1, requestTime); + super(coordinator, cfs, command, replicaPlan, 1, requestTime); this.logFailedSpeculation = logFailedSpeculation; } @@ -295,7 +305,8 @@ static class SpeculatingReadExecutor extends AbstractReadExecutor { private volatile boolean speculated = false; - public SpeculatingReadExecutor(ColumnFamilyStore cfs, + public SpeculatingReadExecutor(ReadCoordinator coordinator, + ColumnFamilyStore cfs, ReadCommand command, ReplicaPlan.ForTokenRead replicaPlan, Dispatcher.RequestTime requestTime) @@ -303,7 +314,7 @@ public SpeculatingReadExecutor(ColumnFamilyStore cfs, // We're hitting additional targets for read repair (??). Since our "extra" replica is the least- // preferred by the snitch, we do an extra data read to start with against a replica more // likely to respond; better to let RR fail than the entire query. - super(cfs, command, replicaPlan, replicaPlan.readQuorum() < replicaPlan.contacts().size() ? 2 : 1, requestTime); + super(coordinator, cfs, command, replicaPlan, replicaPlan.readQuorum() < replicaPlan.contacts().size() ? 2 : 1, requestTime); } public void maybeTryAdditionalReplicas() @@ -366,14 +377,15 @@ void onReadTimeout() private static class AlwaysSpeculatingReadExecutor extends AbstractReadExecutor { - public AlwaysSpeculatingReadExecutor(ColumnFamilyStore cfs, + public AlwaysSpeculatingReadExecutor(ReadCoordinator coordinator, + ColumnFamilyStore cfs, ReadCommand command, ReplicaPlan.ForTokenRead replicaPlan, Dispatcher.RequestTime requestTime) { // presumably, we speculate an extra data request here in case it is our data request that fails to respond, // and there are no more nodes to consult - super(cfs, command, replicaPlan, replicaPlan.contacts().size() > 1 ? 2 : 1, requestTime); + super(coordinator, cfs, command, replicaPlan, replicaPlan.contacts().size() > 1 ? 2 : 1, requestTime); } public void maybeTryAdditionalReplicas() diff --git a/src/java/org/apache/cassandra/service/reads/DataResolver.java b/src/java/org/apache/cassandra/service/reads/DataResolver.java index 332a78570851..03eee1c2c522 100644 --- a/src/java/org/apache/cassandra/service/reads/DataResolver.java +++ b/src/java/org/apache/cassandra/service/reads/DataResolver.java @@ -23,7 +23,6 @@ import java.util.List; import java.util.function.Supplier; import java.util.function.UnaryOperator; - import javax.annotation.Nullable; import com.google.common.base.Joiner; @@ -57,22 +56,23 @@ import org.apache.cassandra.service.reads.repair.RepairedDataVerifier; import org.apache.cassandra.transport.Dispatcher; -import static com.google.common.collect.Iterables.*; +import static com.google.common.collect.Iterables.any; +import static com.google.common.collect.Iterables.transform; public class DataResolver, P extends ReplicaPlan.ForRead> extends ResponseResolver { private final boolean enforceStrictLiveness; - private final ReadRepair readRepair; + public final ReadRepair readRepair; private final boolean trackRepairedStatus; - public DataResolver(ReadCommand command, Supplier replicaPlan, ReadRepair readRepair, Dispatcher.RequestTime requestTime) + public DataResolver(ReadCoordinator coordinator, ReadCommand command, Supplier replicaPlan, ReadRepair readRepair, Dispatcher.RequestTime requestTime) { - this(command, replicaPlan, readRepair, requestTime, false); + this(coordinator, command, replicaPlan, readRepair, requestTime, false); } - public DataResolver(ReadCommand command, Supplier replicaPlan, ReadRepair readRepair, Dispatcher.RequestTime requestTime, boolean trackRepairedStatus) + public DataResolver(ReadCoordinator coordinator, ReadCommand command, Supplier replicaPlan, ReadRepair readRepair, Dispatcher.RequestTime requestTime, boolean trackRepairedStatus) { - super(command, replicaPlan, requestTime); + super(coordinator, command, replicaPlan, requestTime); this.enforceStrictLiveness = command.metadata().enforceStrictLiveness(); this.readRepair = readRepair; this.trackRepairedStatus = trackRepairedStatus; @@ -209,6 +209,7 @@ private UnfilteredPartitionIterator shortReadProtectedResponse(int i, ResolveCon originalResponse, command, context.mergedResultCounter, + coordinator, requestTime, enforceStrictLiveness) : originalResponse; @@ -249,7 +250,7 @@ private PartitionIterator resolveWithReplicaFilteringProtection(E replicas, Repa // before it counts against the limit. If this "pre-count" filter causes a short read, additional rows // will be fetched from the first-phase iterator. - ReplicaFilteringProtection rfp = new ReplicaFilteringProtection<>(replicaPlan().keyspace(), + ReplicaFilteringProtection rfp = new ReplicaFilteringProtection<>(coordinator, replicaPlan().keyspace(), command, replicaPlan().consistencyLevel(), requestTime, @@ -257,6 +258,7 @@ private PartitionIterator resolveWithReplicaFilteringProtection(E replicas, Repa DatabaseDescriptor.getCachedReplicaRowsWarnThreshold(), DatabaseDescriptor.getCachedReplicaRowsFailThreshold()); + // We need separate contexts, as each context has his own counter ResolveContext firstPhaseContext = new ResolveContext(replicas, false); PartitionIterator firstPhasePartitions = resolveInternal(firstPhaseContext, rfp.mergeController(), diff --git a/src/java/org/apache/cassandra/service/reads/DigestResolver.java b/src/java/org/apache/cassandra/service/reads/DigestResolver.java index cc248422c06c..59c3df383d2e 100644 --- a/src/java/org/apache/cassandra/service/reads/DigestResolver.java +++ b/src/java/org/apache/cassandra/service/reads/DigestResolver.java @@ -30,8 +30,8 @@ import org.apache.cassandra.db.partitions.PartitionIterator; import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators; import org.apache.cassandra.locator.Endpoints; -import org.apache.cassandra.locator.Replica; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.Replica; import org.apache.cassandra.locator.ReplicaPlan; import org.apache.cassandra.net.Message; import org.apache.cassandra.service.reads.repair.NoopReadRepair; @@ -45,9 +45,9 @@ public class DigestResolver, P extends ReplicaPlan.ForRea { private volatile Message dataResponse; - public DigestResolver(ReadCommand command, ReplicaPlan.Shared replicaPlan, Dispatcher.RequestTime requestTime) + public DigestResolver(ReadCoordinator coordinator, ReadCommand command, ReplicaPlan.Shared replicaPlan, Dispatcher.RequestTime requestTime) { - super(command, replicaPlan, requestTime); + super(coordinator, command, replicaPlan, requestTime); Preconditions.checkArgument(command instanceof SinglePartitionReadCommand, "DigestResolver can only be used with SinglePartitionReadCommand commands"); } @@ -87,7 +87,7 @@ public PartitionIterator getData() // This path can be triggered only if we've got responses from full replicas and they match, but // transient replica response still contains data, which needs to be reconciled. DataResolver dataResolver - = new DataResolver<>(command, replicaPlan, NoopReadRepair.instance, requestTime); + = new DataResolver<>(coordinator, command, replicaPlan, NoopReadRepair.instance, requestTime); dataResolver.preprocess(dataResponse); // Reconcile with transient replicas diff --git a/src/java/org/apache/cassandra/service/reads/ReadCallback.java b/src/java/org/apache/cassandra/service/reads/ReadCallback.java index ca25d1a3fd85..3b36332b2f51 100644 --- a/src/java/org/apache/cassandra/service/reads/ReadCallback.java +++ b/src/java/org/apache/cassandra/service/reads/ReadCallback.java @@ -24,6 +24,7 @@ import java.util.concurrent.atomic.AtomicIntegerFieldUpdater; import java.util.concurrent.atomic.AtomicReferenceFieldUpdater; +import com.google.common.collect.ImmutableMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -32,9 +33,12 @@ import org.apache.cassandra.db.PartitionRangeReadCommand; import org.apache.cassandra.db.ReadCommand; import org.apache.cassandra.db.ReadResponse; +import org.apache.cassandra.exceptions.CoordinatorBehindException; import org.apache.cassandra.exceptions.ReadFailureException; import org.apache.cassandra.exceptions.ReadTimeoutException; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RetryOnDifferentSystemException; import org.apache.cassandra.locator.Endpoints; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.ReplicaPlan; @@ -53,6 +57,8 @@ import static java.util.concurrent.TimeUnit.MILLISECONDS; import static java.util.concurrent.atomic.AtomicIntegerFieldUpdater.newUpdater; +import static org.apache.cassandra.exceptions.RequestFailureReason.COORDINATOR_BEHIND; +import static org.apache.cassandra.exceptions.RequestFailureReason.RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM; import static org.apache.cassandra.tracing.Tracing.isTracing; import static org.apache.cassandra.utils.concurrent.Condition.newOneTimeCondition; @@ -172,6 +178,34 @@ else if (logger.isDebugEnabled()) if (snapshot != null) snapshot.maybeAbort(command, replicaPlan().consistencyLevel(), received, replicaPlan().readQuorum(), resolver.isDataPresent(), failureReasonByEndpoint); + // failures keeps incrementing, and this.failureReasonByEndpoint keeps getting new entries after signaling. + // Simpler to reason about what happened by copying this.failureReasonByEndpoint and then inferring + // failures from it + final Map failureReasonByEndpoint = ImmutableMap.copyOf(this.failureReasonByEndpoint); + int transactionRetryErrors = 0; + int coordinatorBehindErrors = 0; + for (RequestFailureReason reason : failureReasonByEndpoint.values()) + { + if (reason == RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM) + transactionRetryErrors++; + if (reason == COORDINATOR_BEHIND) + coordinatorBehindErrors++; + } + int totalRetriableFailures = transactionRetryErrors + coordinatorBehindErrors; + + // TODO (nicetohave): This could be smarter and check if retrying would succeed instead of pessimistically + // failing unless all errors are retriable + if (!timedout && totalRetriableFailures > 0 && totalRetriableFailures == failureReasonByEndpoint.size()) + { + // Doesn't matter which we throw really but for clarity/metrics be specific + // Retrying on the correct system might make this write succeed + if (transactionRetryErrors > 0) + throw new RetryOnDifferentSystemException(); + if (coordinatorBehindErrors > 0) + throw new CoordinatorBehindException("Read request failed due to coordinator behind"); + } + + // Same as for writes, see AbstractWriteResponseHandler throw !timedout ? new ReadFailureException(replicaPlan().consistencyLevel(), received, replicaPlan().readQuorum(), resolver.isDataPresent(), failureReasonByEndpoint) @@ -186,7 +220,7 @@ public void onResponse(Message message) InetAddressAndPort from = message.from(); if (WarningContext.isSupported(params.keySet())) { - RequestFailureReason reason = getWarningContext().updateCounters(params, from); + RequestFailure reason = getWarningContext().updateCounters(params, from); replicaPlan().collectFailure(message.from(), reason); if (reason != null) { @@ -236,11 +270,11 @@ public boolean trackLatencyForSnitch() } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) + public void onFailure(InetAddressAndPort from, RequestFailure failure) { assertWaitingFor(from); - failureReasonByEndpoint.put(from, failureReason); + failureReasonByEndpoint.put(from, failure.reason); if (replicaPlan().readQuorum() + failuresUpdater.incrementAndGet(this) > replicaPlan().contacts().size()) condition.signalAll(); diff --git a/src/java/org/apache/cassandra/service/reads/ReadCoordinator.java b/src/java/org/apache/cassandra/service/reads/ReadCoordinator.java new file mode 100644 index 000000000000..ba777418d59c --- /dev/null +++ b/src/java/org/apache/cassandra/service/reads/ReadCoordinator.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.reads; + +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.db.ReadCommand; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; +import org.apache.cassandra.db.ReadResponse; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.locator.EndpointsForToken; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.ReplicaLayout; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.net.RequestCallback; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.tcm.ClusterMetadata; + +public interface ReadCoordinator +{ + ReadCoordinator DEFAULT = new ReadCoordinator() + { + public boolean localReadSupported() + { + return true; + } + + public EndpointsForToken forNonLocalStrategyTokenRead(ClusterMetadata metadata, KeyspaceMetadata keyspace, TableId tableId, Token token) + { + return ReplicaLayout.forNonLocalStrategyTokenRead(metadata, keyspace, token); + } + + public void sendReadCommand(Message message, InetAddressAndPort to, RequestCallback callback) + { + MessagingService.instance().sendWithCallback(message, to, callback); + } + + public void sendReadRepairMutation(Message message, InetAddressAndPort to, RequestCallback callback) + { + MessagingService.instance().sendWithCallback(message, to, callback); + } + + public boolean isEventuallyConsistent() + { + return true; + } + }; + + boolean localReadSupported(); + EndpointsForToken forNonLocalStrategyTokenRead(ClusterMetadata metadata, KeyspaceMetadata keyspace, TableId tableId, Token token); + default ReadCommand maybeAllowOutOfRangeReads(ReadCommand command, ConsistencyLevel cl) + { + return command; + } + void sendReadCommand(Message message, InetAddressAndPort to, RequestCallback callback); + default void notifyOfInitialContacts(EndpointsForToken fullDataRequests, EndpointsForToken transientRequests, EndpointsForToken digestRequests) {} + void sendReadRepairMutation(Message message, InetAddressAndPort to, RequestCallback callback); + default PotentialTxnConflicts potentialTxnConflicts() + { + return isEventuallyConsistent() ? PotentialTxnConflicts.DISALLOW : PotentialTxnConflicts.ALLOW; + } + boolean isEventuallyConsistent(); +} diff --git a/src/java/org/apache/cassandra/service/reads/ReplicaFilteringProtection.java b/src/java/org/apache/cassandra/service/reads/ReplicaFilteringProtection.java index 056f3b55df32..6b2913320143 100644 --- a/src/java/org/apache/cassandra/service/reads/ReplicaFilteringProtection.java +++ b/src/java/org/apache/cassandra/service/reads/ReplicaFilteringProtection.java @@ -23,17 +23,18 @@ import java.util.Arrays; import java.util.List; import java.util.NavigableSet; -import java.util.concurrent.TimeUnit; import java.util.Queue; +import java.util.concurrent.TimeUnit; import java.util.function.Function; +import javax.annotation.concurrent.NotThreadSafe; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.cassandra.concurrent.Stage; import org.apache.cassandra.db.Clustering; import org.apache.cassandra.db.ColumnFamilyStore; -import org.apache.cassandra.db.Columns; import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.DeletionTime; @@ -46,12 +47,12 @@ import org.apache.cassandra.db.filter.DataLimits; import org.apache.cassandra.db.filter.RowFilter; import org.apache.cassandra.db.partitions.PartitionIterator; -import org.apache.cassandra.db.partitions.PartitionIterators; import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators; import org.apache.cassandra.db.rows.EncodingStats; import org.apache.cassandra.db.rows.RangeTombstoneMarker; import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.RowIterator; import org.apache.cassandra.db.rows.Rows; import org.apache.cassandra.db.rows.Unfiltered; import org.apache.cassandra.db.rows.UnfilteredRowIterator; @@ -71,6 +72,7 @@ import org.apache.cassandra.service.ClientWarn; import org.apache.cassandra.service.StorageProxy; import org.apache.cassandra.service.reads.repair.NoopReadRepair; +import org.apache.cassandra.service.reads.repair.PartitionIteratorMergeListener; import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.transport.Dispatcher; import org.apache.cassandra.utils.NoSpamLogger; @@ -90,6 +92,7 @@ * @see CASSANDRA-15907 * @see CASSANDRA-19018 */ +@NotThreadSafe public class ReplicaFilteringProtection> { private static final Logger logger = LoggerFactory.getLogger(ReplicaFilteringProtection.class); @@ -98,6 +101,7 @@ public class ReplicaFilteringProtection> private static final Function NULL_TO_NO_STATS = rowIterator -> rowIterator == null ? EncodingStats.NO_STATS : rowIterator.stats(); + private final ReadCoordinator coordinator; private final Keyspace keyspace; private final ReadCommand command; private final ConsistencyLevel consistency; @@ -105,6 +109,8 @@ public class ReplicaFilteringProtection> private final E sources; private final TableMetrics tableMetrics; + private final QueryMergeListener mergeListener; + private final int cachedRowsWarnThreshold; private final int cachedRowsFailThreshold; @@ -119,7 +125,14 @@ public class ReplicaFilteringProtection> */ private final List> originalPartitions; - ReplicaFilteringProtection(Keyspace keyspace, + /** Whether to consume entire partitions or not in {@link #queryProtectedPartitions}. */ + private final boolean consumeEntirePartitions; + + /** Tracks the current partitions when not consuming entire partitions in {@link #queryProtectedPartitions}. */ + private RowIterator currentRowIterator = null; + + ReplicaFilteringProtection(ReadCoordinator coordinator, + Keyspace keyspace, ReadCommand command, ConsistencyLevel consistency, Dispatcher.RequestTime requestTime, @@ -127,8 +140,10 @@ public class ReplicaFilteringProtection> int cachedRowsWarnThreshold, int cachedRowsFailThreshold) { + this.coordinator = coordinator; this.keyspace = keyspace; this.command = command; + this.consumeEntirePartitions = command.limits().isUnlimited() || !command.isLimitedToOnePartition() || command.rowFilter().hasStaticExpression(); this.consistency = consistency; this.requestTime = requestTime; this.sources = sources; @@ -143,17 +158,19 @@ public class ReplicaFilteringProtection> this.cachedRowsWarnThreshold = cachedRowsWarnThreshold; this.cachedRowsFailThreshold = cachedRowsFailThreshold; + + mergeListener = new QueryMergeListener(); } private UnfilteredPartitionIterator executeReadCommand(ReadCommand cmd, Replica source, ReplicaPlan.Shared replicaPlan) { @SuppressWarnings("unchecked") DataResolver resolver = - new DataResolver<>(cmd, replicaPlan, (NoopReadRepair) NoopReadRepair.instance, requestTime); + new DataResolver<>(coordinator, cmd, replicaPlan, (NoopReadRepair) NoopReadRepair.instance, requestTime); ReadCallback handler = new ReadCallback<>(resolver, cmd, replicaPlan, requestTime); - if (source.isSelf()) + if (source.isSelf() && coordinator.localReadSupported()) { Stage.READ.maybeExecuteImmediately(new StorageProxy.LocalReadRunnable(cmd, handler, requestTime)); } @@ -161,6 +178,7 @@ private UnfilteredPartitionIterator executeReadCommand(ReadCommand cmd, Replica { if (source.isTransient()) cmd = cmd.copyAsTransientQuery(source); + cmd = coordinator.maybeAllowOutOfRangeReads(cmd, consistency); MessagingService.instance().sendWithCallback(cmd.createMessage(false, requestTime), source.endpoint(), handler); } @@ -170,109 +188,136 @@ private UnfilteredPartitionIterator executeReadCommand(ReadCommand cmd, Replica return resolver.getMessages().get(0).payload.makeIterator(command); } - /** - * This listener tracks both the accepted data and the primary keys of the rows that may be incomplete. - * That way, once the query results are merged using this listener, subsequent calls to - * {@link #queryProtectedPartitions(PartitionIterator, int)} will use the collected data to return a copy of the - * data originally collected from the specified replica, completed with the potentially outdated rows. - */ - UnfilteredPartitionIterators.MergeListener mergeController() + private class PartitionMergeListerner implements UnfilteredRowIterators.MergeListener { - return new UnfilteredPartitionIterators.MergeListener() + final DecoratedKey key; + final List builders = new ArrayList<>(sources.size()); + final RegularAndStaticColumns columns; + final EncodingStats stats; + final boolean[] silentRowAt; + final boolean[] silentColumnAt; + + PartitionMergeListerner(DecoratedKey partitionKey, List versions) { - @Override - public void close() - { - // If we hit the failure threshold before consuming a single partition, record the current rows cached. - tableMetrics.rfpRowsCachedPerQuery.update(Math.max(currentRowsCached, maxRowsCached)); - } + key = partitionKey; + columns = PartitionIteratorMergeListener.columns(versions); + stats = EncodingStats.merge(versions, NULL_TO_NO_STATS); - @Override - public UnfilteredRowIterators.MergeListener getRowMergeListener(DecoratedKey partitionKey, List versions) - { - List builders = new ArrayList<>(sources.size()); - RegularAndStaticColumns columns = columns(versions); - EncodingStats stats = EncodingStats.merge(versions, NULL_TO_NO_STATS); + for (int i = 0; i < sources.size(); i++) + builders.add(i, new PartitionBuilder(partitionKey, sources.get(i), columns, stats)); + + silentRowAt = new boolean[builders.size()]; + silentColumnAt = new boolean[builders.size()]; + } + + @Override + public void onMergedPartitionLevelDeletion(DeletionTime mergedDeletion, DeletionTime[] versions) + { + // cache the deletion time versions to be able to regenerate the original row iterator + for (int i = 0; i < versions.length; i++) + builders.get(i).setDeletionTime(versions[i]); + } - for (int i = 0; i < sources.size(); i++) - builders.add(i, new PartitionBuilder(partitionKey, sources.get(i), columns, stats)); + @Override + public void onMergedRows(Row merged, Row[] versions) + { + // Cache the row versions to be able to regenerate the original row iterator: + for (int i = 0; i < versions.length; i++) + builders.get(i).addRow(versions[i]); + + // If all versions are empty, there's no divergence to resolve: + if (merged.isEmpty()) + return; + + Arrays.fill(silentRowAt, false); - boolean[] silentRowAt = new boolean[builders.size()]; - boolean[] silentColumnAt = new boolean[builders.size()]; + // Mark replicas silent if they provide no data for the row: + for (int i = 0; i < versions.length; i++) + if (versions[i] == null || (merged.isStatic() && versions[i].isEmpty())) + silentRowAt[i] = true; - return new UnfilteredRowIterators.MergeListener() + // Even if there are no completely missing rows, replicas may still be silent about individual + // columns, so we need to check for divergence at the column level: + for (ColumnMetadata column : merged.isStatic() ? columns.statics : columns.regulars) + { + Arrays.fill(silentColumnAt, false); + boolean allSilent = true; + + for (int i = 0; i < versions.length; i++) { - @Override - public void onMergedPartitionLevelDeletion(DeletionTime mergedDeletion, DeletionTime[] versions) - { - // cache the deletion time versions to be able to regenerate the original row iterator - for (int i = 0; i < versions.length; i++) - builders.get(i).setDeletionTime(versions[i]); - } + // If the version at this replica is null, we've already marked it as silent: + if (versions[i] != null && versions[i].getColumnData(column) == null) + silentColumnAt[i] = true; + else + allSilent = false; + } - @Override - public void onMergedRows(Row merged, Row[] versions) - { - // Cache the row versions to be able to regenerate the original row iterator: - for (int i = 0; i < versions.length; i++) - builders.get(i).addRow(versions[i]); + for (int i = 0; i < versions.length; i++) + // Mark the replica silent if it is silent about this column and there is actually + // divergence between the replicas. (i.e. If all replicas are silent for this + // column, there is nothing to fetch to complete the row anyway.) + silentRowAt[i] |= silentColumnAt[i] && !allSilent; + } - // If all versions are empty, there's no divergence to resolve: - if (merged.isEmpty()) - return; + for (int i = 0; i < silentRowAt.length; i++) + if (silentRowAt[i]) + builders.get(i).addToFetch(merged); + } - Arrays.fill(silentRowAt, false); + @Override + public void onMergedRangeTombstoneMarkers(RangeTombstoneMarker merged, RangeTombstoneMarker[] versions) + { + // cache the marker versions to be able to regenerate the original row iterator + for (int i = 0; i < versions.length; i++) + builders.get(i).addRangeTombstoneMarker(versions[i]); + } - // Mark replicas silent if they provide no data for the row: - for (int i = 0; i < versions.length; i++) - if (versions[i] == null || (merged.isStatic() && versions[i].isEmpty())) - silentRowAt[i] = true; + @Override + public void close() {} - // Even if there are no completely missing rows, replicas may still be silent about individual - // columns, so we need to check for divergence at the column level: - for (ColumnMetadata column : merged.isStatic() ? columns.statics : columns.regulars) - { - Arrays.fill(silentColumnAt, false); - boolean allSilent = true; + public void populate() + { + for (int i = 0; i < sources.size(); i++) + originalPartitions.get(i).add(builders.get(i)); + } + } - for (int i = 0; i < versions.length; i++) - { - // If the version at this replica is null, we've already marked it as silent: - if (versions[i] != null && versions[i].getColumnData(column) == null) - silentColumnAt[i] = true; - else - allSilent = false; - } + private class QueryMergeListener implements UnfilteredPartitionIterators.MergeListener + { + private PartitionMergeListerner currentListener; - for (int i = 0; i < versions.length; i++) - // Mark the replica silent if it is silent about this column and there is actually - // divergence between the replicas. (i.e. If all replicas are silent for this - // column, there is nothing to fetch to complete the row anyway.) - silentRowAt[i] |= silentColumnAt[i] && !allSilent; - } + @Override + public void close() + { + // If we hit the failure threshold before consuming a single partition, record the current rows cached. + tableMetrics.rfpRowsCachedPerQuery.update(Math.max(currentRowsCached, maxRowsCached)); + } - for (int i = 0; i < silentRowAt.length; i++) - if (silentRowAt[i]) - builders.get(i).addToFetch(merged); - } + @Override + public UnfilteredRowIterators.MergeListener getRowMergeListener(DecoratedKey partitionKey, List versions) + { + if (currentListener == null || !currentListener.key.equals(partitionKey)) + currentListener = new PartitionMergeListerner(partitionKey, versions); - @Override - public void onMergedRangeTombstoneMarkers(RangeTombstoneMarker merged, RangeTombstoneMarker[] versions) - { - // cache the marker versions to be able to regenerate the original row iterator - for (int i = 0; i < versions.length; i++) - builders.get(i).addRangeTombstoneMarker(versions[i]); - } + return currentListener; + } - @Override - public void close() - { - for (int i = 0; i < sources.size(); i++) - originalPartitions.get(i).add(builders.get(i)); - } - }; - } - }; + public void populate() + { + if (currentListener != null) + currentListener.populate(); + } + } + + /** + * This listener tracks both the accepted data and the primary keys of the rows that may be incomplete. + * That way, once the query results are merged using this listener, subsequent calls to + * {@link #queryProtectedPartitions(PartitionIterator, int)} will use the collected data to return a copy of the + * data originally collected from the specified replica, completed with the potentially outdated rows. + */ + UnfilteredPartitionIterators.MergeListener mergeController() + { + return mergeListener; } private void incrementCachedRows() @@ -309,22 +354,6 @@ private void releaseCachedRows(int count) currentRowsCached -= count; } - private static RegularAndStaticColumns columns(List versions) - { - Columns statics = Columns.NONE; - Columns regulars = Columns.NONE; - for (UnfilteredRowIterator iter : versions) - { - if (iter == null) - continue; - - RegularAndStaticColumns cols = iter.columns(); - statics = statics.mergeTo(cols.statics); - regulars = regulars.mergeTo(cols.regulars); - } - return new RegularAndStaticColumns(statics, regulars); - } - /** * Returns the protected results for the specified replica. These are generated fetching the extra rows and merging * them with the cached original filtered results for that replica. @@ -346,16 +375,66 @@ public TableMetadata metadata() } @Override - public void close() { } + public void close() + { + if (currentRowIterator != null) + currentRowIterator.close(); + } @Override public boolean hasNext() { // If there are no cached partition builders for this source, advance the first phase iterator, which - // will force the RFP merge listener to load at least the next protected partition. + // will force the RFP merge listener to load rows from the next protected partition. if (partitions.isEmpty()) { - PartitionIterators.consumeNext(merged); + if (consumeEntirePartitions) + { + if (merged.hasNext()) + { + try (RowIterator partition = merged.next()) + { + while (partition.hasNext()) + partition.next(); + + mergeListener.populate(); + } + } + } + else + { + if (currentRowIterator == null || !currentRowIterator.hasNext()) + { + // If there is an iterator, it's done, so just close it. + if (currentRowIterator != null) + { + currentRowIterator.close(); + currentRowIterator = null; + } + + // Take the next filtered partition from the merged partition iterator. + if (merged.hasNext()) + currentRowIterator = merged.next(); + } + + if (currentRowIterator != null) + { + int i = 0; + + // Consume LIMIT filtered rows from the current partition, unless there are fewer results. + // The underlying iterator is short-read protected, and limiting the number of rows we + // consume avoids needless SRP reads when there are many more than LIMIT results. + while (i < command.limits().count() && currentRowIterator.hasNext()) + { + currentRowIterator.next(); + i++; + } + + // If we actually consumed a row, checkpoint to populate the builders. + if (i > 0) + mergeListener.populate(); + } + } } return !partitions.isEmpty(); @@ -487,6 +566,8 @@ public Row staticRow() public void close() { releaseCachedRows(partitionRowsCached); + toFetch = null; + // TODO: the counters might not be accurate for the static row at this point? } @Override diff --git a/src/java/org/apache/cassandra/service/reads/ResponseResolver.java b/src/java/org/apache/cassandra/service/reads/ResponseResolver.java index 5dd81eb7bcc1..61956322d884 100644 --- a/src/java/org/apache/cassandra/service/reads/ResponseResolver.java +++ b/src/java/org/apache/cassandra/service/reads/ResponseResolver.java @@ -34,6 +34,7 @@ public abstract class ResponseResolver, P extends Replica { protected static final Logger logger = LoggerFactory.getLogger(ResponseResolver.class); + protected final ReadCoordinator coordinator; protected final ReadCommand command; protected final Supplier replicaPlan; @@ -41,8 +42,9 @@ public abstract class ResponseResolver, P extends Replica protected final Accumulator> responses; protected final Dispatcher.RequestTime requestTime; - public ResponseResolver(ReadCommand command, Supplier replicaPlan, Dispatcher.RequestTime requestTime) + public ResponseResolver(ReadCoordinator coordinator, ReadCommand command, Supplier replicaPlan, Dispatcher.RequestTime requestTime) { + this.coordinator = coordinator; this.command = command; this.replicaPlan = replicaPlan; this.responses = new Accumulator<>(replicaPlan.get().readCandidates().size()); diff --git a/src/java/org/apache/cassandra/service/reads/ShortReadPartitionsProtection.java b/src/java/org/apache/cassandra/service/reads/ShortReadPartitionsProtection.java index e9870f1f1d7b..d1562b1b4de9 100644 --- a/src/java/org/apache/cassandra/service/reads/ShortReadPartitionsProtection.java +++ b/src/java/org/apache/cassandra/service/reads/ShortReadPartitionsProtection.java @@ -18,9 +18,6 @@ package org.apache.cassandra.service.reads; -import org.apache.cassandra.locator.Endpoints; -import org.apache.cassandra.locator.ReplicaPlan; -import org.apache.cassandra.locator.ReplicaPlans; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,16 +38,20 @@ import org.apache.cassandra.dht.AbstractBounds; import org.apache.cassandra.dht.ExcludingBounds; import org.apache.cassandra.dht.Range; +import org.apache.cassandra.locator.Endpoints; import org.apache.cassandra.locator.Replica; -import org.apache.cassandra.net.MessagingService; -import org.apache.cassandra.service.reads.repair.NoopReadRepair; +import org.apache.cassandra.locator.ReplicaPlan; +import org.apache.cassandra.locator.ReplicaPlans; import org.apache.cassandra.service.StorageProxy; +import org.apache.cassandra.service.reads.repair.NoopReadRepair; import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.transport.Dispatcher; public class ShortReadPartitionsProtection extends Transformation implements MorePartitions { private static final Logger logger = LoggerFactory.getLogger(ShortReadPartitionsProtection.class); + + private final ReadCoordinator coordinator; private final ReadCommand command; private final Replica source; @@ -65,13 +66,16 @@ public class ShortReadPartitionsProtection extends Transformation, P extends ReplicaPlan.ForRead> UnfilteredPartitionIterator executeReadCommand(ReadCommand cmd, ReplicaPlan.Shared replicaPlan) { - DataResolver resolver = new DataResolver<>(cmd, replicaPlan, (NoopReadRepair)NoopReadRepair.instance, requestTime); + cmd = coordinator.maybeAllowOutOfRangeReads(cmd, replicaPlan.get().consistencyLevel()); + DataResolver resolver = new DataResolver<>(coordinator, cmd, replicaPlan, (NoopReadRepair)NoopReadRepair.instance, requestTime); ReadCallback handler = new ReadCallback<>(resolver, cmd, replicaPlan, requestTime); - if (source.isSelf()) + if (source.isSelf() && coordinator.localReadSupported()) { Stage.READ.maybeExecuteImmediately(new StorageProxy.LocalReadRunnable(cmd, handler, requestTime)); } @@ -189,7 +194,7 @@ UnfilteredPartitionIterator executeReadCommand(ReadCommand cmd, ReplicaPlan.Shar { if (source.isTransient()) cmd = cmd.copyAsTransientQuery(source); - MessagingService.instance().sendWithCallback(cmd.createMessage(false, requestTime), source.endpoint(), handler); + coordinator.sendReadCommand(cmd.createMessage(false, requestTime), source.endpoint(), handler); } // We don't call handler.get() because we want to preserve tombstones since we're still in the middle of merging node results. diff --git a/src/java/org/apache/cassandra/service/reads/ShortReadProtection.java b/src/java/org/apache/cassandra/service/reads/ShortReadProtection.java index 1eca190a7343..e289d276b3e7 100644 --- a/src/java/org/apache/cassandra/service/reads/ShortReadProtection.java +++ b/src/java/org/apache/cassandra/service/reads/ShortReadProtection.java @@ -44,6 +44,7 @@ public static UnfilteredPartitionIterator extend(Replica source, UnfilteredPartitionIterator partitions, ReadCommand command, DataLimits.Counter mergedResultCounter, + ReadCoordinator coordinator, Dispatcher.RequestTime requestTime, boolean enforceStrictLiveness) { @@ -52,7 +53,7 @@ public static UnfilteredPartitionIterator extend(Replica source, command.selectsFullPartition(), enforceStrictLiveness).onlyCount(); - ShortReadPartitionsProtection protection = new ShortReadPartitionsProtection(command, + ShortReadPartitionsProtection protection = new ShortReadPartitionsProtection(coordinator, command, source, preFetchCallback, singleResultCounter, diff --git a/src/java/org/apache/cassandra/service/reads/range/AccordRangeResponse.java b/src/java/org/apache/cassandra/service/reads/range/AccordRangeResponse.java new file mode 100644 index 000000000000..3ec8e6b0088e --- /dev/null +++ b/src/java/org/apache/cassandra/service/reads/range/AccordRangeResponse.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.reads.range; + +import java.util.function.IntPredicate; + +import org.apache.cassandra.db.partitions.PartitionIterator; +import org.apache.cassandra.db.rows.RowIterator; +import org.apache.cassandra.exceptions.RetryOnDifferentSystemException; +import org.apache.cassandra.service.StorageProxy; +import org.apache.cassandra.service.StorageProxy.ConsensusAttemptResult; +import org.apache.cassandra.service.accord.IAccordService.IAccordResult; +import org.apache.cassandra.service.accord.txn.TxnResult; +import org.apache.cassandra.utils.AbstractIterator; + +public class AccordRangeResponse extends AbstractIterator implements PartitionIterator +{ + private final IAccordResult accordResult; + // Range queries don't support reverse, but dutifully threading it through anyways + private final boolean reversed; + private PartitionIterator result; + + public AccordRangeResponse(IAccordResult accordResult, boolean reversed) + { + this.accordResult = accordResult; + this.reversed = reversed; + } + + private void waitForResponse() + { + if (result != null) + return; + IntPredicate alwaysTrue = ignored -> true; + IntPredicate alwaysFalse = ignored -> false; + // TODO (required): Handle retry on different system + ConsensusAttemptResult consensusAttemptResult = StorageProxy.getConsensusAttemptResultFromAsyncTxnResult(accordResult, 1, reversed ? alwaysTrue : alwaysFalse); + if (consensusAttemptResult.shouldRetryOnNewConsensusProtocol) + throw new RetryOnDifferentSystemException(); + result = consensusAttemptResult.serialReadResult; + } + + @Override + protected RowIterator computeNext() + { + waitForResponse(); + return result.hasNext() ? result.next() : endOfData(); + } + + @Override + public void close() + { + // It's an in-memory iterator so no need to close whatever might end up in TxnResult + } +} diff --git a/src/java/org/apache/cassandra/service/reads/range/IRangeResponse.java b/src/java/org/apache/cassandra/service/reads/range/IRangeResponse.java new file mode 100644 index 000000000000..e3a869866af2 --- /dev/null +++ b/src/java/org/apache/cassandra/service/reads/range/IRangeResponse.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.reads.range; + +import org.apache.cassandra.db.partitions.PartitionIterator; +import org.apache.cassandra.locator.EndpointsForRange; +import org.apache.cassandra.locator.ReplicaPlan.ForRangeRead; +import org.apache.cassandra.service.reads.repair.NoopReadRepair; +import org.apache.cassandra.service.reads.repair.ReadRepair; + +public interface IRangeResponse extends PartitionIterator +{ + default ReadRepair getReadRepair() + { + return NoopReadRepair.instance; + } +} diff --git a/src/java/org/apache/cassandra/service/reads/range/RangeCommandIterator.java b/src/java/org/apache/cassandra/service/reads/range/RangeCommandIterator.java index eb55a280c920..1d0c01e674ca 100644 --- a/src/java/org/apache/cassandra/service/reads/range/RangeCommandIterator.java +++ b/src/java/org/apache/cassandra/service/reads/range/RangeCommandIterator.java @@ -21,7 +21,9 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.NoSuchElementException; import java.util.concurrent.TimeUnit; +import java.util.function.Function; import com.google.common.annotations.VisibleForTesting; import org.slf4j.Logger; @@ -29,31 +31,44 @@ import org.apache.cassandra.concurrent.Stage; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.PartitionRangeReadCommand; import org.apache.cassandra.db.ReadCommand; import org.apache.cassandra.db.filter.DataLimits; import org.apache.cassandra.db.partitions.PartitionIterator; +import org.apache.cassandra.db.partitions.PartitionIterators; import org.apache.cassandra.db.rows.RowIterator; +import org.apache.cassandra.exceptions.CoordinatorBehindException; import org.apache.cassandra.exceptions.ReadAbortException; import org.apache.cassandra.exceptions.ReadFailureException; import org.apache.cassandra.exceptions.ReadTimeoutException; +import org.apache.cassandra.exceptions.RetryOnDifferentSystemException; import org.apache.cassandra.exceptions.UnavailableException; import org.apache.cassandra.locator.EndpointsForRange; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.locator.ReplicaPlan; import org.apache.cassandra.metrics.ClientRangeRequestMetrics; import org.apache.cassandra.net.Message; -import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.service.StorageProxy; +import org.apache.cassandra.service.accord.IAccordService; +import org.apache.cassandra.service.accord.txn.TxnResult; +import org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter; +import org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter.RangeReadTarget; +import org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter.RangeReadWithTarget; import org.apache.cassandra.service.reads.DataResolver; import org.apache.cassandra.service.reads.ReadCallback; +import org.apache.cassandra.service.reads.ReadCoordinator; import org.apache.cassandra.service.reads.repair.ReadRepair; +import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.transport.Dispatcher; import org.apache.cassandra.utils.AbstractIterator; import org.apache.cassandra.utils.CloseableIterator; +import static com.google.common.base.Preconditions.checkState; +import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.readMetrics; +import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.readMetricsForLevel; import static org.apache.cassandra.utils.Clock.Global.nanoTime; @VisibleForTesting @@ -68,11 +83,11 @@ public class RangeCommandIterator extends AbstractIterator implemen final PartitionRangeReadCommand command; final boolean enforceStrictLiveness; final Dispatcher.RequestTime requestTime; + final ReadCoordinator readCoordinator; int rangesQueried; int batchesRequested = 0; - private DataLimits.Counter counter; private PartitionIterator sentQueryIterator; @@ -84,6 +99,7 @@ public class RangeCommandIterator extends AbstractIterator implemen RangeCommandIterator(CloseableIterator replicaPlans, PartitionRangeReadCommand command, + ReadCoordinator readCoordinator, int concurrencyFactor, int maxConcurrencyFactor, int totalRangeCount, @@ -91,6 +107,7 @@ public class RangeCommandIterator extends AbstractIterator implemen { this.replicaPlans = replicaPlans; this.command = command; + this.readCoordinator = readCoordinator; this.concurrencyFactor = concurrencyFactor; this.maxConcurrencyFactor = maxConcurrencyFactor; this.totalRangeCount = totalRangeCount; @@ -177,19 +194,16 @@ static int computeConcurrencyFactor(int totalRangeCount, int rangesQueried, int return concurrencyFactor; } - /** - * Queries the provided sub-range. - * - * @param replicaPlan the subRange to query. - * @param isFirst in the case where multiple queries are sent in parallel, whether that's the first query on - * that batch or not. The reason it matters is that whe paging queries, the command (more specifically the - * {@code DataLimits}) may have "state" information and that state may only be valid for the first query (in - * that it's the query that "continues" whatever we're previously queried). - */ - private SingleRangeResponse query(ReplicaPlan.ForRangeRead replicaPlan, boolean isFirst) + private PartitionIterator executeAccord(ClusterMetadata cm, PartitionRangeReadCommand rangeCommand, ConsistencyLevel cl) { - PartitionRangeReadCommand rangeCommand = command.forSubRange(replicaPlan.range(), isFirst); - + //TODO (expected): https://issues.apache.org/jira/browse/CASSANDRA-20210 More efficient reads across command stores + IAccordService.IAccordResult result = StorageProxy.readWithAccord(cm, rangeCommand, rangeCommand.dataRange().keyRange(), cl, requestTime); + return new AccordRangeResponse(result, rangeCommand.isReversed()); + } + + private SingleRangeResponse executeNormal(ReplicaPlan.ForRangeRead replicaPlan, PartitionRangeReadCommand rangeCommand, ReadCoordinator readCoordinator) + { + rangeCommand = (PartitionRangeReadCommand) readCoordinator.maybeAllowOutOfRangeReads(rangeCommand, replicaPlan.consistencyLevel()); // If enabled, request repaired data tracking info from full replicas, but // only if there are multiple full replicas to compare results from. boolean trackRepairedStatus = DatabaseDescriptor.getRepairedDataTrackingForRangeReadsEnabled() @@ -197,13 +211,13 @@ private SingleRangeResponse query(ReplicaPlan.ForRangeRead replicaPlan, boolean ReplicaPlan.SharedForRangeRead sharedReplicaPlan = ReplicaPlan.shared(replicaPlan); ReadRepair readRepair = - ReadRepair.create(command, sharedReplicaPlan, requestTime); + ReadRepair.create(readCoordinator, command, sharedReplicaPlan, requestTime); DataResolver resolver = - new DataResolver<>(rangeCommand, sharedReplicaPlan, readRepair, requestTime, trackRepairedStatus); + new DataResolver<>(readCoordinator, rangeCommand, sharedReplicaPlan, readRepair, requestTime, trackRepairedStatus); ReadCallback handler = - new ReadCallback<>(resolver, rangeCommand, sharedReplicaPlan, requestTime); + new ReadCallback<>(resolver, rangeCommand, sharedReplicaPlan, requestTime); - if (replicaPlan.contacts().size() == 1 && replicaPlan.contacts().get(0).isSelf()) + if (replicaPlan.contacts().size() == 1 && replicaPlan.contacts().get(0).isSelf() && readCoordinator.localReadSupported()) { Stage.READ.execute(new StorageProxy.LocalReadRunnable(rangeCommand, handler, requestTime, trackRepairedStatus)); } @@ -214,27 +228,131 @@ private SingleRangeResponse query(ReplicaPlan.ForRangeRead replicaPlan, boolean Tracing.trace("Enqueuing request to {}", replica); ReadCommand command = replica.isFull() ? rangeCommand : rangeCommand.copyAsTransientQuery(replica); Message message = command.createMessage(trackRepairedStatus && replica.isFull(), requestTime); - MessagingService.instance().sendWithCallback(message, replica.endpoint(), handler); + readCoordinator.sendReadCommand(message, replica.endpoint(), handler); } } - return new SingleRangeResponse(resolver, handler, readRepair); } + + /** + * Queries the provided sub-range. + * + * @param replicaPlan the subRange to query. + * @param isFirst in the case where multiple queries are sent in parallel, whether that's the first query on + * that batch or not. The reason it matters is that whe paging queries, the command (more specifically the + * {@code DataLimits}) may have "state" information and that state may only be valid for the first query (in + * that it's the query that "continues" whatever we're previously queried). + */ + private PartitionIterator query(ClusterMetadata cm, ReplicaPlan.ForRangeRead replicaPlan, ReadCoordinator readCoordinator, List> readRepairs, boolean isFirst) + { + PartitionRangeReadCommand rangeCommand = command.forSubRange(replicaPlan.range(), isFirst); + + // Accord interop execution should always be coordinated through the C* plumbing + if (!readCoordinator.isEventuallyConsistent()) + { + SingleRangeResponse response = executeNormal(replicaPlan, rangeCommand, readCoordinator); + readRepairs.add(response.getReadRepair()); + return response; + } + + List reads = ConsensusRequestRouter.splitReadIntoAccordAndNormal(cm, rangeCommand, readCoordinator, requestTime); + // Special case returning directly to avoid wrapping the iterator and applying the limits an extra time + if (reads.size() == 1) + { + RangeReadWithTarget rangeReadWithTarget = reads.get(0); + checkState(rangeReadWithTarget.read.dataRange().keyRange().equals(rangeCommand.dataRange().keyRange())); + if (rangeReadWithTarget.target == RangeReadTarget.accord && readCoordinator.isEventuallyConsistent()) + { + return executeAccord(cm, + rangeReadWithTarget.read, + replicaPlan.consistencyLevel()); + } + else + { + SingleRangeResponse response = executeNormal(replicaPlan, rangeReadWithTarget.read, readCoordinator); + readRepairs.add(response.getReadRepair()); + return response; + } + } + + // TODO (review): Should this be reworked to execute the queries serially from the iterator? It would respect + // any provided limits better but the number of queries created will generally be low (2-3) + List responses = new ArrayList<>(reads.size() + 1); + // Dummy iterator that checks all the responses for retry on different system hasNext so we don't read + // from the first iterator when the second needs to be retried because the split was wrong + responses.add(new PartitionIterator() + { + @Override + public void close() + { + + } + + @Override + public boolean hasNext() + { + for (int i = 1; i < responses.size(); i++) + responses.get(i).hasNext(); + return false; + } + + @Override + public RowIterator next() + { + throw new NoSuchElementException(); + } + }); + + for (RangeReadWithTarget rangeReadWithTarget : reads) + { + if (rangeReadWithTarget.target == RangeReadTarget.accord && readCoordinator.isEventuallyConsistent()) + responses.add(executeAccord(cm, rangeReadWithTarget.read, replicaPlan.consistencyLevel())); + else + { + SingleRangeResponse response = executeNormal(replicaPlan, rangeReadWithTarget.read, readCoordinator); + responses.add(response); + readRepairs.add(response.getReadRepair()); + } + } + + /* + * We have to apply limits here if the query spans different systems because each subquery we created + * could have gaps in the results since the limit is pushed down independently to each subquery. + * So if we don't meet the limit in the first subquery, it's not safe to go to the next one unless + * we fully exhausted the data the first subquery might have reached + */ + return command.limits().filter(PartitionIterators.concat(responses), + 0, + command.selectsFullPartition(), + command.metadata().enforceStrictLiveness()); + } + PartitionIterator sendNextRequests() { List concurrentQueries = new ArrayList<>(concurrencyFactor); List> readRepairs = new ArrayList<>(concurrencyFactor); + ClusterMetadata cm = ClusterMetadata.current(); try { for (int i = 0; i < concurrencyFactor && replicaPlans.hasNext(); ) { ReplicaPlan.ForRangeRead replicaPlan = replicaPlans.next(); - - SingleRangeResponse response = query(replicaPlan, i == 0); + boolean isFirst = i == 0; + PartitionIterator response; + // Only add the retry wrapper to reroute for the top level coordinator execution + // not Accord's interop execution + if (readCoordinator.isEventuallyConsistent()) + { + Function querySupplier = clusterMetadata -> query(clusterMetadata, replicaPlan, readCoordinator, readRepairs, isFirst); + response = retryingPartitionIterator(querySupplier, replicaPlan.consistencyLevel()); + } + else + { + response = query(cm, replicaPlan, readCoordinator, readRepairs, isFirst); + } concurrentQueries.add(response); - readRepairs.add(response.getReadRepair()); // due to RangeMerger, coordinator may fetch more ranges than required by concurrency factor. rangesQueried += replicaPlan.vnodeCount(); i += replicaPlan.vnodeCount(); @@ -256,6 +374,57 @@ PartitionIterator sendNextRequests() return counter.applyTo(StorageProxy.concatAndBlockOnRepair(concurrentQueries, readRepairs)); } + // Wrap the iterator to retry if request routing is incorrect + private PartitionIterator retryingPartitionIterator(Function attempt, ConsistencyLevel cl) + { + return new PartitionIterator() + { + private ClusterMetadata lastClusterMetadata = ClusterMetadata.current(); + private PartitionIterator delegate = attempt.apply(lastClusterMetadata); + + @Override + public void close() + { + delegate.close(); + } + + @Override + public boolean hasNext() + { + while (true) + { + try + { + return delegate.hasNext(); + } + catch (RetryOnDifferentSystemException e) + { + readMetrics.retryDifferentSystem.mark(); + readMetricsForLevel(cl).retryDifferentSystem.mark(); + logger.debug("Retrying range read on different system because some reads were misrouted according to Accord"); + Tracing.trace("Got {} from range reads, will retry", e); + } + catch (CoordinatorBehindException e) + { + readMetrics.retryCoordinatorBehind.mark(); + readMetricsForLevel(cl).retryCoordinatorBehind.mark(); + logger.debug("Retrying range read now that coordinator has caught up to cluster metadata"); + Tracing.trace("Got {} from range reads, will retry", e); + } + // Fetch the next epoch to retry + lastClusterMetadata = ClusterMetadata.current(); + delegate = attempt.apply(lastClusterMetadata); + } + } + + @Override + public RowIterator next() + { + return delegate.next(); + } + }; + } + @Override public void close() { diff --git a/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java b/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java index cebd3bdf68ee..ad6dfc150a02 100644 --- a/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java +++ b/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java @@ -34,6 +34,7 @@ import org.apache.cassandra.index.Index; import org.apache.cassandra.locator.ReplicaPlans; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.reads.ReadCoordinator; import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.transport.Dispatcher; import org.apache.cassandra.utils.FBUtilities; @@ -55,10 +56,11 @@ public class RangeCommands public static PartitionIterator partitions(PartitionRangeReadCommand command, ConsistencyLevel consistencyLevel, + ReadCoordinator readCoordinator, Dispatcher.RequestTime requestTime) { // Note that in general, a RangeCommandIterator will honor the command limit for each range, but will not enforce it globally. - RangeCommandIterator rangeCommands = rangeCommandIterator(command, consistencyLevel, requestTime); + RangeCommandIterator rangeCommands = rangeCommandIterator(command, consistencyLevel, readCoordinator, requestTime); return command.limits().filter(command.postReconciliationProcessing(rangeCommands), command.nowInSec(), command.selectsFullPartition(), @@ -68,6 +70,7 @@ public static PartitionIterator partitions(PartitionRangeReadCommand command, @VisibleForTesting static RangeCommandIterator rangeCommandIterator(PartitionRangeReadCommand command, ConsistencyLevel consistencyLevel, + ReadCoordinator readCoordinator, Dispatcher.RequestTime requestTime) { Tracing.trace("Computing ranges to query"); @@ -76,10 +79,11 @@ static RangeCommandIterator rangeCommandIterator(PartitionRangeReadCommand comma ReplicaPlanIterator replicaPlans = new ReplicaPlanIterator(command.dataRange().keyRange(), command.indexQueryPlan(), keyspace, + command.metadata().id(), consistencyLevel); if (command.isTopK()) - return new ScanAllRangesCommandIterator(keyspace, replicaPlans, command, replicaPlans.size(), requestTime); + return new ScanAllRangesCommandIterator(keyspace, replicaPlans, command, readCoordinator, replicaPlans.size(), requestTime); int maxConcurrencyFactor = Math.min(replicaPlans.size(), MAX_CONCURRENT_RANGE_REQUESTS); int concurrencyFactor = maxConcurrencyFactor; @@ -107,9 +111,10 @@ static RangeCommandIterator rangeCommandIterator(PartitionRangeReadCommand comma Tracing.trace("Submitting range requests on {} ranges with a concurrency of {}", replicaPlans.size(), concurrencyFactor); } - ReplicaPlanMerger mergedReplicaPlans = new ReplicaPlanMerger(replicaPlans, keyspace, consistencyLevel); + ReplicaPlanMerger mergedReplicaPlans = new ReplicaPlanMerger(replicaPlans, keyspace, command.metadata().id(), consistencyLevel); return new RangeCommandIterator(mergedReplicaPlans, command, + readCoordinator, concurrencyFactor, maxConcurrencyFactor, replicaPlans.size(), @@ -147,11 +152,12 @@ public static boolean sufficientLiveNodesForSelectStar(TableMetadata metadata, C ReplicaPlanIterator rangeIterator = new ReplicaPlanIterator(DataRange.allData(metadata.partitioner).keyRange(), null, keyspace, + metadata.id, consistency); // Called for the side effect of running assureSufficientLiveReplicasForRead. // Deliberately called with an invalid vnode count in case it is used elsewhere in the future.. - rangeIterator.forEachRemaining(r -> ReplicaPlans.forRangeRead(keyspace, null, consistency, r.range(), -1)); + rangeIterator.forEachRemaining(r -> ReplicaPlans.forRangeRead(keyspace, metadata.id, null, consistency, r.range(), -1)); return true; } catch (UnavailableException e) diff --git a/src/java/org/apache/cassandra/service/reads/range/ReplicaPlanIterator.java b/src/java/org/apache/cassandra/service/reads/range/ReplicaPlanIterator.java index 969247b7227a..e138fab4f122 100644 --- a/src/java/org/apache/cassandra/service/reads/range/ReplicaPlanIterator.java +++ b/src/java/org/apache/cassandra/service/reads/range/ReplicaPlanIterator.java @@ -37,6 +37,7 @@ import org.apache.cassandra.locator.ReplicaPlan; import org.apache.cassandra.locator.ReplicaPlans; import org.apache.cassandra.schema.ReplicationParams; +import org.apache.cassandra.schema.TableId; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.compatibility.TokenRingUtils; import org.apache.cassandra.utils.AbstractIterator; @@ -46,6 +47,7 @@ class ReplicaPlanIterator extends AbstractIterator { private final Keyspace keyspace; private final ConsistencyLevel consistency; + private final TableId tableId; private final Index.QueryPlan indexQueryPlan; @VisibleForTesting final Iterator> ranges; @@ -54,10 +56,12 @@ class ReplicaPlanIterator extends AbstractIterator ReplicaPlanIterator(AbstractBounds keyRange, @Nullable Index.QueryPlan indexQueryPlan, Keyspace keyspace, + TableId tableId, ConsistencyLevel consistency) { this.indexQueryPlan = indexQueryPlan; this.keyspace = keyspace; + this.tableId = tableId; this.consistency = consistency; ReplicationParams replication = keyspace.getMetadata().params.replication; @@ -82,7 +86,7 @@ protected ReplicaPlan.ForRangeRead computeNext() if (!ranges.hasNext()) return endOfData(); - return ReplicaPlans.forRangeRead(keyspace, indexQueryPlan, consistency, ranges.next(), 1); + return ReplicaPlans.forRangeRead(keyspace, tableId, indexQueryPlan, consistency, ranges.next(), 1); } /** diff --git a/src/java/org/apache/cassandra/service/reads/range/ReplicaPlanMerger.java b/src/java/org/apache/cassandra/service/reads/range/ReplicaPlanMerger.java index 20e9562f9311..743ac8d8e6e9 100644 --- a/src/java/org/apache/cassandra/service/reads/range/ReplicaPlanMerger.java +++ b/src/java/org/apache/cassandra/service/reads/range/ReplicaPlanMerger.java @@ -27,6 +27,7 @@ import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.locator.ReplicaPlan; import org.apache.cassandra.locator.ReplicaPlans; +import org.apache.cassandra.schema.TableId; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.utils.AbstractIterator; @@ -34,11 +35,13 @@ class ReplicaPlanMerger extends AbstractIterator { private final Keyspace keyspace; private final ConsistencyLevel consistency; + private final TableId tableId; private final PeekingIterator ranges; - ReplicaPlanMerger(Iterator iterator, Keyspace keyspace, ConsistencyLevel consistency) + ReplicaPlanMerger(Iterator iterator, Keyspace keyspace, TableId tableId, ConsistencyLevel consistency) { this.keyspace = keyspace; + this.tableId = tableId; this.consistency = consistency; this.ranges = Iterators.peekingIterator(iterator); } @@ -66,7 +69,7 @@ protected ReplicaPlan.ForRangeRead computeNext() break; ReplicaPlan.ForRangeRead next = ranges.peek(); - ReplicaPlan.ForRangeRead merged = ReplicaPlans.maybeMerge(metadata, keyspace, consistency, current, next); + ReplicaPlan.ForRangeRead merged = ReplicaPlans.maybeMerge(metadata, keyspace, tableId, consistency, current, next); if (merged == null) break; diff --git a/src/java/org/apache/cassandra/service/reads/range/ScanAllRangesCommandIterator.java b/src/java/org/apache/cassandra/service/reads/range/ScanAllRangesCommandIterator.java index 53f55f8938ae..8ebc6a6697fc 100644 --- a/src/java/org/apache/cassandra/service/reads/range/ScanAllRangesCommandIterator.java +++ b/src/java/org/apache/cassandra/service/reads/range/ScanAllRangesCommandIterator.java @@ -38,6 +38,7 @@ import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.service.reads.DataResolver; import org.apache.cassandra.service.reads.ReadCallback; +import org.apache.cassandra.service.reads.ReadCoordinator; import org.apache.cassandra.service.reads.repair.NoopReadRepair; import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.transport.Dispatcher; @@ -62,10 +63,11 @@ public class ScanAllRangesCommandIterator extends RangeCommandIterator ScanAllRangesCommandIterator(Keyspace keyspace, CloseableIterator replicaPlans, PartitionRangeReadCommand command, + ReadCoordinator readCoordinator, int totalRangeCount, Dispatcher.RequestTime requestTime) { - super(replicaPlans, command, totalRangeCount, totalRangeCount, totalRangeCount, requestTime); + super(replicaPlans, command, readCoordinator, totalRangeCount, totalRangeCount, totalRangeCount, requestTime); Preconditions.checkState(command.isTopK()); this.keyspace = keyspace; @@ -92,7 +94,7 @@ protected PartitionIterator sendNextRequests() ReplicaPlan.ForRangeRead plan = ReplicaPlans.forFullRangeRead(keyspace, consistencyLevel, command.dataRange().keyRange(), replicasToQuery, totalRangeCount); ReplicaPlan.SharedForRangeRead sharedReplicaPlan = ReplicaPlan.shared(plan); - DataResolver resolver = new DataResolver<>(command, sharedReplicaPlan, NoopReadRepair.instance, requestTime, false); + DataResolver resolver = new DataResolver<>(ReadCoordinator.DEFAULT, command, sharedReplicaPlan, NoopReadRepair.instance, requestTime, false); ReadCallback handler = new ReadCallback<>(resolver, command, sharedReplicaPlan, requestTime); int nodes = 0; diff --git a/src/java/org/apache/cassandra/service/reads/range/SingleRangeResponse.java b/src/java/org/apache/cassandra/service/reads/range/SingleRangeResponse.java index d318b41614c2..027405803090 100644 --- a/src/java/org/apache/cassandra/service/reads/range/SingleRangeResponse.java +++ b/src/java/org/apache/cassandra/service/reads/range/SingleRangeResponse.java @@ -23,6 +23,7 @@ import org.apache.cassandra.exceptions.ReadTimeoutException; import org.apache.cassandra.locator.EndpointsForRange; import org.apache.cassandra.locator.ReplicaPlan; +import org.apache.cassandra.locator.ReplicaPlan.ForRangeRead; import org.apache.cassandra.service.reads.DataResolver; import org.apache.cassandra.service.reads.ReadCallback; import org.apache.cassandra.service.reads.repair.ReadRepair; @@ -45,7 +46,7 @@ class SingleRangeResponse extends AbstractIterator implements Parti this.readRepair = readRepair; } - ReadRepair getReadRepair() + public ReadRepair getReadRepair() { return readRepair; } diff --git a/src/java/org/apache/cassandra/service/reads/repair/AbstractReadRepair.java b/src/java/org/apache/cassandra/service/reads/repair/AbstractReadRepair.java index 8343b83b071e..418deb4aa2c8 100644 --- a/src/java/org/apache/cassandra/service/reads/repair/AbstractReadRepair.java +++ b/src/java/org/apache/cassandra/service/reads/repair/AbstractReadRepair.java @@ -39,11 +39,11 @@ import org.apache.cassandra.locator.ReplicaPlan; import org.apache.cassandra.metrics.ReadRepairMetrics; import org.apache.cassandra.net.Message; -import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.service.StorageProxy; import org.apache.cassandra.service.reads.DataResolver; import org.apache.cassandra.service.reads.DigestResolver; import org.apache.cassandra.service.reads.ReadCallback; +import org.apache.cassandra.service.reads.ReadCoordinator; import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.transport.Dispatcher; @@ -54,6 +54,7 @@ public abstract class AbstractReadRepair, P extends Repli { protected static final Logger logger = LoggerFactory.getLogger(AbstractReadRepair.class); + protected final ReadCoordinator coordinator; protected final ReadCommand command; protected final Dispatcher.RequestTime requestTime; protected final ReplicaPlan.Shared replicaPlan; @@ -75,10 +76,11 @@ public DigestRepair(DataResolver dataResolver, ReadCallback readCall } } - public AbstractReadRepair(ReadCommand command, + public AbstractReadRepair(ReadCoordinator coordinator, ReadCommand command, ReplicaPlan.Shared replicaPlan, Dispatcher.RequestTime requestTime) { + this.coordinator = coordinator; this.command = command; this.requestTime = requestTime; this.replicaPlan = replicaPlan; @@ -92,9 +94,9 @@ protected P replicaPlan() void sendReadCommand(Replica to, ReadCallback readCallback, boolean speculative, boolean trackRepairedStatus) { - ReadCommand command = this.command; - - if (to.isSelf()) + ReadCommand command = coordinator.maybeAllowOutOfRangeReads(this.command, replicaPlan().consistencyLevel()); + + if (to.isSelf() && coordinator.localReadSupported()) { Stage.READ.maybeExecuteImmediately(new StorageProxy.LocalReadRunnable(command, readCallback, requestTime, trackRepairedStatus)); return; @@ -117,7 +119,7 @@ void sendReadCommand(Replica to, ReadCallback readCallback, boolean specul } Message message = command.createMessage(trackRepairedStatus && to.isFull(), requestTime); - MessagingService.instance().sendWithCallback(message, to.endpoint(), readCallback); + coordinator.sendReadCommand(message, to.endpoint(), readCallback); } abstract Meter getRepairMeter(); @@ -139,7 +141,7 @@ public void startRepair(DigestResolver digestResolver, Consumer resolver = new DataResolver<>(command, replicaPlan, this, requestTime, trackRepairedStatus); + DataResolver resolver = new DataResolver<>(coordinator, command, replicaPlan, this, requestTime, trackRepairedStatus); ReadCallback readCallback = new ReadCallback<>(resolver, command, replicaPlan, requestTime); digestRepair = new DigestRepair<>(resolver, readCallback, resultConsumer); diff --git a/src/java/org/apache/cassandra/service/reads/repair/BlockingPartitionRepair.java b/src/java/org/apache/cassandra/service/reads/repair/BlockingPartitionRepair.java index 61b529ca003b..c3c61181c9bd 100644 --- a/src/java/org/apache/cassandra/service/reads/repair/BlockingPartitionRepair.java +++ b/src/java/org/apache/cassandra/service/reads/repair/BlockingPartitionRepair.java @@ -21,9 +21,6 @@ import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; - -import org.apache.cassandra.utils.concurrent.AsyncFuture; -import org.apache.cassandra.utils.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import com.google.common.annotations.VisibleForTesting; @@ -38,27 +35,33 @@ import org.apache.cassandra.db.Mutation; import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.locator.EndpointsForToken; +import org.apache.cassandra.locator.InOurDc; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.locator.ReplicaPlan; import org.apache.cassandra.locator.Replicas; -import org.apache.cassandra.locator.InOurDc; import org.apache.cassandra.metrics.ReadRepairMetrics; -import org.apache.cassandra.net.RequestCallback; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.net.RequestCallback; import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.reads.ReadCoordinator; +import org.apache.cassandra.service.reads.repair.BlockingReadRepair.PendingPartitionRepair; import org.apache.cassandra.tracing.Tracing; +import org.apache.cassandra.utils.concurrent.AsyncFuture; +import org.apache.cassandra.utils.concurrent.CountDownLatch; import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; -import static org.apache.cassandra.net.Verb.*; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.Iterables.all; +import static org.apache.cassandra.net.Verb.READ_REPAIR_REQ; import static org.apache.cassandra.utils.Clock.Global.nanoTime; import static org.apache.cassandra.utils.concurrent.CountDownLatch.newCountDownLatch; -import static com.google.common.collect.Iterables.all; public class BlockingPartitionRepair - extends AsyncFuture implements RequestCallback + extends AsyncFuture implements RequestCallback, PendingPartitionRepair { + private final ReadCoordinator coordinator; private final DecoratedKey key; private final ReplicaPlan.ForWrite repairPlan; private final Map pendingRepairs; @@ -66,8 +69,10 @@ public class BlockingPartitionRepair private final int blockFor; private volatile long mutationsSentTime; - public BlockingPartitionRepair(DecoratedKey key, Map repairs, ReplicaPlan.ForWrite repairPlan) + @VisibleForTesting + public BlockingPartitionRepair(ReadCoordinator coordinator, DecoratedKey key, Map repairs, ReplicaPlan.ForWrite repairPlan) { + this.coordinator = coordinator; this.key = key; this.pendingRepairs = new ConcurrentHashMap<>(repairs); this.repairPlan = repairPlan; @@ -99,18 +104,21 @@ public BlockingPartitionRepair(DecoratedKey key, Map repairs, latch = newCountDownLatch(Math.max(blockFor, 0)); } + @Override public ReplicaPlan.ForWrite repairPlan() { return repairPlan; } - int blockFor() + @Override + public int blockFor() { return blockFor; } @VisibleForTesting - int waitingOn() + @Override + public int waitingOn() { return latch.count(); } @@ -147,7 +155,8 @@ private PartitionUpdate mergeUnackedUpdates() @VisibleForTesting protected void sendRR(Message message, InetAddressAndPort endpoint) { - MessagingService.instance().sendWithCallback(message, endpoint, this); + checkArgument(message.payload.potentialTxnConflicts() == coordinator.potentialTxnConflicts(), "Mutation allowing transaction conflicts should match coordinator"); + coordinator.sendReadRepairMutation(message, endpoint, this); } public void sendInitialRepairs() @@ -158,7 +167,7 @@ public void sendInitialRepairs() for (Map.Entry entry: pendingRepairs.entrySet()) { Replica destination = entry.getKey(); - Preconditions.checkArgument(destination.isFull(), "Can't send repairs to transient replicas: %s", destination); + checkArgument(destination.isFull(), "Can't send repairs to transient replicas: %s", destination); Mutation mutation = entry.getValue(); TableId tableId = extractUpdate(mutation).metadata().id; @@ -177,6 +186,7 @@ public void sendInitialRepairs() * @param timeUnit the time unit of the future time * @return true if repair is done; otherwise, false. */ + @Override public boolean awaitRepairsUntil(long timeoutAt, TimeUnit timeUnit) { long timeoutAtNanos = timeUnit.toNanos(timeoutAt); @@ -191,18 +201,18 @@ public boolean awaitRepairsUntil(long timeoutAt, TimeUnit timeUnit) } } + @Override + public boolean awaitRepairs(long remaining, TimeUnit timeUnit) throws InterruptedException + { + return latch.await(remaining, timeUnit); + } + private static int msgVersionIdx(int version) { return version - MessagingService.minimum_version; } - /** - * If it looks like we might not receive acks for all the repair mutations we sent out, combine all - * the unacked mutations and send them to the minority of nodes not involved in the read repair data - * read / write cycle. We will accept acks from them in lieu of acks from the initial mutations sent - * out, so long as we receive the same number of acks as repair mutations transmitted. This prevents - * misbehaving nodes from killing a quorum read, while continuing to guarantee monotonic quorum reads - */ + @Override public void maybeSendAdditionalWrites(long timeout, TimeUnit timeoutUnit) { if (awaitRepairsUntil(timeout + timeoutUnit.convert(mutationsSentTime, TimeUnit.NANOSECONDS), timeoutUnit)) @@ -231,7 +241,7 @@ public void maybeSendAdditionalWrites(long timeout, TimeUnit timeoutUnit) if (mutation == null) { - mutation = BlockingReadRepairs.createRepairMutation(update, repairPlan.consistencyLevel(), replica.endpoint(), true); + mutation = BlockingReadRepairs.createRepairMutation(update, repairPlan.consistencyLevel(), replica.endpoint(), true, coordinator.potentialTxnConflicts()); versionedMutations[versionIdx] = mutation; } diff --git a/src/java/org/apache/cassandra/service/reads/repair/BlockingReadRepair.java b/src/java/org/apache/cassandra/service/reads/repair/BlockingReadRepair.java index 4a56e6fe18bd..ee66574b016e 100644 --- a/src/java/org/apache/cassandra/service/reads/repair/BlockingReadRepair.java +++ b/src/java/org/apache/cassandra/service/reads/repair/BlockingReadRepair.java @@ -21,27 +21,53 @@ import java.util.Map; import java.util.Queue; import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; -import org.apache.cassandra.db.DecoratedKey; +import com.google.common.util.concurrent.UncheckedExecutionException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import accord.primitives.Keys; +import accord.primitives.Txn; import com.codahale.metrics.Meter; +import org.apache.cassandra.concurrent.Stage; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.Mutation; import org.apache.cassandra.db.ReadCommand; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators; import org.apache.cassandra.exceptions.ReadTimeoutException; import org.apache.cassandra.locator.Endpoints; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.locator.ReplicaPlan; +import org.apache.cassandra.locator.ReplicaPlan.ForWrite; import org.apache.cassandra.metrics.ReadRepairMetrics; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.txn.TxnQuery; +import org.apache.cassandra.service.accord.txn.TxnRead; +import org.apache.cassandra.service.accord.txn.TxnResult; +import org.apache.cassandra.service.accord.txn.UnrecoverableRepairUpdate; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationMutationHelper; +import org.apache.cassandra.service.consensus.migration.TransactionalMigrationFromMode; +import org.apache.cassandra.service.reads.ReadCoordinator; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.transport.Dispatcher; +import org.apache.cassandra.utils.concurrent.Future; +import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; +import static com.google.common.base.Preconditions.checkState; import static java.util.concurrent.TimeUnit.MICROSECONDS; import static java.util.concurrent.TimeUnit.NANOSECONDS; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; /** * 'Classic' read repair. Doesn't allow the client read to return until @@ -53,13 +79,66 @@ public class BlockingReadRepair, P extends ReplicaPlan.Fo { private static final Logger logger = LoggerFactory.getLogger(BlockingReadRepair.class); - protected final Queue repairs = new ConcurrentLinkedQueue<>(); + protected final Queue repairs = new ConcurrentLinkedQueue<>(); + + interface PendingPartitionRepair + { + + /** + * Wait for the repair to complete util a future time + * If the {@param timeoutAt} is a past time, the method returns immediately with the repair result. + * @param timeoutAt future time + * @param timeUnit the time unit of the future time + * @return true if repair is done; otherwise, false. + */ + default boolean awaitRepairsUntil(long timeoutAt, TimeUnit timeUnit) + { + long timeoutAtNanos = timeUnit.toNanos(timeoutAt); + long remaining = timeoutAtNanos - nanoTime(); + try + { + return awaitRepairs(remaining, timeUnit); + } + catch (InterruptedException e) + { + throw new UncheckedInterruptedException(e); + } + catch (ExecutionException e) + { + throw new UncheckedExecutionException(e); + } + } + + boolean awaitRepairs(long remaining, TimeUnit timeUnit) throws InterruptedException, ExecutionException; + + /** + * If it looks like we might not receive acks for all the repair mutations we sent out, combine all + * the unacked mutations and send them to the minority of nodes not involved in the read repair data + * read / write cycle. We will accept acks from them in lieu of acks from the initial mutations sent + * out, so long as we receive the same number of acks as repair mutations transmitted. This prevents + * misbehaving nodes from killing a quorum read, while continuing to guarantee monotonic quorum reads + */ + default void maybeSendAdditionalWrites(long timeout, TimeUnit timeoutUnit) {} + + default int blockFor() + { + return -1; + } + + default int waitingOn() + { + return -1; + } + + ForWrite repairPlan(); + } - BlockingReadRepair(ReadCommand command, ReplicaPlan.Shared replicaPlan, Dispatcher.RequestTime requestTime) + BlockingReadRepair(ReadCoordinator coordinator, ReadCommand command, ReplicaPlan.Shared replicaPlan, Dispatcher.RequestTime requestTime) { - super(command, replicaPlan, requestTime); + super(coordinator, command, replicaPlan, requestTime); } + @Override public UnfilteredPartitionIterators.MergeListener getMergeListener(P replicaPlan) { return new PartitionIteratorMergeListener<>(replicaPlan, command, this); @@ -74,7 +153,7 @@ Meter getRepairMeter() @Override public void maybeSendAdditionalWrites() { - for (BlockingPartitionRepair repair: repairs) + for (PendingPartitionRepair repair: repairs) { repair.maybeSendAdditionalWrites(cfs.additionalWriteLatencyMicros, MICROSECONDS); } @@ -83,10 +162,10 @@ public void maybeSendAdditionalWrites() @Override public void awaitWrites() { - BlockingPartitionRepair timedOut = null; + PendingPartitionRepair timedOut = null; ReplicaPlan.ForWrite repairPlan = null; - for (BlockingPartitionRepair repair : repairs) + for (PendingPartitionRepair repair : repairs) { long deadline = requestTime.computeDeadline(DatabaseDescriptor.getReadRpcTimeout(NANOSECONDS)); @@ -116,10 +195,111 @@ public void awaitWrites() } @Override - public void repairPartition(DecoratedKey partitionKey, Map mutations, ReplicaPlan.ForWrite writePlan) + public void repairPartition(DecoratedKey dk, Map mutations, ReplicaPlan.ForWrite writePlan, ReadRepairSource rrSource) + { + // non-Accord reads only ever touch one table and key so all mutations need to be applied either transactionally + // or non-transactionally (not a mix). There is no retry loop here because read repair is relatively rare so it racing + // with changes to migrating ranges should also be pretty rare so it isn't worth the added complexity. If you were + // to add a retry loop you would need to be careful to correctly set/unset allowPotentialTransactionConflicts in the mutations + // since that is set if it is routed to Accord + // + // If this is an Accord transaction that is in interoperability mode and executing a read repair + // then we take the non-transactional path and the mutations are intercepted in ReadCoordinator.sendRepairMutation + // which will ensure the repair mutation runs in the command store thread after preceding transactions are done + ClusterMetadata cm = ClusterMetadata.current(); + if (coordinator.isEventuallyConsistent() && ConsensusMigrationMutationHelper.tokenShouldBeWrittenThroughAccord(cm, command.metadata().id, dk.getToken(), TransactionalMode::readRepairsThroughAccord, TransactionalMigrationFromMode::readRepairsThroughAccord)) + repairViaAccordTransaction(dk, mutations, writePlan); + else + repairViaReadCoordinator(dk, mutations, writePlan, rrSource); + } + + /* + * Create a new Accord transaction to apply this blocking read repair ensuring that any data being written + * consists of already committed Accord writes just by virtue of creating a new transaction which must occur + * after any already partially applied transactions whose writes might be present in the repair mutation. + */ + private void repairViaAccordTransaction(DecoratedKey dk, Map accordMutations, ForWrite writePlan) { - BlockingPartitionRepair blockingRepair = new BlockingPartitionRepair(partitionKey, mutations, writePlan); + checkState(coordinator.isEventuallyConsistent(), "Should only repair transactionally for an eventually consistent read coordinator"); + ReadRepairMetrics.repairedBlockingViaAccord.mark(); + PartitionKey partitionKey = new PartitionKey(command.metadata().id, dk); + Keys keys = Keys.of(partitionKey); + // This is going create a new BlockingReadRepair inside an Accord transaction which will go down + // the !isEventuallyConsistent path and apply the repairs through Accord command stores using AccordInteropExecution + UnrecoverableRepairUpdate repairUpdate = new UnrecoverableRepairUpdate(AccordService.instance().nodeId(), this, keys, dk, accordMutations, writePlan); + + /* + * The motivation for using a read to apply read repair is that we want to apply the writes in the execute phase + * so it takes fewer roundtrips and re-use a lot of the AccordInteropExecution code. We don't want to wait for + * the extra roundtrip for apply since this is blocking a read. + * + * The reason this is safe/correct even though read transactions commute with each other is that read transactions + * don't return a result when they are recovered so there is no race with recovery coordinators to worry about. + * The remaining concern of a Read transaction seeing a torn write from an Accord transaction can't happen because + * this RR mutation only contains already applied Accord writes and possibly some non-transactional writes + * that need to be read repaired. + * + * Really the partialy applied Accord writes could just be barriered instead of read repaired, but we use this + * approach so we can read repair non-transactional writes as well. This doesn't make that any more deterministic + * since overlapping non-transactional writes with transactional reads will never be deterministic, but it combines + * the two things into the same mechanism and we can't tell the origin of the writes needing read repair anyways. + */ + TableMetadatasAndKeys tablesAndKeys = new TableMetadatasAndKeys(TableMetadatas.of(command.metadata()), keys); + Txn txn = new Txn.InMemory(Txn.Kind.Read, keys, TxnRead.createNoOpRead(keys), TxnQuery.NONE, repairUpdate, tablesAndKeys); + Future repairFuture = Stage.ACCORD_MIGRATION.submit(() -> AccordService.instance().coordinate(command.metadata().epoch.getEpoch(), txn, ConsistencyLevel.ANY, requestTime)); + + repairs.add(new PendingPartitionRepair() + { + @Override + public boolean awaitRepairs(long remaining, TimeUnit timeUnit) throws InterruptedException, ExecutionException + { + try + { + repairFuture.get(remaining, timeUnit); + return true; + } + catch (TimeoutException e) + { + + return false; + } + } + + @Override + public ForWrite repairPlan() + { + return writePlan; + } + }); + } + + /* + * ReadCoordinator could be an Accord transaction if this is already in an Accord transaction or a regular + * non-transactional read coordinator. We might take this path because transactional repair is not needed, or this + * is an Accord transaction and the Accord read coordinator will take care of proxying the mutations through command + * stores + */ + private void repairViaReadCoordinator(DecoratedKey dk, Map mutations, ForWrite writePlan, ReadRepairSource rrSource) + { + // Accord read at QUORUM and found it needed to read repair, this means txn recovery is non-deterministic + if (rrSource == ReadRepairSource.OTHER && !coordinator.isEventuallyConsistent()) + ReadRepairMetrics.repairedBlockingFromAccord.mark(); + BlockingPartitionRepair blockingRepair = new BlockingPartitionRepair(coordinator, dk, mutations, writePlan); blockingRepair.sendInitialRepairs(); repairs.add(blockingRepair); } + + public void repairPartitionDirectly(ReadCoordinator readCoordinator, DecoratedKey dk, Map mutations, ForWrite writePlan) + { + ReadRepair delegateRR = ReadRepairStrategy.BLOCKING.create(readCoordinator, command, replicaPlan, requestTime); + delegateRR.repairPartition(dk, mutations, writePlan, ReadRepairSource.REPAIR_VIA_ACCORD); + delegateRR.maybeSendAdditionalWrites(); + delegateRR.awaitWrites(); + } + + @Override + public PotentialTxnConflicts coordinatorPotentialTxnConflicts() + { + return coordinator.potentialTxnConflicts(); + } } diff --git a/src/java/org/apache/cassandra/service/reads/repair/BlockingReadRepairs.java b/src/java/org/apache/cassandra/service/reads/repair/BlockingReadRepairs.java index de49f5a5636f..3986a42bf4db 100644 --- a/src/java/org/apache/cassandra/service/reads/repair/BlockingReadRepairs.java +++ b/src/java/org/apache/cassandra/service/reads/repair/BlockingReadRepairs.java @@ -26,6 +26,7 @@ import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.Mutation; import org.apache.cassandra.db.MutationExceededMaxSizeException; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.exceptions.ReadTimeoutException; import org.apache.cassandra.locator.InetAddressAndPort; @@ -46,13 +47,13 @@ public class BlockingReadRepairs * Create a read repair mutation from the given update, if the mutation is not larger than the maximum * mutation size, otherwise return null. Or, if we're configured to be strict, throw an exception. */ - public static Mutation createRepairMutation(PartitionUpdate update, ConsistencyLevel consistency, InetAddressAndPort destination, boolean suppressException) + public static Mutation createRepairMutation(PartitionUpdate update, ConsistencyLevel consistency, InetAddressAndPort destination, boolean suppressException, PotentialTxnConflicts potentialTxnConflicts) { if (update == null) return null; DecoratedKey key = update.partitionKey(); - Mutation mutation = new Mutation(update); + Mutation mutation = new Mutation(update, potentialTxnConflicts); int messagingVersion = MessagingService.instance().versions.get(destination); try diff --git a/src/java/org/apache/cassandra/service/reads/repair/NoopReadRepair.java b/src/java/org/apache/cassandra/service/reads/repair/NoopReadRepair.java index 5cf72b33cf85..c44a0b9ce0c1 100644 --- a/src/java/org/apache/cassandra/service/reads/repair/NoopReadRepair.java +++ b/src/java/org/apache/cassandra/service/reads/repair/NoopReadRepair.java @@ -30,6 +30,7 @@ import org.apache.cassandra.locator.Replica; import org.apache.cassandra.locator.ReplicaPlan; import org.apache.cassandra.service.reads.DigestResolver; +import org.apache.cassandra.service.reads.ReadCoordinator; /** * Bypasses the read repair path for short read protection and testing @@ -75,8 +76,15 @@ public void awaitWrites() } @Override - public void repairPartition(DecoratedKey partitionKey, Map mutations, ReplicaPlan.ForWrite writePlan) + public void repairPartition(DecoratedKey partitionKey, Map mutations, ReplicaPlan.ForWrite writePlan, ReadRepairSource rrSource) { } + + @Override + public void repairPartitionDirectly(ReadCoordinator coordinator, DecoratedKey partitionKey, Map mutations, ReplicaPlan.ForWrite writePlan) + { + // Shouldn't be possible to invoke this since repairPartition is a no op + throw new UnsupportedOperationException(); + } } diff --git a/src/java/org/apache/cassandra/service/reads/repair/PartitionIteratorMergeListener.java b/src/java/org/apache/cassandra/service/reads/repair/PartitionIteratorMergeListener.java index f77bd4d52ca0..5aacaf43299d 100644 --- a/src/java/org/apache/cassandra/service/reads/repair/PartitionIteratorMergeListener.java +++ b/src/java/org/apache/cassandra/service/reads/repair/PartitionIteratorMergeListener.java @@ -49,7 +49,7 @@ public UnfilteredRowIterators.MergeListener getRowMergeListener(DecoratedKey par return new RowIteratorMergeListener<>(partitionKey, columns(versions), isReversed(versions), replicaPlan, command, readRepair); } - protected RegularAndStaticColumns columns(List versions) + public static RegularAndStaticColumns columns(List versions) { Columns statics = Columns.NONE; Columns regulars = Columns.NONE; diff --git a/src/java/org/apache/cassandra/service/reads/repair/ReadOnlyReadRepair.java b/src/java/org/apache/cassandra/service/reads/repair/ReadOnlyReadRepair.java index 46b30a927935..9560247ef021 100644 --- a/src/java/org/apache/cassandra/service/reads/repair/ReadOnlyReadRepair.java +++ b/src/java/org/apache/cassandra/service/reads/repair/ReadOnlyReadRepair.java @@ -28,8 +28,10 @@ import org.apache.cassandra.locator.Endpoints; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.locator.ReplicaPlan; +import org.apache.cassandra.locator.ReplicaPlan.ForWrite; import org.apache.cassandra.metrics.ReadRepairMetrics; import org.apache.cassandra.transport.Dispatcher; +import org.apache.cassandra.service.reads.ReadCoordinator; /** * Only performs the collection of data responses and reconciliation of them, doesn't send repair mutations @@ -38,9 +40,9 @@ public class ReadOnlyReadRepair, P extends ReplicaPlan.ForRead> extends AbstractReadRepair { - ReadOnlyReadRepair(ReadCommand command, ReplicaPlan.Shared replicaPlan, Dispatcher.RequestTime requestTime) + ReadOnlyReadRepair(ReadCoordinator coordinator, ReadCommand command, ReplicaPlan.Shared replicaPlan, Dispatcher.RequestTime requestTime) { - super(command, replicaPlan, requestTime); + super(coordinator, command, replicaPlan, requestTime); } @Override @@ -62,7 +64,13 @@ public void maybeSendAdditionalWrites() } @Override - public void repairPartition(DecoratedKey partitionKey, Map mutations, ReplicaPlan.ForWrite writePlan) + public void repairPartition(DecoratedKey partitionKey, Map mutations, ReplicaPlan.ForWrite writePlan, ReadRepairSource rrSource) + { + throw new UnsupportedOperationException("ReadOnlyReadRepair shouldn't be trying to repair partitions"); + } + + @Override + public void repairPartitionDirectly(ReadCoordinator coordinator, DecoratedKey partitionKey, Map mutations, ForWrite writePlan) { throw new UnsupportedOperationException("ReadOnlyReadRepair shouldn't be trying to repair partitions"); } diff --git a/src/java/org/apache/cassandra/service/reads/repair/ReadRepair.java b/src/java/org/apache/cassandra/service/reads/repair/ReadRepair.java index a63cc7f6bfca..b21765203e87 100644 --- a/src/java/org/apache/cassandra/service/reads/repair/ReadRepair.java +++ b/src/java/org/apache/cassandra/service/reads/repair/ReadRepair.java @@ -19,18 +19,20 @@ import java.util.Map; import java.util.function.Consumer; +import javax.annotation.Nullable; import org.apache.cassandra.db.DecoratedKey; -import org.apache.cassandra.locator.Endpoints; - import org.apache.cassandra.db.Mutation; import org.apache.cassandra.db.ReadCommand; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; import org.apache.cassandra.db.partitions.PartitionIterator; import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators; import org.apache.cassandra.exceptions.ReadTimeoutException; +import org.apache.cassandra.locator.Endpoints; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.locator.ReplicaPlan; import org.apache.cassandra.service.reads.DigestResolver; +import org.apache.cassandra.service.reads.ReadCoordinator; import org.apache.cassandra.transport.Dispatcher; public interface ReadRepair, P extends ReplicaPlan.ForRead> @@ -38,13 +40,13 @@ public interface ReadRepair, P extends ReplicaPlan.ForRea public interface Factory { , P extends ReplicaPlan.ForRead> - ReadRepair create(ReadCommand command, ReplicaPlan.Shared replicaPlan, Dispatcher.RequestTime requestTime); + ReadRepair create(ReadCoordinator coordinator, ReadCommand command, ReplicaPlan.Shared replicaPlan, Dispatcher.RequestTime requestTime); } static , P extends ReplicaPlan.ForRead> - ReadRepair create(ReadCommand command, ReplicaPlan.Shared replicaPlan, Dispatcher.RequestTime requestTime) + ReadRepair create(ReadCoordinator coordinator, ReadCommand command, ReplicaPlan.Shared replicaPlan, Dispatcher.RequestTime requestTime) { - return command.metadata().params.readRepair.create(command, replicaPlan, requestTime); + return command.metadata().params.readRepair.create(coordinator, command, replicaPlan, requestTime); } /** @@ -58,7 +60,7 @@ ReadRepair create(ReadCommand command, ReplicaPlan.Shared replicaPla * @param digestResolver supplied so we can get the original data response * @param resultConsumer hook for the repair to set it's result on completion */ - public void startRepair(DigestResolver digestResolver, Consumer resultConsumer); + public void startRepair(DigestResolver digestResolver, @Nullable Consumer resultConsumer); /** * Block on the reads (or timeout) sent out in {@link ReadRepair#startRepair} @@ -89,9 +91,28 @@ ReadRepair create(ReadCommand command, ReplicaPlan.Shared replicaPla */ public void awaitWrites(); + // For metrics need to know the source of the repair + enum ReadRepairSource + { + // Running a dedicated repair transacation to do RR for a non-transactional reads + REPAIR_VIA_ACCORD, + // Read repair from a regular Accord transaction or non-transactional read + OTHER + } + /** * Repairs a partition _after_ receiving data responses. This method receives replica list, since * we will block repair only on the replicas that have responded. */ - void repairPartition(DecoratedKey partitionKey, Map mutations, ReplicaPlan.ForWrite writePlan); + void repairPartition(DecoratedKey partitionKey, Map mutations, ReplicaPlan.ForWrite writePlan, ReadRepairSource rrSource); + + /** + * Repairs a partition using the provided read coordinator + */ + void repairPartitionDirectly(ReadCoordinator coordinator, DecoratedKey partitionKey, Map mutations, ReplicaPlan.ForWrite writePlan); + + default PotentialTxnConflicts coordinatorPotentialTxnConflicts() + { + return PotentialTxnConflicts.DISALLOW; + } } diff --git a/src/java/org/apache/cassandra/service/reads/repair/ReadRepairStrategy.java b/src/java/org/apache/cassandra/service/reads/repair/ReadRepairStrategy.java index 22615494a748..7f8d861888a5 100644 --- a/src/java/org/apache/cassandra/service/reads/repair/ReadRepairStrategy.java +++ b/src/java/org/apache/cassandra/service/reads/repair/ReadRepairStrategy.java @@ -22,6 +22,7 @@ import org.apache.cassandra.locator.Endpoints; import org.apache.cassandra.locator.ReplicaPlan; import org.apache.cassandra.transport.Dispatcher; +import org.apache.cassandra.service.reads.ReadCoordinator; import static org.apache.cassandra.utils.LocalizeString.toUpperCaseLocalized; @@ -30,18 +31,18 @@ public enum ReadRepairStrategy implements ReadRepair.Factory NONE { public , P extends ReplicaPlan.ForRead> - ReadRepair create(ReadCommand command, ReplicaPlan.Shared replicaPlan, Dispatcher.RequestTime requestTime) + ReadRepair create(ReadCoordinator coordinator, ReadCommand command, ReplicaPlan.Shared replicaPlan, Dispatcher.RequestTime requestTime) { - return new ReadOnlyReadRepair<>(command, replicaPlan, requestTime); + return new ReadOnlyReadRepair<>(coordinator, command, replicaPlan, requestTime); } }, BLOCKING { public , P extends ReplicaPlan.ForRead> - ReadRepair create(ReadCommand command, ReplicaPlan.Shared replicaPlan, Dispatcher.RequestTime requestTime) + ReadRepair create(ReadCoordinator coordinator, ReadCommand command, ReplicaPlan.Shared replicaPlan, Dispatcher.RequestTime requestTime) { - return new BlockingReadRepair<>(command, replicaPlan, requestTime); + return new BlockingReadRepair<>(coordinator, command, replicaPlan, requestTime); } }; diff --git a/src/java/org/apache/cassandra/service/reads/repair/RowIteratorMergeListener.java b/src/java/org/apache/cassandra/service/reads/repair/RowIteratorMergeListener.java index acddf45d94ce..47bf0efeb129 100644 --- a/src/java/org/apache/cassandra/service/reads/repair/RowIteratorMergeListener.java +++ b/src/java/org/apache/cassandra/service/reads/repair/RowIteratorMergeListener.java @@ -52,6 +52,7 @@ import org.apache.cassandra.locator.Replica; import org.apache.cassandra.locator.ReplicaPlan; import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.service.reads.repair.ReadRepair.ReadRepairSource; public class RowIteratorMergeListener> implements UnfilteredRowIterators.MergeListener @@ -390,13 +391,13 @@ public void close() else if (repairs[i] != null) update = repairs[i].build(); - Mutation mutation = BlockingReadRepairs.createRepairMutation(update, readPlan.consistencyLevel(), replica.endpoint(), false); + Mutation mutation = BlockingReadRepairs.createRepairMutation(update, readPlan.consistencyLevel(), replica.endpoint(), false, readRepair.coordinatorPotentialTxnConflicts()); if (mutation == null) continue; mutations.put(replica, mutation); } - readRepair.repairPartition(partitionKey, mutations, repairPlan); + readRepair.repairPartition(partitionKey, mutations, repairPlan, ReadRepairSource.OTHER); } } diff --git a/src/java/org/apache/cassandra/service/reads/thresholds/WarningContext.java b/src/java/org/apache/cassandra/service/reads/thresholds/WarningContext.java index dd6ee2f1a6e8..5bb5deb99a90 100644 --- a/src/java/org/apache/cassandra/service/reads/thresholds/WarningContext.java +++ b/src/java/org/apache/cassandra/service/reads/thresholds/WarningContext.java @@ -22,7 +22,7 @@ import java.util.Map; import java.util.Set; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.ParamType; @@ -43,31 +43,31 @@ public static boolean isSupported(Set keys) return !Collections.disjoint(keys, SUPPORTED); } - public RequestFailureReason updateCounters(Map params, InetAddressAndPort from) + public RequestFailure updateCounters(Map params, InetAddressAndPort from) { for (Map.Entry entry : params.entrySet()) { WarnAbortCounter counter = null; - RequestFailureReason reason = null; + RequestFailure reason = null; switch (entry.getKey()) { case ROW_INDEX_READ_SIZE_FAIL: - reason = RequestFailureReason.READ_SIZE; + reason = RequestFailure.READ_SIZE; case ROW_INDEX_READ_SIZE_WARN: counter = rowIndexReadSize; break; case LOCAL_READ_SIZE_FAIL: - reason = RequestFailureReason.READ_SIZE; + reason = RequestFailure.READ_SIZE; case LOCAL_READ_SIZE_WARN: counter = localReadSize; break; case TOMBSTONE_FAIL: - reason = RequestFailureReason.READ_TOO_MANY_TOMBSTONES; + reason = RequestFailure.READ_TOO_MANY_TOMBSTONES; case TOMBSTONE_WARNING: counter = tombstones; break; case TOO_MANY_REFERENCED_INDEXES_FAIL: - reason = RequestFailureReason.READ_TOO_MANY_INDEXES; + reason = RequestFailure.READ_TOO_MANY_INDEXES; case TOO_MANY_REFERENCED_INDEXES_WARN: counter = indexReadSSTablesCount; break; diff --git a/src/java/org/apache/cassandra/service/snapshot/SnapshotDetailsTabularData.java b/src/java/org/apache/cassandra/service/snapshot/SnapshotDetailsTabularData.java index 3d9609f2e740..7b7635b0a45e 100644 --- a/src/java/org/apache/cassandra/service/snapshot/SnapshotDetailsTabularData.java +++ b/src/java/org/apache/cassandra/service/snapshot/SnapshotDetailsTabularData.java @@ -81,7 +81,10 @@ public static void from(TableSnapshot details, TabularDataSupport result, Set T executeTask(AbstractSnapshotTask task) private synchronized void prePopulateSnapshots(TakeSnapshotTask task) { Map snapshotsToCreate = task.getSnapshotsToCreate(); - for (Map.Entry toCreateEntry : snapshotsToCreate.entrySet()) + Map snapshotsToOverwrite = new HashMap<>(); + List toCreate = new ArrayList<>(snapshotsToCreate.values()); + + for (TableSnapshot existingSnapshot : snapshots) { - if (snapshots.contains(toCreateEntry.getValue())) + for (Map.Entry toCreateEntry : snapshotsToCreate.entrySet()) { - throw new RuntimeException(format("Snapshot %s for %s.%s already exists.", - toCreateEntry.getValue().getTag(), - toCreateEntry.getValue().getKeyspaceName(), - toCreateEntry.getValue().getTableName())); + TableSnapshot snapshotToCreate = toCreateEntry.getValue(); + if (existingSnapshot.equals(toCreateEntry.getValue())) + { + if (!task.options.ephemeral) + { + throw new RuntimeException(format("Snapshot %s for %s.%s already exists.", + snapshotToCreate.getTag(), + snapshotToCreate.getKeyspaceName(), + snapshotToCreate.getTableName())); + } + + toCreate.remove(toCreateEntry.getValue()); + snapshotsToOverwrite.put(toCreateEntry.getKey(), existingSnapshot); + } } } - snapshots.addAll(snapshotsToCreate.values()); + snapshotsToCreate.putAll(snapshotsToOverwrite); + + snapshots.addAll(toCreate); } private static ScheduledExecutorPlus createSnapshotCleanupExecutor() diff --git a/src/java/org/apache/cassandra/service/snapshot/SnapshotOptions.java b/src/java/org/apache/cassandra/service/snapshot/SnapshotOptions.java index 409288f8bf17..8f487d24533d 100644 --- a/src/java/org/apache/cassandra/service/snapshot/SnapshotOptions.java +++ b/src/java/org/apache/cassandra/service/snapshot/SnapshotOptions.java @@ -50,27 +50,18 @@ public class SnapshotOptions public final Predicate sstableFilter; public final ColumnFamilyStore cfs; - private SnapshotOptions(SnapshotType type, - String tag, - DurationSpec.IntSecondsBound ttl, - Instant creationTime, - boolean skipFlush, - boolean ephemeral, - String[] entities, - RateLimiter rateLimiter, - Predicate sstableFilter, - ColumnFamilyStore cfs) + private SnapshotOptions(Builder builder) { - this.type = type; - this.tag = tag; - this.ttl = ttl; - this.creationTime = creationTime; - this.skipFlush = skipFlush; - this.ephemeral = ephemeral; - this.entities = entities; - this.rateLimiter = rateLimiter; - this.sstableFilter = sstableFilter; - this.cfs = cfs; + this.type = builder.type; + this.tag = builder.tag; + this.ttl = builder.ttl; + this.creationTime = builder.creationTime; + this.skipFlush = builder.skipFlush; + this.ephemeral = builder.ephemeral; + this.entities = builder.entities; + this.rateLimiter = builder.rateLimiter; + this.sstableFilter = builder.sstableFilter; + this.cfs = builder.cfs; } public static Builder systemSnapshot(String tag, SnapshotType type, String... entities) @@ -214,8 +205,7 @@ public SnapshotOptions build() if (rateLimiter == null) rateLimiter = DatabaseDescriptor.getSnapshotRateLimiter(); - return new SnapshotOptions(type, tag, ttl, creationTime, skipFlush, ephemeral, entities, rateLimiter, - sstableFilter, cfs); + return new SnapshotOptions(this); } } diff --git a/src/java/org/apache/cassandra/service/snapshot/TableSnapshot.java b/src/java/org/apache/cassandra/service/snapshot/TableSnapshot.java index 7698ea1e3da2..a23aa7a2b165 100644 --- a/src/java/org/apache/cassandra/service/snapshot/TableSnapshot.java +++ b/src/java/org/apache/cassandra/service/snapshot/TableSnapshot.java @@ -207,10 +207,21 @@ public boolean isExpiring() } public long computeSizeOnDiskBytes() + { + return computeSizeOnDiskBytes(false); + } + + /** + * + * @param refresh true if a caller wants to recompute otherwise cached size + * @return on disk bytes + */ + public long computeSizeOnDiskBytes(boolean refresh) { long sum = sizeOnDisk; - if (sum == 0) + if (sum == 0 || refresh) { + sum = 0; for (File snapshotDir : snapshotDirs) sum += FileUtils.folderSize(snapshotDir); diff --git a/src/java/org/apache/cassandra/service/snapshot/TakeSnapshotTask.java b/src/java/org/apache/cassandra/service/snapshot/TakeSnapshotTask.java index a0d81a685ccc..236a3ab783aa 100644 --- a/src/java/org/apache/cassandra/service/snapshot/TakeSnapshotTask.java +++ b/src/java/org/apache/cassandra/service/snapshot/TakeSnapshotTask.java @@ -163,7 +163,7 @@ private void createSnapshot(ColumnFamilyStore cfs, TableSnapshot snapshotToCreat for (SSTableReader ssTable : currentView.sstables) { File snapshotDirectory = Directories.getSnapshotDirectory(ssTable.descriptor, snapshotName); - ssTable.createLinks(snapshotDirectory.path(), options.rateLimiter); // hard links + ssTable.createLinks(snapshotDirectory.path(), options.rateLimiter, options.ephemeral); // hard links if (logger.isTraceEnabled()) logger.trace("Snapshot for {} keyspace data file {} created in {}", cfs.keyspace, ssTable.getFilename(), snapshotDirectory); sstables.add(ssTable); @@ -268,12 +268,40 @@ else if (entities != null) } + private SnapshotManifest createSnapshotManifest(SnapshotManifest manifest, File manifestFile) + { + SnapshotManifest oldManifest = null; + if (manifestFile.exists()) + { + try + { + oldManifest = SnapshotManifest.deserializeFromJsonFile(manifestFile); + } + catch (Throwable t) + { + logger.warn("Unable to read the content of old manifest {}", manifestFile); + } + } + + if (oldManifest != null) + { + Set deduplicates = new HashSet<>(); // set to deduplicate + deduplicates.addAll(oldManifest.getFiles()); + deduplicates.addAll(manifest.files); + + return new SnapshotManifest(new ArrayList<>(deduplicates), options.ttl, creationTime, options.ephemeral); + } + + return manifest; + } + private void writeSnapshotManifest(SnapshotManifest manifest, File manifestFile) { try { + SnapshotManifest toCreate = createSnapshotManifest(manifest, manifestFile); manifestFile.parent().tryCreateDirectories(); - manifest.serializeToJsonFile(manifestFile); + toCreate.serializeToJsonFile(manifestFile); } catch (IOException e) { diff --git a/src/java/org/apache/cassandra/streaming/OutgoingStream.java b/src/java/org/apache/cassandra/streaming/OutgoingStream.java index cc42ab6b8235..77386a622552 100644 --- a/src/java/org/apache/cassandra/streaming/OutgoingStream.java +++ b/src/java/org/apache/cassandra/streaming/OutgoingStream.java @@ -19,7 +19,10 @@ package org.apache.cassandra.streaming; import java.io.IOException; +import java.util.List; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.utils.TimeUUID; @@ -54,4 +57,5 @@ public interface OutgoingStream long getEstimatedSize(); TableId getTableId(); int getNumFiles(); + List> ranges(); } diff --git a/src/java/org/apache/cassandra/streaming/SessionSummary.java b/src/java/org/apache/cassandra/streaming/SessionSummary.java index 9588e4918ffd..8bb1a1eb819e 100644 --- a/src/java/org/apache/cassandra/streaming/SessionSummary.java +++ b/src/java/org/apache/cassandra/streaming/SessionSummary.java @@ -25,7 +25,8 @@ import java.util.List; import org.apache.cassandra.db.TypeSizes; -import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.IPartitionerDependentSerializer; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.locator.InetAddressAndPort; @@ -78,7 +79,7 @@ public int hashCode() return result; } - public static IVersionedSerializer serializer = new IVersionedSerializer() + public static IPartitionerDependentSerializer serializer = new IPartitionerDependentSerializer() { public void serialize(SessionSummary summary, DataOutputPlus out, int version) throws IOException { @@ -98,7 +99,7 @@ public void serialize(SessionSummary summary, DataOutputPlus out, int version) t } } - public SessionSummary deserialize(DataInputPlus in, int version) throws IOException + public SessionSummary deserialize(DataInputPlus in, IPartitioner partitioner, int version) throws IOException { InetAddressAndPort coordinator = inetAddressAndPortSerializer.deserialize(in, version); InetAddressAndPort peer = inetAddressAndPortSerializer.deserialize(in, version); diff --git a/src/java/org/apache/cassandra/streaming/StreamOperation.java b/src/java/org/apache/cassandra/streaming/StreamOperation.java index 98a4070d2b0c..b1c5908f7fe8 100644 --- a/src/java/org/apache/cassandra/streaming/StreamOperation.java +++ b/src/java/org/apache/cassandra/streaming/StreamOperation.java @@ -19,28 +19,31 @@ public enum StreamOperation { - OTHER("Other", true, false), // Fallback to avoid null types when deserializing from string - RESTORE_REPLICA_COUNT("Restore replica count", false, false), // Handles removeNode - DECOMMISSION("Unbootstrap", false, true), - RELOCATION("Relocation", false, true), - BOOTSTRAP("Bootstrap", false, true), - REBUILD("Rebuild", false, true), - BULK_LOAD("Bulk Load", true, false), - REPAIR("Repair", true, false); + OTHER("Other", true, false, false), // Fallback to avoid null types when deserializing from string + RESTORE_REPLICA_COUNT("Restore replica count", false, false, false), // Handles removeNode + DECOMMISSION("Unbootstrap", false, true, false), + RELOCATION("Relocation", false, true, false), + BOOTSTRAP("Bootstrap", false, true, false), + REBUILD("Rebuild", false, true, false), + BULK_LOAD("Bulk Load", true, false, false), + REPAIR("Repair", true, false, true); private final String description; private final boolean requiresViewBuild; private final boolean keepSSTableLevel; + private final boolean requiresBarrierTransaction; /** * @param description The operation description * @param requiresViewBuild Whether this operation requires views to be updated if it involves a base table + * @param requiresBarrierTransaction Requires barrier to ensure all data that was repaired is already committed by txn system */ - StreamOperation(String description, boolean requiresViewBuild, boolean keepSSTableLevel) + StreamOperation(String description, boolean requiresViewBuild, boolean keepSSTableLevel, boolean requiresBarrierTransaction) { this.description = description; this.requiresViewBuild = requiresViewBuild; this.keepSSTableLevel = keepSSTableLevel; + this.requiresBarrierTransaction = requiresBarrierTransaction; } public static StreamOperation fromString(String text) @@ -71,4 +74,9 @@ public boolean keepSSTableLevel() { return keepSSTableLevel; } + + public boolean requiresBarrierTransaction() + { + return requiresBarrierTransaction; + } } diff --git a/src/java/org/apache/cassandra/streaming/StreamPlan.java b/src/java/org/apache/cassandra/streaming/StreamPlan.java index 47fa9e1463bf..93b864d79b07 100644 --- a/src/java/org/apache/cassandra/streaming/StreamPlan.java +++ b/src/java/org/apache/cassandra/streaming/StreamPlan.java @@ -24,6 +24,8 @@ import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.RangesAtEndpoint; import org.apache.cassandra.locator.Replica; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.utils.TimeUUID; import static com.google.common.collect.Iterables.all; @@ -225,4 +227,31 @@ public StreamCoordinator getCoordinator() { return coordinator; } + + /** + * Returns an array containing the non-accord tables for the given keyspace. Since the relevant StreamPlan methods + * interpret an empty array to mean all tables, null is returned if there are no non-accord tables in + * the given keyspace + * @param ksm + * @return + */ + public static String[] nonAccordTablesForKeyspace(KeyspaceMetadata ksm) + { + String[] result = ksm.tables.stream() + .filter(tbl -> !tbl.isAccordEnabled()) + .map(tbl -> tbl.name) + .toArray(String[]::new); + + return result.length > 0 ? result : null; + } + + public static boolean hasNonAccordTables(KeyspaceMetadata ksm) + { + return ksm.tables.stream().anyMatch(tbl -> !tbl.isAccordEnabled()); + } + + public static boolean hasAccordTables(KeyspaceMetadata ksm) + { + return ksm.tables.stream().anyMatch(TableMetadata::isAccordEnabled); + } } diff --git a/src/java/org/apache/cassandra/streaming/StreamReceiveTask.java b/src/java/org/apache/cassandra/streaming/StreamReceiveTask.java index 002e1827148a..86b880cd89c3 100644 --- a/src/java/org/apache/cassandra/streaming/StreamReceiveTask.java +++ b/src/java/org/apache/cassandra/streaming/StreamReceiveTask.java @@ -17,6 +17,7 @@ */ package org.apache.cassandra.streaming; +import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; @@ -25,7 +26,10 @@ import com.google.common.base.Preconditions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.utils.JVMStabilityInspector; @@ -56,12 +60,16 @@ public class StreamReceiveTask extends StreamTask private int remoteStreamsReceived = 0; private long bytesReceived = 0; - public StreamReceiveTask(StreamSession session, TableId tableId, int totalStreams, long totalSize) + private List> ranges; + + public StreamReceiveTask(StreamSession session, TableId tableId, List> ranges, int totalStreams, long totalSize) { super(session, tableId); - this.receiver = ColumnFamilyStore.getIfExists(tableId).getStreamManager().createStreamReceiver(session, totalStreams); + Range.assertNormalized(ranges); + this.receiver = ColumnFamilyStore.getIfExists(tableId).getStreamManager().createStreamReceiver(session, ranges, totalStreams); this.totalStreams = totalStreams; this.totalSize = totalSize; + this.ranges = ranges; } /** @@ -164,6 +172,12 @@ public synchronized void abort() receiver.abort(); } + @Override + protected List> ranges() + { + return ranges; + } + @VisibleForTesting public static void shutdownAndWait(long timeout, TimeUnit unit) throws InterruptedException, TimeoutException { diff --git a/src/java/org/apache/cassandra/streaming/StreamSession.java b/src/java/org/apache/cassandra/streaming/StreamSession.java index 217c70f0586f..447621c78c9f 100644 --- a/src/java/org/apache/cassandra/streaming/StreamSession.java +++ b/src/java/org/apache/cassandra/streaming/StreamSession.java @@ -36,7 +36,6 @@ import java.util.concurrent.TimeUnit; import java.util.function.Function; import java.util.stream.Collectors; - import javax.annotation.Nullable; import com.google.common.annotations.VisibleForTesting; @@ -45,13 +44,12 @@ import com.google.common.collect.Lists; import com.google.common.collect.Multimap; import com.google.common.collect.Sets; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import io.netty.channel.Channel; import io.netty.util.concurrent.Future; //checkstyle: permit this import -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.cassandra.concurrent.ScheduledExecutors; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.ColumnFamilyStore; @@ -72,7 +70,17 @@ import org.apache.cassandra.service.ActiveRepairService; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.streaming.async.StreamingMultiplexedChannel; -import org.apache.cassandra.streaming.messages.*; +import org.apache.cassandra.streaming.messages.CompleteMessage; +import org.apache.cassandra.streaming.messages.IncomingStreamMessage; +import org.apache.cassandra.streaming.messages.OutgoingStreamMessage; +import org.apache.cassandra.streaming.messages.PrepareAckMessage; +import org.apache.cassandra.streaming.messages.PrepareSynAckMessage; +import org.apache.cassandra.streaming.messages.PrepareSynMessage; +import org.apache.cassandra.streaming.messages.ReceivedMessage; +import org.apache.cassandra.streaming.messages.SessionFailedMessage; +import org.apache.cassandra.streaming.messages.StreamInitMessage; +import org.apache.cassandra.streaming.messages.StreamMessage; +import org.apache.cassandra.streaming.messages.StreamMessageHeader; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.JVMStabilityInspector; import org.apache.cassandra.utils.NoSpamLogger; @@ -454,6 +462,7 @@ synchronized void addTransferRanges(String keyspace, RangesAtEndpoint replicas, //Range and if it's transient RangesAtEndpoint unwrappedRanges = replicas.unwrap(); List streams = getOutgoingStreamsForRanges(unwrappedRanges, stores, pendingRepair, previewKind); + addTransferStreams(streams); Set> toBeUpdated = transferredRangesPerKeyspace.get(keyspace); if (toBeUpdated == null) @@ -735,7 +744,7 @@ else if (e instanceof TransactionAlreadyCompletedException && isFailedOrAborted( if (channel.connected()) { - state(State.FAILED); // make sure subsequent error handling sees the session in a final state + state(State.FAILED); // make sure subsequent error handling sees the session in a final state sendControlMessage(new SessionFailedMessage()).awaitUninterruptibly(); } StringBuilder failureReason = new StringBuilder("Failed because of an unknown exception\n"); @@ -1260,7 +1269,7 @@ public synchronized void prepareReceiving(StreamSummary summary) { failIfFinished(); if (summary.files > 0) - receivers.put(summary.tableId, new StreamReceiveTask(this, summary.tableId, summary.files, summary.totalSize)); + receivers.put(summary.tableId, new StreamReceiveTask(this, summary.tableId, summary.ranges, summary.files, summary.totalSize)); } private void startStreamingFiles(@Nullable PrepareDirection prepareDirection) @@ -1297,8 +1306,16 @@ public int getNumRequests() return requests.size(); } - @VisibleForTesting public int getNumTransfers() + { + return transfers.size(); + } + + //TODO (now, review): there were 2 tests that use this (nothing else) and both are checking that its > 1... but in both cases they are checking if there are transfer tasks, but there isn't any as the range doesn't have data... + // This looks like AccordBootstrapTest and LocalSyncTaskTest have a test bug, so rather than fixing this method was created to keep the old semantic... + @Deprecated(since = "5.1") + @VisibleForTesting + public int getNumKeyspaceTransfers() { return transferredRangesPerKeyspace.size(); } diff --git a/src/java/org/apache/cassandra/streaming/StreamSummary.java b/src/java/org/apache/cassandra/streaming/StreamSummary.java index 3f957c69a78c..b6b3545bf26a 100644 --- a/src/java/org/apache/cassandra/streaming/StreamSummary.java +++ b/src/java/org/apache/cassandra/streaming/StreamSummary.java @@ -19,14 +19,22 @@ import java.io.IOException; import java.io.Serializable; +import java.util.List; import com.google.common.base.Objects; - +import com.google.common.collect.ImmutableList; import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.schema.Schema; import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.CollectionSerializers; /** * Summary of streaming. @@ -36,6 +44,7 @@ public class StreamSummary implements Serializable public static final IVersionedSerializer serializer = new StreamSummarySerializer(); public final TableId tableId; + public final List> ranges; /** * Number of files to transfer. Can be 0 if nothing to transfer for some streaming request. @@ -43,9 +52,10 @@ public class StreamSummary implements Serializable public final int files; public final long totalSize; - public StreamSummary(TableId tableId, int files, long totalSize) + public StreamSummary(TableId tableId, List> ranges, int files, long totalSize) { this.tableId = tableId; + this.ranges = ranges; this.files = files; this.totalSize = totalSize; } @@ -56,13 +66,13 @@ public boolean equals(Object o) if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; StreamSummary summary = (StreamSummary) o; - return files == summary.files && totalSize == summary.totalSize && tableId.equals(summary.tableId); + return files == summary.files && totalSize == summary.totalSize && tableId.equals(summary.tableId) && ranges.equals(summary.ranges); } @Override public int hashCode() { - return Objects.hashCode(tableId, files, totalSize); + return Objects.hashCode(tableId, ranges, files, totalSize); } @Override @@ -70,6 +80,7 @@ public String toString() { final StringBuilder sb = new StringBuilder("StreamSummary{"); sb.append("path=").append(tableId); + sb.append(", ranges=").append(ranges); sb.append(", files=").append(files); sb.append(", totalSize=").append(totalSize); sb.append('}'); @@ -83,14 +94,28 @@ public void serialize(StreamSummary summary, DataOutputPlus out, int version) th summary.tableId.serialize(out); out.writeInt(summary.files); out.writeLong(summary.totalSize); + Token.logPartitioner = true; + if (version >= MessagingService.VERSION_51) + CollectionSerializers.serializeCollection(summary.ranges, out, version, Range.rangeSerializer); + Token.logPartitioner = false; } public StreamSummary deserialize(DataInputPlus in, int version) throws IOException { TableId tableId = TableId.deserialize(in); + int files = in.readInt(); long totalSize = in.readLong(); - return new StreamSummary(tableId, files, totalSize); + List> ranges = ImmutableList.of(); + if (version >= MessagingService.VERSION_51) + { + TableMetadata tableMetadata = Schema.instance.getTableMetadata(tableId); + IPartitioner p = tableMetadata != null ? tableMetadata.partitioner : IPartitioner.global(); + Token.logPartitioner = true; + ranges = CollectionSerializers.deserializeList(in, p, version, Range.rangeSerializer); + Token.logPartitioner = false; + } + return new StreamSummary(tableId, ranges, files, totalSize); } public long serializedSize(StreamSummary summary, int version) @@ -98,6 +123,8 @@ public long serializedSize(StreamSummary summary, int version) long size = summary.tableId.serializedSize(); size += TypeSizes.sizeof(summary.files); size += TypeSizes.sizeof(summary.totalSize); + if (version >= MessagingService.VERSION_51) + size += CollectionSerializers.serializedCollectionSize(summary.ranges, version, Range.rangeSerializer); return size; } } diff --git a/src/java/org/apache/cassandra/streaming/StreamTask.java b/src/java/org/apache/cassandra/streaming/StreamTask.java index 1e22c34ce9c9..886257fda7b8 100644 --- a/src/java/org/apache/cassandra/streaming/StreamTask.java +++ b/src/java/org/apache/cassandra/streaming/StreamTask.java @@ -17,6 +17,10 @@ */ package org.apache.cassandra.streaming; +import java.util.List; + +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.schema.TableId; /** @@ -51,11 +55,13 @@ protected StreamTask(StreamSession session, TableId tableId) */ public abstract void abort(); + protected abstract List> ranges(); + /** * @return StreamSummary that describes this task */ public StreamSummary getSummary() { - return new StreamSummary(tableId, getTotalNumberOfFiles(), getTotalSize()); + return new StreamSummary(tableId, ranges(), getTotalNumberOfFiles(), getTotalSize()); } } diff --git a/src/java/org/apache/cassandra/streaming/StreamTransferTask.java b/src/java/org/apache/cassandra/streaming/StreamTransferTask.java index 0721316ccbde..8b4fe1f1bd62 100644 --- a/src/java/org/apache/cassandra/streaming/StreamTransferTask.java +++ b/src/java/org/apache/cassandra/streaming/StreamTransferTask.java @@ -20,7 +20,10 @@ import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; +import java.util.HashSet; +import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; @@ -33,6 +36,8 @@ import org.slf4j.LoggerFactory; import org.apache.cassandra.concurrent.ScheduledExecutorPlus; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.streaming.messages.OutgoingStreamMessage; import org.apache.cassandra.utils.ExecutorUtils; @@ -57,6 +62,8 @@ public class StreamTransferTask extends StreamTask private long totalSize = 0; private int totalFiles = 0; + private final Set> ranges = new HashSet<>(); + public StreamTransferTask(StreamSession session, TableId tableId) { super(session, tableId); @@ -70,6 +77,7 @@ public synchronized void addTransferStream(OutgoingStream stream) streams.put(message.header.sequenceNumber, message); totalSize += message.stream.getEstimatedSize(); totalFiles += message.stream.getNumFiles(); + ranges.addAll(stream.ranges()); } /** @@ -149,6 +157,12 @@ public synchronized void abort() } } + @Override + protected List> ranges() + { + return Range.normalize(ranges); + } + public synchronized int getTotalNumberOfFiles() { return totalFiles; diff --git a/src/java/org/apache/cassandra/streaming/TableStreamManager.java b/src/java/org/apache/cassandra/streaming/TableStreamManager.java index 208dc344a926..d19064c9577e 100644 --- a/src/java/org/apache/cassandra/streaming/TableStreamManager.java +++ b/src/java/org/apache/cassandra/streaming/TableStreamManager.java @@ -19,7 +19,10 @@ package org.apache.cassandra.streaming; import java.util.Collection; +import java.util.List; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.locator.RangesAtEndpoint; import org.apache.cassandra.streaming.messages.StreamMessageHeader; import org.apache.cassandra.utils.TimeUUID; @@ -36,7 +39,7 @@ public interface TableStreamManager /** * Creates a {@link StreamReceiver} for the given session, expecting the given number of streams */ - StreamReceiver createStreamReceiver(StreamSession session, int totalStreams); + StreamReceiver createStreamReceiver(StreamSession session, List> ranges, int totalStreams); /** * Creates an {@link IncomingStream} for the given header diff --git a/src/java/org/apache/cassandra/streaming/async/StreamingMultiplexedChannel.java b/src/java/org/apache/cassandra/streaming/async/StreamingMultiplexedChannel.java index ce11c3996e56..751928c19096 100644 --- a/src/java/org/apache/cassandra/streaming/async/StreamingMultiplexedChannel.java +++ b/src/java/org/apache/cassandra/streaming/async/StreamingMultiplexedChannel.java @@ -95,7 +95,7 @@ public class StreamingMultiplexedChannel private static final int MAX_PARALLEL_TRANSFERS = STREAMING_SESSION_PARALLELTRANSFERS.getInt(DEFAULT_MAX_PARALLEL_TRANSFERS); // a simple mechansim for allowing a degree of fairness across multiple sessions - private static final Semaphore fileTransferSemaphore = newFairSemaphore(DEFAULT_MAX_PARALLEL_TRANSFERS); + private static final Semaphore fileTransferSemaphore = newFairSemaphore(MAX_PARALLEL_TRANSFERS); private final StreamingChannel.Factory factory; private final InetAddressAndPort to; diff --git a/src/java/org/apache/cassandra/streaming/management/StreamSummaryCompositeData.java b/src/java/org/apache/cassandra/streaming/management/StreamSummaryCompositeData.java index 05a0afcfd4a1..c79c1a9a1f0e 100644 --- a/src/java/org/apache/cassandra/streaming/management/StreamSummaryCompositeData.java +++ b/src/java/org/apache/cassandra/streaming/management/StreamSummaryCompositeData.java @@ -19,7 +19,14 @@ import java.util.HashMap; import java.util.Map; -import javax.management.openmbean.*; +import javax.management.openmbean.CompositeData; +import javax.management.openmbean.CompositeDataSupport; +import javax.management.openmbean.CompositeType; +import javax.management.openmbean.OpenDataException; +import javax.management.openmbean.OpenType; +import javax.management.openmbean.SimpleType; + +import com.google.common.collect.ImmutableList; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.streaming.StreamSummary; @@ -75,6 +82,7 @@ public static StreamSummary fromCompositeData(CompositeData cd) { Object[] values = cd.getAll(ITEM_NAMES); return new StreamSummary(TableId.fromString((String) values[0]), + ImmutableList.of(), (int) values[1], (long) values[2]); } diff --git a/src/java/org/apache/cassandra/streaming/messages/CompleteMessage.java b/src/java/org/apache/cassandra/streaming/messages/CompleteMessage.java index bf3526663c2b..afb1c6c7b478 100644 --- a/src/java/org/apache/cassandra/streaming/messages/CompleteMessage.java +++ b/src/java/org/apache/cassandra/streaming/messages/CompleteMessage.java @@ -18,8 +18,8 @@ package org.apache.cassandra.streaming.messages; import org.apache.cassandra.io.util.DataInputPlus; -import org.apache.cassandra.streaming.StreamingDataOutputPlus; import org.apache.cassandra.streaming.StreamSession; +import org.apache.cassandra.streaming.StreamingDataOutputPlus; public class CompleteMessage extends StreamMessage { diff --git a/src/java/org/apache/cassandra/streaming/messages/IncomingStreamMessage.java b/src/java/org/apache/cassandra/streaming/messages/IncomingStreamMessage.java index ff1e61fd598b..e48d115e35d2 100644 --- a/src/java/org/apache/cassandra/streaming/messages/IncomingStreamMessage.java +++ b/src/java/org/apache/cassandra/streaming/messages/IncomingStreamMessage.java @@ -22,13 +22,12 @@ import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.io.util.DataInputPlus; - import org.apache.cassandra.streaming.IncomingStream; -import org.apache.cassandra.streaming.StreamingChannel; -import org.apache.cassandra.streaming.StreamingDataOutputPlus; import org.apache.cassandra.streaming.StreamManager; import org.apache.cassandra.streaming.StreamReceiveException; import org.apache.cassandra.streaming.StreamSession; +import org.apache.cassandra.streaming.StreamingChannel; +import org.apache.cassandra.streaming.StreamingDataOutputPlus; public class IncomingStreamMessage extends StreamMessage { diff --git a/src/java/org/apache/cassandra/streaming/messages/KeepAliveMessage.java b/src/java/org/apache/cassandra/streaming/messages/KeepAliveMessage.java index a09cfcae8200..42be1e99a1fd 100644 --- a/src/java/org/apache/cassandra/streaming/messages/KeepAliveMessage.java +++ b/src/java/org/apache/cassandra/streaming/messages/KeepAliveMessage.java @@ -18,8 +18,8 @@ package org.apache.cassandra.streaming.messages; import org.apache.cassandra.io.util.DataInputPlus; -import org.apache.cassandra.streaming.StreamingDataOutputPlus; import org.apache.cassandra.streaming.StreamSession; +import org.apache.cassandra.streaming.StreamingDataOutputPlus; public class KeepAliveMessage extends StreamMessage { diff --git a/src/java/org/apache/cassandra/streaming/messages/OutgoingStreamMessage.java b/src/java/org/apache/cassandra/streaming/messages/OutgoingStreamMessage.java index 4128ddb4b0fe..b83d7863fc1d 100644 --- a/src/java/org/apache/cassandra/streaming/messages/OutgoingStreamMessage.java +++ b/src/java/org/apache/cassandra/streaming/messages/OutgoingStreamMessage.java @@ -24,8 +24,8 @@ import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.streaming.OutgoingStream; -import org.apache.cassandra.streaming.StreamingDataOutputPlus; import org.apache.cassandra.streaming.StreamSession; +import org.apache.cassandra.streaming.StreamingDataOutputPlus; import org.apache.cassandra.utils.FBUtilities; public class OutgoingStreamMessage extends StreamMessage diff --git a/src/java/org/apache/cassandra/streaming/messages/PrepareAckMessage.java b/src/java/org/apache/cassandra/streaming/messages/PrepareAckMessage.java index 479ef3424db0..f93b5afe3092 100644 --- a/src/java/org/apache/cassandra/streaming/messages/PrepareAckMessage.java +++ b/src/java/org/apache/cassandra/streaming/messages/PrepareAckMessage.java @@ -21,8 +21,8 @@ import java.io.IOException; import org.apache.cassandra.io.util.DataInputPlus; -import org.apache.cassandra.streaming.StreamingDataOutputPlus; import org.apache.cassandra.streaming.StreamSession; +import org.apache.cassandra.streaming.StreamingDataOutputPlus; public class PrepareAckMessage extends StreamMessage { diff --git a/src/java/org/apache/cassandra/streaming/messages/PrepareSynAckMessage.java b/src/java/org/apache/cassandra/streaming/messages/PrepareSynAckMessage.java index 9d97de69fac7..e052f4c3017b 100644 --- a/src/java/org/apache/cassandra/streaming/messages/PrepareSynAckMessage.java +++ b/src/java/org/apache/cassandra/streaming/messages/PrepareSynAckMessage.java @@ -23,9 +23,9 @@ import java.util.Collection; import org.apache.cassandra.io.util.DataInputPlus; -import org.apache.cassandra.streaming.StreamingDataOutputPlus; import org.apache.cassandra.streaming.StreamSession; import org.apache.cassandra.streaming.StreamSummary; +import org.apache.cassandra.streaming.StreamingDataOutputPlus; public class PrepareSynAckMessage extends StreamMessage { diff --git a/src/java/org/apache/cassandra/streaming/messages/PrepareSynMessage.java b/src/java/org/apache/cassandra/streaming/messages/PrepareSynMessage.java index 1160033bd3ae..c856f469838f 100644 --- a/src/java/org/apache/cassandra/streaming/messages/PrepareSynMessage.java +++ b/src/java/org/apache/cassandra/streaming/messages/PrepareSynMessage.java @@ -17,15 +17,15 @@ */ package org.apache.cassandra.streaming.messages; -import java.io.*; +import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import org.apache.cassandra.io.util.DataInputPlus; -import org.apache.cassandra.streaming.StreamingDataOutputPlus; import org.apache.cassandra.streaming.StreamRequest; import org.apache.cassandra.streaming.StreamSession; import org.apache.cassandra.streaming.StreamSummary; +import org.apache.cassandra.streaming.StreamingDataOutputPlus; public class PrepareSynMessage extends StreamMessage { diff --git a/src/java/org/apache/cassandra/streaming/messages/ReceivedMessage.java b/src/java/org/apache/cassandra/streaming/messages/ReceivedMessage.java index 378f72f896da..67559596140e 100644 --- a/src/java/org/apache/cassandra/streaming/messages/ReceivedMessage.java +++ b/src/java/org/apache/cassandra/streaming/messages/ReceivedMessage.java @@ -17,12 +17,12 @@ */ package org.apache.cassandra.streaming.messages; -import java.io.*; +import java.io.IOException; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.schema.TableId; -import org.apache.cassandra.streaming.StreamingDataOutputPlus; import org.apache.cassandra.streaming.StreamSession; +import org.apache.cassandra.streaming.StreamingDataOutputPlus; public class ReceivedMessage extends StreamMessage { diff --git a/src/java/org/apache/cassandra/streaming/messages/SessionFailedMessage.java b/src/java/org/apache/cassandra/streaming/messages/SessionFailedMessage.java index f09b64327e05..7fa82d8f6770 100644 --- a/src/java/org/apache/cassandra/streaming/messages/SessionFailedMessage.java +++ b/src/java/org/apache/cassandra/streaming/messages/SessionFailedMessage.java @@ -18,8 +18,8 @@ package org.apache.cassandra.streaming.messages; import org.apache.cassandra.io.util.DataInputPlus; -import org.apache.cassandra.streaming.StreamingDataOutputPlus; import org.apache.cassandra.streaming.StreamSession; +import org.apache.cassandra.streaming.StreamingDataOutputPlus; public class SessionFailedMessage extends StreamMessage { diff --git a/src/java/org/apache/cassandra/streaming/messages/StreamInitMessage.java b/src/java/org/apache/cassandra/streaming/messages/StreamInitMessage.java index 889c732f0fc1..e78442334bf0 100644 --- a/src/java/org/apache/cassandra/streaming/messages/StreamInitMessage.java +++ b/src/java/org/apache/cassandra/streaming/messages/StreamInitMessage.java @@ -22,12 +22,12 @@ import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.streaming.StreamingChannel; -import org.apache.cassandra.streaming.StreamingDataOutputPlus; -import org.apache.cassandra.streaming.StreamOperation; import org.apache.cassandra.streaming.PreviewKind; +import org.apache.cassandra.streaming.StreamOperation; import org.apache.cassandra.streaming.StreamResultFuture; import org.apache.cassandra.streaming.StreamSession; +import org.apache.cassandra.streaming.StreamingChannel; +import org.apache.cassandra.streaming.StreamingDataOutputPlus; import org.apache.cassandra.utils.TimeUUID; import static org.apache.cassandra.locator.InetAddressAndPort.Serializer.inetAddressAndPortSerializer; diff --git a/src/java/org/apache/cassandra/streaming/messages/StreamMessage.java b/src/java/org/apache/cassandra/streaming/messages/StreamMessage.java index db393a54347f..186ac3274abd 100644 --- a/src/java/org/apache/cassandra/streaming/messages/StreamMessage.java +++ b/src/java/org/apache/cassandra/streaming/messages/StreamMessage.java @@ -22,9 +22,9 @@ import java.util.Map; import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.streaming.StreamSession; import org.apache.cassandra.streaming.StreamingChannel; import org.apache.cassandra.streaming.StreamingDataOutputPlus; -import org.apache.cassandra.streaming.StreamSession; /** * StreamMessage is an abstract base class that every messages in streaming protocol inherit. diff --git a/src/java/org/apache/cassandra/tcm/AbstractLocalProcessor.java b/src/java/org/apache/cassandra/tcm/AbstractLocalProcessor.java index da4e2e53ba88..72d46a5af5a5 100644 --- a/src/java/org/apache/cassandra/tcm/AbstractLocalProcessor.java +++ b/src/java/org/apache/cassandra/tcm/AbstractLocalProcessor.java @@ -18,7 +18,6 @@ package org.apache.cassandra.tcm; -import java.util.concurrent.TimeUnit; import java.util.function.Supplier; import org.slf4j.Logger; @@ -50,9 +49,9 @@ public AbstractLocalProcessor(LocalLog log) * the time when this method returns. */ @Override - public final Commit.Result commit(Entry.Id entryId, Transformation transform, final Epoch lastKnown, Retry.Deadline retryPolicy) + public final Commit.Result commit(Entry.Id entryId, Transformation transform, final Epoch lastKnown, Retry retryPolicy) { - while (!retryPolicy.reachedMax()) + while (!retryPolicy.hasExpired()) { ClusterMetadata previous = log.waitForHighestConsecutive(); if (!previous.fullCMSMembers().contains(FBUtilities.getBroadcastAddressAndPort())) @@ -112,7 +111,8 @@ public final Commit.Result commit(Entry.Id entryId, Transformation transform, fi } else { - retryPolicy.maybeSleep(); + if (!retryPolicy.maybeSleep()) + break; // TODO: could also add epoch from mis-application from [applied]. fetchLogAndWait(null, retryPolicy); } @@ -121,13 +121,12 @@ public final Commit.Result commit(Entry.Id entryId, Transformation transform, fi { logger.error("Caught error while trying to perform a local commit", e); JVMStabilityInspector.inspectThrowable(e); - retryPolicy.maybeSleep(); + if (!retryPolicy.maybeSleep()) + break; } } return Commit.Result.failed(SERVER_ERROR, - String.format("Could not perform commit after %d/%d tries. Time remaining: %dms", - retryPolicy.tries, retryPolicy.maxTries, - TimeUnit.NANOSECONDS.toMillis(retryPolicy.remainingNanos()))); + String.format("Could not perform commit; policy %s gave up", retryPolicy)); } public Commit.Result maybeFailure(Entry.Id entryId, Epoch lastKnown, Supplier orElse) @@ -172,7 +171,6 @@ private Transformation.Result executeStrictly(ClusterMetadata metadata, Transfor } } - private LogState toLogState(Transformation.Success success, Entry.Id entryId, Epoch lastKnown, Transformation transform) { if (lastKnown == null || lastKnown.isDirectlyBefore(success.metadata.epoch)) @@ -191,15 +189,12 @@ private LogState toLogState(Epoch lastKnown) // We can use local log here since we always call this method only if local log is up-to-date: // in case of a successful commit, we apply against latest metadata locally before committing, // and in case of a rejection, we fetch latest entries to verify linearizability. - logState = log.getCommittedEntries(lastKnown); + logState = log.getLocalEntries(lastKnown); } return logState; } - - @Override - public abstract ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry.Deadline retryPolicy); + public abstract ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry retryPolicy); protected abstract boolean tryCommitOne(Entry.Id entryId, Transformation transform, Epoch previousEpoch, Epoch nextEpoch); - -} \ No newline at end of file +} diff --git a/src/java/org/apache/cassandra/tcm/AtomicLongBackedProcessor.java b/src/java/org/apache/cassandra/tcm/AtomicLongBackedProcessor.java index df00253433ba..d124707300ab 100644 --- a/src/java/org/apache/cassandra/tcm/AtomicLongBackedProcessor.java +++ b/src/java/org/apache/cassandra/tcm/AtomicLongBackedProcessor.java @@ -18,8 +18,10 @@ package org.apache.cassandra.tcm; +import java.io.IOException; import java.util.ArrayList; import java.util.Collections; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.NavigableMap; @@ -30,8 +32,11 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import accord.utils.Invariants; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.tcm.log.Entry; import org.apache.cassandra.tcm.log.LocalLog; +import org.apache.cassandra.tcm.log.LogReader; import org.apache.cassandra.tcm.log.LogState; import org.apache.cassandra.tcm.log.LogStorage; @@ -71,11 +76,44 @@ protected boolean tryCommitOne(Entry.Id entryId, Transformation transform, Epoch } @Override - public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry.Deadline retry) + public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry retry) { return log.waitForHighestConsecutive(); } + @Override + public LogState getLocalState(Epoch lowEpoch, Epoch highEpoch, boolean includeSnapshot) + { + try + { + LogReader.EntryHolder state = log.storage().getEntries(Epoch.EMPTY, highEpoch); + ClusterMetadata metadata = new ClusterMetadata(DatabaseDescriptor.getPartitioner()); + + Iterator iter = state.iterator(); + ImmutableList.Builder rest = new ImmutableList.Builder<>(); + while (iter.hasNext()) + { + Entry current = iter.next(); + if (current.epoch.isEqualOrBefore(lowEpoch)) + metadata = current.transform.execute(metadata).success().metadata; + else + rest.add(current); + } + + return new LogState(metadata, rest.build()); + } + catch (IOException t) + { + throw new RuntimeException(t); + } + } + + @Override + public LogState getLogState(Epoch lowEpoch, Epoch highEpoch, boolean includeSnapshot, Retry retryPolicy) + { + return getLocalState(lowEpoch, highEpoch, includeSnapshot); + } + public static class InMemoryStorage implements LogStorage { private final List entries; @@ -99,11 +137,7 @@ public synchronized void append(Entry entry) @Override public synchronized LogState getLogState(Epoch startEpoch) { - ImmutableList.Builder builder = ImmutableList.builder(); - ClusterMetadata latest = metadataSnapshots.getLatestSnapshot(); - Epoch actualSince = latest != null && latest.epoch.isAfter(startEpoch) ? latest.epoch : startEpoch; - entries.stream().filter(e -> e.epoch.isAfter(actualSince)).forEach(builder::add); - return new LogState(latest, builder.build()); + return getLogState(startEpoch, Epoch.MAX); } @Override @@ -128,7 +162,40 @@ public synchronized MetadataSnapshots snapshots() @Override public synchronized EntryHolder getEntries(Epoch since) { - throw new IllegalStateException("We have overridden all callers of this method, it should never be called"); + EntryHolder entryHolder = new EntryHolder(since); + entries.stream().filter(e -> e.epoch.isAfter(since)).forEach(entryHolder::add); + return entryHolder; + } + + @Override + public synchronized EntryHolder getEntries(Epoch since, Epoch until) + { + EntryHolder entryHolder = new EntryHolder(since); + entries.stream().filter(e -> e.epoch.isAfter(since) && e.epoch.isEqualOrBefore(until)).forEach(entryHolder::add); + return entryHolder; + } + + public LogState getLogState(Epoch start, Epoch end) + { + EntryHolder state = getEntries(Epoch.EMPTY); + ClusterMetadata metadata = new ClusterMetadata(DatabaseDescriptor.getPartitioner()); + Iterator iter = state.iterator(); + ImmutableList.Builder rest = new ImmutableList.Builder<>(); + while (iter.hasNext()) + { + Entry current = iter.next(); + if (current.epoch.isAfter(end)) + break; + if (current.epoch.isEqualOrBefore(start)) + { + Invariants.require(current.epoch.isDirectlyAfter(metadata.epoch)); + metadata = current.transform.execute(metadata).success().metadata; + } + else if (current.epoch.isAfter(start)) + rest.add(current); + } + + return new LogState(metadata, rest.build()); } } diff --git a/src/java/org/apache/cassandra/tcm/CMSOperations.java b/src/java/org/apache/cassandra/tcm/CMSOperations.java index b37da9dd94b9..a0917584d925 100644 --- a/src/java/org/apache/cassandra/tcm/CMSOperations.java +++ b/src/java/org/apache/cassandra/tcm/CMSOperations.java @@ -31,7 +31,10 @@ import org.slf4j.LoggerFactory; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.virtual.ClusterMetadataDirectoryTable; +import org.apache.cassandra.db.virtual.ClusterMetadataLogTable; import org.apache.cassandra.schema.ReplicationParams; +import org.apache.cassandra.schema.TableId; import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.tcm.membership.NodeState; import org.apache.cassandra.tcm.membership.NodeVersion; @@ -40,6 +43,7 @@ import org.apache.cassandra.tcm.sequences.InProgressSequences; import org.apache.cassandra.tcm.sequences.ReconfigureCMS; import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.tcm.sequences.DropAccordTable; import org.apache.cassandra.tcm.transformations.Unregister; import org.apache.cassandra.tcm.transformations.cms.AdvanceCMSReconfiguration; import org.apache.cassandra.utils.FBUtilities; @@ -261,4 +265,44 @@ public void unregisterLeftNodes(List nodeIdStrings) cms.commit(new Unregister(nodeId, EnumSet.of(NodeState.LEFT), ClusterMetadataService.instance().placementProvider())); } } + + public Map> dumpDirectory(boolean tokens) + { + Map> directory = ClusterMetadataDirectoryTable.directory(tokens); + return convertToStringValues(directory); + } + + public Map> dumpLog(long startEpoch, long endEpoch) + { + Map> log = ClusterMetadataLogTable.log(startEpoch, endEpoch); + return convertToStringValues(log); + } + + private Map> convertToStringValues(Map> log) + { + Map> res = new LinkedHashMap<>(); + for (Map.Entry> outerEntry : log.entrySet()) + { + Map rowRes = new HashMap<>(); + for (Map.Entry row : outerEntry.getValue().entrySet()) + rowRes.put(row.getKey(), row.getValue().toString()); + res.put(outerEntry.getKey(), rowRes); + } + return res; + } + + @Override + public void resumeDropAccordTable(String tableId) + { + TableId id = TableId.fromString(tableId); + for (MultiStepOperation.SequenceKey key : ClusterMetadata.current().inProgressSequences.keys()) + { + if (key instanceof DropAccordTable.TableReference && ((DropAccordTable.TableReference) key).id.equals(id)) + { + InProgressSequences.finishInProgressSequences(key); + return; + } + } + throw new IllegalArgumentException("No drop table operation is in progress for table with id " + tableId); + } } diff --git a/src/java/org/apache/cassandra/tcm/CMSOperationsMBean.java b/src/java/org/apache/cassandra/tcm/CMSOperationsMBean.java index 1e2d9e147313..6656634a6417 100644 --- a/src/java/org/apache/cassandra/tcm/CMSOperationsMBean.java +++ b/src/java/org/apache/cassandra/tcm/CMSOperationsMBean.java @@ -46,4 +46,8 @@ public interface CMSOperationsMBean public boolean cancelInProgressSequences(String sequenceOwner, String expectedSequenceKind); public void unregisterLeftNodes(List nodeIds); + public Map> dumpDirectory(boolean includeTokens); + public Map> dumpLog(long startEpoch, long endEpoch); + + public void resumeDropAccordTable(String tableId); } diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java index 64eadc76dddb..d57df4de5fca 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadata.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadata.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -32,10 +33,12 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Sets; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import accord.local.Node; import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.dht.IPartitioner; @@ -54,14 +57,15 @@ import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.Keyspaces; import org.apache.cassandra.schema.ReplicationParams; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.AccordStaleReplicas; +import org.apache.cassandra.service.accord.AccordTopology; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationState; +import org.apache.cassandra.service.consensus.migration.TableMigrationState; import org.apache.cassandra.tcm.extensions.ExtensionKey; import org.apache.cassandra.tcm.extensions.ExtensionValue; -import org.apache.cassandra.tcm.membership.Directory; -import org.apache.cassandra.tcm.membership.Location; -import org.apache.cassandra.tcm.membership.NodeAddresses; -import org.apache.cassandra.tcm.membership.NodeId; -import org.apache.cassandra.tcm.membership.NodeState; -import org.apache.cassandra.tcm.membership.NodeVersion; +import org.apache.cassandra.service.accord.AccordFastPath; +import org.apache.cassandra.tcm.membership.*; import org.apache.cassandra.tcm.ownership.DataPlacement; import org.apache.cassandra.tcm.ownership.DataPlacements; import org.apache.cassandra.tcm.ownership.PrimaryRangeComparator; @@ -78,6 +82,7 @@ import static com.google.common.collect.ImmutableSet.toImmutableSet; import static org.apache.cassandra.config.CassandraRelevantProperties.LINE_SEPARATOR; import static org.apache.cassandra.db.TypeSizes.sizeof; +import static org.apache.cassandra.tcm.serialization.Version.MIN_ACCORD_VERSION; public class ClusterMetadata { @@ -93,9 +98,12 @@ public class ClusterMetadata public final Directory directory; public final TokenMap tokenMap; public final DataPlacements placements; + public final AccordFastPath accordFastPath; public final LockedRanges lockedRanges; public final InProgressSequences inProgressSequences; + public final ConsensusMigrationState consensusMigrationState; public final ImmutableMap, ExtensionValue> extensions; + public final AccordStaleReplicas accordStaleReplicas; // This isn't serialized as part of ClusterMetadata it's really just a view over the Directory. public final Locator locator; @@ -104,6 +112,7 @@ public class ClusterMetadata private EndpointsForRange fullCMSReplicas; private Set fullCMSEndpoints; private Set fullCMSIds; + private DataPlacements writePlacementAllSettled; public ClusterMetadata(IPartitioner partitioner) { @@ -126,9 +135,12 @@ public ClusterMetadata(IPartitioner partitioner, Directory directory, Distribute directory, new TokenMap(partitioner), DataPlacements.EMPTY, + AccordFastPath.EMPTY, LockedRanges.EMPTY, InProgressSequences.EMPTY, - ImmutableMap.of()); + ConsensusMigrationState.EMPTY, + ImmutableMap.of(), + AccordStaleReplicas.EMPTY); } public ClusterMetadata(Epoch epoch, @@ -137,9 +149,12 @@ public ClusterMetadata(Epoch epoch, Directory directory, TokenMap tokenMap, DataPlacements placements, + AccordFastPath accordFastPath, LockedRanges lockedRanges, InProgressSequences inProgressSequences, - Map, ExtensionValue> extensions) + ConsensusMigrationState consensusMigrationState, + Map, ExtensionValue> extensions, + AccordStaleReplicas accordStaleReplicas) { this(EMPTY_METADATA_IDENTIFIER, epoch, @@ -148,9 +163,12 @@ public ClusterMetadata(Epoch epoch, directory, tokenMap, placements, + accordFastPath, lockedRanges, inProgressSequences, - extensions); + consensusMigrationState, + extensions, + accordStaleReplicas); } private ClusterMetadata(int metadataIdentifier, @@ -160,9 +178,12 @@ private ClusterMetadata(int metadataIdentifier, Directory directory, TokenMap tokenMap, DataPlacements placements, + AccordFastPath accordFastPath, LockedRanges lockedRanges, InProgressSequences inProgressSequences, - Map, ExtensionValue> extensions) + ConsensusMigrationState consensusMigrationState, + Map, ExtensionValue> extensions, + AccordStaleReplicas accordStaleReplicas) { // TODO: token map is a feature of the specific placement strategy, and so may not be a relevant component of // ClusterMetadata in the long term. We need to consider how the actual components of metadata can be evolved @@ -175,10 +196,13 @@ private ClusterMetadata(int metadataIdentifier, this.directory = directory; this.tokenMap = tokenMap; this.placements = placements; + this.accordFastPath = accordFastPath; this.lockedRanges = lockedRanges; this.inProgressSequences = inProgressSequences; + this.consensusMigrationState = consensusMigrationState; this.extensions = ImmutableMap.copyOf(extensions); this.locator = Locator.usingDirectory(directory); + this.accordStaleReplicas = accordStaleReplicas; } public Set fullCMSMembers() @@ -230,9 +254,12 @@ public ClusterMetadata forceEpoch(Epoch epoch) capLastModified(directory, epoch), capLastModified(tokenMap, epoch), capLastModified(placements, epoch), + capLastModified(accordFastPath, epoch), capLastModified(lockedRanges, epoch), capLastModified(inProgressSequences, epoch), - capLastModified(extensions, epoch)); + capLastModified(consensusMigrationState, epoch), + capLastModified(extensions, epoch), + capLastModified(accordStaleReplicas, epoch)); } public ClusterMetadata initializeClusterIdentifier(int clusterIdentifier) @@ -250,9 +277,12 @@ public ClusterMetadata initializeClusterIdentifier(int clusterIdentifier) directory, tokenMap, placements, + accordFastPath, lockedRanges, inProgressSequences, - extensions); + consensusMigrationState, + extensions, + accordStaleReplicas); } private static Map, ExtensionValue> capLastModified(Map, ExtensionValue> original, Epoch maxEpoch) @@ -282,15 +312,19 @@ public Epoch nextEpoch() public DataPlacement writePlacementAllSettled(KeyspaceMetadata ksm) { - ClusterMetadata metadata = this; - Iterator> iter = metadata.inProgressSequences.iterator(); - while (iter.hasNext()) + if (writePlacementAllSettled == null) { - Transformation.Result result = iter.next().applyTo(metadata); - assert result.isSuccess(); - metadata = result.success().metadata; + ClusterMetadata metadata = this; + Iterator> iter = metadata.inProgressSequences.iterator(); + while (iter.hasNext()) + { + Transformation.Result result = iter.next().applyTo(metadata); + assert result.isSuccess(); + metadata = result.success().metadata; + } + writePlacementAllSettled = metadata.placements; } - return metadata.placements.get(ksm.params.replication); + return writePlacementAllSettled.get(ksm.params.replication); } // TODO Remove this as it isn't really an equivalent to the previous concept of pending ranges @@ -371,10 +405,13 @@ public static class Transformer private Directory directory; private TokenMap tokenMap; private DataPlacements placements; + private AccordFastPath accordFastPath; private LockedRanges lockedRanges; private InProgressSequences inProgressSequences; + private ConsensusMigrationState consensusMigrationState; private final Map, ExtensionValue> extensions; private final Set modifiedKeys; + private AccordStaleReplicas accordStaleReplicas; private Transformer(ClusterMetadata metadata, Epoch epoch) { @@ -385,10 +422,18 @@ private Transformer(ClusterMetadata metadata, Epoch epoch) this.directory = metadata.directory; this.tokenMap = metadata.tokenMap; this.placements = metadata.placements; + this.accordFastPath = metadata.accordFastPath; this.lockedRanges = metadata.lockedRanges; this.inProgressSequences = metadata.inProgressSequences; + this.consensusMigrationState = metadata.consensusMigrationState; extensions = new HashMap<>(metadata.extensions); modifiedKeys = new HashSet<>(); + accordStaleReplicas = metadata.accordStaleReplicas; + } + + public Epoch epoch() + { + return epoch; } public Transformer with(DistributedSchema schema) @@ -411,9 +456,14 @@ public Transformer register(NodeAddresses addresses, Location location, NodeVers public Transformer unregister(NodeId nodeId) { - directory = directory.withoutRackAndDC(nodeId).without(nodeId); + directory = directory.withoutRackAndDC(nodeId).without(epoch, nodeId); if (!tokenMap.tokens(nodeId).isEmpty()) tokenMap = tokenMap.unassignTokens(nodeId); + + Node.Id accordId = AccordTopology.tcmIdToAccord(nodeId); + if (accordStaleReplicas.contains(accordId)) + accordStaleReplicas = accordStaleReplicas.without(Collections.singleton(accordId)); + return this; } @@ -429,6 +479,12 @@ public Transformer withVersion(NodeId nodeId, NodeVersion version) return this; } + public Transformer register(NodeId nodeId, NodeAddresses addresses, Location location, NodeVersion version) + { + directory = directory.with(nodeId, addresses, location, version); + return this; + } + public Transformer withNodeState(NodeId id, NodeState state) { directory = directory.withNodeState(id, state); @@ -472,9 +528,14 @@ public Transformer replaced(NodeId replaced, NodeId replacement) Collection transferringTokens = tokenMap.tokens(replaced); tokenMap = tokenMap.unassignTokens(replaced) .assignTokens(replacement, transferringTokens); - directory = directory.without(replaced) + directory = directory.without(epoch, replaced) .withRackAndDC(replacement) .withNodeState(replacement, NodeState.JOINED); + + Node.Id accordId = AccordTopology.tcmIdToAccord(replaced); + if (accordStaleReplicas.contains(accordId)) + accordStaleReplicas = accordStaleReplicas.without(Collections.singleton(accordId)); + return this; } @@ -498,6 +559,24 @@ public Transformer with(DataPlacements placements) return this; } + public Transformer withFastPathStatusSince(Node.Id node, AccordFastPath.Status status, long updateTimeMillis, long updateDelayMillis) + { + accordFastPath = accordFastPath.withNodeStatusSince(node, status, updateTimeMillis, updateDelayMillis); + return this; + } + + public Transformer markStaleReplicas(Set ids) + { + accordStaleReplicas = accordStaleReplicas.withNodeIds(ids); + return this; + } + + public Transformer unmarkStaleReplicas(Set ids) + { + accordStaleReplicas = accordStaleReplicas.without(ids); + return this; + } + public Transformer with(LockedRanges lockedRanges) { this.lockedRanges = lockedRanges; @@ -510,9 +589,40 @@ public Transformer with(InProgressSequences sequences) return this; } + public Transformer with(Map newTableMigrationStates) + { + return with(newTableMigrationStates, true); + } + + public Transformer with(Map newTableMigrationStates, + boolean addRemaining) + { + if (addRemaining) + { + ImmutableMap.Builder tableMigrationStatesBuilder = ImmutableMap.builder(); + consensusMigrationState.tableStates.entrySet() + .stream() + .filter(existingTMS -> !newTableMigrationStates.containsKey(existingTMS.getKey())) + .forEach(tableMigrationStatesBuilder::put); + tableMigrationStatesBuilder.putAll(newTableMigrationStates.entrySet()); + consensusMigrationState = new ConsensusMigrationState(Epoch.EMPTY, tableMigrationStatesBuilder.build()); + } + else + { + consensusMigrationState = new ConsensusMigrationState(Epoch.EMPTY, newTableMigrationStates); + } + return this; + } + + public Transformer with(ConsensusMigrationState consensusMigrationState) + { + this.consensusMigrationState = consensusMigrationState; + return this; + } + public Transformer with(ExtensionKey key, ExtensionValue obj) { - if (MetadataKeys.CORE_METADATA.contains(key)) + if (MetadataKeys.CORE_METADATA.containsKey(key)) throw new IllegalArgumentException("Core cluster metadata objects should be addressed directly, " + "not using the associated MetadataKey"); @@ -535,7 +645,7 @@ public Transformer withIfAbsent(ExtensionKey key, ExtensionValue obj) public Transformer without(ExtensionKey key) { - if (MetadataKeys.CORE_METADATA.contains(key)) + if (MetadataKeys.CORE_METADATA.containsKey(key)) throw new IllegalArgumentException("Core cluster metadata objects should be addressed directly, " + "not using the associated MetadataKey"); if (extensions.remove(key) != null) @@ -564,6 +674,9 @@ public Transformed build() { modifiedKeys.add(MetadataKeys.NODE_DIRECTORY); directory = directory.withLastModified(epoch); + + for (NodeId peer : Sets.difference(base.directory.peerIds(), directory.peerIds())) + accordFastPath = accordFastPath.withoutNode(peer); } if (tokenMap != base.tokenMap) @@ -584,6 +697,18 @@ public Transformed build() placements = placements.withLastModified(epoch); } + if (accordFastPath != base.accordFastPath) + { + modifiedKeys.add(MetadataKeys.ACCORD_FAST_PATH); + accordFastPath = accordFastPath.withLastModified(epoch); + } + + if (accordStaleReplicas != base.accordStaleReplicas) + { + modifiedKeys.add(MetadataKeys.ACCORD_STALE_REPLICAS); + accordStaleReplicas = accordStaleReplicas.withLastModified(epoch); + } + if (lockedRanges != base.lockedRanges) { modifiedKeys.add(MetadataKeys.LOCKED_RANGES); @@ -596,6 +721,17 @@ public Transformed build() inProgressSequences = inProgressSequences.withLastModified(epoch); } + if (consensusMigrationState != base.consensusMigrationState) + { + modifiedKeys.add(MetadataKeys.CONSENSUS_MIGRATION_STATE); + consensusMigrationState = consensusMigrationState.withLastModified(epoch); + } + + if (consensusMigrationState != base.consensusMigrationState || schema != base.schema) + { + consensusMigrationState.validateAgainstSchema(schema); + } + return new Transformed(new ClusterMetadata(base.metadataIdentifier, epoch, partitioner, @@ -603,9 +739,12 @@ public Transformed build() directory, tokenMap, placements, + accordFastPath, lockedRanges, inProgressSequences, - extensions), + consensusMigrationState, + extensions, + accordStaleReplicas), ImmutableSet.copyOf(modifiedKeys)); } @@ -618,9 +757,12 @@ public ClusterMetadata buildForGossipMode() directory, tokenMap, placements, + accordFastPath, lockedRanges, inProgressSequences, - extensions); + consensusMigrationState, + extensions, + accordStaleReplicas); } @Override @@ -634,8 +776,10 @@ public String toString() ", directory=" + schema + ", tokenMap=" + tokenMap + ", placement=" + placements + + ", availability=" + accordFastPath + ", lockedRanges=" + lockedRanges + ", inProgressSequences=" + inProgressSequences + + ", consensusMigrationState=" + consensusMigrationState + ", extensions=" + extensions + ", modifiedKeys=" + modifiedKeys + '}'; @@ -723,6 +867,7 @@ public String legacyToString() @Override public String toString() { + // TODO is this supposed to be missing fields? return "ClusterMetadata{" + "epoch=" + epoch + ", schema=" + schema + @@ -730,6 +875,7 @@ public String toString() ", tokenMap=" + tokenMap + ", placements=" + placements + ", lockedRanges=" + lockedRanges + + ", consensusMigrationState=" + lockedRanges + '}'; } @@ -744,8 +890,11 @@ public boolean equals(Object o) directory.equals(that.directory) && tokenMap.equals(that.tokenMap) && placements.equals(that.placements) && + accordFastPath.equals(that.accordFastPath) && lockedRanges.equals(that.lockedRanges) && inProgressSequences.equals(that.inProgressSequences) && + consensusMigrationState.equals(that.consensusMigrationState) && + accordStaleReplicas.equals(that.accordStaleReplicas) && extensions.equals(that.extensions); } @@ -794,7 +943,7 @@ public void dumpDiff(ClusterMetadata other) @Override public int hashCode() { - return Objects.hash(epoch, schema, directory, tokenMap, placements, lockedRanges, inProgressSequences, extensions); + return Objects.hash(epoch, schema, directory, tokenMap, placements, accordFastPath, lockedRanges, inProgressSequences, consensusMigrationState, accordStaleReplicas, extensions); } public static ClusterMetadata current() @@ -871,6 +1020,13 @@ public void serialize(ClusterMetadata metadata, DataOutputPlus out, Version vers Directory.serializer.serialize(metadata.directory, out, version); TokenMap.serializer.serialize(metadata.tokenMap, out, version); DataPlacements.serializer.serialize(metadata.placements, out, version); + if (version.isAtLeast(MIN_ACCORD_VERSION)) + { + AccordFastPath.serializer.serialize(metadata.accordFastPath, out, version); + ConsensusMigrationState.serializer.serialize(metadata.consensusMigrationState, out, version); + AccordStaleReplicas.serializer.serialize(metadata.accordStaleReplicas, out, version); + } + LockedRanges.serializer.serialize(metadata.lockedRanges, out, version); InProgressSequences.serializer.serialize(metadata.inProgressSequences, out, version); out.writeInt(metadata.extensions.size()); @@ -907,6 +1063,24 @@ public ClusterMetadata deserialize(DataInputPlus in, Version version) throws IOE Directory dir = Directory.serializer.deserialize(in, version); TokenMap tokenMap = TokenMap.serializer.deserialize(in, version); DataPlacements placements = DataPlacements.serializer.deserialize(in, version); + + AccordFastPath accordFastPath; + ConsensusMigrationState consensusMigrationState; + AccordStaleReplicas staleReplicas; + + if (version.isAtLeast(MIN_ACCORD_VERSION)) + { + accordFastPath = AccordFastPath.serializer.deserialize(in, version); + consensusMigrationState = ConsensusMigrationState.serializer.deserialize(in, version); + staleReplicas = AccordStaleReplicas.serializer.deserialize(in, version); + } + else + { + accordFastPath = AccordFastPath.EMPTY; + consensusMigrationState = ConsensusMigrationState.EMPTY; + staleReplicas = AccordStaleReplicas.EMPTY; + } + LockedRanges lockedRanges = LockedRanges.serializer.deserialize(in, version); InProgressSequences ips = InProgressSequences.serializer.deserialize(in, version); int items = in.readInt(); @@ -925,9 +1099,12 @@ public ClusterMetadata deserialize(DataInputPlus in, Version version) throws IOE dir, tokenMap, placements, + accordFastPath, lockedRanges, ips, - extensions); + consensusMigrationState, + extensions, + staleReplicas); } @Override @@ -946,8 +1123,16 @@ public long serializedSize(ClusterMetadata metadata, Version version) DistributedSchema.serializer.serializedSize(metadata.schema, version) + Directory.serializer.serializedSize(metadata.directory, version) + TokenMap.serializer.serializedSize(metadata.tokenMap, version) + - DataPlacements.serializer.serializedSize(metadata.placements, version) + - LockedRanges.serializer.serializedSize(metadata.lockedRanges, version) + + DataPlacements.serializer.serializedSize(metadata.placements, version); + + if (version.isAtLeast(MIN_ACCORD_VERSION)) + { + size += AccordFastPath.serializer.serializedSize(metadata.accordFastPath, version) + + ConsensusMigrationState.serializer.serializedSize(metadata.consensusMigrationState, version) + + AccordStaleReplicas.serializer.serializedSize(metadata.accordStaleReplicas, version); + } + + size += LockedRanges.serializer.serializedSize(metadata.lockedRanges, version) + InProgressSequences.serializer.serializedSize(metadata.inProgressSequences, version); return size; diff --git a/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java b/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java index 3e3d8389ae2f..15b21e1aa7d7 100644 --- a/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java +++ b/src/java/org/apache/cassandra/tcm/ClusterMetadataService.java @@ -41,6 +41,7 @@ import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.exceptions.ExceptionCode; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.StartupException; import org.apache.cassandra.gms.FailureDetector; import org.apache.cassandra.io.util.FileInputStreamPlus; @@ -48,6 +49,8 @@ import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.metrics.TCMMetrics; import org.apache.cassandra.net.IVerbHandler; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessageDelivery; import org.apache.cassandra.schema.DistributedSchema; import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.tcm.listeners.SchemaListener; @@ -78,6 +81,7 @@ import static java.util.concurrent.TimeUnit.NANOSECONDS; import static java.util.stream.Collectors.toSet; import static org.apache.cassandra.config.CassandraRelevantProperties.TCM_SKIP_CMS_RECONFIGURATION_AFTER_TOPOLOGY_CHANGE; +import static org.apache.cassandra.config.DatabaseDescriptor.getCmsAwaitTimeout; import static org.apache.cassandra.tcm.ClusterMetadataService.State.GOSSIP; import static org.apache.cassandra.tcm.ClusterMetadataService.State.LOCAL; import static org.apache.cassandra.tcm.ClusterMetadataService.State.REMOTE; @@ -166,17 +170,17 @@ public static State state(ClusterMetadata metadata) Processor localProcessor; if (CassandraRelevantProperties.TCM_USE_ATOMIC_LONG_PROCESSOR.getBoolean()) { - log = logSpec.sync().createLog(); + log = logSpec.sync().withStorage(new AtomicLongBackedProcessor.InMemoryStorage()).createLog(); localProcessor = wrapProcessor.apply(new AtomicLongBackedProcessor(log, logSpec.isReset())); - fetchLogHandler = new FetchCMSLog.Handler((e, ignored) -> logSpec.storage().getLogState(e)); } else { log = logSpec.async().createLog(); localProcessor = wrapProcessor.apply(new PaxosBackedProcessor(log)); - fetchLogHandler = new FetchCMSLog.Handler(); } + fetchLogHandler = new FetchCMSLog.Handler(); + Commit.Replicator replicator = CassandraRelevantProperties.TCM_USE_NO_OP_REPLICATOR.getBoolean() ? Commit.Replicator.NO_OP : new Commit.DefaultReplicator(() -> log.metadata().directory); @@ -667,8 +671,7 @@ public ClusterMetadata fetchLogFromCMS(Epoch awaitAtLeast) if (ourEpoch.isEqualOrAfter(awaitAtLeast)) return metadata; - Retry.Deadline deadline = Retry.Deadline.after(DatabaseDescriptor.getCmsAwaitTimeout().to(TimeUnit.NANOSECONDS), - new Retry.Jitter(TCMMetrics.instance.fetchLogRetries)); + Retry deadline = Retry.untilElapsed(getCmsAwaitTimeout().to(TimeUnit.NANOSECONDS), TCMMetrics.instance.fetchLogRetries); // responses for ALL withhout knowing we have pending metadata = processor.fetchLogAndWait(awaitAtLeast, deadline); if (metadata.epoch.isBefore(awaitAtLeast)) @@ -739,7 +742,7 @@ public Future fetchLogFromPeerOrCMSAsync(ClusterMetadata metada ScheduledExecutors.optionalTasks.submit(() -> { try { - future.setSuccess(ClusterMetadataService.instance().fetchLogFromPeerOrCMS(metadata, from, awaitAtLeast)); + future.setSuccess(fetchLogFromPeerOrCMS(metadata, from, awaitAtLeast)); } catch (Throwable t) { @@ -751,6 +754,19 @@ public Future fetchLogFromPeerOrCMSAsync(ClusterMetadata metada return future; } + public boolean maybeFetchLogFromPeerOrCMSAsync(MessageDelivery messaging, Message message, Runnable onFetchSuccess) + { + ClusterMetadata metadata = metadata(); + if (metadata.epoch.isEqualOrAfter(metadata.epoch)) + return false; + Future f = fetchLogFromPeerOrCMSAsync(metadata, message.from(), message.epoch()); + f.addCallback((success, failure) -> { + if (failure != null) messaging.respondWithFailure(RequestFailure.UNKNOWN, message); + else message.verb().stage.execute(onFetchSuccess); + }); + return true; + } + /** * Combines {@link #fetchLogFromPeer} with {@link #fetchLogFromCMS} to synchronously fetch and apply log entries * up to the requested epoch. The supplied peer will be contacted first and if after doing so, the current local @@ -787,6 +803,24 @@ public ClusterMetadata fetchLogFromPeerOrCMS(ClusterMetadata metadata, InetAddre return metadata; } + /** + * Combines {@link #fetchLogFromPeer} with {@link #fetchLogFromCMS} to synchronously fetch and apply log entries + * up to the requested epoch. The supplied peer will be contacted first and if after doing so, the current local + * metadata is not caught up to at least the required epoch, a further request is made to the CMS. + * The returned ClusterMetadata is guaranteed to have been published, though it may have also been superceded by + * further updates. + * If the requested epoch is not reached even after fetching from the CMS, an IllegalStateException is thrown. + * @param from Initial peer to contact. Usually this is the sender of a message containing the requested epoch, + * which means it can be assumed that this peer (if available) can supply any missing log entries. + * @param awaitAtLeast The requested epoch. + * @return A published ClusterMetadata with all entries up to (at least) the requested epoch enacted. + * @throws IllegalStateException if the requested epoch could not be reached, even after falling back to CMS catchup + */ + public ClusterMetadata fetchLogFromPeerOrCMS(InetAddressAndPort from, Epoch awaitAtLeast) + { + return fetchLogFromPeerOrCMS(metadata(), from, awaitAtLeast); + } + public ClusterMetadata awaitAtLeast(Epoch epoch) throws InterruptedException, TimeoutException { return log.awaitAtLeast(epoch); @@ -825,6 +859,7 @@ public boolean commitsPaused() { return commitsPaused.get(); } + /** * Switchable implementation that allow us to go between local and remote implementation whenever we need it. * When the node becomes a member of CMS, it switches back to being a regular member of a cluster, and all @@ -875,9 +910,9 @@ private Pair delegateInternal() } @Override - public Commit.Result commit(Entry.Id entryId, Transformation transform, Epoch lastKnown, Retry.Deadline retryPolicy) + public Commit.Result commit(Entry.Id entryId, Transformation transform, Epoch lastKnown, Retry retryPolicy) { - while (!retryPolicy.reachedMax()) + while (true) { try { @@ -890,18 +925,31 @@ public Commit.Result commit(Entry.Id entryId, Transformation transform, Epoch la } catch (NotCMSException e) { - retryPolicy.maybeSleep(); + if (!retryPolicy.maybeSleep()) + break; } } return Commit.Result.failed(ExceptionCode.SERVER_ERROR, "Could not commit " + transform.kind() + " at epoch " + lastKnown); } @Override - public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry.Deadline retryPolicy) + public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry retryPolicy) { return delegate().fetchLogAndWait(waitFor, retryPolicy); } + @Override + public LogState getLocalState(Epoch start, Epoch end, boolean includeSnapshot) + { + return delegate().getLocalState(start, end, includeSnapshot); + } + + @Override + public LogState getLogState(Epoch start, Epoch end, boolean includeSnapshot, Retry retryPolicy) + { + return delegate().getLogState(start, end, includeSnapshot, retryPolicy); + } + public String toString() { return "SwitchableProcessor{" + diff --git a/src/java/org/apache/cassandra/tcm/Commit.java b/src/java/org/apache/cassandra/tcm/Commit.java index f6008f92f7a4..6767eeaf48ec 100644 --- a/src/java/org/apache/cassandra/tcm/Commit.java +++ b/src/java/org/apache/cassandra/tcm/Commit.java @@ -364,7 +364,7 @@ public void doVerb(Message message) throws IOException { checkCMSState(); logger.info("Received commit request {} from {}", message.payload, message.from()); - Retry.Deadline retryPolicy = Retry.Deadline.at(message.expiresAtNanos(), new Retry.Jitter(TCMMetrics.instance.commitRetries)); + Retry retryPolicy = Retry.until(message.expiresAtNanos(), TCMMetrics.instance.commitRetries); Result result = processor.commit(message.payload.entryId, message.payload.transform, message.payload.lastKnown, retryPolicy); if (result.isSuccess()) { diff --git a/src/java/org/apache/cassandra/tcm/Epoch.java b/src/java/org/apache/cassandra/tcm/Epoch.java index 0d070b4a5ba2..c7e9fe719469 100644 --- a/src/java/org/apache/cassandra/tcm/Epoch.java +++ b/src/java/org/apache/cassandra/tcm/Epoch.java @@ -26,10 +26,12 @@ import com.google.common.collect.Sets; import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.UnversionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.tcm.serialization.MetadataSerializer; import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.utils.ObjectSizes; import org.apache.cassandra.utils.vint.VIntCoding; public class Epoch implements Comparable, Serializable @@ -57,10 +59,12 @@ public long serializedSize(Epoch t, int version) }; public static final Epoch FIRST = new Epoch(1); + public static final Epoch MAX = new Epoch(Long.MAX_VALUE); public static final Epoch EMPTY = new Epoch(0); public static final Epoch UPGRADE_STARTUP = new Epoch(Long.MIN_VALUE); public static final Epoch UPGRADE_GOSSIP = new Epoch(Long.MIN_VALUE + 1); private static final Set beforeFirst = Sets.newHashSet(EMPTY, UPGRADE_GOSSIP, UPGRADE_STARTUP); + private static final long EMPTY_SIZE = ObjectSizes.measure(EMPTY); private final long epoch; @@ -87,6 +91,11 @@ public static Epoch max(Epoch l, Epoch r) return l.compareTo(r) > 0 ? l : r; } + public static Epoch min(Epoch l, Epoch r) + { + return l.compareTo(r) < 0 ? l : r; + } + public boolean isDirectlyBefore(Epoch epoch) { if (epoch.equals(Epoch.FIRST)) @@ -168,37 +177,48 @@ public long getEpoch() return epoch; } - public static class EpochSerializer implements MetadataSerializer + public static class EpochSerializer implements MetadataSerializer, UnversionedSerializer { // convenience methods for messageSerializer et al + @Override public void serialize(Epoch t, DataOutputPlus out) throws IOException { serialize(t, out, Version.V0); } + @Override public Epoch deserialize(DataInputPlus in) throws IOException { return deserialize(in, Version.V0); } + @Override public long serializedSize(Epoch t) { return serializedSize(t, Version.V0); } + @Override public void serialize(Epoch t, DataOutputPlus out, Version version) throws IOException { out.writeUnsignedVInt(t.epoch); } + @Override public Epoch deserialize(DataInputPlus in, Version version) throws IOException { return Epoch.create(in.readUnsignedVInt()); } + @Override public long serializedSize(Epoch t, Version version) { return VIntCoding.computeUnsignedVIntSize(t.epoch); } } + + public long estimatedSizeOnHeap() + { + return EMPTY_SIZE; + } } diff --git a/src/java/org/apache/cassandra/tcm/FetchCMSLog.java b/src/java/org/apache/cassandra/tcm/FetchCMSLog.java index 38ef550ba587..71d79964692f 100644 --- a/src/java/org/apache/cassandra/tcm/FetchCMSLog.java +++ b/src/java/org/apache/cassandra/tcm/FetchCMSLog.java @@ -19,7 +19,8 @@ package org.apache.cassandra.tcm; import java.io.IOException; -import java.util.function.BiFunction; +import java.util.concurrent.TimeUnit; +import java.util.function.Supplier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -32,10 +33,11 @@ import org.apache.cassandra.net.IVerbHandler; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; -import org.apache.cassandra.schema.DistributedMetadataLogKeyspace; import org.apache.cassandra.tcm.log.LogState; import org.apache.cassandra.utils.FBUtilities; +import static org.apache.cassandra.config.DatabaseDescriptor.getCmsAwaitTimeout; + public class FetchCMSLog { public static final Serializer serializer = new Serializer(); @@ -89,16 +91,16 @@ static class Handler implements IVerbHandler * to node-local (which only relevant in cases of CMS expansions/shrinks, and can only be requested by the * CMS node that collects the highest epoch from the quorum of peers). */ - private final BiFunction logStateSupplier; + private final Supplier processor; public Handler() { - this(DistributedMetadataLogKeyspace::getLogState); + this(() -> ClusterMetadataService.instance().processor()); } - public Handler(BiFunction logStateSupplier) + public Handler(Supplier processor) { - this.logStateSupplier = logStateSupplier; + this.processor = processor; } public void doVerb(Message message) throws IOException @@ -114,7 +116,13 @@ public void doVerb(Message message) throws IOException // If both we and the other node believe it should be caught up with a linearizable read boolean consistentFetch = request.consistentFetch && !ClusterMetadataService.instance().isCurrentMember(message.from()); - LogState delta = logStateSupplier.apply(message.payload.lowerBound, consistentFetch); + Retry retry = Retry.untilElapsed(getCmsAwaitTimeout().to(TimeUnit.NANOSECONDS), TCMMetrics.instance.fetchLogRetries); + LogState delta; + if (consistentFetch) + delta = processor.get().getLogState(message.payload.lowerBound, Epoch.MAX, false, retry); + else + delta = processor.get().getLocalState(message.payload.lowerBound, Epoch.MAX, false); + TCMMetrics.instance.cmsLogEntriesServed(message.payload.lowerBound, delta.latestEpoch()); logger.info("Responding to {}({}) with log delta: {}", message.from(), request, delta); MessagingService.instance().send(message.responseWith(delta), message.from()); diff --git a/src/java/org/apache/cassandra/tcm/FetchPeerLog.java b/src/java/org/apache/cassandra/tcm/FetchPeerLog.java index 1347dcf049ee..ab55dcf8f0dc 100644 --- a/src/java/org/apache/cassandra/tcm/FetchPeerLog.java +++ b/src/java/org/apache/cassandra/tcm/FetchPeerLog.java @@ -31,7 +31,6 @@ import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.tcm.log.LogState; -import org.apache.cassandra.tcm.log.LogStorage; public class FetchPeerLog { @@ -82,7 +81,9 @@ public void doVerb(Message message) throws IOException ClusterMetadata metadata = ClusterMetadata.current(); logger.info("Received peer log fetch request {} from {}: start = {}, current = {}", request, message.from(), message.payload.start, metadata.epoch); - LogState delta = LogStorage.SystemKeyspace.getLogState(message.payload.start); + LogState delta = ClusterMetadataService.instance() + .processor() + .getLocalState(message.payload.start, Epoch.MAX, false); TCMMetrics.instance.peerLogEntriesServed(message.payload.start, delta.latestEpoch()); logger.info("Responding with log delta: {}", delta); MessagingService.instance().send(message.responseWith(delta), message.from()); diff --git a/src/java/org/apache/cassandra/tcm/MetadataKeys.java b/src/java/org/apache/cassandra/tcm/MetadataKeys.java index bead377c0bfb..0be621b20189 100644 --- a/src/java/org/apache/cassandra/tcm/MetadataKeys.java +++ b/src/java/org/apache/cassandra/tcm/MetadataKeys.java @@ -24,6 +24,7 @@ import java.util.Set; import java.util.function.Function; +import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import org.apache.cassandra.tcm.extensions.ExtensionKey; @@ -39,15 +40,24 @@ public class MetadataKeys public static final MetadataKey NODE_DIRECTORY = make(CORE_NS, "membership", "node_directory"); public static final MetadataKey TOKEN_MAP = make(CORE_NS, "ownership", "token_map"); public static final MetadataKey DATA_PLACEMENTS = make(CORE_NS, "ownership", "data_placements"); + public static final MetadataKey ACCORD_FAST_PATH = make(CORE_NS, "ownership", "accord_fast_path"); + public static final MetadataKey ACCORD_STALE_REPLICAS = make(CORE_NS, "ownership", "accord_stale_replicas"); public static final MetadataKey LOCKED_RANGES = make(CORE_NS, "sequences", "locked_ranges"); public static final MetadataKey IN_PROGRESS_SEQUENCES = make(CORE_NS, "sequences", "in_progress"); - - public static final ImmutableSet CORE_METADATA = ImmutableSet.of(SCHEMA, - NODE_DIRECTORY, - TOKEN_MAP, - DATA_PLACEMENTS, - LOCKED_RANGES, - IN_PROGRESS_SEQUENCES); + public static final MetadataKey CONSENSUS_MIGRATION_STATE = make(CORE_NS, "consensus", "migration_state"); + + public static final ImmutableMap>> CORE_METADATA + = ImmutableMap.>>builder() + .put(SCHEMA, cm -> cm.schema) + .put(NODE_DIRECTORY, cm -> cm.directory) + .put(TOKEN_MAP, cm -> cm.tokenMap) + .put(DATA_PLACEMENTS, cm -> cm.placements) + .put(LOCKED_RANGES, cm -> cm.lockedRanges) + .put(IN_PROGRESS_SEQUENCES, cm -> cm.inProgressSequences) + .put(ACCORD_FAST_PATH, cm -> cm.accordFastPath) + .put(ACCORD_STALE_REPLICAS, cm -> cm.accordStaleReplicas) + .put(CONSENSUS_MIGRATION_STATE, cm -> cm.consensusMigrationState) + .build(); public static MetadataKey make(String...parts) { @@ -61,6 +71,15 @@ public static MetadataKey make(String...parts) return new MetadataKey(b.toString()); } + public static MetadataValue extract(ClusterMetadata cm, MetadataKey key) + { + if (CORE_METADATA.containsKey(key)) + return CORE_METADATA.get(key).apply(cm); + if (!(key instanceof ExtensionKey)) + throw new IllegalArgumentException("Unknown key: " + key); + return cm.extensions.get(key); + } + public static ImmutableSet diffKeys(ClusterMetadata before, ClusterMetadata after) { ImmutableSet.Builder builder = new ImmutableSet.Builder<>(); @@ -70,12 +89,8 @@ public static ImmutableSet diffKeys(ClusterMetadata before, Cluster private static void diffKeys(ClusterMetadata before, ClusterMetadata after, ImmutableSet.Builder builder) { - checkKey(before, after, builder, cm -> cm.schema, MetadataKeys.SCHEMA); - checkKey(before, after, builder, cm -> cm.directory, MetadataKeys.NODE_DIRECTORY); - checkKey(before, after, builder, cm -> cm.tokenMap, MetadataKeys.TOKEN_MAP); - checkKey(before, after, builder, cm -> cm.placements, MetadataKeys.DATA_PLACEMENTS); - checkKey(before, after, builder, cm -> cm.lockedRanges, MetadataKeys.LOCKED_RANGES); - checkKey(before, after, builder, cm -> cm.inProgressSequences, MetadataKeys.IN_PROGRESS_SEQUENCES); + for (Map.Entry>> e : CORE_METADATA.entrySet()) + checkKey(before, after, builder, e.getValue(), e.getKey()); Set> added = new HashSet<>(after.extensions.keySet()); for (Map.Entry, ExtensionValue> entry : before.extensions.entrySet()) diff --git a/src/java/org/apache/cassandra/tcm/MultiStepOperation.java b/src/java/org/apache/cassandra/tcm/MultiStepOperation.java index d447974f85d5..3d42772518bb 100644 --- a/src/java/org/apache/cassandra/tcm/MultiStepOperation.java +++ b/src/java/org/apache/cassandra/tcm/MultiStepOperation.java @@ -33,6 +33,7 @@ import org.apache.cassandra.tcm.sequences.UnbootstrapAndLeave; import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.tcm.sequences.DropAccordTable; /** * Represents a multi-step process performed in order to transition the cluster to some state. @@ -67,7 +68,8 @@ public enum Kind LEAVE(UnbootstrapAndLeave.serializer), REMOVE(UnbootstrapAndLeave.serializer), - RECONFIGURE_CMS(ReconfigureCMS.serializer) + RECONFIGURE_CMS(ReconfigureCMS.serializer), + DROP_ACCORD_TABLE(DropAccordTable.serializer), ; public final AsymmetricMetadataSerializer, ? extends MultiStepOperation> serializer; diff --git a/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java b/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java index 19bdc7ae9bbe..b3a6e318f512 100644 --- a/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java +++ b/src/java/org/apache/cassandra/tcm/PaxosBackedProcessor.java @@ -32,8 +32,8 @@ import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.exceptions.ReadTimeoutException; -import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.locator.EndpointsForRange; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.metrics.TCMMetrics; @@ -70,7 +70,7 @@ protected boolean tryCommitOne(Entry.Id entryId, Transformation transform, Epoch } @Override - public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry.Deadline retryPolicy) + public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry retryPolicy) { ClusterMetadata metadata = log.waitForHighestConsecutive(); @@ -106,7 +106,7 @@ public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry.Deadline retryPolicy for (Replica peer : replicas) requests.add(new FetchLogRequest(peer, MessagingService.instance(), metadata.epoch)); - while (!retryPolicy.reachedMax()) + while (!retryPolicy.hasExpired()) { Iterator iter = requests.iterator(); boolean hasRequestToSelf = false; @@ -136,7 +136,7 @@ public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry.Deadline retryPolicy while (iter.hasNext()) { FetchLogRequest request = iter.next(); - if (request.condition.awaitUninterruptibly(Math.max(0, nextTimeout - Clock.Global.nanoTime()), TimeUnit.NANOSECONDS) && + if (request.condition.awaitThrowUncheckedOnInterrupt(Math.max(0, nextTimeout - Clock.Global.nanoTime()), TimeUnit.NANOSECONDS) && request.condition.isSuccess()) { collected.add(request.to.endpoint()); @@ -153,7 +153,8 @@ public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry.Deadline retryPolicy if (collected.size() < blockFor) { - retryPolicy.maybeSleep(); + if (!retryPolicy.maybeSleep()) + break; continue; } @@ -167,6 +168,31 @@ public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry.Deadline retryPolicy throw new ReadTimeoutException(ConsistencyLevel.QUORUM, blockFor - collected.size(), blockFor, false); } + @Override + public LogState getLocalState(Epoch start, Epoch end, boolean includeSnapshot) + { + return log.storage().getLogState(start, end, includeSnapshot); + } + + @Override + public LogState getLogState(Epoch start, Epoch end, boolean includeSnapshot, Retry retryPolicy) + { + while (true) + { + if (Thread.currentThread().isInterrupted()) + throw new RuntimeException("Can not reconstruct during shutdown", new InterruptedException()); + try + { + return DistributedMetadataLogKeyspace.getLogState(start, end, includeSnapshot); + } + catch (RuntimeException e) // honestly best to only retry timeouts, but everything gets wrapped in a RuntimeException... + { + if (!retryPolicy.maybeSleep()) + throw new RuntimeException(String.format("Could not reconstruct range %d, %d", start.getEpoch(), end.getEpoch()), new TimeoutException()); + } + } + } + private static T unwrap(Promise promise) { if (!promise.isDone() || !promise.isSuccess()) @@ -202,10 +228,10 @@ public void onResponse(Message msg) } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) + public void onFailure(InetAddressAndPort from, RequestFailure failure) { - logger.debug("Error response from {} with {}", from, failureReason); - condition.tryFailure(new TimeoutException(failureReason.toString())); + logger.debug("Error response from {} with {}", from, failure.reason); + condition.tryFailure(new TimeoutException(failure.reason.toString())); } public void retry() diff --git a/src/java/org/apache/cassandra/tcm/PeerLogFetcher.java b/src/java/org/apache/cassandra/tcm/PeerLogFetcher.java index 3564ab93f70f..0565520bbd68 100644 --- a/src/java/org/apache/cassandra/tcm/PeerLogFetcher.java +++ b/src/java/org/apache/cassandra/tcm/PeerLogFetcher.java @@ -95,8 +95,7 @@ private Future fetchLogEntriesAndWaitInternal(InetAddressAndPor Verb.TCM_FETCH_PEER_LOG_REQ, new FetchPeerLog(before), new RemoteProcessor.CandidateIterator(Collections.singletonList(remote), false), - Retry.Deadline.after(DatabaseDescriptor.getCmsAwaitTimeout().to(TimeUnit.NANOSECONDS), - new Retry.Jitter(TCMMetrics.instance.fetchLogRetries))); + Retry.untilElapsed(DatabaseDescriptor.getCmsAwaitTimeout().to(TimeUnit.NANOSECONDS), TCMMetrics.instance.fetchLogRetries)); return fetchRes.map((logState) -> { log.append(logState); @@ -108,7 +107,7 @@ private Future fetchLogEntriesAndWaitInternal(InetAddressAndPor } else { - throw new IllegalStateException(String.format("Queried for epoch %s, but could not catch up", awaitAtleast)); + throw new IllegalStateException(String.format("Queried for epoch %s, but could not catch up. Current epoch: %s", awaitAtleast, fetched.epoch)); } }); diff --git a/src/java/org/apache/cassandra/tcm/Processor.java b/src/java/org/apache/cassandra/tcm/Processor.java index fdb4cf23bb4f..9a27ac9e6683 100644 --- a/src/java/org/apache/cassandra/tcm/Processor.java +++ b/src/java/org/apache/cassandra/tcm/Processor.java @@ -18,13 +18,22 @@ package org.apache.cassandra.tcm; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import java.util.concurrent.TimeUnit; import com.codahale.metrics.Meter; -import org.apache.cassandra.config.DatabaseDescriptor; + +import accord.utils.Invariants; import org.apache.cassandra.metrics.TCMMetrics; +import org.apache.cassandra.service.WaitStrategy; import org.apache.cassandra.tcm.log.Entry; -import org.apache.cassandra.utils.Clock; +import org.apache.cassandra.tcm.log.LogState; + +import static java.util.concurrent.TimeUnit.NANOSECONDS; +import static org.apache.cassandra.config.DatabaseDescriptor.getCmsAwaitTimeout; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; public interface Processor { @@ -43,8 +52,7 @@ default Commit.Result commit(Entry.Id entryId, Transformation transform, Epoch l } return commit(entryId, transform, lastKnown, - Retry.Deadline.after(DatabaseDescriptor.getCmsAwaitTimeout().to(TimeUnit.NANOSECONDS), - new Retry.Jitter(TCMMetrics.instance.commitRetries))); + Retry.untilElapsed(getCmsAwaitTimeout().to(NANOSECONDS), TCMMetrics.instance.commitRetries)); } /** @@ -52,33 +60,27 @@ default Commit.Result commit(Entry.Id entryId, Transformation transform, Epoch l * to overflow the long, since messaging is using only 32 bits for deadlines. To achieve that, we are * giving `timeoutNanos` every time we retry, but will retry indefinitely. */ - private static Retry.Deadline unsafeRetryIndefinitely() + private static Retry unsafeRetryIndefinitely() { - long timeoutNanos = DatabaseDescriptor.getCmsAwaitTimeout().to(TimeUnit.NANOSECONDS); + long timeoutNanos = getCmsAwaitTimeout().to(NANOSECONDS); Meter retryMeter = TCMMetrics.instance.commitRetries; - return new Retry.Deadline(Clock.Global.nanoTime() + timeoutNanos, - new Retry.Jitter(retryMeter)) + return Retry.withNoTimeLimit(retryMeter, new WaitStrategy() { @Override - public boolean reachedMax() + public long computeWaitUntil(int attempts) { - return false; + return nanoTime() + timeoutNanos; } @Override - public long remainingNanos() - { - return timeoutNanos; - } - - public String toString() + public long computeWait(int attempts, TimeUnit units) { - return String.format("RetryIndefinitely{tries=%d}", currentTries()); + return units.convert(timeoutNanos, NANOSECONDS); } - }; + }); } - Commit.Result commit(Entry.Id entryId, Transformation transform, Epoch lastKnown, Retry.Deadline retryPolicy); + Commit.Result commit(Entry.Id entryId, Transformation transform, Epoch lastKnown, Retry retryPolicy); /** * Fetches log from CMS up to the highest currently known epoch. @@ -97,8 +99,41 @@ default ClusterMetadata fetchLogAndWait() default ClusterMetadata fetchLogAndWait(Epoch waitFor) { return fetchLogAndWait(waitFor, - Retry.Deadline.after(DatabaseDescriptor.getCmsAwaitTimeout().to(TimeUnit.NANOSECONDS), - new Retry.Jitter(TCMMetrics.instance.fetchLogRetries))); + Retry.untilElapsed(getCmsAwaitTimeout().to(NANOSECONDS), TCMMetrics.instance.fetchLogRetries)); } - ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry.Deadline retryPolicy); + + ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry retryPolicy); + + /** + * Queries node's _local_ state. It is not guaranteed to be contiguous, but can be used for restoring CMS state/ + */ + LogState getLocalState(Epoch start, Epoch end, boolean includeSnapshot); + + /** + * Queries global log state. + */ + LogState getLogState(Epoch start, Epoch end, boolean includeSnapshot, Retry retryPolicy); + + /** + * Reconstructs + */ + default List reconstruct(Epoch lowEpoch, Epoch highEpoch, Retry retryPolicy) + { + LogState logState = getLogState(lowEpoch, highEpoch, true, retryPolicy); + if (logState.isEmpty()) return Collections.emptyList(); + List cms = new ArrayList<>(logState.entries.size()); + + ClusterMetadata acc = logState.baseState; + cms.add(acc); + for (Entry entry : logState.entries) + { + Invariants.require(entry.epoch.isDirectlyAfter(acc.epoch), "%s should have been directly after %s", entry.epoch, acc.epoch); + Transformation.Result res = entry.transform.execute(acc); + assert res.isSuccess() : res.toString(); + acc = res.success().metadata; + cms.add(acc); + } + return cms; + } + } diff --git a/src/java/org/apache/cassandra/tcm/ReconstructLogState.java b/src/java/org/apache/cassandra/tcm/ReconstructLogState.java new file mode 100644 index 000000000000..4dea1efeb87d --- /dev/null +++ b/src/java/org/apache/cassandra/tcm/ReconstructLogState.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm; + +import java.io.IOException; +import java.util.concurrent.TimeUnit; +import java.util.function.Supplier; + +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.metrics.TCMMetrics; +import org.apache.cassandra.net.IVerbHandler; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.tcm.log.LogState; +import org.apache.cassandra.utils.FBUtilities; + +import static org.apache.cassandra.config.DatabaseDescriptor.getCmsAwaitTimeout; + +public class ReconstructLogState +{ + public static final Serializer serializer = new Serializer(); + + public final Epoch lowerBound; + public final Epoch higherBound; + public final boolean includeSnapshot; + + public ReconstructLogState(Epoch lowerBound, Epoch higherBound, boolean includeSnapshot) + { + this.lowerBound = lowerBound; + this.higherBound = higherBound; + this.includeSnapshot = includeSnapshot; + } + + static class Serializer implements IVersionedSerializer + { + + public void serialize(ReconstructLogState t, DataOutputPlus out, int version) throws IOException + { + Epoch.serializer.serialize(t.lowerBound, out); + Epoch.serializer.serialize(t.higherBound, out); + out.writeBoolean(t.includeSnapshot); + } + + public ReconstructLogState deserialize(DataInputPlus in, int version) throws IOException + { + Epoch lowerBound = Epoch.serializer.deserialize(in); + Epoch higherBound = Epoch.serializer.deserialize(in); + return new ReconstructLogState(lowerBound, higherBound, in.readBoolean()); + } + + public long serializedSize(ReconstructLogState t, int version) + { + return Epoch.serializer.serializedSize(t.lowerBound) + + Epoch.serializer.serializedSize(t.higherBound) + + TypeSizes.BOOL_SIZE; + } + } + + public static class Handler implements IVerbHandler + { + public static final Handler instance = new Handler(); + + private final Supplier processor; + + public Handler() + { + this(() -> ClusterMetadataService.instance().processor()); + } + public Handler(Supplier processor) + { + this.processor = processor; + } + public void doVerb(Message message) throws IOException + { + TCMMetrics.instance.reconstructLogStateCall.mark(); + ReconstructLogState request = message.payload; + + if (!ClusterMetadataService.instance().isCurrentMember(FBUtilities.getBroadcastAddressAndPort())) + throw new NotCMSException("This node is not in the CMS, can't generate a consistent log fetch response to " + message.from()); + + LogState result = processor.get().getLogState(request.lowerBound, request.higherBound, request.includeSnapshot, + Retry.untilElapsed(getCmsAwaitTimeout().to(TimeUnit.NANOSECONDS), TCMMetrics.instance.fetchLogRetries)); + + MessagingService.instance().send(message.responseWith(result), message.from()); + } + } +} diff --git a/src/java/org/apache/cassandra/tcm/RemoteProcessor.java b/src/java/org/apache/cassandra/tcm/RemoteProcessor.java index 0ea055b908f9..fe87719593ac 100644 --- a/src/java/org/apache/cassandra/tcm/RemoteProcessor.java +++ b/src/java/org/apache/cassandra/tcm/RemoteProcessor.java @@ -34,6 +34,7 @@ import com.codahale.metrics.Timer; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.gms.FailureDetector; import org.apache.cassandra.locator.InetAddressAndPort; @@ -43,12 +44,12 @@ import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.net.RequestCallbackWithFailure; import org.apache.cassandra.net.Verb; +import org.apache.cassandra.service.WaitStrategy; import org.apache.cassandra.tcm.Discovery.DiscoveredNodes; import org.apache.cassandra.tcm.log.Entry; import org.apache.cassandra.tcm.log.LocalLog; import org.apache.cassandra.tcm.log.LogState; import org.apache.cassandra.utils.AbstractIterator; -import org.apache.cassandra.utils.Backoff; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.concurrent.AsyncPromise; import org.apache.cassandra.utils.concurrent.Future; @@ -72,7 +73,7 @@ public final class RemoteProcessor implements Processor @Override @SuppressWarnings("resource") - public Commit.Result commit(Entry.Id entryId, Transformation transform, Epoch lastKnown, Retry.Deadline retryPolicy) + public Commit.Result commit(Entry.Id entryId, Transformation transform, Epoch lastKnown, Retry retryPolicy) { try { @@ -125,7 +126,7 @@ private List candidates(boolean allowDiscovery) } @Override - public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry.Deadline retryPolicy) + public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry retryPolicy) { // Synchonous, non-debounced call if we are waiting for the highest epoch (without knowing/caring what it is). // Should be used sparingly. @@ -150,6 +151,36 @@ public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry.Deadline retryPolicy } } + @Override + public LogState getLocalState(Epoch start, Epoch end, boolean includeSnapshot) + { + return log.getLocalEntries(start, end, includeSnapshot); + } + + @Override + public LogState getLogState(Epoch lowEpoch, Epoch highEpoch, boolean includeSnapshot, Retry retryPolicy) + { + try + { + Promise request = new AsyncPromise<>(); + List candidates = new ArrayList<>(log.metadata().fullCMSMembers()); + sendWithCallbackAsync(request, + Verb.TCM_RECONSTRUCT_EPOCH_REQ, + new ReconstructLogState(lowEpoch, highEpoch, includeSnapshot), + new CandidateIterator(candidates), + retryPolicy); + return request.get(retryPolicy.remainingNanos(), TimeUnit.NANOSECONDS); + } + catch (InterruptedException e) + { + throw new RuntimeException("Can not reconstruct during shutdown", e); + } + catch (ExecutionException | TimeoutException e) + { + throw new RuntimeException(String.format("Could not reconstruct range %d, %d", lowEpoch.getEpoch(), highEpoch.getEpoch()), e); + } + } + public static ClusterMetadata fetchLogAndWait(CandidateIterator candidateIterator, LocalLog log) { try @@ -162,8 +193,7 @@ public static ClusterMetadata fetchLogAndWait(CandidateIterator candidateIterato } } - private static Future fetchLogAndWaitInternal(CandidateIterator candidates, - LocalLog log) + private static Future fetchLogAndWaitInternal(CandidateIterator candidates, LocalLog log) { try (Timer.Context ctx = TCMMetrics.instance.fetchCMSLogLatency.time()) { @@ -173,7 +203,7 @@ private static Future fetchLogAndWaitInternal(CandidateIterator Verb.TCM_FETCH_CMS_LOG_REQ, new FetchCMSLog(currentEpoch, ClusterMetadataService.state() == REMOTE), candidates, - new Retry.Backoff(TCMMetrics.instance.fetchLogRetries)); + Retry.withNoTimeLimit(TCMMetrics.instance.fetchLogRetries)); return remoteRequest.map((replay) -> { if (!replay.isEmpty()) { @@ -187,12 +217,12 @@ private static Future fetchLogAndWaitInternal(CandidateIterator } // todo rename to send with retries or something - public static RSP sendWithCallback(Verb verb, REQ request, CandidateIterator candidates, Retry retryPolicy) + public static RSP sendWithCallback(Verb verb, REQ request, CandidateIterator candidates, WaitStrategy backoff) { try { Promise promise = new AsyncPromise<>(); - sendWithCallbackAsync(promise, verb, request, candidates, retryPolicy); + sendWithCallbackAsync(promise, verb, request, candidates, backoff); return promise.await().get(); } catch (InterruptedException | ExecutionException e) @@ -201,10 +231,10 @@ public static RSP sendWithCallback(Verb verb, REQ request, CandidateI } } - public static void sendWithCallbackAsync(Promise promise, Verb verb, REQ request, CandidateIterator candidates, Retry retryPolicy) + public static void sendWithCallbackAsync(Promise promise, Verb verb, REQ request, CandidateIterator candidates, WaitStrategy backoff) { //TODO (now): the retry defines how long to wait for a retry, but the old behavior scheduled the message right away... should this be delayed as well? - MessagingService.instance().sendWithRetries(Backoff.fromRetry(retryPolicy), MessageDelivery.ImmediateRetryScheduler.instance, + MessagingService.instance().sendWithRetries(backoff, MessageDelivery.ImmediateRetryScheduler.instance, verb, request, candidates, (attempt, success, failure) -> { if (failure != null) promise.tryFailure(failure); @@ -213,7 +243,7 @@ public static void sendWithCallbackAsync(Promise promise, Verb v (attempt, from, failure) -> { if (promise.isDone() || promise.isCancelled()) return false; - if (failure == RequestFailureReason.NOT_CMS) + if (failure.reason == RequestFailureReason.NOT_CMS) { logger.debug("{} is not a member of the CMS, querying it to discover current membership", from); DiscoveredNodes cms = tryDiscover(from); @@ -233,8 +263,8 @@ public static void sendWithCallbackAsync(Promise promise, Verb v { case NoMoreCandidates: return String.format("Ran out of candidates while sending %s: %s", verb, candidates); - case MaxRetries: - return String.format("Could not succeed sending %s to %s after %d tries", verb, candidates, retryPolicy.tries); + case GiveUp: + return String.format("Could not succeed sending %s to %s; policy %s gave up", verb, candidates, backoff); case Interrupted: case FailedSchedule: return null; @@ -257,7 +287,7 @@ public void onResponse(Message msg) } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) + public void onFailure(InetAddressAndPort from, RequestFailure failureReason) { // "success" - this lets us just try the next one in cmsIter promise.setSuccess(new DiscoveredNodes(Collections.emptySet(), DiscoveredNodes.Kind.KNOWN_PEERS)); diff --git a/src/java/org/apache/cassandra/tcm/Retry.java b/src/java/org/apache/cassandra/tcm/Retry.java index 3277531444a6..a66f64e1130c 100644 --- a/src/java/org/apache/cassandra/tcm/Retry.java +++ b/src/java/org/apache/cassandra/tcm/Retry.java @@ -18,175 +18,143 @@ package org.apache.cassandra.tcm; -import java.util.Random; -import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; import com.codahale.metrics.Meter; import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.utils.Clock; +import org.apache.cassandra.config.DurationSpec; +import org.apache.cassandra.service.RetryStrategy; +import org.apache.cassandra.service.TimeoutStrategy.LatencySourceFactory; +import org.apache.cassandra.service.WaitStrategy; import static com.google.common.util.concurrent.Uninterruptibles.sleepUninterruptibly; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; -public abstract class Retry +// TODO (expected): unwrap this, use RetryStrategy directly +public class Retry implements WaitStrategy { - protected static final int MAX_TRIES = DatabaseDescriptor.getCmsDefaultRetryMaxTries(); - protected final int maxTries; - protected int tries; - protected Meter retryMeter; - - public Retry(Meter retryMeter) + private static final WaitStrategy DEFAULT_STRATEGY; + static { - this(MAX_TRIES, retryMeter); + DurationSpec.IntMillisecondsBound defaultBackoff = DatabaseDescriptor.getDefaultRetryBackoff(); + DurationSpec.IntMillisecondsBound defaultMaxBackoff = DatabaseDescriptor.getDefaultMaxRetryBackoff(); + String defaultSpec = DatabaseDescriptor.getCMSRetryDelay(); + if (defaultSpec == null || (defaultBackoff != null || defaultMaxBackoff != null)) + { + defaultSpec = (defaultBackoff == null ? "50ms" : defaultBackoff.toMilliseconds() + "ms") + + "*attempts <=" + (defaultMaxBackoff == null ? "10s" : defaultMaxBackoff.toMilliseconds() + "ms") + + ",retries=" + DatabaseDescriptor.getCmsDefaultRetryMaxTries(); + } + DEFAULT_STRATEGY = RetryStrategy.parse(defaultSpec, LatencySourceFactory.none()); } - public Retry(int maxTries, Meter retryMeter) + public final long deadlineNanos; + protected Meter retryMeter; + private final WaitStrategy delegate; + int attempts = 1; + + public Retry(long deadlineNanos, Meter retryMeter, WaitStrategy delegate) { - this.maxTries = maxTries; + this.deadlineNanos = deadlineNanos; this.retryMeter = retryMeter; + this.delegate = delegate; } - public int currentTries() + public Retry(long deadlineNanos, Meter retryMeter) { - return tries; + this(deadlineNanos, retryMeter, DEFAULT_STRATEGY); } - public boolean reachedMax() + public int attempts() { - return tries >= maxTries; + return attempts; } - public void maybeSleep() + public boolean hasExpired() { - sleepUninterruptibly(computeSleepFor(), TimeUnit.MILLISECONDS); + return nanoTime() >= deadlineNanos; } - public long computeSleepFor() + public boolean maybeSleep() { - tries++; - retryMeter.mark(); - return sleepFor(); + long wait = computeWait(attempts, TimeUnit.MILLISECONDS); + if (wait < 0) + return false; + sleepUninterruptibly(wait, TimeUnit.MILLISECONDS); + return true; } - protected abstract long sleepFor(); - - public static class Jitter extends Retry + @Override + public long computeWaitUntil(int attempts) { - public static final int MAX_JITTER_MS = Math.toIntExact(DatabaseDescriptor.getDefaultRetryBackoff().to(TimeUnit.MILLISECONDS)); - private final Random random; - private final int maxJitterMs; - - public Jitter(Meter retryMeter) - { - this(MAX_TRIES, MAX_JITTER_MS, new Random(), retryMeter); - } - - private Jitter(int maxTries, int maxJitterMs, Random random, Meter retryMeter) - { - super(maxTries, retryMeter); - this.random = random; - this.maxJitterMs = maxJitterMs; - } - - public long sleepFor() - { - int actualBackoff = ThreadLocalRandom.current().nextInt(maxJitterMs / 2, maxJitterMs); - return random.nextInt(actualBackoff); - } - - @Override - public String toString() - { - return "Jitter{" + - ", maxTries=" + maxTries + - ", tries=" + tries + - ", maxJitterMs=" + maxJitterMs + - '}'; - } + long wait = computeWaitInternal(attempts, TimeUnit.NANOSECONDS); + if (wait < 0) + return -1; + long now = nanoTime(); + if (now >= deadlineNanos) + return -1; + return Math.min(deadlineNanos, wait + now); } - public static class Backoff extends Retry + @Override + public long computeWait(int attempts, TimeUnit units) { - private static final int RETRY_BACKOFF_MS = Math.toIntExact(DatabaseDescriptor.getDefaultRetryBackoff().to(TimeUnit.MILLISECONDS)); - protected final int backoffMs; - - public Backoff(Meter retryMeter) - { - this(MAX_TRIES, RETRY_BACKOFF_MS, retryMeter); - } - - public Backoff(int maxTries, int backoffMs, Meter retryMeter) - { - super(maxTries, retryMeter); - this.backoffMs = backoffMs; - } - - public long sleepFor() - { - return (long) tries * backoffMs; - } - - @Override - public String toString() - { - return "Backoff{" + - "backoffMs=" + backoffMs + - ", maxTries=" + maxTries + - ", tries=" + tries + - '}'; - } + long wait = computeWaitInternal(attempts, TimeUnit.NANOSECONDS); + if (wait < 0) + return -1; + + if (deadlineNanos == Long.MAX_VALUE) + return wait; + + long now = nanoTime(); + wait = Math.min(deadlineNanos - now, wait); + if (wait <= 0) + return -1; + return units.convert(wait, TimeUnit.NANOSECONDS); } - public static class Deadline extends Retry + private long computeWaitInternal(int attempts, TimeUnit units) { - public final long deadlineNanos; - protected final Retry delegate; - - public Deadline(long deadlineNanos, Retry delegate) - { - super(delegate.maxTries, delegate.retryMeter); - assert deadlineNanos > 0 : String.format("Deadline should be strictly positive but was %d.", deadlineNanos); - this.deadlineNanos = deadlineNanos; - this.delegate = delegate; - } - - public static Deadline at(long deadlineNanos, Retry delegate) - { - return new Deadline(deadlineNanos, delegate); - } + retryMeter.mark(); + attempts = Math.max(attempts, ++this.attempts); + return delegate.computeWait(attempts, units); + } - public static Deadline after(long timeoutNanos, Retry delegate) - { - return new Deadline(Clock.Global.nanoTime() + timeoutNanos, delegate); - } + // imposes attempt limit + public static Retry withNoTimeLimit(Meter retryMeter) + { + return new Retry(Long.MAX_VALUE, retryMeter, DEFAULT_STRATEGY); + } - @Override - public boolean reachedMax() - { - return delegate.reachedMax() || Clock.Global.nanoTime() > deadlineNanos; - } + public static Retry withNoTimeLimit(Meter retryMeter, WaitStrategy delegate) + { + return new Retry(Long.MAX_VALUE, retryMeter, delegate); + } - public long remainingNanos() - { - return Math.max(0, deadlineNanos - Clock.Global.nanoTime()); - } + public static Retry until(long deadlineNanos, Meter retryMeter) + { + return new Retry(deadlineNanos, retryMeter, DEFAULT_STRATEGY); + } - @Override - public int currentTries() - { - return delegate.currentTries(); - } + public static Retry untilElapsed(long timeoutNanos, Meter retryMeter) + { + return new Retry(nanoTime() + timeoutNanos, retryMeter, DEFAULT_STRATEGY); + } - @Override - public long sleepFor() - { - return delegate.sleepFor(); - } + public static Retry untilElapsed(long timeoutNanos, Meter retryMeter, WaitStrategy waitStrategy) + { + return new Retry(nanoTime() + timeoutNanos, retryMeter, waitStrategy); + } - public String toString() - { - return String.format("Deadline{remainingMs=%d, tries=%d/%d}", TimeUnit.NANOSECONDS.toMillis(remainingNanos()), currentTries(), delegate.maxTries); - } + public String toString() + { + if (deadlineNanos == Long.MAX_VALUE) + return "RetryIndefinitely{attempts=" + attempts + '}'; + return String.format("Retry{remainingMs=%d, attempts=%d}", TimeUnit.NANOSECONDS.toMillis(remainingNanos()), attempts()); } + public long remainingNanos() + { + return Math.max(0, deadlineNanos - nanoTime()); + } } diff --git a/src/java/org/apache/cassandra/tcm/Startup.java b/src/java/org/apache/cassandra/tcm/Startup.java index d17b68769886..64ccbf37e852 100644 --- a/src/java/org/apache/cassandra/tcm/Startup.java +++ b/src/java/org/apache/cassandra/tcm/Startup.java @@ -53,6 +53,7 @@ import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.AccordService; import org.apache.cassandra.tcm.log.LocalLog; import org.apache.cassandra.tcm.log.LogStorage; import org.apache.cassandra.tcm.log.SystemKeyspaceStorage; @@ -75,6 +76,7 @@ import static org.apache.cassandra.tcm.compatibility.GossipHelper.fromEndpointStates; import static org.apache.cassandra.tcm.membership.NodeState.JOINED; import static org.apache.cassandra.tcm.membership.NodeState.LEFT; +import static org.apache.cassandra.tcm.membership.NodeState.REGISTERED; import static org.apache.cassandra.utils.FBUtilities.getBroadcastAddressAndPort; /** @@ -246,7 +248,7 @@ public static void initializeForDiscovery(Runnable initMessaging) else { CMSInitializationRequest.Initiator initiator = Election.instance.initiator(); - candidates = Discovery.instance.discoverOnce(initiator == null ? null : initiator.initiator); + candidates = Discovery.instance.discoverOnce(initiator == null ? null : initiator.endpoint); } Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS); } @@ -420,7 +422,11 @@ public static void startup(Supplier initialTransformation, boole InProgressSequences.finishInProgressSequences(self, true); metadata = ClusterMetadata.current(); - switch (metadata.directory.peerState(self)) + NodeState startingstate = metadata.directory.peerState(self); + if (startingstate != REGISTERED && startingstate != LEFT) + AccordService.startup(self); + + switch (startingstate) { case REGISTERED: case LEFT: @@ -428,6 +434,10 @@ public static void startup(Supplier initialTransformation, boole ReconfigureCMS.maybeReconfigureCMS(metadata, DatabaseDescriptor.getReplaceAddress()); ClusterMetadataService.instance().commit(initialTransformation.get()); + // When Accord starts up it needs to check for any historic epochs that it needs to know about (in order + // to handle pending transactions), in order to know what nodes to check with it needs to know what the + // settled placement is (so it knows what peers to reach out to). + AccordService.startup(self); InProgressSequences.finishInProgressSequences(self, true); // potentially finish the MSO committed above metadata = ClusterMetadata.current(); @@ -552,15 +562,17 @@ static StartupMode get(Set seeds) } if (seeds.isEmpty()) throw new IllegalArgumentException("Can not initialize CMS without any seeds"); - boolean hasAnyEpoch = SystemKeyspaceStorage.hasAnyEpoch(); + // For CCM and local dev clusters boolean isOnlySeed = DatabaseDescriptor.getSeeds().size() == 1 && DatabaseDescriptor.getSeeds().contains(FBUtilities.getBroadcastAddressAndPort()) && DatabaseDescriptor.getSeeds().iterator().next().getAddress().isLoopbackAddress(); boolean hasBootedBefore = SystemKeyspace.getLocalHostId() != null; logger.info("hasAnyEpoch = {}, hasBootedBefore = {}", hasAnyEpoch, hasBootedBefore); - if (!hasAnyEpoch && hasBootedBefore) + if (!hasAnyEpoch && hasBootedBefore && + // Atomic long processor currently does not support upgrades + !CassandraRelevantProperties.TCM_USE_ATOMIC_LONG_PROCESSOR.getBoolean()) return UPGRADE; else if (hasAnyEpoch) return NORMAL; diff --git a/src/java/org/apache/cassandra/tcm/StubClusterMetadataService.java b/src/java/org/apache/cassandra/tcm/StubClusterMetadataService.java index 855cce0d363a..d062d202e64a 100644 --- a/src/java/org/apache/cassandra/tcm/StubClusterMetadataService.java +++ b/src/java/org/apache/cassandra/tcm/StubClusterMetadataService.java @@ -28,9 +28,13 @@ import org.apache.cassandra.schema.DistributedSchema; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.Keyspaces; +import org.apache.cassandra.service.accord.AccordFastPath; +import org.apache.cassandra.service.accord.AccordStaleReplicas; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationState; import org.apache.cassandra.tcm.Commit.Replicator; import org.apache.cassandra.tcm.log.Entry; import org.apache.cassandra.tcm.log.LocalLog; +import org.apache.cassandra.tcm.log.LogState; import org.apache.cassandra.tcm.membership.Directory; import org.apache.cassandra.tcm.ownership.DataPlacements; import org.apache.cassandra.tcm.ownership.PlacementProvider; @@ -70,7 +74,7 @@ public static StubClusterMetadataService forTesting(ClusterMetadata metadata) private ClusterMetadata metadata; - private StubClusterMetadataService(ClusterMetadata initial) + protected StubClusterMetadataService(ClusterMetadata initial) { super(new UniformRangePlacement(), MetadataSnapshots.NO_OP, @@ -101,15 +105,20 @@ private StubClusterMetadataService(PlacementProvider placement, @Override public T1 commit(Transformation transform, CommitSuccessHandler onSuccess, CommitFailureHandler onFailure) { - Transformation.Result result = transform.execute(metadata); + Transformation.Result result = execute(transform); if (result.isSuccess()) { - metadata = result.success().metadata; + setMetadata(result.success().metadata); return onSuccess.accept(result.success().metadata); } return onFailure.accept(result.rejected().code, result.rejected().reason); } + protected Transformation.Result execute(Transformation transform) + { + return transform.execute(metadata()); + } + @Override public ClusterMetadata fetchLogFromCMS(Epoch awaitAtLeast) { @@ -133,13 +142,25 @@ private static class StubProcessor implements Processor private StubProcessor() {} @Override - public Commit.Result commit(Entry.Id entryId, Transformation transform, Epoch lastKnown, Retry.Deadline retryPolicy) + public Commit.Result commit(Entry.Id entryId, Transformation transform, Epoch lastKnown, Retry retryPolicy) + { + throw new UnsupportedOperationException(); + } + + @Override + public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry retryPolicy) + { + throw new UnsupportedOperationException(); + } + + @Override + public LogState getLocalState(Epoch start, Epoch end, boolean includeSnapshot) { throw new UnsupportedOperationException(); } @Override - public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry.Deadline retryPolicy) + public LogState getLogState(Epoch start, Epoch end, boolean includeSnapshot, Retry retryPolicy) { throw new UnsupportedOperationException(); } @@ -172,9 +193,12 @@ public StubClusterMetadataService build() Directory.EMPTY, new TokenMap(partitioner), DataPlacements.EMPTY, + AccordFastPath.EMPTY, LockedRanges.EMPTY, InProgressSequences.EMPTY, - ImmutableMap.of()); + ConsensusMigrationState.EMPTY, + ImmutableMap.of(), + AccordStaleReplicas.EMPTY); } return new StubClusterMetadataService(new UniformRangePlacement(), snapshots != null ? snapshots : MetadataSnapshots.NO_OP, diff --git a/src/java/org/apache/cassandra/tcm/Transformation.java b/src/java/org/apache/cassandra/tcm/Transformation.java index 8cfda01e26c0..928ce59aca12 100644 --- a/src/java/org/apache/cassandra/tcm/Transformation.java +++ b/src/java/org/apache/cassandra/tcm/Transformation.java @@ -38,8 +38,28 @@ import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; import org.apache.cassandra.tcm.serialization.VerboseMetadataSerializer; import org.apache.cassandra.tcm.serialization.Version; -import org.apache.cassandra.tcm.transformations.*; +import org.apache.cassandra.tcm.transformations.AccordMarkRejoining; +import org.apache.cassandra.tcm.transformations.AccordMarkStale; +import org.apache.cassandra.tcm.transformations.AlterSchema; +import org.apache.cassandra.tcm.transformations.AlterTopology; +import org.apache.cassandra.tcm.transformations.Assassinate; +import org.apache.cassandra.tcm.transformations.BeginConsensusMigrationForTableAndRange; +import org.apache.cassandra.tcm.transformations.CancelInProgressSequence; +import org.apache.cassandra.tcm.transformations.CustomTransformation; +import org.apache.cassandra.tcm.transformations.FinishDropAccordTable; +import org.apache.cassandra.tcm.transformations.ForceSnapshot; +import org.apache.cassandra.tcm.transformations.MaybeFinishConsensusMigrationForTableAndRange; +import org.apache.cassandra.tcm.transformations.PrepareDropAccordTable; +import org.apache.cassandra.tcm.transformations.PrepareJoin; +import org.apache.cassandra.tcm.transformations.PrepareLeave; +import org.apache.cassandra.tcm.transformations.PrepareMove; +import org.apache.cassandra.tcm.transformations.PrepareReplace; +import org.apache.cassandra.tcm.transformations.ReconfigureAccordFastPath; +import org.apache.cassandra.tcm.transformations.Register; import org.apache.cassandra.tcm.transformations.Startup; +import org.apache.cassandra.tcm.transformations.TriggerSnapshot; +import org.apache.cassandra.tcm.transformations.Unregister; +import org.apache.cassandra.tcm.transformations.UnsafeJoin; import org.apache.cassandra.tcm.transformations.cms.AdvanceCMSReconfiguration; import org.apache.cassandra.tcm.transformations.cms.FinishAddToCMS; import org.apache.cassandra.tcm.transformations.cms.Initialize; @@ -219,7 +239,17 @@ enum Kind PREPARE_SIMPLE_CMS_RECONFIGURATION(31, () -> PrepareCMSReconfiguration.Simple.serializer), PREPARE_COMPLEX_CMS_RECONFIGURATION(32, () -> PrepareCMSReconfiguration.Complex.serializer), ADVANCE_CMS_RECONFIGURATION(33, () -> AdvanceCMSReconfiguration.serializer), - CANCEL_CMS_RECONFIGURATION(34, () -> CancelCMSReconfiguration.serializer) + CANCEL_CMS_RECONFIGURATION(34, () -> CancelCMSReconfiguration.serializer), + ALTER_TOPOLOGY(35, () -> AlterTopology.serializer), + + UPDATE_AVAILABILITY(36, () -> ReconfigureAccordFastPath.serializer), + + BEGIN_CONSENSUS_MIGRATION_FOR_TABLE_AND_RANGE(37, () -> BeginConsensusMigrationForTableAndRange.serializer), + MAYBE_FINISH_CONSENSUS_MIGRATION_FOR_TABLE_AND_RANGE(38, () -> MaybeFinishConsensusMigrationForTableAndRange.serializer), + ACCORD_MARK_STALE(39, () -> AccordMarkStale.serializer), + ACCORD_MARK_REJOINING(40, () -> AccordMarkRejoining.serializer), + PREPARE_DROP_ACCORD_TABLE(41, () -> PrepareDropAccordTable.serializer), + FINISH_DROP_ACCORD_TABLE(42, () -> FinishDropAccordTable.serializer), ; private final Supplier> serializer; diff --git a/src/java/org/apache/cassandra/tcm/compatibility/GossipHelper.java b/src/java/org/apache/cassandra/tcm/compatibility/GossipHelper.java index 1a555fce4d36..f569a9d1cbc5 100644 --- a/src/java/org/apache/cassandra/tcm/compatibility/GossipHelper.java +++ b/src/java/org/apache/cassandra/tcm/compatibility/GossipHelper.java @@ -34,7 +34,6 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Lists; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -55,11 +54,14 @@ import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.schema.SchemaKeyspace; import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.AccordStaleReplicas; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationState; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.MultiStepOperation; import org.apache.cassandra.tcm.extensions.ExtensionKey; import org.apache.cassandra.tcm.extensions.ExtensionValue; +import org.apache.cassandra.service.accord.AccordFastPath; import org.apache.cassandra.tcm.membership.Directory; import org.apache.cassandra.tcm.membership.Location; import org.apache.cassandra.tcm.membership.NodeAddresses; @@ -295,9 +297,12 @@ public static ClusterMetadata emptyWithSchemaFromSystemTables(Set allKno Directory.EMPTY, new TokenMap(DatabaseDescriptor.getPartitioner()), DataPlacements.empty(), + AccordFastPath.EMPTY, LockedRanges.EMPTY, InProgressSequences.EMPTY, - Collections.emptyMap()); + ConsensusMigrationState.EMPTY, + Collections.emptyMap(), + AccordStaleReplicas.EMPTY); } public static ClusterMetadata fromEndpointStates(DistributedSchema schema, Map epStates) @@ -382,9 +387,12 @@ public static ClusterMetadata fromEndpointStates(Map epstates) diff --git a/src/java/org/apache/cassandra/tcm/listeners/LegacyStateListener.java b/src/java/org/apache/cassandra/tcm/listeners/LegacyStateListener.java index f0fced7ae6d5..798583a533f8 100644 --- a/src/java/org/apache/cassandra/tcm/listeners/LegacyStateListener.java +++ b/src/java/org/apache/cassandra/tcm/listeners/LegacyStateListener.java @@ -115,6 +115,9 @@ public void notifyPostCommit(ClusterMetadata prev, ClusterMetadata next, boolean // state for the local node. Gossiper.instance.maybeInitializeLocalState(SystemKeyspace.incrementAndGetGeneration()); Gossiper.instance.addLocalApplicationState(SCHEMA, StorageService.instance.valueFactory.schema(next.schema.getVersion())); + // if the local node's location has changed, update system.local. + if (!next.directory.location(change).equals(prev.directory.location(change))) + SystemKeyspace.updateLocation(next.directory.location(change)); } if (next.directory.peerState(change) == REGISTERED) @@ -181,6 +184,8 @@ private boolean directoryEntryChangedFor(NodeId nodeId, Directory prev, Director { return prev.peerState(nodeId) != next.peerState(nodeId) || !Objects.equals(prev.getNodeAddresses(nodeId), next.getNodeAddresses(nodeId)) || - !Objects.equals(prev.version(nodeId), next.version(nodeId)); + !Objects.equals(prev.version(nodeId), next.version(nodeId)) || + !Objects.equals(prev.location(nodeId), next.location(nodeId)); + } } diff --git a/src/java/org/apache/cassandra/tcm/listeners/PlacementsChangeListener.java b/src/java/org/apache/cassandra/tcm/listeners/PlacementsChangeListener.java index 80da4ec844d1..605b52637813 100644 --- a/src/java/org/apache/cassandra/tcm/listeners/PlacementsChangeListener.java +++ b/src/java/org/apache/cassandra/tcm/listeners/PlacementsChangeListener.java @@ -34,7 +34,7 @@ public void notifyPostCommit(ClusterMetadata prev, ClusterMetadata next, boolean private boolean shouldInvalidate(ClusterMetadata prev, ClusterMetadata next) { if (!prev.placements.lastModified().equals(next.placements.lastModified()) && - !prev.placements.equals(next.placements)) // <- todo should we update lastModified if the result is the same? + !prev.placements.equivalentTo(next.placements)) // <- todo should we update lastModified if the result is the same? return true; if (prev.schema.getKeyspaces().size() != next.schema.getKeyspaces().size()) diff --git a/src/java/org/apache/cassandra/tcm/log/LocalLog.java b/src/java/org/apache/cassandra/tcm/log/LocalLog.java index 6e54e4657fa5..a0f50d16ad8d 100644 --- a/src/java/org/apache/cassandra/tcm/log/LocalLog.java +++ b/src/java/org/apache/cassandra/tcm/log/LocalLog.java @@ -70,7 +70,7 @@ import org.apache.cassandra.utils.concurrent.Condition; import org.apache.cassandra.utils.concurrent.WaitQueue; -import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.Daemon.NON_DAEMON; +import static org.apache.cassandra.concurrent.ExecutorFactory.SystemThreadTag.NON_DAEMON; import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.Interrupts.UNSYNCHRONIZED; import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.SimulatorSafe.SAFE; import static org.apache.cassandra.tcm.Epoch.EMPTY; @@ -300,6 +300,11 @@ public void bootstrap(InetAddressAndPort addr, String datacenter) assert metadata.epoch.is(Epoch.FIRST) : String.format("Epoch: %s. CMS: %s", metadata.epoch, metadata.fullCMSMembers()); } + public LogStorage storage() + { + return storage; + } + public ClusterMetadata metadata() { return committed.get(); @@ -351,11 +356,16 @@ public Optional highestPending() } } - public LogState getCommittedEntries(Epoch since) + public LogState getLocalEntries(Epoch since) { return storage.getLogState(since, false); } + public LogState getLocalEntries(Epoch since, Epoch until, boolean includeSnapshot) + { + return storage.getLogState(since, until, includeSnapshot); + } + public ClusterMetadata waitForHighestConsecutive() { runOnce(); diff --git a/src/java/org/apache/cassandra/tcm/log/LogReader.java b/src/java/org/apache/cassandra/tcm/log/LogReader.java index 688f1c76531b..4ec597a31122 100644 --- a/src/java/org/apache/cassandra/tcm/log/LogReader.java +++ b/src/java/org/apache/cassandra/tcm/log/LogReader.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.Comparator; +import java.util.Iterator; import java.util.List; import java.util.SortedSet; import java.util.TreeSet; @@ -27,6 +28,8 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.Ordering; +import accord.utils.Invariants; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.MetadataSnapshots; @@ -37,6 +40,7 @@ public interface LogReader * Gets all entries where epoch >= since - could be empty if since is a later epoch than the current highest seen */ EntryHolder getEntries(Epoch since) throws IOException; + EntryHolder getEntries(Epoch since, Epoch until) throws IOException; MetadataSnapshots snapshots(); /** @@ -117,6 +121,58 @@ else if (!allowSnapshots) } } + default LogState getLogState(Epoch start, Epoch end, boolean includeSnapshot) + { + try + { + ClusterMetadata closestSnapshot = null; + if (includeSnapshot) + closestSnapshot = snapshots().getSnapshotBefore(start); + + // Snapshot could not be found, fetch enough epochs to reconstruct the start metadata + if (closestSnapshot == null) + { + if (includeSnapshot) + closestSnapshot = new ClusterMetadata(DatabaseDescriptor.getPartitioner()); + ImmutableList.Builder entries = new ImmutableList.Builder<>(); + EntryHolder entryHolder = getEntries(Epoch.EMPTY, end); + for (Entry entry : entryHolder.entries) + { + if (entry.epoch.isAfter(start)) + entries.add(entry); + else if (includeSnapshot) + closestSnapshot = entry.transform.execute(closestSnapshot).success().metadata; + } + return new LogState(closestSnapshot, entries.build()); + } + else if (closestSnapshot.epoch.isBefore(start)) + { + ImmutableList.Builder entries = new ImmutableList.Builder<>(); + // start is exclusive, so use the closest snapshot + EntryHolder entryHolder = getEntries(closestSnapshot.epoch, end); + for (Entry entry : entryHolder.entries) + { + if (entry.epoch.isAfter(start)) + entries.add(entry); + else if (includeSnapshot) + closestSnapshot = entry.transform.execute(closestSnapshot).success().metadata; + } + return new LogState(closestSnapshot, entries.build()); + } + else + { + Invariants.require(closestSnapshot.epoch.isEqualOrAfter(start), + "Got %s, but requested snapshot of %s", closestSnapshot.epoch, start); + EntryHolder entryHolder = getEntries(closestSnapshot.epoch, end); + return new LogState(closestSnapshot, ImmutableList.copyOf(entryHolder.entries)); + } + } + catch (IOException e) + { + throw new RuntimeException(e); + } + } + class EntryHolder { SortedSet entries; @@ -146,6 +202,11 @@ private boolean isContinuous() return true; } + public Iterator iterator() + { + return entries.iterator(); + } + private ImmutableList immutable() { return ImmutableList.copyOf(entries); diff --git a/src/java/org/apache/cassandra/tcm/log/LogState.java b/src/java/org/apache/cassandra/tcm/log/LogState.java index 03294e9ffb77..7e050e080550 100644 --- a/src/java/org/apache/cassandra/tcm/log/LogState.java +++ b/src/java/org/apache/cassandra/tcm/log/LogState.java @@ -23,10 +23,10 @@ import java.util.Optional; import com.google.common.collect.ImmutableList; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import accord.utils.Invariants; import org.apache.cassandra.concurrent.ScheduledExecutors; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.SystemKeyspace; @@ -72,6 +72,10 @@ public static IVersionedSerializer messageSerializer(Version version) // Uses Replication rather than an just a list of entries primarily to avoid duplicating the existing serializer public LogState(ClusterMetadata baseState, ImmutableList entries) { + Invariants.require(baseState == null || + entries.isEmpty() || + entries.get(0).epoch.isDirectlyAfter(baseState.epoch), + "Base state: %s, first entry: %s", baseState == null ? null : baseState.epoch, entries.isEmpty() ? null : entries.get(0).epoch); this.baseState = baseState; this.entries = entries; } diff --git a/src/java/org/apache/cassandra/tcm/log/LogStorage.java b/src/java/org/apache/cassandra/tcm/log/LogStorage.java index 3d5e681b168c..e739a8aae799 100644 --- a/src/java/org/apache/cassandra/tcm/log/LogStorage.java +++ b/src/java/org/apache/cassandra/tcm/log/LogStorage.java @@ -56,6 +56,12 @@ public LogState getLogState(Epoch startEpoch, boolean allowSnapshots) return LogState.EMPTY; } + @Override + public LogState getLogState(Epoch start, Epoch end, boolean includeSnapshot) + { + return LogState.EMPTY; + } + @Override public LogState getPersistedLogState() { @@ -68,6 +74,12 @@ public EntryHolder getEntries(Epoch since) return null; } + @Override + public EntryHolder getEntries(Epoch since, Epoch until) + { + return null; + } + @Override public MetadataSnapshots snapshots() { diff --git a/src/java/org/apache/cassandra/tcm/membership/Directory.java b/src/java/org/apache/cassandra/tcm/membership/Directory.java index 51ab84c52051..0f6cf6eb14a4 100644 --- a/src/java/org/apache/cassandra/tcm/membership/Directory.java +++ b/src/java/org/apache/cassandra/tcm/membership/Directory.java @@ -27,13 +27,14 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableSet; import com.google.common.collect.Multimap; import com.google.common.collect.Sets; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import accord.utils.Invariants; +import accord.utils.btree.BTreeSet; import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; @@ -62,6 +63,7 @@ public class Directory implements MetadataValue private final int nextId; private final Epoch lastModified; private final BTreeBiMap peers; + private final BTreeSet removedNodes; private final BTreeMap locations; public final BTreeMap states; public final BTreeMap versions; @@ -77,6 +79,7 @@ public Directory() this(1, Epoch.EMPTY, BTreeBiMap.empty(), + BTreeSet.empty(RemovedNode::compareTo), BTreeMap.empty(), BTreeMap.empty(), BTreeMap.empty(), @@ -89,6 +92,7 @@ public Directory() private Directory(int nextId, Epoch lastModified, BTreeBiMap peers, + BTreeSet removedNodes, BTreeMap locations, BTreeMap states, BTreeMap versions, @@ -100,6 +104,7 @@ private Directory(int nextId, this.nextId = nextId; this.lastModified = lastModified; this.peers = peers; + this.removedNodes = removedNodes; this.locations = locations; this.states = states; this.versions = versions; @@ -146,7 +151,7 @@ public Epoch lastModified() @Override public Directory withLastModified(Epoch epoch) { - return new Directory(nextId, epoch, peers, locations, states, versions, hostIds, addresses, endpointsByDC, racksByDC); + return new Directory(nextId, epoch, peers, removedNodes, locations, states, versions, hostIds, addresses, endpointsByDC, racksByDC); } public Directory withNonUpgradedNode(NodeAddresses addresses, @@ -168,6 +173,11 @@ public Directory with(NodeAddresses addresses, Location location) return with(addresses, location, CURRENT); } + public Directory with(NodeId id, NodeAddresses addresses, Location location, NodeVersion nodeVersion) + { + return with(addresses, id, id.toUUID(), location, nodeVersion); + } + public Directory with(NodeAddresses addresses, Location location, NodeVersion nodeVersion) { NodeId id = new NodeId(nextId); @@ -188,6 +198,7 @@ private Directory with(NodeAddresses nodeAddresses, NodeId id, UUID hostId, Loca return new Directory(nextId + 1, lastModified, peers.without(id).with(id, nodeAddresses.broadcastAddress), + removedNodes, locations.withForce(id, location), states.withForce(id, NodeState.REGISTERED), versions.withForce(id, nodeVersion), @@ -199,14 +210,14 @@ private Directory with(NodeAddresses nodeAddresses, NodeId id, UUID hostId, Loca public Directory withNodeState(NodeId id, NodeState state) { - return new Directory(nextId, lastModified, peers, locations, states.withForce(id, state), versions, hostIds, addresses, endpointsByDC, racksByDC); + return new Directory(nextId, lastModified, peers, removedNodes, locations, states.withForce(id, state), versions, hostIds, addresses, endpointsByDC, racksByDC); } public Directory withNodeVersion(NodeId id, NodeVersion version) { if (Objects.equals(versions.get(id), version)) return this; - return new Directory(nextId, lastModified, peers, locations, states, versions.withForce(id, version), hostIds, addresses, endpointsByDC, racksByDC); + return new Directory(nextId, lastModified, peers, removedNodes, locations, states, versions.withForce(id, version), hostIds, addresses, endpointsByDC, racksByDC); } public Directory withNodeAddresses(NodeId id, NodeAddresses nodeAddresses) @@ -228,7 +239,7 @@ public Directory withNodeAddresses(NodeId id, NodeAddresses nodeAddresses) BTreeMap> updatedEndpointsByRack = racksByDC.withForce(location(id).datacenter, rackEP); return new Directory(nextId, lastModified, - peers.withForce(id,nodeAddresses.broadcastAddress), locations, states, versions, hostIds, addresses.withForce(id, nodeAddresses), + peers.withForce(id,nodeAddresses.broadcastAddress), removedNodes, locations, states, versions, hostIds, addresses.withForce(id, nodeAddresses), updatedEndpointsByDC, updatedEndpointsByRack); } @@ -237,13 +248,12 @@ public Directory withRackAndDC(NodeId id) { InetAddressAndPort endpoint = peers.get(id); Location location = locations.get(id); - BTreeMultimap rackEP = (BTreeMultimap) racksByDC.get(location.datacenter); if (rackEP == null) rackEP = BTreeMultimap.empty(); rackEP = rackEP.with(location.rack, endpoint); - return new Directory(nextId, lastModified, peers, locations, states, versions, hostIds, addresses, + return new Directory(nextId, lastModified, peers, removedNodes, locations, states, versions, hostIds, addresses, endpointsByDC.with(location.datacenter, endpoint), racksByDC.withForce(location.datacenter, rackEP)); } @@ -263,12 +273,48 @@ public Directory withoutRackAndDC(NodeId id) newRacksByDC = racksByDC.without(location.datacenter); else newRacksByDC = racksByDC.withForce(location.datacenter, rackEP); - return new Directory(nextId, lastModified, peers, locations, states, versions, hostIds, addresses, + return new Directory(nextId, lastModified, peers, removedNodes, locations, states, versions, hostIds, addresses, endpointsByDC.without(location.datacenter, endpoint), newRacksByDC); } - public Directory without(NodeId id) + public Directory withUpdatedRackAndDc(NodeId id, Location location) + { + if (!peers.containsKey(id)) + throw new IllegalArgumentException(String.format("Node %s has no registered location to update", id)); + + return withoutRackAndDC(id).withLocation(id, location).withRackAndDC(id); + } + + private Directory withLocation(NodeId id, Location location) + { + if (!locations.containsKey(id)) + throw new IllegalArgumentException(String.format("Node %s has no registered location to update", id)); + + if (locations.get(id).equals(location)) + return this; + + return new Directory(nextId, lastModified, peers, removedNodes, locations.withForce(id, location), states, versions, hostIds, + addresses, endpointsByDC, racksByDC); + } + + public Directory removed(Epoch removedIn, NodeId id, InetAddressAndPort addr) + { + Invariants.require(!peers.containsKey(id)); + return new Directory(nextId, + lastModified, + peers, + removedNodes.with(new RemovedNode(removedIn, id, addr)), + locations, + states, + versions, + hostIds, + addresses, + endpointsByDC, + racksByDC); + } + + public Directory without(Epoch removedIn, NodeId id) { InetAddressAndPort endpoint = peers.get(id); Location location = locations.get(id); @@ -280,6 +326,7 @@ public Directory without(NodeId id) return new Directory(nextId, lastModified, peers.without(id), + removedNodes.with(new RemovedNode(removedIn, id, peers.get(id))), locations.without(id), states.without(id), versions.without(id), @@ -296,6 +343,7 @@ public Directory without(NodeId id) return new Directory(nextId, lastModified, peers.without(id), + removedNodes.with(new RemovedNode(removedIn, id, peers.get(id))), locations.without(id), states.without(id), versions.without(id), @@ -336,9 +384,14 @@ public ImmutableList allAddresses() return ImmutableList.copyOf(peers.values()); } - public ImmutableSet peerIds() + public NavigableSet peerIds() { - return ImmutableSet.copyOf(peers.keySet()); + return peers.keySet(); + } + + public BTreeSet removedNodes() + { + return removedNodes; } public NodeAddresses getNodeAddresses(NodeId id) @@ -558,6 +611,17 @@ public void serialize(Directory t, DataOutputPlus out, Version version) throws I } } Epoch.serializer.serialize(t.lastModified, out, version); + + if (version.isAtLeast(Version.V7)) + { + out.writeInt(t.removedNodes.size()); + for (RemovedNode removedNode : t.removedNodes) + { + out.writeLong(removedNode.removedIn.getEpoch()); + NodeId.serializer.serialize(removedNode.id, out, version); + InetAddressAndPort.MetadataSerializer.serializer.serialize(removedNode.endpoint, out, version); + } + } } public Directory deserialize(DataInputPlus in, Version version) throws IOException @@ -613,9 +677,23 @@ public Directory deserialize(DataInputPlus in, Version version) throws IOExcepti else nextId = maxId.id() + 1; } + + if (version.isAtLeast(Version.V7)) + { + int removedNodes = in.readInt(); + for (int i = 0; i < removedNodes; i++) + { + long epoch = in.readLong(); + NodeId nodeId = NodeId.serializer.deserialize(in, version); + InetAddressAndPort addr = InetAddressAndPort.MetadataSerializer.serializer.deserialize(in, version); + newDir.removed(Epoch.create(epoch), nodeId, addr); + } + } + return new Directory(nextId, lastModified, newDir.peers, + newDir.removedNodes, newDir.locations, newDir.states, newDir.versions, @@ -653,6 +731,18 @@ public long serializedSize(Directory t, Version version) } } size += Epoch.serializer.serializedSize(t.lastModified, version); + + if (version.isAtLeast(Version.V7)) + { + size += TypeSizes.INT_SIZE; + for (RemovedNode removedNode : t.removedNodes) + { + size += TypeSizes.LONG_SIZE; + size += NodeId.serializer.serializedSize(removedNode.id, version); + size += InetAddressAndPort.MetadataSerializer.serializer.serializedSize(removedNode.endpoint, version); + } + } + return size; } } @@ -665,7 +755,7 @@ public boolean equals(Object o) Directory directory = (Directory) o; return Objects.equals(lastModified, directory.lastModified) && - isEquivalent(directory); + equivalentTo(directory); } private static Pair minMaxVersions(BTreeMap states, BTreeMap versions) @@ -700,7 +790,7 @@ public int hashCode() * does not check equality of lastModified */ @VisibleForTesting - public boolean isEquivalent(Directory directory) + public boolean equivalentTo(Directory directory) { return nextId == directory.nextId && Objects.equals(peers, directory.peers) && @@ -774,4 +864,37 @@ public static void dumpDiff(Logger logger, Map l, Map r) logger.warn("Value for key {} is only present in the right set: {}", k, r.get(k)); } + + + public static class RemovedNode implements Comparable + { + public final Epoch removedIn; + public final NodeId id; + public final InetAddressAndPort endpoint; + + public RemovedNode(Epoch removedIn, NodeId id, InetAddressAndPort endpoint) + { + this.removedIn = removedIn; + this.id = id; + this.endpoint = endpoint; + } + + public boolean equals(Object object) + { + if (this == object) return true; + if (object == null || getClass() != object.getClass()) return false; + RemovedNode that = (RemovedNode) object; + return Objects.equals(removedIn, that.removedIn) && Objects.equals(id, that.id) && Objects.equals(endpoint, that.endpoint); + } + + public int hashCode() + { + return Objects.hash(removedIn, id, endpoint); + } + + public int compareTo(RemovedNode o) + { + return id.compareTo(o.id); + } + } } diff --git a/src/java/org/apache/cassandra/tcm/membership/Location.java b/src/java/org/apache/cassandra/tcm/membership/Location.java index faf8230d94fe..08ad29dde983 100644 --- a/src/java/org/apache/cassandra/tcm/membership/Location.java +++ b/src/java/org/apache/cassandra/tcm/membership/Location.java @@ -67,6 +67,18 @@ public String toString() return datacenter + '/' + rack; } + public static Location fromString(String value) + { + if (value == null || value.isEmpty()) + return null; + + String[] parts = value.split(":"); + if (parts.length < 2) + throw new IllegalArgumentException("Invalid datacenter:rack - " + value); + else + return new Location(parts[0].trim(), parts[1].trim()); + } + public static class Serializer implements MetadataSerializer { public void serialize(Location t, DataOutputPlus out, Version version) throws IOException diff --git a/src/java/org/apache/cassandra/tcm/membership/NodeVersion.java b/src/java/org/apache/cassandra/tcm/membership/NodeVersion.java index bc1bcc707e3d..64d1cc71c6f5 100644 --- a/src/java/org/apache/cassandra/tcm/membership/NodeVersion.java +++ b/src/java/org/apache/cassandra/tcm/membership/NodeVersion.java @@ -34,7 +34,7 @@ public class NodeVersion implements Comparable { public static final Serializer serializer = new Serializer(); - public static final Version CURRENT_METADATA_VERSION = Version.V6; + public static final Version CURRENT_METADATA_VERSION = Version.V7; public static final NodeVersion CURRENT = new NodeVersion(new CassandraVersion(FBUtilities.getReleaseVersionString()), CURRENT_METADATA_VERSION); private static final CassandraVersion SINCE_VERSION = CassandraVersion.CASSANDRA_5_1; diff --git a/src/java/org/apache/cassandra/tcm/migration/CMSInitializationRequest.java b/src/java/org/apache/cassandra/tcm/migration/CMSInitializationRequest.java index dac50e5edbdc..599bfca0da7d 100644 --- a/src/java/org/apache/cassandra/tcm/migration/CMSInitializationRequest.java +++ b/src/java/org/apache/cassandra/tcm/migration/CMSInitializationRequest.java @@ -109,12 +109,12 @@ public long serializedSize(CMSInitializationRequest t, int version) public static class Initiator { public static final Serializer serializer = new Serializer(); - public final InetAddressAndPort initiator; + public final InetAddressAndPort endpoint; public final UUID initToken; public Initiator(InetAddressAndPort initiator, UUID initToken) { - this.initiator = initiator; + this.endpoint = initiator; this.initToken = initToken; } @@ -124,20 +124,20 @@ public boolean equals(Object o) if (this == o) return true; if (!(o instanceof Initiator)) return false; Initiator other = (Initiator) o; - return Objects.equals(initiator, other.initiator) && Objects.equals(initToken, other.initToken); + return Objects.equals(endpoint, other.endpoint) && Objects.equals(initToken, other.initToken); } @Override public int hashCode() { - return Objects.hash(initiator, initToken); + return Objects.hash(endpoint, initToken); } @Override public String toString() { return "Initiator{" + - "initiator=" + initiator + + "initiator=" + endpoint + ", initToken=" + initToken + '}'; } @@ -147,7 +147,7 @@ public static class Serializer implements IVersionedSerializer @Override public void serialize(Initiator t, DataOutputPlus out, int version) throws IOException { - InetAddressAndPort.Serializer.inetAddressAndPortSerializer.serialize(t.initiator, out, version); + InetAddressAndPort.Serializer.inetAddressAndPortSerializer.serialize(t.endpoint, out, version); UUIDSerializer.serializer.serialize(t.initToken, out, version); } @@ -161,7 +161,7 @@ public Initiator deserialize(DataInputPlus in, int version) throws IOException @Override public long serializedSize(Initiator t, int version) { - return InetAddressAndPort.Serializer.inetAddressAndPortSerializer.serializedSize(t.initiator, version) + + return InetAddressAndPort.Serializer.inetAddressAndPortSerializer.serializedSize(t.endpoint, version) + UUIDSerializer.serializer.serializedSize(t.initToken, version); } } diff --git a/src/java/org/apache/cassandra/tcm/migration/Election.java b/src/java/org/apache/cassandra/tcm/migration/Election.java index 94f5dc4a06d3..6ada116323d6 100644 --- a/src/java/org/apache/cassandra/tcm/migration/Election.java +++ b/src/java/org/apache/cassandra/tcm/migration/Election.java @@ -29,7 +29,6 @@ import java.util.stream.Collectors; import com.google.common.collect.Sets; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -134,7 +133,7 @@ private void finish(Set sendTo) { CMSInitializationRequest.Initiator currentInitiator = initiator.get(); if (currentInitiator != null && - Objects.equals(currentInitiator.initiator, FBUtilities.getBroadcastAddressAndPort()) && + Objects.equals(currentInitiator.endpoint, FBUtilities.getBroadcastAddressAndPort()) && initiator.compareAndSet(currentInitiator, MIGRATING)) { Startup.initializeAsFirstCMSNode(); @@ -183,7 +182,7 @@ public void abortInitialization(String initiatorEp) { InetAddressAndPort expectedInitiator = InetAddressAndPort.getByNameUnchecked(initiatorEp); CMSInitializationRequest.Initiator currentInitiator = initiator.get(); - if (currentInitiator != null && Objects.equals(currentInitiator.initiator, expectedInitiator) && initiator.compareAndSet(currentInitiator, null)) + if (currentInitiator != null && Objects.equals(currentInitiator.endpoint, expectedInitiator) && initiator.compareAndSet(currentInitiator, null)) { ClusterMetadata metadata = ClusterMetadata.current(); for (Map.Entry entry : metadata.directory.states.entrySet()) @@ -243,9 +242,11 @@ public class AbortHandler implements IVerbHandler message) throws IOException { logger.info("Received election abort message {} from {}", message.payload, message.from()); - CMSInitializationRequest.Initiator initiator = message.payload; - if (!initiator.initiator.equals(initiator().initiator) || !updateInitiator(message.payload, null)) - logger.error("Could not clear initiator - initiator is set to {}, abort message received from {}", initiator(), message.payload); + CMSInitializationRequest.Initiator remoteInitiator = message.payload; + if (initiator() == null) + logger.info("Initiator already cleared, ignoring abort message from {}: {}", message.from(), remoteInitiator); + else if (!remoteInitiator.endpoint.equals(initiator().endpoint) || !updateInitiator(remoteInitiator, null)) + logger.error("Could not clear initiator - initiator is set to {}, abort message received from {}: {}", initiator(), message.from(), remoteInitiator); } } } diff --git a/src/java/org/apache/cassandra/tcm/migration/GossipProcessor.java b/src/java/org/apache/cassandra/tcm/migration/GossipProcessor.java index 0cb654f19a49..602f52a3a7df 100644 --- a/src/java/org/apache/cassandra/tcm/migration/GossipProcessor.java +++ b/src/java/org/apache/cassandra/tcm/migration/GossipProcessor.java @@ -25,18 +25,31 @@ import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.Transformation; import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.log.LogState; public class GossipProcessor implements Processor { @Override - public Commit.Result commit(Entry.Id entryId, Transformation transform, Epoch lastKnown, Retry.Deadline retryPolicy) + public Commit.Result commit(Entry.Id entryId, Transformation transform, Epoch lastKnown, Retry retryPolicy) { throw new IllegalStateException("Can't commit transformations when running in gossip mode. Enable the ClusterMetadataService with `nodetool cms initialize`."); } @Override - public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry.Deadline retryPolicy) + public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry retryPolicy) { return ClusterMetadata.current(); } + + @Override + public LogState getLocalState(Epoch start, Epoch end, boolean includeSnapshot) + { + throw new IllegalStateException("Can't reconstruct log state when running in gossip mode. Enable the ClusterMetadataService with `nodetool addtocms`."); + } + + @Override + public LogState getLogState(Epoch start, Epoch end, boolean includeSnapshot, Retry retryPolicy) + { + throw new IllegalStateException("Can't reconstruct log state when running in gossip mode. Enable the ClusterMetadataService with `nodetool addtocms`."); + } } diff --git a/src/java/org/apache/cassandra/tcm/ownership/DataPlacement.java b/src/java/org/apache/cassandra/tcm/ownership/DataPlacement.java index f42d8b54792e..12920d68624d 100644 --- a/src/java/org/apache/cassandra/tcm/ownership/DataPlacement.java +++ b/src/java/org/apache/cassandra/tcm/ownership/DataPlacement.java @@ -184,6 +184,11 @@ public int hashCode() return Objects.hash(reads, writes); } + public boolean equivalentTo(DataPlacement other) + { + return reads.equivalentTo(other.reads) && writes.equivalentTo(other.writes); + } + public static class Serializer implements MetadataSerializer { private final IPartitioner partitioner; diff --git a/src/java/org/apache/cassandra/tcm/ownership/DataPlacements.java b/src/java/org/apache/cassandra/tcm/ownership/DataPlacements.java index 988d2b1bcb81..b89ecde7d329 100644 --- a/src/java/org/apache/cassandra/tcm/ownership/DataPlacements.java +++ b/src/java/org/apache/cassandra/tcm/ownership/DataPlacements.java @@ -141,6 +141,24 @@ public String toString() '}'; } + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (!(o instanceof DataPlacements)) return false; + DataPlacements that = (DataPlacements) o; + return this.map.equals(that.map); + } + + public boolean equivalentTo(DataPlacements other) + { + if (!map.keySet().equals(other.map.keySet())) + return false; + return map.entrySet() + .stream() + .allMatch(e -> e.getValue().equivalentTo(other.get(e.getKey()))); + } + public static DataPlacements sortReplicaGroups(DataPlacements placements, Comparator comparator) { Builder builder = DataPlacements.builder(placements.size()); diff --git a/src/java/org/apache/cassandra/tcm/ownership/ReplicaGroups.java b/src/java/org/apache/cassandra/tcm/ownership/ReplicaGroups.java index adc26ff820c9..33d160fa10a8 100644 --- a/src/java/org/apache/cassandra/tcm/ownership/ReplicaGroups.java +++ b/src/java/org/apache/cassandra/tcm/ownership/ReplicaGroups.java @@ -534,4 +534,19 @@ public int hashCode() { return Objects.hash(ranges, endpoints); } + + public boolean equivalentTo(ReplicaGroups other) + { + if (!ranges.equals(other.ranges)) + return false; + + for (int i = 0; i < ranges.size(); i++) + { + EndpointsForRange e1 = endpoints.get(i).get(); + EndpointsForRange e2 = other.forRange(ranges.get(i)).get(); + if (e1.size() != e2.size() || !e1.stream().allMatch(e2::contains)) + return false; + } + return true; + } } diff --git a/src/java/org/apache/cassandra/tcm/ownership/TokenMap.java b/src/java/org/apache/cassandra/tcm/ownership/TokenMap.java index c32f6c351c1a..ca0a3eb93023 100644 --- a/src/java/org/apache/cassandra/tcm/ownership/TokenMap.java +++ b/src/java/org/apache/cassandra/tcm/ownership/TokenMap.java @@ -27,6 +27,7 @@ import java.util.Objects; import com.google.common.collect.ImmutableList; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,7 +42,6 @@ import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.tcm.serialization.MetadataSerializer; import org.apache.cassandra.tcm.serialization.Version; -import org.apache.cassandra.utils.BiMultiValMap; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.SortedBiMultiValMap; @@ -112,7 +112,7 @@ public TokenMap unassignTokens(NodeId id, Collection tokens) return new TokenMap(lastModified, partitioner, finalisedCopy); } - public BiMultiValMap asMap() + public SortedBiMultiValMap asMap() { return SortedBiMultiValMap.create(map); } @@ -164,7 +164,14 @@ private static void maybeAdd(List> ranges, Range r) ranges.add(r); } - public Token nextToken(List tokens, Token token) + public Token getPredecessor(Token token) + { + int index = Collections.binarySearch(tokens, token); + assert index >= 0 : token + " not found in " + StringUtils.join(map.keySet(), ", "); + return index == 0 ? tokens.get(tokens.size() - 1) : tokens.get(index - 1); + } + + public static Token nextToken(List tokens, Token token) { return tokens.get(nextTokenIndex(tokens, token)); } @@ -255,7 +262,7 @@ public boolean equals(Object o) if (!(o instanceof TokenMap)) return false; TokenMap tokenMap = (TokenMap) o; return Objects.equals(lastModified, tokenMap.lastModified) && - isEquivalent(tokenMap); + equivalentTo(tokenMap); } @Override @@ -269,7 +276,7 @@ public int hashCode() * * does not check equality of lastModified */ - public boolean isEquivalent(TokenMap tokenMap) + public boolean equivalentTo(TokenMap tokenMap) { return Objects.equals(map, tokenMap.map) && Objects.equals(partitioner, tokenMap.partitioner); diff --git a/src/java/org/apache/cassandra/tcm/ownership/VersionedEndpoints.java b/src/java/org/apache/cassandra/tcm/ownership/VersionedEndpoints.java index 90148f2c836e..2f429138d2d4 100644 --- a/src/java/org/apache/cassandra/tcm/ownership/VersionedEndpoints.java +++ b/src/java/org/apache/cassandra/tcm/ownership/VersionedEndpoints.java @@ -116,7 +116,9 @@ public boolean equals(Object o) if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; ForRange forRange = (ForRange) o; - return Objects.equals(endpointsForRange.sorted(Replica::compareTo), forRange.endpointsForRange.sorted(Replica::compareTo)); + return lastModified.equals(forRange.lastModified) && + Objects.equals(endpointsForRange.sorted(Replica::compareTo), + forRange.endpointsForRange.sorted(Replica::compareTo)); } public boolean isEmpty() @@ -184,7 +186,8 @@ public boolean equals(Object o) if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; ForToken forToken = (ForToken) o; - return Objects.equals(endpointsForToken, forToken.endpointsForToken); + return lastModified.equals(forToken.lastModified) && + Objects.equals(endpointsForToken, forToken.endpointsForToken); } public boolean isEmpty() diff --git a/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndJoin.java b/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndJoin.java index 15182fc92b2a..0e32f18c2507 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndJoin.java +++ b/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndJoin.java @@ -25,6 +25,7 @@ import java.util.stream.StreamSupport; import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Lists; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -39,8 +40,10 @@ import org.apache.cassandra.locator.EndpointsByReplica; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.Replica; +import org.apache.cassandra.repair.autorepair.AutoRepairUtils; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.AccordService; import org.apache.cassandra.streaming.StreamState; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.ClusterMetadataService; @@ -60,6 +63,7 @@ import org.apache.cassandra.utils.JVMStabilityInspector; import org.apache.cassandra.utils.Pair; import org.apache.cassandra.utils.concurrent.Future; +import org.apache.cassandra.utils.concurrent.FutureCombiner; import org.apache.cassandra.utils.vint.VIntCoding; import static com.google.common.collect.ImmutableList.of; @@ -240,6 +244,9 @@ public SequenceState executeNext() .filter(cfs -> Schema.instance.getUserKeyspaces().names().contains(cfs.keyspace.getName())) .forEach(cfs -> cfs.indexManager.executePreJoinTasksBlocking(true)); ClusterMetadataService.instance().commit(midJoin); + + // this node might have just bootstrapped; check if we should run repair immediately + AutoRepairUtils.runRepairOnNewlyBootstrappedNodeIfEnabled(); } else { @@ -356,12 +363,14 @@ public static boolean bootstrap(final Collection tokens, StorageService.instance.repairPaxosForTopologyChange("bootstrap"); Future bootstrapStream = StorageService.instance.startBootstrap(metadata, beingReplaced, movements, strictMovements); + Future accordReady = AccordService.instance().epochReady(metadata.epoch); + Future ready = FutureCombiner.allOf(Lists.newArrayList(bootstrapStream, accordReady)); try { if (bootstrapTimeoutMillis > 0) - bootstrapStream.get(bootstrapTimeoutMillis, MILLISECONDS); + ready.get(bootstrapTimeoutMillis, MILLISECONDS); else - bootstrapStream.get(); + ready.get(); StorageService.instance.markViewsAsBuilt(); StorageService.instance.clearOngoingBootstrap(); logger.info("Bootstrap completed for tokens {}", tokens); diff --git a/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndReplace.java b/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndReplace.java index 2b283d6905af..8bc73d142eb9 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndReplace.java +++ b/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndReplace.java @@ -44,6 +44,7 @@ import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.locator.EndpointsByReplica; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.repair.autorepair.AutoRepairUtils; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.tcm.ClusterMetadata; @@ -240,6 +241,9 @@ public SequenceState executeNext() .filter(cfs -> Schema.instance.getUserKeyspaces().names().contains(cfs.keyspace.getName())) .forEach(cfs -> cfs.indexManager.executePreJoinTasksBlocking(true)); ClusterMetadataService.instance().commit(midReplace); + + // this node might have just bootstrapped; check if we should run repair immediately + AutoRepairUtils.runRepairOnNewlyBootstrappedNodeIfEnabled(); } else { diff --git a/src/java/org/apache/cassandra/tcm/sequences/CancelCMSReconfiguration.java b/src/java/org/apache/cassandra/tcm/sequences/CancelCMSReconfiguration.java index 3d6499f61b06..5ce36e09a814 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/CancelCMSReconfiguration.java +++ b/src/java/org/apache/cassandra/tcm/sequences/CancelCMSReconfiguration.java @@ -33,6 +33,7 @@ import org.apache.cassandra.schema.KeyspaceParams; import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.service.accord.fastpath.FastPathStrategy; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Transformation; import org.apache.cassandra.tcm.membership.Directory; @@ -77,7 +78,7 @@ public Result execute(ClusterMetadata prev) .withoutWriteReplica(prev.nextEpoch(), pendingReplica) .build(); } - if (!placement.reads.equals(placement.writes)) + if (!placement.reads.equivalentTo(placement.writes)) return new Rejected(ExceptionCode.INVALID, String.format("Placements will be inconsistent if this transformation is applied:\nReads %s\nWrites: %s", placement.reads, placement.writes)); @@ -94,7 +95,7 @@ public Result execute(ClusterMetadata prev) // Also update schema with the corrected params KeyspaceMetadata keyspace = prev.schema.getKeyspaceMetadata(SchemaConstants.METADATA_KEYSPACE_NAME); - KeyspaceMetadata newKeyspace = keyspace.withSwapped(new KeyspaceParams(keyspace.params.durableWrites, fromPlacement)); + KeyspaceMetadata newKeyspace = keyspace.withSwapped(new KeyspaceParams(keyspace.params.durableWrites, fromPlacement, FastPathStrategy.simple())); transformer = transformer.with(new DistributedSchema(prev.schema.getKeyspaces().withAddedOrUpdated(newKeyspace))); } diff --git a/src/java/org/apache/cassandra/tcm/sequences/DropAccordTable.java b/src/java/org/apache/cassandra/tcm/sequences/DropAccordTable.java new file mode 100644 index 000000000000..68fc2471ac59 --- /dev/null +++ b/src/java/org/apache/cassandra/tcm/sequences/DropAccordTable.java @@ -0,0 +1,324 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.sequences; + +import java.io.IOException; +import java.time.Duration; +import java.util.Objects; +import java.util.concurrent.ExecutionException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.Keyspaces; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.MultiStepOperation; +import org.apache.cassandra.tcm.Transformation; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; +import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.tcm.transformations.FinishDropAccordTable; +import org.apache.cassandra.utils.JVMStabilityInspector; + +import static org.apache.cassandra.tcm.Transformation.Kind.FINISH_DROP_ACCORD_TABLE; +import static org.apache.cassandra.tcm.sequences.SequenceState.continuable; +import static org.apache.cassandra.tcm.sequences.SequenceState.error; +import static org.apache.cassandra.tcm.sequences.SequenceState.halted; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; + +/** + * A slightly atypical implementation as it consists of only a single step. To perform the drop of an + * Accord table, we first commit a PrepareDropAccordTable transformation. Upon enactement, that + * marks the table as pending drop, which blocks any new transactions from being started. It also + * instantiates an instance of this operation and adds it to the set of in progress operations. + * + * The intention is to introduce a barrier which blocks until the Accord service acknowledges that + * it was learned of the epoch in which the table was marked for deletion and that all prior transactions + * are completed. Once this is complete, we can proceed to actually drop the table. The transformation + * which performs that schema modification also removes this MSO from ClusterMetadata's in-flight set. + * This obviates the need to 'advance' this MSO in the way that other implementations with more steps do. + * + */ +public class DropAccordTable extends MultiStepOperation +{ + private static final Logger logger = LoggerFactory.getLogger(DropAccordTable.class); + + public static final Serializer serializer = new Serializer(); + + public final TableReference table; + + public static DropAccordTable newSequence(TableReference table, Epoch preparedAt) + { + return new DropAccordTable(table, preparedAt); + } + + /** + * Used by factory method for external callers and by the serializer. + * We don't need to include the serialized FinishDropAccordTable step in the serialization + * of the MSO itself because they have no parameters other than the table reference and so + * we can just construct a new one when we execute it + */ + private DropAccordTable(TableReference table, Epoch latestModification) + { + super(0, latestModification); + this.table = table; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + DropAccordTable that = (DropAccordTable) o; + return latestModification.equals(that.latestModification) + && table.equals(that.table); + } + + @Override + public int hashCode() + { + return Objects.hash(latestModification, table); + } + + @Override + public Kind kind() + { + return Kind.DROP_ACCORD_TABLE; + } + + @Override + protected SequenceKey sequenceKey() + { + return table; + } + + @Override + public MetadataSerializer keySerializer() + { + return TableReference.serializer; + } + + @Override + public Transformation.Kind nextStep() + { + return FINISH_DROP_ACCORD_TABLE; + } + + @Override + public SequenceState executeNext() + { + try + { + SequenceState failure = awaitSafeFromAccord(); + if (failure != null) return failure; + } + catch (Throwable t) + { + JVMStabilityInspector.inspectThrowable(t); + logger.warn("Exception while waiting for Accord service to notify all table txns are complete", t); + // this is actually continuable as we can simply retry + return continuable(); + } + try + { + // Now we're satisfied that all Accord txns have finished for the table, + // go ahead and actually drop it + ClusterMetadataService.instance().commit(new FinishDropAccordTable(table)); + return continuable(); + } + catch (Throwable t) + { + JVMStabilityInspector.inspectThrowable(t); + logger.warn("Exception committing finish_drop_accord_table. " + + "Accord service has acknowledged the operation but table remains present in schema", t); + return halted(); + } + } + + private SequenceState awaitSafeFromAccord() throws ExecutionException, InterruptedException + { + // make sure that Accord sees the current epoch, which must necessarily follow the + // one which marked the table as pending drop + ClusterMetadata metadata = ClusterMetadata.current(); + // just for the sake of paranoia, assert that the table is actually marked as being dropped + if (!verifyTableMarked(metadata.schema.getKeyspaces())) + return error(new IllegalStateException(String.format("Table %s is in an invalid state to be dropped", table))); + + long startNanos = nanoTime(); + AccordService.instance().epochReady(metadata.epoch).get(); + long epochEndNanos = nanoTime(); + + // As of this writing this logic is based off ExclusiveSyncPoints which is a bit heavy weight for what is needed, this could cause timeouts for clusters that have a lot of data. + // There are retries baked into this call, but trying to handle timeouts more broadly is put on hold as there is active work to define a EpochSyncPoint that should be far cheaper + // which would avoid the timeout issues + // NOTE: ExclusiveSyncPoint must find all keys in the range, then make sure nothing is blocking them... this causes a lot of IO. EpochSyncPoint just needs to validate that the last txn processed is in the newer epoch, this can work with in-memory state. + AccordService.instance().awaitDone(table.id, metadata.epoch.getEpoch()); + long awaitEndNanos = nanoTime(); + logger.info("Wait for Accord to see the drop table was success. " + + "Took {} to wait for Accord to learn about the change, then {} to process everything", + Duration.ofNanos(epochEndNanos - startNanos), Duration.ofNanos(awaitEndNanos - epochEndNanos)); + return null; + } + + private boolean verifyTableMarked(Keyspaces keyspaces) + { + TableMetadata tm = keyspaces.getTableOrViewNullable(table.id); + if (tm == null) + { + logger.warn("Unable to drop accord table {}, table not found", table); + return false; + } + + if (!tm.params.pendingDrop) + { + logger.warn("Unexpected state, table {} was not marked pending drop", table); + return false; + } + + return true; + } + + @Override + public Transformation.Result applyTo(ClusterMetadata metadata) + { + // note: that this will apply the finish drop transformation to the supplied metadata. It's + // not used to actually execute the MSO, but to determine what the metadata state will/would + // be if it were executed. + return new FinishDropAccordTable(table).execute(metadata); + } + + @Override + public DropAccordTable advance(Epoch epoch) + { + // note: this isn't really used by this MSO impl as it consists of a single step so there's nothing + // to advance. An action of the single step is to remove the MSO from the set of in progress sequences + return new DropAccordTable(this.table, epoch); + } + + @Override + public ProgressBarrier barrier() + { + return ProgressBarrier.immediate(); + } + + public static class TableReference implements SequenceKey, Comparable + { + public static final Serializer serializer = new Serializer(); + + public final TableId id; + + public TableReference(TableId id) + { + this.id = id; + } + + public static TableReference from(TableMetadata metadata) + { + return new TableReference(metadata.id); + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + TableReference that = (TableReference) o; + return id.equals(that.id); + } + + @Override + public int hashCode() + { + return Objects.hash(id); + } + + @Override + public int compareTo(TableReference o) + { + return id.compareTo(o.id); + } + + @Override + public String toString() + { + return "TableReference{id=" + id + '}'; + } + + public static class Serializer implements MetadataSerializer + { + @Override + public void serialize(TableReference t, DataOutputPlus out, Version version) throws IOException + { + t.id.serialize(out); + } + + @Override + public TableReference deserialize(DataInputPlus in, Version version) throws IOException + { + TableId id = TableId.deserialize(in); + return new TableReference(id); + } + + @Override + public long serializedSize(TableReference t, Version version) + { + return t.id.serializedSize(); + } + } + } + + public static class Serializer implements AsymmetricMetadataSerializer, DropAccordTable> + { + @Override + public void serialize(MultiStepOperation t, DataOutputPlus out, Version version) throws IOException + { + DropAccordTable plan = (DropAccordTable) t; + Epoch.serializer.serialize(plan.latestModification, out, version); + // This type of sequence only has a single step so no need to include the index in serde. + // Similarly, the only parameter to that single step (FinishDropAccordTable) is the table + // reference, so that's all we really need to include in the serialization. + TableReference.serializer.serialize(plan.table, out, version); + } + + @Override + public DropAccordTable deserialize(DataInputPlus in, Version version) throws IOException + { + Epoch lastModified = Epoch.serializer.deserialize(in, version); + TableReference table = TableReference.serializer.deserialize(in, version); + return new DropAccordTable(table, lastModified); + } + + @Override + public long serializedSize(MultiStepOperation t, Version version) + { + DropAccordTable plan = (DropAccordTable) t; + long size = 0; + size += Epoch.serializer.serializedSize(plan.latestModification, version); + size += TableReference.serializer.serializedSize(plan.table, version); + return size; + } + } +} diff --git a/src/java/org/apache/cassandra/tcm/sequences/InProgressSequences.java b/src/java/org/apache/cassandra/tcm/sequences/InProgressSequences.java index 735a7f693571..840c2505f423 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/InProgressSequences.java +++ b/src/java/org/apache/cassandra/tcm/sequences/InProgressSequences.java @@ -27,6 +27,7 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; @@ -59,27 +60,27 @@ private InProgressSequences(Epoch lastModified, ImmutableMap sequence = metadata.inProgressSequences.get(sequenceKey); if (sequence == null) - break; + return metadata; if (onlyStartupSafeSequences && !sequence.finishDuringStartup()) - break; + return metadata; if (isLeave(sequence)) StorageService.instance.maybeInitializeServices(); if (resume(sequence)) metadata = ClusterMetadata.current(); else - return; + return metadata; } } @@ -225,6 +226,11 @@ public Iterator> iterator() return state.values().iterator(); } + public ImmutableSet keys() + { + return state.keySet(); + } + public static class Serializer implements MetadataSerializer { public void serialize(InProgressSequences t, DataOutputPlus out, Version version) throws IOException diff --git a/src/java/org/apache/cassandra/tcm/sequences/Move.java b/src/java/org/apache/cassandra/tcm/sequences/Move.java index 99ada06f9356..450889694e9a 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/Move.java +++ b/src/java/org/apache/cassandra/tcm/sequences/Move.java @@ -49,8 +49,10 @@ import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.AccordService; import org.apache.cassandra.streaming.StreamOperation; import org.apache.cassandra.streaming.StreamPlan; +import org.apache.cassandra.streaming.StreamResultFuture; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tcm.Epoch; @@ -68,6 +70,8 @@ import org.apache.cassandra.tcm.transformations.PrepareMove; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.JVMStabilityInspector; +import org.apache.cassandra.utils.concurrent.Future; +import org.apache.cassandra.utils.concurrent.FutureCombiner; import org.apache.cassandra.utils.vint.VIntCoding; import static com.google.common.collect.ImmutableList.of; @@ -211,11 +215,12 @@ public SequenceState executeNext() case MID_MOVE: try { + ClusterMetadata metadata = ClusterMetadata.current(); logger.info("fetching new ranges and streaming old ranges"); StreamPlan streamPlan = new StreamPlan(StreamOperation.RELOCATION); Keyspaces keyspaces = Schema.instance.getNonLocalStrategyKeyspaces(); Map movementMap = movementMap(FailureDetector.instance, - ClusterMetadata.current().placements, + metadata.placements, toSplitRanges, startMove.delta(), midMove.delta(), @@ -225,9 +230,12 @@ public SequenceState executeNext() for (KeyspaceMetadata ks : keyspaces) { ReplicationParams replicationParams = ks.params.replication; - if (replicationParams.isMeta()) + if (replicationParams.isMeta() || !StreamPlan.hasNonAccordTables(ks)) continue; + EndpointsByReplica endpoints = movementMap.get(replicationParams); + + String[] cfNames = StreamPlan.nonAccordTablesForKeyspace(ks); for (Map.Entry e : endpoints.flattenEntries()) { Replica destination = e.getKey(); @@ -235,20 +243,22 @@ public SequenceState executeNext() logger.info("Stream source: {} destination: {}", source, destination); assert !source.endpoint().equals(destination.endpoint()) : String.format("Source %s should not be the same as destionation %s", source, destination); if (source.isSelf()) - streamPlan.transferRanges(destination.endpoint(), ks.name, RangesAtEndpoint.of(destination)); + streamPlan.transferRanges(destination.endpoint(), ks.name, RangesAtEndpoint.of(destination), cfNames); else if (destination.isSelf()) { if (destination.isFull()) - streamPlan.requestRanges(source.endpoint(), ks.name, RangesAtEndpoint.of(destination), RangesAtEndpoint.empty(destination.endpoint())); + streamPlan.requestRanges(source.endpoint(), ks.name, RangesAtEndpoint.of(destination), RangesAtEndpoint.empty(destination.endpoint()), cfNames); else - streamPlan.requestRanges(source.endpoint(), ks.name, RangesAtEndpoint.empty(destination.endpoint()), RangesAtEndpoint.of(destination)); + streamPlan.requestRanges(source.endpoint(), ks.name, RangesAtEndpoint.empty(destination.endpoint()), RangesAtEndpoint.of(destination), cfNames); } else throw new IllegalStateException("Node should be either source or destination in the movement map " + endpoints); } } - streamPlan.execute().get(); + StreamResultFuture streamResult = streamPlan.execute(); + Future accordReady = AccordService.instance().epochReady(metadata.epoch); + FutureCombiner.allOf(streamResult, accordReady).get(); StorageService.instance.repairPaxosForTopologyChange("move"); } catch (InterruptedException e) diff --git a/src/java/org/apache/cassandra/tcm/sequences/ProgressBarrier.java b/src/java/org/apache/cassandra/tcm/sequences/ProgressBarrier.java index af504d35d362..56eda1828382 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/ProgressBarrier.java +++ b/src/java/org/apache/cassandra/tcm/sequences/ProgressBarrier.java @@ -39,7 +39,7 @@ import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.locator.EndpointsForRange; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.Replica; @@ -50,6 +50,8 @@ import org.apache.cassandra.net.RequestCallbackWithFailure; import org.apache.cassandra.net.Verb; import org.apache.cassandra.schema.ReplicationParams; +import org.apache.cassandra.service.RetryStrategy; +import org.apache.cassandra.service.WaitStrategy; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.Retry; @@ -58,6 +60,13 @@ import org.apache.cassandra.utils.Clock; import org.apache.cassandra.utils.concurrent.AsyncPromise; +import static org.apache.cassandra.config.DatabaseDescriptor.getCmsDefaultRetryMaxTries; +import static org.apache.cassandra.config.DatabaseDescriptor.getProgressBarrierBackoff; +import static org.apache.cassandra.config.DatabaseDescriptor.getProgressBarrierDefaultConsistencyLevel; +import static org.apache.cassandra.config.DatabaseDescriptor.getProgressBarrierMinConsistencyLevel; +import static org.apache.cassandra.config.DatabaseDescriptor.getProgressBarrierTimeout; +import static org.apache.cassandra.service.TimeoutStrategy.LatencySourceFactory.none; + /** * ProgressBarrier is responsible for ensuring that epoch visibility plays together with quorum consistency. * @@ -73,10 +82,16 @@ public class ProgressBarrier { private static final Logger logger = LoggerFactory.getLogger(ProgressBarrier.class); - private static final ConsistencyLevel MIN_CL = DatabaseDescriptor.getProgressBarrierMinConsistencyLevel(); - private static final ConsistencyLevel DEFAULT_CL = DatabaseDescriptor.getProgressBarrierDefaultConsistencyLevel(); - private static final long TIMEOUT_MILLIS = DatabaseDescriptor.getProgressBarrierTimeout(TimeUnit.MILLISECONDS); - private static final long BACKOFF_MILLIS = DatabaseDescriptor.getProgressBarrierBackoff(TimeUnit.MILLISECONDS); + private static final ConsistencyLevel MIN_CL = getProgressBarrierMinConsistencyLevel(); + private static final ConsistencyLevel DEFAULT_CL = getProgressBarrierDefaultConsistencyLevel(); + private static final long TIMEOUT_MILLIS = getProgressBarrierTimeout(TimeUnit.MILLISECONDS); + private static final long BACKOFF_MILLIS = getProgressBarrierBackoff(TimeUnit.MILLISECONDS); + private static final WaitStrategy WAIT_STRATEGY; + static + { + WAIT_STRATEGY = RetryStrategy.parse(BACKOFF_MILLIS + "ms" + "*attempts <=" + TIMEOUT_MILLIS + "ms,retries=" + + getCmsDefaultRetryMaxTries(), none()); + } public final Epoch waitFor; // Location of the affected node; used for LOCAL_QUORUM @@ -195,11 +210,8 @@ public boolean await(ConsistencyLevel cl, ClusterMetadata metadata) requests.add(new WatermarkRequest(peer, messagingService, waitFor)); long start = Clock.Global.nanoTime(); - Retry.Deadline deadline = Retry.Deadline.after(TimeUnit.MILLISECONDS.toNanos(TIMEOUT_MILLIS), - new Retry.Backoff(DatabaseDescriptor.getCmsDefaultRetryMaxTries(), - (int) BACKOFF_MILLIS, - TCMMetrics.instance.progressBarrierRetries)); - while (!deadline.reachedMax()) + Retry deadline = Retry.untilElapsed(TimeUnit.MILLISECONDS.toNanos(TIMEOUT_MILLIS), TCMMetrics.instance.progressBarrierRetries, WAIT_STRATEGY); + while (!deadline.hasExpired()) { for (WatermarkRequest request : requests) request.retry(); @@ -219,7 +231,8 @@ public boolean await(ConsistencyLevel cl, ClusterMetadata metadata) // No need to try processing until we collect enough nodes to pass all conditions if (collected.size() < maxWaitFor) { - deadline.maybeSleep(); + if (!deadline.maybeSleep()) + break; continue; } @@ -544,10 +557,10 @@ public void onResponse(Message msg) } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) + public void onFailure(InetAddressAndPort from, RequestFailure failure) { - logger.debug("Error response from {} with {}", from, failureReason); - condition.tryFailure(new TimeoutException(String.format("Watermark request did returned %s.", failureReason))); + logger.debug("Error response from {} with {}", from, failure); + condition.tryFailure(new TimeoutException(String.format("Watermark request did returned %s.", failure.reason))); } public void retry() diff --git a/src/java/org/apache/cassandra/tcm/sequences/ReconfigureCMS.java b/src/java/org/apache/cassandra/tcm/sequences/ReconfigureCMS.java index 38566812bf54..fe1ec17f3d71 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/ReconfigureCMS.java +++ b/src/java/org/apache/cassandra/tcm/sequences/ReconfigureCMS.java @@ -343,13 +343,13 @@ static void repairPaxosForCMSTopologyChange() // paxos repair at the beginning of each step, before streaming where applicable, will ensure that the // overlapping quorums invariant holds. - Retry.Backoff retry = new Retry.Backoff(TCMMetrics.instance.repairPaxosTopologyRetries); + Retry retry = Retry.withNoTimeLimit(TCMMetrics.instance.repairPaxosTopologyRetries); List>> remaining = ActiveRepairService.instance() .repairPaxosForTopologyChangeAsync(SchemaConstants.METADATA_KEYSPACE_NAME, Collections.singletonList(entireRange), "CMS reconfiguration"); - while (!retry.reachedMax()) + while (true) { Map>, Future> tasks = new HashMap<>(); for (Supplier> supplier : remaining) @@ -377,7 +377,8 @@ static void repairPaxosForCMSTopologyChange() if (remaining.isEmpty()) return; - retry.maybeSleep(); + if (!retry.maybeSleep()) + break; } logger.error("Added node as a CMS, but failed to repair paxos topology after this operation."); } diff --git a/src/java/org/apache/cassandra/tcm/sequences/ReplaceSameAddress.java b/src/java/org/apache/cassandra/tcm/sequences/ReplaceSameAddress.java index be75538d6888..4580d716a1ba 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/ReplaceSameAddress.java +++ b/src/java/org/apache/cassandra/tcm/sequences/ReplaceSameAddress.java @@ -29,6 +29,7 @@ import org.apache.cassandra.gms.Gossiper; import org.apache.cassandra.locator.EndpointsByReplica; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.repair.autorepair.AutoRepairUtils; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.tcm.ClusterMetadata; @@ -95,6 +96,9 @@ public static void streamData(NodeId nodeId, ClusterMetadata metadata, boolean s .forEach(cfs -> cfs.indexManager.executePreJoinTasksBlocking(true)); BootstrapAndReplace.gossipStateToNormal(metadata, metadata.myNodeId()); Gossiper.instance.mergeNodeToGossip(metadata.myNodeId(), metadata); + + // this node might have just bootstrapped; check if we should run repair immediately + AutoRepairUtils.runRepairOnNewlyBootstrappedNodeIfEnabled(); } } } diff --git a/src/java/org/apache/cassandra/tcm/serialization/Version.java b/src/java/org/apache/cassandra/tcm/serialization/Version.java index 50e1792e2374..955c6996e0c0 100644 --- a/src/java/org/apache/cassandra/tcm/serialization/Version.java +++ b/src/java/org/apache/cassandra/tcm/serialization/Version.java @@ -18,7 +18,10 @@ package org.apache.cassandra.tcm.serialization; +import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; +import java.util.List; import java.util.Map; import org.apache.cassandra.tcm.ClusterMetadata; @@ -39,25 +42,39 @@ public enum Version */ V2(2), /** - * - down nodes serialized in PrepareCMSReconfiguration + * - down nodes serialized in PrepareCMSReconfiguration */ V3(3), /** - * - Serialize allowAutoSnapshot and incrementalBackups when serializing TableParams + * - Serialize allowAutoSnapshot and incrementalBackups when serializing TableParams */ V4(4), /** - * - AlterSchema includes execution timestamp - * - PreInitialize includes datacenter (affects local serialization on first CMS node only) + * - AlterSchema includes execution timestamp + * - PreInitialize includes datacenter (affects local serialization on first CMS node only) */ V5(5), /** - * CEP-42 - Constraints framework. New version due to modifications in table metadata serialization. + * - CEP-42 - Constraints framework. New version due to modifications in table metadata serialization. */ V6(6), + /** + * - Track nodes removed + * - Column Metadata now stores a unique id + * - Added AccordFastPath + * - Added ConsensusMigrationState + * - Added AccordStaleReplicas + * - TableParam now has pendingDrop (accord table drop is multistep) + */ + V7(7), UNKNOWN(Integer.MAX_VALUE); + /** + * The version that Accord was added to TCM. + */ + public static final Version MIN_ACCORD_VERSION = V7; + private static Map values = new HashMap<>(); static { @@ -111,4 +128,15 @@ public static Version fromInt(int i) throw new IllegalArgumentException("Unsupported metadata version (" + i + ")"); } + + public List greaterThanOrEqual() + { + Version[] all = Version.values(); + if (ordinal() == all.length - 1) + return Collections.singletonList(this); + List values = new ArrayList<>(all.length - ordinal()); + for (int i = ordinal(); i < all.length; i++) + values.add(all[i]); + return values; + } } diff --git a/src/java/org/apache/cassandra/tcm/transformations/AccordMarkRejoining.java b/src/java/org/apache/cassandra/tcm/transformations/AccordMarkRejoining.java new file mode 100644 index 000000000000..402d05cf50a0 --- /dev/null +++ b/src/java/org/apache/cassandra/tcm/transformations/AccordMarkRejoining.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.transformations; + +import java.io.IOException; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.local.Node; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.AccordTopology; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Transformation; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.sequences.LockedRanges; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.utils.CollectionSerializers; + +import static org.apache.cassandra.exceptions.ExceptionCode.INVALID; + +public class AccordMarkRejoining implements Transformation +{ + private static final Logger logger = LoggerFactory.getLogger(AccordMarkRejoining.class); + + private final Set ids; + + public AccordMarkRejoining(Set ids) + { + this.ids = ids; + } + + @Override + public Kind kind() + { + return Kind.ACCORD_MARK_REJOINING; + } + + @Override + public Result execute(ClusterMetadata prev) + { + for (NodeId id : ids) + if (!prev.directory.peerIds().contains(id)) + return new Rejected(INVALID, String.format("Can not unmark node %s as it is not present in the directory.", id)); + + Set accordIds = ids.stream().map(AccordTopology::tcmIdToAccord).collect(Collectors.toSet()); + + for (Node.Id id : accordIds) + if (!prev.accordStaleReplicas.contains(id)) + return new Rejected(INVALID, String.format("Can not unmark node %s as it is not stale.", id)); + + logger.info("Unmarking " + ids + ". They will now participate in durability status coordination..."); + ClusterMetadata.Transformer next = prev.transformer().unmarkStaleReplicas(accordIds); + return Transformation.success(next, LockedRanges.AffectedRanges.EMPTY); + } + + @Override + public String toString() + { + return "AccordMarkRejoining{ids=" + ids + '}'; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + AccordMarkRejoining that = (AccordMarkRejoining) o; + return Objects.equals(ids, that.ids); + } + + @Override + public int hashCode() + { + return Objects.hash(ids); + } + + public static final AsymmetricMetadataSerializer serializer = new AsymmetricMetadataSerializer<>() + { + @Override + public void serialize(Transformation t, DataOutputPlus out, Version version) throws IOException + { + assert t instanceof AccordMarkRejoining; + AccordMarkRejoining mark = (AccordMarkRejoining) t; + CollectionSerializers.serializeCollection(mark.ids, out, version, NodeId.serializer); + } + + @Override + public AccordMarkRejoining deserialize(DataInputPlus in, Version version) throws IOException + { + return new AccordMarkRejoining(CollectionSerializers.deserializeSet(in, version, NodeId.serializer)); + } + + @Override + public long serializedSize(Transformation t, Version version) + { + assert t instanceof AccordMarkRejoining; + AccordMarkRejoining mark = (AccordMarkRejoining) t; + return CollectionSerializers.serializedCollectionSize(mark.ids, version, NodeId.serializer); + } + }; +} diff --git a/src/java/org/apache/cassandra/tcm/transformations/AccordMarkStale.java b/src/java/org/apache/cassandra/tcm/transformations/AccordMarkStale.java new file mode 100644 index 000000000000..33f68cb65d34 --- /dev/null +++ b/src/java/org/apache/cassandra/tcm/transformations/AccordMarkStale.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.transformations; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.local.Node; +import accord.topology.Shard; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.service.accord.AccordTopology; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Transformation; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.sequences.LockedRanges; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.utils.CollectionSerializers; + +import static org.apache.cassandra.exceptions.ExceptionCode.INVALID; + +public class AccordMarkStale implements Transformation +{ + private static final Logger logger = LoggerFactory.getLogger(AccordMarkStale.class); + + private final Set ids; + + public AccordMarkStale(Set ids) + { + this.ids = ids; + } + + @Override + public Kind kind() + { + return Kind.ACCORD_MARK_STALE; + } + + @Override + public Result execute(ClusterMetadata prev) + { + for (NodeId id : ids) + if (!prev.directory.peerIds().contains(id)) + return new Rejected(INVALID, String.format("Can not mark node %s stale as it is not present in the directory.", id)); + + Set accordIds = ids.stream().map(AccordTopology::tcmIdToAccord).collect(Collectors.toSet()); + + for (Node.Id id : accordIds) + if (prev.accordStaleReplicas.contains(id)) + return new Rejected(INVALID, String.format("Can not mark node %s stale as it already is.", id)); + + for (KeyspaceMetadata keyspace : prev.schema.getKeyspaces().without(SchemaConstants.REPLICATED_SYSTEM_KEYSPACE_NAMES)) + { + List shards = AccordTopology.KeyspaceShard.forKeyspace(keyspace, prev.placements, prev.directory); + + for (AccordTopology.KeyspaceShard shard : shards) + { + // We're trying to mark a node in this shard stale... + if (!Collections.disjoint(shard.nodes(), accordIds)) + { + int quorumSize = Shard.slowQuorumSize(shard.nodes().size()); + Set nonStaleNodes = new HashSet<>(shard.nodes()); + nonStaleNodes.removeAll(accordIds); + nonStaleNodes.removeAll(prev.accordStaleReplicas.ids()); + + // ...but reject the transformation if this would bring us below quorum. + if (nonStaleNodes.size() < quorumSize) + return new Rejected(INVALID, String.format("Can not mark nodes %s stale as that would leave fewer than a quorum of nodes active for ranges %s in keyspace '%s'.", + accordIds, shard.ranges(), keyspace.name)); + } + } + } + + logger.info("Marking " + ids + " stale. They will no longer participate in durability status coordination..."); + ClusterMetadata.Transformer next = prev.transformer().markStaleReplicas(accordIds); + return Transformation.success(next, LockedRanges.AffectedRanges.EMPTY); + } + + @Override + public String toString() + { + return "AccordMarkStale{ids=" + ids + '}'; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + AccordMarkStale that = (AccordMarkStale) o; + return Objects.equals(ids, that.ids); + } + + @Override + public int hashCode() + { + return Objects.hash(ids); + } + + public static final AsymmetricMetadataSerializer serializer = new AsymmetricMetadataSerializer<>() + { + @Override + public void serialize(Transformation t, DataOutputPlus out, Version version) throws IOException + { + assert t instanceof AccordMarkStale; + AccordMarkStale mark = (AccordMarkStale) t; + CollectionSerializers.serializeCollection(mark.ids, out, version, NodeId.serializer); + } + + @Override + public AccordMarkStale deserialize(DataInputPlus in, Version version) throws IOException + { + return new AccordMarkStale(CollectionSerializers.deserializeSet(in, version, NodeId.serializer)); + } + + @Override + public long serializedSize(Transformation t, Version version) + { + assert t instanceof AccordMarkStale; + AccordMarkStale mark = (AccordMarkStale) t; + return CollectionSerializers.serializedCollectionSize(mark.ids, version, NodeId.serializer); + } + }; +} diff --git a/src/java/org/apache/cassandra/tcm/transformations/AlterSchema.java b/src/java/org/apache/cassandra/tcm/transformations/AlterSchema.java index cec1d42ca86d..09c5b68bb5f1 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/AlterSchema.java +++ b/src/java/org/apache/cassandra/tcm/transformations/AlterSchema.java @@ -19,17 +19,27 @@ package org.apache.cassandra.tcm.transformations; import java.io.IOException; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; +import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Sets; +import com.google.common.collect.Streams; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.config.AccordSpec; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.statements.schema.AlterSchemaStatement; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.exceptions.AlreadyExistsException; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.exceptions.InvalidRequestException; @@ -38,14 +48,18 @@ import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.schema.DistributedSchema; import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.KeyspaceMetadata.KeyspaceDiff; import org.apache.cassandra.schema.Keyspaces; import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.schema.SchemaTransformation; +import org.apache.cassandra.schema.TableId; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.schema.Tables; import org.apache.cassandra.schema.ViewMetadata; import org.apache.cassandra.schema.Views; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationState; import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadata.Transformer; import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.Transformation; @@ -57,6 +71,8 @@ import org.apache.cassandra.utils.JVMStabilityInspector; import org.apache.cassandra.utils.vint.VIntCoding; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableSet.toImmutableSet; import static org.apache.cassandra.cql3.statements.schema.AlterSchemaStatement.NO_EXECUTION_TIMESTAMP; import static org.apache.cassandra.exceptions.ExceptionCode.ALREADY_EXISTS; import static org.apache.cassandra.exceptions.ExceptionCode.CONFIG_ERROR; @@ -224,14 +240,14 @@ public final Result execute(ClusterMetadata prev) calculatedPlacements.forEach((params, newPlacement) -> { DataPlacement previousPlacement = prev.placements.get(params); // Preserve placement versioning that has resulted from natural application where possible - if (previousPlacement.equals(newPlacement)) + if (previousPlacement.equivalentTo(newPlacement)) newPlacementsBuilder.with(params, previousPlacement); else newPlacementsBuilder.with(params, newPlacement); }); next = next.with(newPlacementsBuilder.build()); } - + next = maybeUpdateConsensusMigrationState(prev.consensusMigrationState, next, diff.altered, diff.dropped); return Transformation.success(next, LockedRanges.AffectedRanges.EMPTY); } @@ -247,6 +263,75 @@ private static Map> groupByReplication( return byReplication; } + public static Transformer maybeUpdateConsensusMigrationState(ConsensusMigrationState prev, Transformer next, ImmutableList altered, Keyspaces dropped) + { + ConsensusMigrationState migrationState = prev; + + Set droppedIds = Streams.concat(altered.stream().flatMap(diff -> diff.tables.dropped.stream().map(TableMetadata::id)), + dropped.stream().flatMap(ks -> ks.tables.stream().map(TableMetadata::id))) + .collect(toImmutableSet()); + + if (!droppedIds.isEmpty()) + migrationState = migrationState.withMigrationsRemovedFor(droppedIds); + + Set completedIds = altered.stream() + .flatMap(diff -> diff.tables.altered.stream()) + .filter(alt -> alt.before.params.transactionalMigrationFrom.isMigrating() + && !alt.after.params.transactionalMigrationFrom.isMigrating()) + .map(alt -> alt.after.id) + .collect(toImmutableSet()); + + if (!completedIds.isEmpty()) + migrationState = migrationState.withMigrationsCompletedFor(completedIds); + + Map reversals = altered.stream() + .flatMap(diff -> diff.tables.altered.stream()) + .filter(alt -> alt.before.params.transactionalMigrationFrom.from == alt.after.params.transactionalMode) + .map(alt -> alt.after) + .collect(Collectors.toMap(TableMetadata::id, Function.identity())); + + + // we treat explicitly switched migration types as a new migration here + Set started = altered.stream() + .flatMap(diff -> diff.tables.altered.stream()) + .filter(alt -> !reversals.containsKey(alt.after.id)) + .filter(alt -> alt.after.params.transactionalMigrationFrom.isMigrating() + && !alt.before.params.transactionalMigrationFrom.isMigrating()) + .map(alt -> alt.after) + .collect(Collectors.toUnmodifiableSet()); + + Set startedAndReversed = Sets.intersection(started.stream().map(TableMetadata::id).collect(Collectors.toSet()), reversals.keySet()); + checkState(startedAndReversed.isEmpty(), "Set of tables starting migration and reversing migration should not intersect"); + + if (!started.isEmpty()) + { + List> ranges; + AccordSpec.TransactionalRangeMigration migration = DatabaseDescriptor.getTransactionalRangeMigration(); + switch (migration) + { + default: throw new IllegalStateException("Unhandled transactional range migration: " + migration); + case auto: + Token minToken = DatabaseDescriptor.getPartitioner().getMinimumToken(); + ranges = Range.normalize(Collections.singletonList(new Range<>(minToken, minToken))); + break; + case explicit: + ranges = Collections.emptyList(); + break; + } + + // Always create the migration state even if nothing is currently migrating, the empty state + // signals that a migration is in progress with no migrating ranges and corresponds to transactionalMigrationFrom != none + migrationState = migrationState.withRangesMigrating(started, ranges, true); + } + + migrationState = migrationState.withReversedMigrations(reversals, next.epoch()); + + if (migrationState != prev) + next = next.with(migrationState); + + return next; + } + private static Iterable normaliseTableEpochs(Epoch nextEpoch, Stream tables) { return tables.map(tm -> tm.epoch.is(nextEpoch) diff --git a/src/java/org/apache/cassandra/tcm/transformations/AlterTopology.java b/src/java/org/apache/cassandra/tcm/transformations/AlterTopology.java new file mode 100644 index 000000000000..e247f66a8805 --- /dev/null +++ b/src/java/org/apache/cassandra/tcm/transformations/AlterTopology.java @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.transformations; + +import java.io.IOException; +import java.net.UnknownHostException; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.Transformation; +import org.apache.cassandra.tcm.membership.Directory; +import org.apache.cassandra.tcm.membership.Location; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.ownership.DataPlacements; +import org.apache.cassandra.tcm.ownership.PlacementProvider; +import org.apache.cassandra.tcm.sequences.LockedRanges; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; + +import static org.apache.cassandra.exceptions.ExceptionCode.INVALID; + +public class AlterTopology implements Transformation +{ + private static final Logger logger = LoggerFactory.getLogger(AlterTopology.class); + public static final Serializer serializer = new Serializer(); + + private final Map updates; + private final PlacementProvider placementProvider; + + public AlterTopology(Map updates, PlacementProvider placementProvider) + { + this.updates = updates; + this.placementProvider = placementProvider; + } + + public static Map parseArgs(String args, Directory directory) + { + Map asMap = new HashMap<>(); + for (String change : args.split(",")) + { + String[] parts = change.trim().split("="); + if (parts.length != 2) + throw new IllegalArgumentException("Invalid specification: " + change); + + if (parts[0].isEmpty() || parts[1].isEmpty()) + throw new IllegalArgumentException("Invalid specification: " + change); + + NodeId id = getNodeIdFromString(parts[0].trim(), directory); + if (asMap.containsKey(id)) + throw new IllegalArgumentException("Multiple updates for node " + id + " (" + parts[0].trim() + " )"); + asMap.put(getNodeIdFromString(parts[0].trim(), directory), Location.fromString(parts[1].trim())); + } + return asMap; + } + + private static NodeId getNodeIdFromString(String s, Directory directory) + { + // first try to parse the id as a node id, either in UUID or int form + try + { + return NodeId.fromString(s); + } + catch (Exception e) + { + // fall back to trying the supplied id as an endpoint + try + { + InetAddressAndPort endpoint = InetAddressAndPort.getByName(s); + return directory.peerId(endpoint); + } + catch (UnknownHostException u) + { + throw new IllegalArgumentException("Invalid node identifier supplied: " + s); + } + + } + } + + @Override + public Kind kind() + { + return Kind.ALTER_TOPOLOGY; + } + + @Override + public Result execute(ClusterMetadata prev) + { + // Check no inflight range movements + if (!prev.lockedRanges.locked.isEmpty()) + return new Rejected(INVALID, "The requested topology changes cannot be executed while there are ongoing range movements."); + + Directory dir = prev.directory; + // Check all node ids are present + Set missing = updates.keySet() + .stream() + .filter(location -> (null == dir.location(location))) + .collect(Collectors.toSet()); + if (!missing.isEmpty()) + return new Rejected(INVALID, String.format("Some updates specify an unregistered node: %s", missing)); + + // Validate there will be no change to placements + Directory updated = prev.directory; + for (Map.Entry update : updates.entrySet()) + updated = updated.withUpdatedRackAndDc(update.getKey(), update.getValue()); + ClusterMetadata proposed = prev.transformer().with(updated).build().metadata; + DataPlacements proposedPlacements = placementProvider.calculatePlacements(prev.placements.lastModified(), + proposed.tokenMap.toRanges(), + proposed, + proposed.schema.getKeyspaces()); + if (!proposedPlacements.equivalentTo(prev.placements)) + { + logger.info("Rejecting topology modifications which would materially change data placements: {}", updates); + return new Rejected(INVALID, "Proposed updates modify data placements, violating consistency guarantees"); + } + + ClusterMetadata.Transformer next = prev.transformer().with(updated); + return Transformation.success(next, LockedRanges.AffectedRanges.EMPTY); + } + + + @Override + public String toString() + { + return "AlterTopology{" + + "updates=" + updates + + '}'; + } + + static class Serializer implements AsymmetricMetadataSerializer + { + public void serialize(Transformation t, DataOutputPlus out, Version version) throws IOException + { + assert t instanceof AlterTopology; + AlterTopology alterTopology = (AlterTopology)t; + int size = alterTopology.updates.size(); + out.writeInt(size); + for (Map.Entry entry : alterTopology.updates.entrySet()) + { + NodeId.serializer.serialize(entry.getKey(), out, version); + Location.serializer.serialize(entry.getValue(), out, version); + } + } + + public AlterTopology deserialize(DataInputPlus in, Version version) throws IOException + { + int size = in.readInt(); + Map updates = new HashMap<>(size); + for (int i = 0; i < size; i++) + updates.put(NodeId.serializer.deserialize(in, version), Location.serializer.deserialize(in, version)); + return new AlterTopology(updates, ClusterMetadataService.instance().placementProvider()); + } + + public long serializedSize(Transformation t, Version version) + { + assert t instanceof AlterTopology; + AlterTopology alterTopology = (AlterTopology) t; + long size = TypeSizes.sizeof(alterTopology.updates.size()); + for (Map.Entry entry : alterTopology.updates.entrySet()) + { + size += NodeId.serializer.serializedSize(entry.getKey(), version); + size += Location.serializer.serializedSize(entry.getValue(), version); + } + return size; + } + } +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/tcm/transformations/BeginConsensusMigrationForTableAndRange.java b/src/java/org/apache/cassandra/tcm/transformations/BeginConsensusMigrationForTableAndRange.java new file mode 100644 index 000000000000..869a5b51894d --- /dev/null +++ b/src/java/org/apache/cassandra/tcm/transformations/BeginConsensusMigrationForTableAndRange.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.transformations; + +import java.io.IOException; +import java.util.Collection; +import java.util.List; +import java.util.stream.Collectors; +import javax.annotation.Nonnull; + +import org.apache.cassandra.dht.NormalizedRanges; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationState; +import org.apache.cassandra.service.consensus.migration.ConsensusTableMigration; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Transformation; +import org.apache.cassandra.tcm.sequences.LockedRanges; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.cassandra.tcm.ClusterMetadata.Transformer; +import static org.apache.cassandra.utils.CollectionSerializers.deserializeList; +import static org.apache.cassandra.utils.CollectionSerializers.serializeCollection; +import static org.apache.cassandra.utils.CollectionSerializers.serializedCollectionSize; + +public class BeginConsensusMigrationForTableAndRange implements Transformation +{ + public static Serializer serializer = new Serializer(); + + @Nonnull + public final NormalizedRanges ranges; + + @Nonnull + public final List tables; + + public BeginConsensusMigrationForTableAndRange(@Nonnull NormalizedRanges ranges, + @Nonnull List tables) + { + checkNotNull(ranges, "ranges should not be null"); + checkArgument(!ranges.isEmpty(), "ranges should not be empty"); + checkNotNull(tables, "tables should not be null"); + checkArgument(!tables.isEmpty(), "tables should not be empty"); + this.ranges = ranges; + this.tables = tables; + } + + public Kind kind() + { + return Kind.BEGIN_CONSENSUS_MIGRATION_FOR_TABLE_AND_RANGE; + } + + public Result execute(ClusterMetadata prev) + { + Transformer transformer = prev.transformer(); + Collection metadata = tables.stream().map(prev.schema::getTableMetadata).collect(Collectors.toList()); + ConsensusMigrationState consensusMigrationState = prev.consensusMigrationState.withRangesMigrating(metadata, ranges, false); + return Transformation.success(transformer.with(consensusMigrationState), LockedRanges.AffectedRanges.EMPTY); + } + + static class Serializer implements AsymmetricMetadataSerializer + { + + public void serialize(Transformation t, DataOutputPlus out, Version version) throws IOException + { + BeginConsensusMigrationForTableAndRange v = (BeginConsensusMigrationForTableAndRange)t; + ConsensusTableMigration.rangesSerializer.serialize(v.ranges, out, version); + serializeCollection(v.tables, out, version, TableId.metadataSerializer); + } + + public BeginConsensusMigrationForTableAndRange deserialize(DataInputPlus in, Version version) throws IOException + { + NormalizedRanges ranges = ConsensusTableMigration.rangesSerializer.deserialize(in, version); + List tables = deserializeList(in, version, TableId.metadataSerializer); + return new BeginConsensusMigrationForTableAndRange(ranges, tables); + } + + public long serializedSize(Transformation t, Version version) + { + BeginConsensusMigrationForTableAndRange v = (BeginConsensusMigrationForTableAndRange) t; + return ConsensusTableMigration.rangesSerializer.serializedSize(v.ranges, version) + + serializedCollectionSize(v.tables, version, TableId.metadataSerializer); + } + } +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/tcm/transformations/FinishDropAccordTable.java b/src/java/org/apache/cassandra/tcm/transformations/FinishDropAccordTable.java new file mode 100644 index 000000000000..2019adb8ad39 --- /dev/null +++ b/src/java/org/apache/cassandra/tcm/transformations/FinishDropAccordTable.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.transformations; + +import java.io.IOException; +import java.util.Objects; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.DistributedSchema; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.Keyspaces; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Transformation; +import org.apache.cassandra.tcm.sequences.DropAccordTable.TableReference; +import org.apache.cassandra.tcm.sequences.LockedRanges; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; + +import static org.apache.cassandra.tcm.Transformation.Kind.FINISH_DROP_ACCORD_TABLE; + +/** + * Dropping an Accord table is a three-step process. + *
      + *
    1. Mark the table as pending drop
    2. + *
    3. Await all in-flight txns to finish
    4. + *
    5. Drop the table from schema (this step)
    6. + *
    + *

    + * Hypothetically it is possible that after {1} has been committed, but before {3} is executed + * interleaving metadata changes occur. These could include dropping the table's keyspace, or + * modifying the transactional mode of the table to make it a non-accord table. Validation + * exists to prevent these schema changes from being committed while the drop is in-flight. + * However, if something like this did happen, by the time we come to execute this transformation, + * there's nothing really to do other than return success (as the table has indeed already been dropped). + */ +public class FinishDropAccordTable implements Transformation +{ + private static final Logger logger = LoggerFactory.getLogger(FinishDropAccordTable.class); + + public static final Serializer serializer = new Serializer(); + public final TableReference tableRef; + + public FinishDropAccordTable(TableReference tableRef) + { + this.tableRef = tableRef; + } + + @Override + public Kind kind() + { + return FINISH_DROP_ACCORD_TABLE; + } + + @Override + public Result execute(ClusterMetadata prev) + { + // In every case we remove the operation to drop this table from the set of in-flight sequences + ClusterMetadata.Transformer proposed = prev.transformer() + .with(prev.inProgressSequences.without(tableRef)); + + Keyspaces keyspaces = prev.schema.getKeyspaces(); + TableMetadata table = keyspaces.getTableOrViewNullable(tableRef.id); + // Table was already dropped + if (table == null) + { + logger.warn("Table {} was dropped while drop accord table sequence was in flight", tableRef); + return Transformation.success(proposed, LockedRanges.AffectedRanges.EMPTY); + } + KeyspaceMetadata keyspace = keyspaces.getNullable(table.keyspace); + + // Actually drop the table + Keyspaces withoutTable = keyspaces.withAddedOrUpdated(keyspace.withSwapped(keyspace.tables.without(table))); + + Keyspaces.KeyspacesDiff diff = Keyspaces.diff(prev.schema.getKeyspaces(), withoutTable); + + proposed = AlterSchema.maybeUpdateConsensusMigrationState(prev.consensusMigrationState, proposed, diff.altered, Keyspaces.NONE); + + proposed = proposed.with(new DistributedSchema(withoutTable)); + return Transformation.success(proposed, LockedRanges.AffectedRanges.EMPTY); + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (!(o instanceof FinishDropAccordTable)) return false; + + FinishDropAccordTable that = (FinishDropAccordTable) o; + + return Objects.equals(tableRef, that.tableRef); + } + + @Override + public int hashCode() + { + return Objects.hash(tableRef); + } + + public static class Serializer implements AsymmetricMetadataSerializer + { + @Override + public void serialize(Transformation t, DataOutputPlus out, Version version) throws IOException + { + FinishDropAccordTable plan = (FinishDropAccordTable) t; + TableReference.serializer.serialize(plan.tableRef, out, version); + } + + @Override + public FinishDropAccordTable deserialize(DataInputPlus in, Version version) throws IOException + { + TableReference table = TableReference.serializer.deserialize(in, version); + return new FinishDropAccordTable(table); + } + + @Override + public long serializedSize(Transformation t, Version version) + { + FinishDropAccordTable plan = (FinishDropAccordTable) t; + return TableReference.serializer.serializedSize(plan.tableRef, version); + } + } +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/tcm/transformations/MaybeFinishConsensusMigrationForTableAndRange.java b/src/java/org/apache/cassandra/tcm/transformations/MaybeFinishConsensusMigrationForTableAndRange.java new file mode 100644 index 000000000000..9b758b3486a9 --- /dev/null +++ b/src/java/org/apache/cassandra/tcm/transformations/MaybeFinishConsensusMigrationForTableAndRange.java @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.transformations; + +import java.io.IOException; +import java.util.List; +import javax.annotation.Nonnull; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.dht.NormalizedRanges; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.DistributedSchema; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.Keyspaces; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.TableParams; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationRepairType; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationState; +import org.apache.cassandra.service.consensus.migration.ConsensusTableMigration; +import org.apache.cassandra.service.consensus.migration.TableMigrationState; +import org.apache.cassandra.service.consensus.migration.TransactionalMigrationFromMode; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadata.Transformer; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.Transformation; +import org.apache.cassandra.tcm.sequences.LockedRanges; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static java.lang.String.format; +import static org.apache.cassandra.dht.Range.intersects; +import static org.apache.cassandra.dht.Range.normalize; +import static org.apache.cassandra.exceptions.ExceptionCode.INVALID; +import static org.apache.cassandra.service.consensus.migration.ConsensusMigrationTarget.accord; +import static org.apache.cassandra.service.consensus.migration.ConsensusMigrationTarget.paxos; + +public class MaybeFinishConsensusMigrationForTableAndRange implements Transformation +{ + private static final Logger logger = LoggerFactory.getLogger(MaybeFinishConsensusMigrationForTableAndRange.class); + + public static Serializer serializer = new Serializer(); + + @Nonnull + public final String keyspace; + + @Nonnull + public final String cf; + + @Nonnull + public final NormalizedRanges paxosRepairedRanges; + + @Nonnull + public final NormalizedRanges accordBarrieredRanges; + + @Nonnull + public final Epoch minEpoch; + + @Nonnull + public final ConsensusMigrationRepairType repairType; + + public MaybeFinishConsensusMigrationForTableAndRange(@Nonnull String keyspace, + @Nonnull String cf, + @Nonnull NormalizedRanges paxosRepairedRanges, + @Nonnull NormalizedRanges accordBarrieredRanges, + @Nonnull Epoch minEpoch, + boolean repairedData, + boolean repairedPaxos, + boolean repairedAccord) + { + checkNotNull(keyspace, "keyspace should not be null"); + checkNotNull(cf, "cf should not be null"); + checkNotNull(paxosRepairedRanges, "paxosRepairedRanges should not be null"); + checkNotNull(accordBarrieredRanges, "accordBarrierRanges should not be null"); + checkNotNull(minEpoch, "minEpoch should not be null"); + checkArgument(minEpoch.isAfter(Epoch.EMPTY), "minEpoch should not be empty"); + ConsensusMigrationRepairType repairType = new ConsensusMigrationRepairType(repairedData, repairedPaxos, repairedAccord); + checkNotNull(repairType, "repairType is null"); + checkArgument(!repairType.ineligibleForMigration(), "Shouldn't attempt to finish migration with ineligible repair"); + this.keyspace = keyspace; + this.cf = cf; + this.paxosRepairedRanges = paxosRepairedRanges; + this.accordBarrieredRanges = accordBarrieredRanges; + this.minEpoch = minEpoch; + this.repairType = repairType; + } + + public Kind kind() + { + return Kind.MAYBE_FINISH_CONSENSUS_MIGRATION_FOR_TABLE_AND_RANGE; + } + + private static Transformer resetMigrationOnSchema(ClusterMetadata prev, Transformer transformer, String ksName, String tblName, TableId id) + { + Keyspaces schema = prev.schema.getKeyspaces(); + KeyspaceMetadata keyspace = schema.getNullable(ksName); + + TableMetadata table = null == keyspace + ? null + : keyspace.getTableOrViewNullable(tblName); + + if (table == null || !table.id.equals(id)) + return transformer; + + TableParams params = table.params.unbuild().transactionalMigrationFrom(TransactionalMigrationFromMode.none).build(); + keyspace = keyspace.withSwapped(keyspace.tables.withSwapped(table.withSwapped(params))); + schema = schema.withAddedOrUpdated(keyspace); + return transformer.with(new DistributedSchema(schema)); + } + + public Result execute(@Nonnull ClusterMetadata metadata) + { + logger.info("Completed repair eligibility '{}' paxos repaired ranges {}, accord repaired ranges {}", repairType, paxosRepairedRanges, accordBarrieredRanges); + checkNotNull(metadata, "clusterMetadata should not be null"); + String ksAndCF = keyspace + "." + cf; + TableMetadata tbm = metadata.schema.getTableMetadata(keyspace, cf); + if (tbm == null) + return new Rejected(INVALID, format("Table %s is not currently performing consensus migration", ksAndCF)); + + ConsensusMigrationState consensusMigrationState = metadata.consensusMigrationState; + TableMigrationState tms = consensusMigrationState.tableStates.get(tbm.id); + if (tms == null) + return new Rejected(INVALID, format("Table %s is not currently performing consensus migration", ksAndCF)); + + if (!tms.targetProtocol.isMigratedBy(repairType)) + return new Rejected(INVALID, format("Table %s has a target protocol of %s and is the repair type %s is not eligible/needed to progress the migration", ksAndCF, tms.targetProtocol, repairType)); + + List> repairedRanges; + if (tms.targetProtocol == accord && (repairType.repairedPaxos || repairType.repairedData)) + repairedRanges = paxosRepairedRanges; + else if (tms.targetProtocol == paxos && repairType.repairedAccord) + repairedRanges = accordBarrieredRanges; + else + throw new IllegalStateException("Unhandled migration target " + tms.targetProtocol); + List> normalizedRepairedRanges = normalize(repairedRanges); + + // Bail out if repair doesn't actually intersect with any migrating ranges + if (!intersects(tms.migratingRanges, normalizedRepairedRanges)) + return new Rejected(INVALID, format("Table %s is migrating ranges %s, which doesn't include repaired ranges %s", ksAndCF, tms.migratingRanges, normalizedRepairedRanges)); + + Transformer next = metadata.transformer(); + ConsensusMigrationState migrationState = metadata.consensusMigrationState.withRangesRepairedAtEpoch(tbm, normalizedRepairedRanges, minEpoch, repairType); + next = next.with(migrationState); + + // reset the migration value on the table if the migration has completed + TableMigrationState tableState = migrationState.tableStates.get(tbm.id); + if (tableState == null || tableState.hasMigratedFullTokenRange(metadata.partitioner)) + next = resetMigrationOnSchema(metadata, next, keyspace, cf, tbm.id); + + return Transformation.success(next, LockedRanges.AffectedRanges.EMPTY); + } + + static class Serializer implements AsymmetricMetadataSerializer + { + public void serialize(Transformation t, DataOutputPlus out, Version version) throws IOException + { + MaybeFinishConsensusMigrationForTableAndRange v = (MaybeFinishConsensusMigrationForTableAndRange)t; + out.writeUTF(v.keyspace); + out.writeUTF(v.cf); + ConsensusTableMigration.rangesSerializer.serialize(v.paxosRepairedRanges, out, version); + ConsensusTableMigration.rangesSerializer.serialize(v.accordBarrieredRanges, out, version); + Epoch.serializer.serialize(v.minEpoch, out, version); + out.writeBoolean(v.repairType.repairedData); + out.writeBoolean(v.repairType.repairedPaxos); + out.writeBoolean(v.repairType.repairedAccord); + } + + public MaybeFinishConsensusMigrationForTableAndRange deserialize(DataInputPlus in, Version version) throws IOException + { + String keyspace = in.readUTF(); + String cf = in.readUTF(); + NormalizedRanges paxosRepairedRanges = ConsensusTableMigration.rangesSerializer.deserialize(in, version); + NormalizedRanges accordBarrieredRanges = ConsensusTableMigration.rangesSerializer.deserialize(in, version); + Epoch minEpoch = Epoch.serializer.deserialize(in, version); + boolean repairedData = in.readBoolean(); + boolean repairedPaxos = in.readBoolean(); + boolean repairedAccord = in.readBoolean(); + return new MaybeFinishConsensusMigrationForTableAndRange(keyspace, cf, paxosRepairedRanges, accordBarrieredRanges, minEpoch, repairedData, repairedPaxos, repairedAccord); + } + + public long serializedSize(Transformation t, Version version) + { + MaybeFinishConsensusMigrationForTableAndRange v = (MaybeFinishConsensusMigrationForTableAndRange)t; + return TypeSizes.sizeof(v.keyspace) + + TypeSizes.sizeof(v.cf) + + ConsensusTableMigration.rangesSerializer.serializedSize(v.paxosRepairedRanges, version) + + ConsensusTableMigration.rangesSerializer.serializedSize(v.accordBarrieredRanges, version) + + Epoch.serializer.serializedSize(v.minEpoch) + + TypeSizes.sizeof(v.repairType.repairedData) + + TypeSizes.sizeof(v.repairType.repairedPaxos) + + TypeSizes.sizeof(v.repairType.repairedAccord); + } + } +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/tcm/transformations/PrepareDropAccordTable.java b/src/java/org/apache/cassandra/tcm/transformations/PrepareDropAccordTable.java new file mode 100644 index 000000000000..2c960a0bd3a0 --- /dev/null +++ b/src/java/org/apache/cassandra/tcm/transformations/PrepareDropAccordTable.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.transformations; + +import java.io.IOException; +import java.util.Objects; + +import org.apache.cassandra.exceptions.ExceptionCode; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.DistributedSchema; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Transformation; +import org.apache.cassandra.tcm.sequences.DropAccordTable; +import org.apache.cassandra.tcm.sequences.DropAccordTable.TableReference; +import org.apache.cassandra.tcm.sequences.LockedRanges; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; + +public class PrepareDropAccordTable implements Transformation +{ + public static final Serializer serializer = new Serializer(); + + public final TableReference tableRef; + + public PrepareDropAccordTable(TableReference tableRef) + { + this.tableRef = tableRef; + } + + @Override + public Kind kind() + { + return Kind.PREPARE_DROP_ACCORD_TABLE; + } + + @Override + public Result execute(ClusterMetadata prev) + { + TableMetadata metadata = prev.schema.getKeyspaces().getTableOrViewNullable(tableRef.id); + if (metadata == null) + return new Rejected(ExceptionCode.INVALID, "Table " + tableRef + " is not known"); + if (!metadata.isAccordEnabled()) + return new Rejected(ExceptionCode.INVALID, "Table " + metadata + " is not an Accord table and should be dropped normally"); + if (metadata.params.pendingDrop) + return new Rejected(ExceptionCode.INVALID, "Table " + metadata + " is in the process of being dropped"); + + KeyspaceMetadata ks = prev.schema.getKeyspaceMetadata(metadata.keyspace); + metadata = metadata.unbuild().params(metadata.params.unbuild().pendingDrop(true).build()).build(); + ks = ks.withSwapped(ks.tables.withSwapped(metadata)); + + DropAccordTable operation = DropAccordTable.newSequence(tableRef, prev.nextEpoch()); + ClusterMetadata.Transformer proposed = prev.transformer() + .with(new DistributedSchema(prev.schema.getKeyspaces().withAddedOrUpdated(ks))) + .with(prev.inProgressSequences.with(tableRef, operation)); + return Transformation.success(proposed, LockedRanges.AffectedRanges.EMPTY); + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + PrepareDropAccordTable that = (PrepareDropAccordTable) o; + return tableRef.equals(that.tableRef); + } + + @Override + public int hashCode() + { + return Objects.hash(tableRef); + } + + public static class Serializer implements AsymmetricMetadataSerializer + { + @Override + public void serialize(Transformation t, DataOutputPlus out, Version version) throws IOException + { + PrepareDropAccordTable plan = (PrepareDropAccordTable) t; + TableReference.serializer.serialize(plan.tableRef, out, version); + } + + @Override + public PrepareDropAccordTable deserialize(DataInputPlus in, Version version) throws IOException + { + TableReference table = TableReference.serializer.deserialize(in, version); + return new PrepareDropAccordTable(table); + } + + @Override + public long serializedSize(Transformation t, Version version) + { + PrepareDropAccordTable plan = (PrepareDropAccordTable) t; + return TableReference.serializer.serializedSize(plan.tableRef, version); + } + } +} diff --git a/src/java/org/apache/cassandra/tcm/transformations/ReconfigureAccordFastPath.java b/src/java/org/apache/cassandra/tcm/transformations/ReconfigureAccordFastPath.java new file mode 100644 index 000000000000..e8feef342b34 --- /dev/null +++ b/src/java/org/apache/cassandra/tcm/transformations/ReconfigureAccordFastPath.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.transformations; + +import java.io.IOException; + +import accord.local.Node; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.exceptions.ExceptionCode; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.AccordFastPath; +import org.apache.cassandra.service.accord.serializers.TopologySerializers; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Transformation; +import org.apache.cassandra.tcm.sequences.LockedRanges; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; + +public class ReconfigureAccordFastPath implements Transformation +{ + private final Node.Id node; + private final AccordFastPath.Status status; + private final long updateTimeMillis; + private final long updateDelayMillis; + + public ReconfigureAccordFastPath(Node.Id node, AccordFastPath.Status status, long updateTimeMillis, long updateDelayMillis) + { + this.node = node; + this.status = status; + this.updateTimeMillis = updateTimeMillis; + this.updateDelayMillis = updateDelayMillis; + } + + public Kind kind() + { + return Kind.UPDATE_AVAILABILITY; + } + + public Result execute(ClusterMetadata metadata) + { + try + { + return Transformation.success(metadata.transformer().withFastPathStatusSince(node, status, updateTimeMillis, updateDelayMillis), LockedRanges.AffectedRanges.EMPTY); + } + catch (InvalidRequestException e) + { + return new Rejected(ExceptionCode.INVALID, e.getMessage()); + } + } + + @Override + public String toString() + { + return "ReconfigureAccordFastPath{" + + "node=" + node + + ", status=" + status + + ", updateTimeMillis=" + updateTimeMillis + + ", updateDelayMillis=" + updateDelayMillis + + '}'; + } + + public static final AsymmetricMetadataSerializer serializer = new AsymmetricMetadataSerializer() + { + public void serialize(Transformation t, DataOutputPlus out, Version version) throws IOException + { + ReconfigureAccordFastPath update = (ReconfigureAccordFastPath) t; + TopologySerializers.nodeId.serialize(update.node, out); + AccordFastPath.Status.serializer.serialize(update.status, out, version); + out.writeUnsignedVInt(update.updateTimeMillis); + out.writeUnsignedVInt(update.updateDelayMillis); + + } + + public ReconfigureAccordFastPath deserialize(DataInputPlus in, Version version) throws IOException + { + return new ReconfigureAccordFastPath(TopologySerializers.nodeId.deserialize(in), + AccordFastPath.Status.serializer.deserialize(in, version), + in.readUnsignedVInt(), in.readUnsignedVInt()); + } + + public long serializedSize(Transformation t, Version version) + { + ReconfigureAccordFastPath update = (ReconfigureAccordFastPath) t; + return TopologySerializers.nodeId.serializedSize(update.node) + + AccordFastPath.Status.serializer.serializedSize(update.status, version) + + TypeSizes.sizeofUnsignedVInt(update.updateTimeMillis) + + TypeSizes.sizeofUnsignedVInt(update.updateDelayMillis); + } + }; +} diff --git a/src/java/org/apache/cassandra/tcm/transformations/cms/PrepareCMSReconfiguration.java b/src/java/org/apache/cassandra/tcm/transformations/cms/PrepareCMSReconfiguration.java index c8b15c4fa595..e94292831e4b 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/cms/PrepareCMSReconfiguration.java +++ b/src/java/org/apache/cassandra/tcm/transformations/cms/PrepareCMSReconfiguration.java @@ -45,6 +45,7 @@ import org.apache.cassandra.schema.KeyspaceParams; import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.service.accord.fastpath.FastPathStrategy; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Transformation; import org.apache.cassandra.tcm.membership.NodeId; @@ -263,7 +264,7 @@ public Result execute(ClusterMetadata prev) // In a complex reconfiguration, in addition to initiating the sequence of membership changes, // we're modifying the replication params of the metadata keyspace so we supply a function to do that KeyspaceMetadata keyspace = prev.schema.getKeyspaceMetadata(SchemaConstants.METADATA_KEYSPACE_NAME); - KeyspaceMetadata newKeyspace = keyspace.withSwapped(new KeyspaceParams(keyspace.params.durableWrites, replicationParams)); + KeyspaceMetadata newKeyspace = keyspace.withSwapped(new KeyspaceParams(keyspace.params.durableWrites, replicationParams, FastPathStrategy.simple())); return executeInternal(prev, transformer -> transformer.with(prev.placements.replaceParams(prev.nextEpoch(), ReplicationParams.meta(prev), replicationParams)) diff --git a/src/java/org/apache/cassandra/tools/CmdLineOptions.java b/src/java/org/apache/cassandra/tools/CmdLineOptions.java new file mode 100644 index 000000000000..504ac1aaf6ce --- /dev/null +++ b/src/java/org/apache/cassandra/tools/CmdLineOptions.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tools; + +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; + +public class CmdLineOptions extends Options +{ + /** + * Add option with argument and argument name + * + * @param opt shortcut for option name + * @param longOpt complete option name + * @param argName argument name + * @param description description of the option + * @return updated Options object + */ + public Options addOption(String opt, String longOpt, String argName, String description) + { + Option option = new Option(opt, longOpt, true, description); + option.setArgName(argName); + + return addOption(option); + } + + /** + * Add option with argument and argument name that accepts being defined multiple times as a list + * + * @param opt shortcut for option name + * @param longOpt complete option name + * @param argName argument name + * @param description description of the option + * @return updated Options object + */ + public Options addOptionList(String opt, String longOpt, String argName, String description) + { + Option option = new Option(opt, longOpt, true, description); + option.setArgName(argName); + option.setArgs(Option.UNLIMITED_VALUES); + + return addOption(option); + } + + /** + * Add option without argument + * + * @param opt shortcut for option name + * @param longOpt complete option name + * @param description description of the option + * @return updated Options object + */ + public Options addOption(String opt, String longOpt, String description) + { + return addOption(new Option(opt, longOpt, false, description)); + } +} diff --git a/src/java/org/apache/cassandra/tools/FieldUtil.java b/src/java/org/apache/cassandra/tools/FieldUtil.java new file mode 100644 index 000000000000..6c61e2b3f70a --- /dev/null +++ b/src/java/org/apache/cassandra/tools/FieldUtil.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tools; + +import java.lang.reflect.Field; +import java.lang.reflect.Modifier; + +import org.apache.cassandra.utils.ReflectionUtils; + +public class FieldUtil +{ + public static void setInstanceUnsafe(Class klass, Object v, String fieldName) + { + try + { + setInstanceUnsafeThrowing(klass, v, fieldName); + } + catch (Throwable e) + { + throw new RuntimeException(e); + } + } + + private static void setInstanceUnsafeThrowing(Class klass, Object v, String fieldName) throws Throwable + { + Field field = ReflectionUtils.getField(klass, fieldName); + field.setAccessible(true); + + Field modifiers = ReflectionUtils.getModifiersField(); + modifiers.setAccessible(true); + modifiers.setInt(field, field.getModifiers() & ~Modifier.FINAL); + + field.set(null, v); + } + + public static void transferFields(Object sourceInstance, Class klass) + { + for (Field sourceField : sourceInstance.getClass().getDeclaredFields()) + { + sourceField.setAccessible(true); + try + { + setInstanceUnsafe(klass, sourceField.get(sourceInstance), sourceField.getName()); + } + catch (Throwable e) + { + throw new RuntimeException("Failed to transfer field: " + sourceField.getName(), e); + } + } + } +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/tools/NodeProbe.java b/src/java/org/apache/cassandra/tools/NodeProbe.java index 747da8348564..62d2d164c6ff 100644 --- a/src/java/org/apache/cassandra/tools/NodeProbe.java +++ b/src/java/org/apache/cassandra/tools/NodeProbe.java @@ -30,6 +30,7 @@ import java.rmi.server.RMISocketFactory; import java.util.AbstractMap; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; @@ -41,7 +42,6 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; - import javax.annotation.Nullable; import javax.management.JMX; import javax.management.MBeanServerConnection; @@ -55,12 +55,20 @@ import javax.management.remote.JMXServiceURL; import javax.rmi.ssl.SslRMIClientSocketFactory; +import com.google.common.base.Function; +import com.google.common.base.Strings; +import com.google.common.collect.HashMultimap; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Iterables; +import com.google.common.collect.Maps; +import com.google.common.collect.Multimap; +import com.google.common.collect.Sets; +import com.google.common.util.concurrent.Uninterruptibles; + import org.apache.cassandra.audit.AuditLogManager; import org.apache.cassandra.audit.AuditLogManagerMBean; import org.apache.cassandra.audit.AuditLogOptions; import org.apache.cassandra.audit.AuditLogOptionsCompositeData; - -import com.google.common.collect.ImmutableMap; import org.apache.cassandra.auth.AuthCache; import org.apache.cassandra.auth.AuthCacheMBean; import org.apache.cassandra.auth.CIDRGroupsMappingManager; @@ -104,31 +112,26 @@ import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.net.MessagingServiceMBean; import org.apache.cassandra.service.ActiveRepairServiceMBean; +import org.apache.cassandra.service.AutoRepairService; +import org.apache.cassandra.service.AutoRepairServiceMBean; import org.apache.cassandra.service.CacheService; import org.apache.cassandra.service.CacheServiceMBean; import org.apache.cassandra.service.snapshot.SnapshotManagerMBean; -import org.apache.cassandra.tcm.CMSOperationsMBean; import org.apache.cassandra.service.GCInspector; import org.apache.cassandra.service.GCInspectorMXBean; import org.apache.cassandra.service.StorageProxy; import org.apache.cassandra.service.StorageProxyMBean; import org.apache.cassandra.service.StorageServiceMBean; +import org.apache.cassandra.service.accord.AccordOperations; +import org.apache.cassandra.service.accord.AccordOperationsMBean; import org.apache.cassandra.streaming.StreamManagerMBean; import org.apache.cassandra.streaming.StreamState; import org.apache.cassandra.streaming.management.StreamStateCompositeData; -import org.apache.cassandra.tools.nodetool.formatter.TableBuilder; - -import com.google.common.base.Function; -import com.google.common.base.Strings; -import com.google.common.collect.HashMultimap; -import com.google.common.collect.Iterables; -import com.google.common.collect.Maps; -import com.google.common.collect.Multimap; -import com.google.common.collect.Sets; -import com.google.common.util.concurrent.Uninterruptibles; - import org.apache.cassandra.tcm.CMSOperations; +import org.apache.cassandra.tcm.CMSOperationsMBean; +import org.apache.cassandra.tools.RepairRunner.RepairCmd; import org.apache.cassandra.tools.nodetool.GetTimeout; +import org.apache.cassandra.tools.nodetool.formatter.TableBuilder; import org.apache.cassandra.utils.NativeLibrary; import static org.apache.cassandra.config.CassandraRelevantProperties.NODETOOL_JMX_NOTIFICATION_POLL_INTERVAL_SECONDS; @@ -157,6 +160,7 @@ public class NodeProbe implements AutoCloseable protected StorageServiceMBean ssProxy; protected SnapshotManagerMBean snapshotProxy; protected CMSOperationsMBean cmsProxy; + protected AccordOperationsMBean accordProxy; protected GossiperMBean gossProxy; protected MemoryMXBean memProxy; protected GCInspectorMXBean gcProxy; @@ -177,6 +181,7 @@ public class NodeProbe implements AutoCloseable protected CIDRGroupsMappingManagerMBean cmbProxy; protected PermissionsCacheMBean pcProxy; protected RolesCacheMBean rcProxy; + protected AutoRepairServiceMBean autoRepairProxy; protected Output output; private boolean failed; @@ -273,6 +278,8 @@ protected void connect() throws IOException snapshotProxy = JMX.newMBeanProxy(mbeanServerConn, name, SnapshotManagerMBean.class); name = new ObjectName(CMSOperations.MBEAN_OBJECT_NAME); cmsProxy = JMX.newMBeanProxy(mbeanServerConn, name, CMSOperationsMBean.class); + name = new ObjectName(AccordOperations.MBEAN_OBJECT_NAME); + accordProxy = JMX.newMBeanProxy(mbeanServerConn, name, AccordOperationsMBean.class); name = new ObjectName(MessagingService.MBEAN_NAME); msProxy = JMX.newMBeanProxy(mbeanServerConn, name, MessagingServiceMBean.class); name = new ObjectName(StreamManagerMBean.OBJECT_NAME); @@ -319,6 +326,9 @@ protected void connect() throws IOException name = new ObjectName(CIDRFilteringMetricsTable.MBEAN_NAME); cfmProxy = JMX.newMBeanProxy(mbeanServerConn, name, CIDRFilteringMetricsTableMBean.class); + + name = new ObjectName(AutoRepairService.MBEAN_NAME); + autoRepairProxy = JMX.newMBeanProxy(mbeanServerConn, name, AutoRepairServiceMBean.class); } catch (MalformedObjectNameException e) { @@ -523,13 +533,28 @@ public String getKeyspaceReplicationInfo(String keyspaceName) public void repairAsync(final PrintStream out, final String keyspace, Map options) throws IOException { - RepairRunner runner = new RepairRunner(out, ssProxy, keyspace, options); + startAndBlockOnAsyncRepairs(out, Collections.singleton(new RepairCmd(keyspace) + { + @Override + public Integer start() + { + return ssProxy.repairAsync(keyspace, options); + } + })); + } + + public void startAndBlockOnAsyncRepairs(final PrintStream out, Collection cmds) throws IOException + { + List runners = new ArrayList<>(cmds.size()); + for (RepairCmd cmd : cmds) + runners.add(new RepairRunner(out, jmxc, ssProxy, cmd)); + try { - if (jmxc != null) - jmxc.addConnectionNotificationListener(runner, null, null); - ssProxy.addNotificationListener(runner, null, null); - runner.run(); + runners.forEach(RepairRunner::start); + + for (RepairRunner runner : runners) + runner.run(); } catch (Exception e) { @@ -539,9 +564,7 @@ public void repairAsync(final PrintStream out, final String keyspace, Map getNonLocalStrategyKeyspaces() return ssProxy.getNonLocalStrategyKeyspaces(); } + public List getAccordManagedKeyspaces() + { + return ssProxy.getAccordManagedKeyspaces(); + } + + public List getAccordManagedTables() + { + return ssProxy.getAccordManagedTables(); + } + public String getClusterName() { return ssProxy.getClusterName(); @@ -2325,7 +2363,7 @@ public void setLoggingLevel(String classQualifier, String level) } catch (Exception e) { - throw new RuntimeException("Error setting log for " + classQualifier + " on level " + level + ". Please check logback configuration and ensure to have set", e); + throw new RuntimeException("Error setting log for " + classQualifier + " on level " + level + ". Please check logback configuration.", e); } } @@ -2521,6 +2559,141 @@ public void abortBootstrap(String nodeId, String endpoint) { ssProxy.abortBootstrap(nodeId, endpoint); } + + public boolean isAutoRepairDisabled() + { + return autoRepairProxy.isAutoRepairDisabled(); + } + + public String autoRepairConfiguration() + { + return autoRepairProxy.getAutoRepairConfiguration(); + } + + public void setAutoRepairTokenRangeSplitterParameter(String repairType, String key, String value) + { + autoRepairProxy.setAutoRepairTokenRangeSplitterParameter(repairType, key, value); + } + + public void setAutoRepairEnabled(String repairType, boolean enabled) + { + autoRepairProxy.setAutoRepairEnabled(repairType, enabled); + } + + public void setAutoRepairThreads(String repairType, int repairThreads) + { + autoRepairProxy.setRepairThreads(repairType, repairThreads); + } + + public void setAutoRepairPriorityForHosts(String repairType, String commaSeparatedHostSet) + { + autoRepairProxy.setRepairPriorityForHosts(repairType, commaSeparatedHostSet); + } + + public void setAutoRepairForceRepairForHosts(String repairType, String commaSeparatedHostSet) + { + autoRepairProxy.setForceRepairForHosts(repairType, commaSeparatedHostSet); + } + + public void setAutoRepairMinInterval(String repairType, String minRepairInterval) + { + autoRepairProxy.setRepairMinInterval(repairType, minRepairInterval); + } + + public void setAutoRepairHistoryClearDeleteHostsBufferDuration(String duration) + { + autoRepairProxy.setAutoRepairHistoryClearDeleteHostsBufferDuration(duration); + } + + public void startAutoRepairScheduler() + { + autoRepairProxy.startScheduler(); + } + + public void setAutoRepairMinRepairTaskDuration(String duration) + { + autoRepairProxy.setAutoRepairMinRepairTaskDuration(duration); + } + + public void setAutoRepairSSTableCountHigherThreshold(String repairType, int ssTableHigherThreshold) + { + autoRepairProxy.setRepairSSTableCountHigherThreshold(repairType, ssTableHigherThreshold); + } + + public void setAutoRepairTableMaxRepairTime(String repairType, String autoRepairTableMaxRepairTime) + { + autoRepairProxy.setAutoRepairTableMaxRepairTime(repairType, autoRepairTableMaxRepairTime); + } + + public void setAutoRepairIgnoreDCs(String repairType, Set ignoreDCs) + { + autoRepairProxy.setIgnoreDCs(repairType, ignoreDCs); + } + + public void setAutoRepairParallelRepairPercentage(String repairType, int percentage) + { + autoRepairProxy.setParallelRepairPercentage(repairType, percentage); + } + + public void setAutoRepairParallelRepairCount(String repairType, int count) + { + autoRepairProxy.setParallelRepairCount(repairType, count); + } + + public void setAutoRepairAllowParallelReplicaRepair(String repairType, boolean enabled) + { + autoRepairProxy.setAllowParallelReplicaRepair(repairType, enabled); + } + + public void setAutoRepairAllowParallelReplicaRepairAcrossSchedules(String repairType, boolean enabled) + { + autoRepairProxy.setAllowParallelReplicaRepairAcrossSchedules(repairType, enabled); + } + + public void setAutoRepairPrimaryTokenRangeOnly(String repairType, boolean primaryTokenRangeOnly) + { + autoRepairProxy.setPrimaryTokenRangeOnly(repairType, primaryTokenRangeOnly); + } + + public void setAutoRepairMaterializedViewRepairEnabled(String repairType, boolean enabled) + { + autoRepairProxy.setMVRepairEnabled(repairType, enabled); + } + + public List mutateSSTableRepairedState(boolean repair, boolean preview, String keyspace, List tables) + { + return ssProxy.mutateSSTableRepairedState(repair, preview, keyspace, tables); + } + + public List getAutoRepairTablesForKeyspace(String keyspace) + { + return ssProxy.getTablesForKeyspace(keyspace); + } + + public void setAutoRepairSessionTimeout(String repairType, String timeout) + { + autoRepairProxy.setRepairSessionTimeout(repairType, timeout); + } + + public Set getAutoRepairOnGoingRepairHostIds(String repairType) + { + return autoRepairProxy.getOnGoingRepairHostIds(repairType); + } + + public void setAutoRepairRepairByKeyspace(String repairType, boolean enabled) + { + autoRepairProxy.setRepairByKeyspace(repairType, enabled); + } + + public void setAutoRepairMaxRetriesCount(String repairType, int retries) + { + autoRepairProxy.setAutoRepairMaxRetriesCount(repairType, retries); + } + + public void setAutoRepairRetryBackoff(String repairType, String interval) + { + autoRepairProxy.setAutoRepairRetryBackoff(repairType, interval); + } } class ColumnFamilyStoreMBeanIterator implements Iterator> diff --git a/src/java/org/apache/cassandra/tools/NodeTool.java b/src/java/org/apache/cassandra/tools/NodeTool.java index d7bcc25b7110..3e025df1652e 100644 --- a/src/java/org/apache/cassandra/tools/NodeTool.java +++ b/src/java/org/apache/cassandra/tools/NodeTool.java @@ -17,20 +17,7 @@ */ package org.apache.cassandra.tools; -import static com.google.common.base.Throwables.getStackTraceAsString; -import static com.google.common.collect.Iterables.toArray; -import static com.google.common.collect.Lists.newArrayList; -import static java.lang.Integer.parseInt; -import static java.lang.String.format; -import static org.apache.cassandra.io.util.File.WriteMode.APPEND; -import static org.apache.commons.lang3.ArrayUtils.EMPTY_STRING_ARRAY; -import static org.apache.commons.lang3.StringUtils.EMPTY; -import static org.apache.commons.lang3.StringUtils.isEmpty; -import static org.apache.commons.lang3.StringUtils.isNotEmpty; - import java.io.Console; -import org.apache.cassandra.io.util.File; -import org.apache.cassandra.io.util.FileWriter; import java.io.FileNotFoundException; import java.io.IOError; import java.io.IOException; @@ -44,16 +31,10 @@ import java.util.Map.Entry; import java.util.Scanner; import java.util.SortedMap; - import javax.management.InstanceNotFoundException; import com.google.common.base.Joiner; import com.google.common.base.Throwables; - -import org.apache.cassandra.locator.EndpointSnitchInfoMBean; -import org.apache.cassandra.tools.nodetool.*; -import org.apache.cassandra.utils.FBUtilities; - import com.google.common.collect.Maps; import io.airlift.airline.Cli; @@ -67,6 +48,22 @@ import io.airlift.airline.ParseOptionConversionException; import io.airlift.airline.ParseOptionMissingException; import io.airlift.airline.ParseOptionMissingValueException; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.io.util.FileWriter; +import org.apache.cassandra.locator.EndpointSnitchInfoMBean; +import org.apache.cassandra.tools.nodetool.*; +import org.apache.cassandra.utils.FBUtilities; + +import static com.google.common.base.Throwables.getStackTraceAsString; +import static com.google.common.collect.Iterables.toArray; +import static com.google.common.collect.Lists.newArrayList; +import static java.lang.Integer.parseInt; +import static java.lang.String.format; +import static org.apache.cassandra.io.util.File.WriteMode.APPEND; +import static org.apache.commons.lang3.ArrayUtils.EMPTY_STRING_ARRAY; +import static org.apache.commons.lang3.StringUtils.EMPTY; +import static org.apache.commons.lang3.StringUtils.isEmpty; +import static org.apache.commons.lang3.StringUtils.isNotEmpty; public class NodeTool { @@ -95,7 +92,9 @@ public int execute(String... args) { List> commands = newArrayList( AbortBootstrap.class, + AlterTopology.class, Assassinate.class, + AutoRepairStatus.class, CassHelp.class, CIDRFilteringStats.class, Cleanup.class, @@ -135,6 +134,7 @@ public int execute(String... args) GcStats.class, GetAuditLog.class, GetAuthCacheConfig.class, + GetAutoRepairConfig.class, GetBatchlogReplayTrottle.class, GetCIDRGroupsOfIP.class, GetColumnIndexSize.class, @@ -199,6 +199,7 @@ public int execute(String... args) Ring.class, Scrub.class, SetAuthCacheConfig.class, + SetAutoRepairConfig.class, SetBatchlogReplayThrottle.class, SetCacheCapacity.class, SetCacheKeysToSave.class, @@ -219,6 +220,7 @@ public int execute(String... args) SetTraceProbability.class, Sjk.class, Snapshot.class, + SSTableRepairedSet.class, Status.class, StatusAutoCompaction.class, StatusBackup.class, @@ -269,7 +271,24 @@ public int execute(String... args) .withCommand(CMSAdmin.ReconfigureCMS.class) .withCommand(CMSAdmin.Snapshot.class) .withCommand(CMSAdmin.Unregister.class) - .withCommand(CMSAdmin.AbortInitialization.class); + .withCommand(CMSAdmin.AbortInitialization.class) + .withCommand(CMSAdmin.DumpDirectory.class) + .withCommand(CMSAdmin.DumpLog.class) + .withCommand(CMSAdmin.ResumeDropAccordTable.class); + + builder.withGroup("consensus_admin") + .withDescription("List and mark ranges as migrating between consensus protocols") + .withDefaultCommand(CassHelp.class) + .withCommand(ConsensusMigrationAdmin.BeginMigration.class) + .withCommands(ConsensusMigrationAdmin.ListCmd.class) + .withCommands(ConsensusMigrationAdmin.FinishMigration.class); + + builder.withGroup("accord") + .withDescription("Manage the operation of Accord") + .withDefaultCommand(AccordAdmin.Describe.class) + .withCommand(AccordAdmin.Describe.class) + .withCommand(AccordAdmin.MarkStale.class) + .withCommand(AccordAdmin.MarkRejoining.class); Cli parser = builder.build(); @@ -470,7 +489,7 @@ private NodeProbe connect() protected enum KeyspaceSet { - ALL, NON_SYSTEM, NON_LOCAL_STRATEGY + ALL, NON_SYSTEM, NON_LOCAL_STRATEGY, ACCORD_MANAGED } protected List parseOptionalKeyspace(List cmdArgs, NodeProbe nodeProbe) @@ -489,6 +508,8 @@ protected List parseOptionalKeyspace(List cmdArgs, NodeProbe nod keyspaces.addAll(keyspaces = nodeProbe.getNonLocalStrategyKeyspaces()); else if (defaultKeyspaceSet == KeyspaceSet.NON_SYSTEM) keyspaces.addAll(keyspaces = nodeProbe.getNonSystemKeyspaces()); + else if (defaultKeyspaceSet == KeyspaceSet.ACCORD_MANAGED) + keyspaces.addAll(nodeProbe.getAccordManagedKeyspaces()); else keyspaces.addAll(nodeProbe.getKeyspaces()); } diff --git a/src/java/org/apache/cassandra/tools/RepairRunner.java b/src/java/org/apache/cassandra/tools/RepairRunner.java index 01aa5201852b..8451de291820 100644 --- a/src/java/org/apache/cassandra/tools/RepairRunner.java +++ b/src/java/org/apache/cassandra/tools/RepairRunner.java @@ -21,53 +21,102 @@ import java.io.PrintStream; import java.text.SimpleDateFormat; import java.util.List; -import java.util.Map; + +import javax.management.ListenerNotFoundException; +import javax.management.remote.JMXConnector; import org.apache.cassandra.service.ActiveRepairService.ParentRepairStatus; import org.apache.cassandra.service.StorageServiceMBean; import org.apache.cassandra.utils.concurrent.Condition; - import org.apache.cassandra.utils.progress.ProgressEvent; import org.apache.cassandra.utils.progress.ProgressEventType; import org.apache.cassandra.utils.progress.jmx.JMXNotificationProgressListener; -import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis; import static java.util.concurrent.TimeUnit.SECONDS; import static org.apache.cassandra.service.ActiveRepairService.ParentRepairStatus.FAILED; import static org.apache.cassandra.service.ActiveRepairService.ParentRepairStatus.valueOf; import static org.apache.cassandra.tools.NodeProbe.JMX_NOTIFICATION_POLL_INTERVAL_SECONDS; +import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis; import static org.apache.cassandra.utils.LocalizeString.toLowerCaseLocalized; import static org.apache.cassandra.utils.concurrent.Condition.newOneTimeCondition; -import static org.apache.cassandra.utils.progress.ProgressEventType.*; +import static org.apache.cassandra.utils.progress.ProgressEventType.COMPLETE; +import static org.apache.cassandra.utils.progress.ProgressEventType.ERROR; +import static org.apache.cassandra.utils.progress.ProgressEventType.PROGRESS; public class RepairRunner extends JMXNotificationProgressListener { + public static abstract class RepairCmd + { + private final String keyspace; + + public RepairCmd(String keyspace) + { + this.keyspace = keyspace; + } + + + + public abstract Integer start(); + } private final SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss,SSS"); private final PrintStream out; + private final JMXConnector jmxc; private final StorageServiceMBean ssProxy; - private final String keyspace; - private final Map options; private final Condition condition = newOneTimeCondition(); - private int cmd; + private final RepairCmd repairCmd; + private Integer cmd; private volatile Exception error; - public RepairRunner(PrintStream out, StorageServiceMBean ssProxy, String keyspace, Map options) + public RepairRunner(PrintStream out, JMXConnector jmxc, StorageServiceMBean ssProxy, RepairCmd repairCmd) { this.out = out; + this.jmxc = jmxc; this.ssProxy = ssProxy; - this.keyspace = keyspace; - this.options = options; + this.repairCmd = repairCmd; + } + + public void start() + { + if (jmxc != null) + jmxc.addConnectionNotificationListener(this, null, null); + ssProxy.addNotificationListener(this, null, null); + this.cmd = repairCmd.start(); + } + + public void close() + { + try + { + ssProxy.removeNotificationListener(this); + } + catch (ListenerNotFoundException e) + { + // noop - there may be double removes with error handling + } + if (jmxc != null) + { + try + { + jmxc.removeConnectionNotificationListener(this); + } + catch (ListenerNotFoundException e) + { + // noop - there may be double removes with error handling + } + } } public void run() throws Exception { - cmd = ssProxy.repairAsync(keyspace, options); + if (cmd == null) + return; + if (cmd <= 0) { // repairAsync can only return 0 for replication factor 1. - String message = String.format("Replication factor is 1. No repair is needed for keyspace '%s'", keyspace); + String message = String.format("Replication factor is 1. No repair is needed for keyspace '%s'", repairCmd.keyspace); printMessage(message); } else @@ -119,7 +168,7 @@ public void handleConnectionFailed(long timestamp, String message) { error = new IOException(String.format("[%s] JMX connection closed. You should check server log for repair status of keyspace %s" + "(Subsequent keyspaces are not going to be repaired).", - format.format(timestamp), keyspace)); + format.format(timestamp), repairCmd.keyspace)); condition.signalAll(); } diff --git a/src/java/org/apache/cassandra/tools/StandaloneSSTableUtil.java b/src/java/org/apache/cassandra/tools/StandaloneSSTableUtil.java index 1dd0ba53d6cc..badea964c529 100644 --- a/src/java/org/apache/cassandra/tools/StandaloneSSTableUtil.java +++ b/src/java/org/apache/cassandra/tools/StandaloneSSTableUtil.java @@ -30,7 +30,6 @@ import java.util.function.BiPredicate; import org.apache.cassandra.io.util.File; -import static org.apache.cassandra.tools.BulkLoader.CmdLineOptions; public class StandaloneSSTableUtil { diff --git a/src/java/org/apache/cassandra/tools/StandaloneScrubber.java b/src/java/org/apache/cassandra/tools/StandaloneScrubber.java index fe12e6d723c9..93275ad28c45 100644 --- a/src/java/org/apache/cassandra/tools/StandaloneScrubber.java +++ b/src/java/org/apache/cassandra/tools/StandaloneScrubber.java @@ -51,7 +51,6 @@ import org.apache.cassandra.io.util.File; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.tcm.ClusterMetadataService; -import org.apache.cassandra.tools.BulkLoader.CmdLineOptions; import org.apache.cassandra.utils.JVMStabilityInspector; import org.apache.cassandra.utils.OutputHandler; import org.apache.cassandra.utils.Pair; diff --git a/src/java/org/apache/cassandra/tools/StandaloneSplitter.java b/src/java/org/apache/cassandra/tools/StandaloneSplitter.java index db9519041520..965186ce7eec 100644 --- a/src/java/org/apache/cassandra/tools/StandaloneSplitter.java +++ b/src/java/org/apache/cassandra/tools/StandaloneSplitter.java @@ -50,7 +50,7 @@ import org.apache.cassandra.utils.JVMStabilityInspector; import static org.apache.cassandra.config.CassandraRelevantProperties.TEST_UTIL_ALLOW_TOOL_REINIT_FOR_TEST; -import static org.apache.cassandra.tools.BulkLoader.CmdLineOptions; + import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis; public class StandaloneSplitter diff --git a/src/java/org/apache/cassandra/tools/StandaloneUpgrader.java b/src/java/org/apache/cassandra/tools/StandaloneUpgrader.java index 52e9f8c2955a..069fdbe8451f 100644 --- a/src/java/org/apache/cassandra/tools/StandaloneUpgrader.java +++ b/src/java/org/apache/cassandra/tools/StandaloneUpgrader.java @@ -46,7 +46,6 @@ import org.apache.cassandra.utils.OutputHandler; import static org.apache.cassandra.config.CassandraRelevantProperties.TEST_UTIL_ALLOW_TOOL_REINIT_FOR_TEST; -import static org.apache.cassandra.tools.BulkLoader.CmdLineOptions; public class StandaloneUpgrader { diff --git a/src/java/org/apache/cassandra/tools/StandaloneVerifier.java b/src/java/org/apache/cassandra/tools/StandaloneVerifier.java index 241fe2d43211..f00c3168f346 100644 --- a/src/java/org/apache/cassandra/tools/StandaloneVerifier.java +++ b/src/java/org/apache/cassandra/tools/StandaloneVerifier.java @@ -53,7 +53,6 @@ import org.apache.cassandra.utils.Throwables; import static org.apache.cassandra.config.CassandraRelevantProperties.TEST_UTIL_ALLOW_TOOL_REINIT_FOR_TEST; -import static org.apache.cassandra.tools.BulkLoader.CmdLineOptions; public class StandaloneVerifier { diff --git a/src/java/org/apache/cassandra/tools/nodetool/AccordAdmin.java b/src/java/org/apache/cassandra/tools/nodetool/AccordAdmin.java new file mode 100644 index 000000000000..ff7e88ca9f61 --- /dev/null +++ b/src/java/org/apache/cassandra/tools/nodetool/AccordAdmin.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tools.nodetool; + +import java.util.List; +import java.util.Map; + +import io.airlift.airline.Arguments; +import io.airlift.airline.Command; +import org.apache.cassandra.tools.NodeProbe; +import org.apache.cassandra.tools.NodeTool; + +public abstract class AccordAdmin extends NodeTool.NodeToolCmd +{ + @Command(name = "describe", description = "Describe current cluster metadata relating to Accord") + public static class Describe extends NodeTool.NodeToolCmd + { + @Override + protected void execute(NodeProbe probe) + { + Map info = probe.getAccordOperationsProxy().describe(); + output.out.printf("Accord Service:%n"); + output.out.printf("Epoch: %s%n", info.get("EPOCH")); + output.out.printf("Stale Replicas: %s%n", info.get("STALE_REPLICAS")); + } + } + + @Command(name = "mark_stale", description = "Mark a replica as being stale and no longer able to participate in durability status coordination") + public static class MarkStale extends AccordAdmin + { + @Arguments(required = true, description = "One or more node IDs to mark stale", usage = "+") + public List nodeIds; + + @Override + protected void execute(NodeProbe probe) + { + probe.getAccordOperationsProxy().accordMarkStale(nodeIds); + } + } + + @Command(name = "mark_rejoining", description = "Mark a stale replica as being allowed to participate in durability status coordination again") + public static class MarkRejoining extends AccordAdmin + { + @Arguments(required = true, description = "One or more node IDs to mark no longer stale", usage = "+") + public List nodeIds; + + @Override + protected void execute(NodeProbe probe) + { + probe.getAccordOperationsProxy().accordMarkRejoining(nodeIds); + } + } +} diff --git a/src/java/org/apache/cassandra/tools/nodetool/AlterTopology.java b/src/java/org/apache/cassandra/tools/nodetool/AlterTopology.java new file mode 100644 index 000000000000..e8078d3d5784 --- /dev/null +++ b/src/java/org/apache/cassandra/tools/nodetool/AlterTopology.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.tools.nodetool; + +import java.util.ArrayList; +import java.util.List; + +import io.airlift.airline.Arguments; +import io.airlift.airline.Command; +import org.apache.cassandra.tools.NodeProbe; +import org.apache.cassandra.tools.NodeTool.NodeToolCmd; + +import static com.google.common.base.Preconditions.checkArgument; + +@Command(name = "altertopology", description = "Modify the datacenter and/or rack of one or more nodes") +public class AlterTopology extends NodeToolCmd +{ + @Arguments(usage = " [...]", description = "One or more node identifiers, which may be either a node id, host id or broadcast address, each with a target dc:rack") + private List args = new ArrayList<>(); + + @Override + public void execute(NodeProbe probe) + { + checkArgument(!args.isEmpty(), "Invalid arguments; no changes specified"); + try + { + probe.getStorageService().alterTopology(String.join(",", args)); + } + catch (Exception e) + { + throw new IllegalArgumentException(e.getMessage()); + } + } +} diff --git a/src/java/org/apache/cassandra/tools/nodetool/AutoRepairStatus.java b/src/java/org/apache/cassandra/tools/nodetool/AutoRepairStatus.java new file mode 100644 index 000000000000..bb594a010ff1 --- /dev/null +++ b/src/java/org/apache/cassandra/tools/nodetool/AutoRepairStatus.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tools.nodetool; + +import java.io.PrintStream; +import java.util.Set; + +import com.google.common.annotations.VisibleForTesting; + +import io.airlift.airline.Command; +import io.airlift.airline.Option; +import org.apache.cassandra.tools.NodeProbe; +import org.apache.cassandra.tools.NodeTool; +import org.apache.cassandra.tools.nodetool.formatter.TableBuilder; + +import static com.google.common.base.Preconditions.checkArgument; + +/** + * Provides currently running auto-repair tasks. + */ +@Command(name = "autorepairstatus", description = "Print autorepair status") +public class AutoRepairStatus extends NodeTool.NodeToolCmd +{ + @VisibleForTesting + @Option(title = "repair type", name = { "-t", "--repair-type" }, description = "Repair type") + protected String repairType; + + @Override + public void execute(NodeProbe probe) + { + checkArgument(repairType != null, "--repair-type is required."); + PrintStream out = probe.output().out; + + if (probe.isAutoRepairDisabled()) + { + out.println("Auto-repair is not enabled"); + return; + } + + TableBuilder table = new TableBuilder(); + table.add("Active Repairs"); + Set ongoingRepairHostIds = probe.getAutoRepairOnGoingRepairHostIds(repairType); + table.add(getSetString(ongoingRepairHostIds)); + table.printTo(out); + } + + private String getSetString(Set hostIds) + { + if (hostIds.isEmpty()) + { + return "NONE"; + } + StringBuilder sb = new StringBuilder(); + for (String id : hostIds) + { + sb.append(id); + sb.append(","); + } + // remove last "," + sb.setLength(Math.max(sb.length() - 1, 0)); + return sb.toString(); + } +} diff --git a/src/java/org/apache/cassandra/tools/nodetool/CMSAdmin.java b/src/java/org/apache/cassandra/tools/nodetool/CMSAdmin.java index 02cc045545b7..84d23dc3f82a 100644 --- a/src/java/org/apache/cassandra/tools/nodetool/CMSAdmin.java +++ b/src/java/org/apache/cassandra/tools/nodetool/CMSAdmin.java @@ -18,14 +18,19 @@ package org.apache.cassandra.tools.nodetool; +import java.io.PrintStream; import java.util.ArrayList; +import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; +import com.google.common.collect.ImmutableList; + import io.airlift.airline.Arguments; import io.airlift.airline.Command; import io.airlift.airline.Option; +import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tools.NodeProbe; import org.apache.cassandra.tools.NodeTool; @@ -207,4 +212,61 @@ protected void execute(NodeProbe probe) probe.getCMSOperationsProxy().abortInitialization(initiator); } } + + @Command(name = "dumpdirectory", description = "Dump the directory from the current ClusterMetadata") + public static class DumpDirectory extends NodeTool.NodeToolCmd + { + @Option(name = "--tokens", title = "Include tokens", description = "Include tokens in output") + public boolean tokens = false; + @Override + protected void execute(NodeProbe probe) + { + output(probe.output().out, "NodeId", probe.getCMSOperationsProxy().dumpDirectory(tokens)); + } + } + + @Command(name = "dumplog", description = "Dump the metadata log") + public static class DumpLog extends NodeTool.NodeToolCmd + { + @Option(name = "--start", title = "Start epoch") + long startEpoch = Epoch.FIRST.getEpoch(); + @Option(name = "--end", title = "End epoch") + long endEpoch = Long.MAX_VALUE; + @Override + protected void execute(NodeProbe probe) + { + output(probe.output().out, "Epoch", probe.getCMSOperationsProxy().dumpLog(startEpoch, endEpoch)); + } + } + + private static void output(PrintStream out, String title, Map> map) + { + if (map.isEmpty()) + return; + int keywidth = keywidth(map); + for (Long key : ImmutableList.sortedCopyOf(map.keySet())) + { + out.println(title + ": " + key); + for (Map.Entry nodeEntry : map.get(key).entrySet()) + out.printf(" %-" + keywidth + "s%s%n", nodeEntry.getKey(), nodeEntry.getValue()); + } + } + + private static int keywidth(Map> map) + { + assert !map.isEmpty(); + return map.entrySet().iterator().next().getValue().keySet().stream().max(Comparator.comparingInt(String::length)).get().length() + 1; + } + + @Command(name = "resumedropaccordtable", description = "Resume a drop accord table operation which has stalled") + public static class ResumeDropAccordTable extends NodeTool.NodeToolCmd + { + @Arguments(usage = "[tableId]", description = "Table id of the table being dropped") + private String tableId; + @Override + public void execute(NodeProbe probe) + { + probe.getCMSOperationsProxy().resumeDropAccordTable(tableId); + } + } } diff --git a/src/java/org/apache/cassandra/tools/nodetool/ConsensusMigrationAdmin.java b/src/java/org/apache/cassandra/tools/nodetool/ConsensusMigrationAdmin.java new file mode 100644 index 000000000000..6553ddf9fd42 --- /dev/null +++ b/src/java/org/apache/cassandra/tools/nodetool/ConsensusMigrationAdmin.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tools.nodetool; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import io.airlift.airline.Arguments; +import io.airlift.airline.Command; +import io.airlift.airline.Option; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationTarget; +import org.apache.cassandra.tools.NodeProbe; +import org.apache.cassandra.tools.NodeTool; +import org.apache.cassandra.tools.RepairRunner.RepairCmd; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Collections.singleton; + +/** + * For managing migration from one consensus protocol to another. + * + * Mark ranges as migrating, and list the migrating ranges. + */ +public abstract class ConsensusMigrationAdmin extends NodeTool.NodeToolCmd +{ + @Command(name = "list", description = "List migrating tables and ranges") + public static class ListCmd extends ConsensusMigrationAdmin + { + @Arguments(usage = "[ ...]", description = "The keyspace followed by one or many tables") + private List schemaArgs = new ArrayList<>(); + + @Option(title = "format", name = {"-f", "--format"}, description = "Output format, YAML and JSON are the only supported formats, default YAML, prefix with `minified-` to turn off pretty printing") + private String format = "yaml"; + + protected void execute(NodeProbe probe) + { + Set keyspaceNames = schemaArgs.size() > 0 ? singleton(schemaArgs.get(0)) : null; + Set tableNames = schemaArgs.size() > 1 ? new HashSet<>(schemaArgs.subList(1, schemaArgs.size())) : null; + String output = probe.getStorageService().listConsensusMigrations(keyspaceNames, tableNames, format); + probe.output().out.println(output); + } + } + + @Command(name = "begin-migration", description = "Mark the range as migrating for the specified token range and tables") + public static class BeginMigration extends ConsensusMigrationAdmin + { + @Option(title = "start_token", name = {"-st", "--start-token"}, description = "Use -st to specify a token at which the repair range starts") + private String startToken = null; + + @Option(title = "end_token", name = {"-et", "--end-token"}, description = "Use -et to specify a token at which repair range ends") + private String endToken = null; + + @Arguments(usage = "[ ...]", description = "The keyspace followed by one or many tables") + private List schemaArgs = new ArrayList<>(); + + protected void execute(NodeProbe probe) + { + checkArgument((endToken != null && startToken != null) || (endToken == null && startToken == null), "Must specify start and end token together"); + String maybeRangesStr = startToken != null ? startToken + ":" + endToken : null; + List keyspaceNames = parseOptionalKeyspace(schemaArgs, probe, KeyspaceSet.ACCORD_MANAGED); + List maybeTableNames = schemaArgs.size() > 1 ? schemaArgs.subList(1, schemaArgs.size()) : null; + probe.getStorageService().migrateConsensusProtocol(keyspaceNames, maybeTableNames, maybeRangesStr); + probe.output().out.println("Marked requested ranges as migrating. Repair needs to be run in order to complete the migration"); + } + } + + @Command(name = "finish-migration", description = "Complete the migration for a range that has already begun migration") + public static class FinishMigration extends ConsensusMigrationAdmin + { + @Option(title = "start_token", name = {"-st", "--start-token"}, description = "Use -st to specify a token at which the repair range starts (exclusive)") + private String startToken = null; + + @Option(title = "end_token", name = {"-et", "--end-token"}, description = "Use -et to specify a token at which repair range ends (inclusive)") + private String endToken = null; + + @Arguments(usage = "[ ...]", description = "The keyspace followed by one or many tables") + private List schemaArgs = new ArrayList<>(); + + private static class FinishMigrationRepairCommand extends RepairCmd + { + private final NodeProbe probe; + private final String keyspace; + private final List maybeTableNames; + private final String maybeRangesStr; + private final ConsensusMigrationTarget target; + + public FinishMigrationRepairCommand(NodeProbe probe, String keyspace, List maybeTableNames, String maybeRangesStr, ConsensusMigrationTarget target) + { + super(keyspace); + this.probe = probe; + this.keyspace = keyspace; + this.maybeTableNames = maybeTableNames; + this.maybeRangesStr = maybeRangesStr; + this.target = target; + } + + @Override + public Integer start() + { + return probe.getStorageService().finishConsensusMigration(keyspace, maybeTableNames, maybeRangesStr, target.toString()); + } + } + + protected void execute(NodeProbe probe) + { + checkArgument((endToken != null) == (startToken != null), "Start and end token must be specified together"); + String maybeRangesStr = startToken != null ? startToken + ":" + endToken : null; + List keyspaceNames = parseOptionalKeyspace(schemaArgs, probe, KeyspaceSet.ACCORD_MANAGED); + List maybeTableNames = schemaArgs.size() > 1 ? schemaArgs.subList(1, schemaArgs.size()) : null; + List repairCmds = new ArrayList<>(keyspaceNames.size() * 2); + // Finish can't actually finish with one set of repairs when migrating from Paxos -> Accord + // and it's async when the next invocation will see TCM updates from the repair that will correctly determine + // the next set of repairs needed. If we spin we will issue redundant repairs. + // It's also pretty involved not to return handles on the repairs since there is already a lot of plumbing + // leveraging monitoring in progress repairs. + output.out.println("Starting first round of repairs"); + for (String keyspace : keyspaceNames) + { + repairCmds.add(new FinishMigrationRepairCommand(probe, keyspace, maybeTableNames, maybeRangesStr, ConsensusMigrationTarget.paxos)); + repairCmds.add(new FinishMigrationRepairCommand(probe, keyspace, maybeTableNames, maybeRangesStr, ConsensusMigrationTarget.accord)); + } + try + { + probe.startAndBlockOnAsyncRepairs(probe.output().out, repairCmds); + } + catch (IOException e) + { + throw new RuntimeException("Error occurred attempting to finish migration for keyspace(s) " + keyspaceNames + " tables " + maybeTableNames + " and ranges " + maybeRangesStr, e); + } + // The repair should have at least committed the TCM change to the node we asked to coordinate the repair + // so calling finishedConsensusMigration a second time should trigger any needed 2nd phase repairs + // or does nothing if none are needed + output.out.println("Starting second round of repairs (may do nothing if migrating from Accord to Paxos)"); + repairCmds.clear(); + for (String keyspace : keyspaceNames) + { + repairCmds.add(new FinishMigrationRepairCommand(probe, keyspace, maybeTableNames, maybeRangesStr, ConsensusMigrationTarget.accord)); + } + try + { + probe.startAndBlockOnAsyncRepairs(probe.output().out, repairCmds); + } + catch (IOException e) + { + throw new RuntimeException("Error occurred attempting to finish migration for keyspace(s) " + keyspaceNames + " tables " + maybeTableNames + " and ranges " + maybeRangesStr, e); + } + probe.output().out.printf("Finished consensus migration range (%s) of keyspaces %s and tables %s%n", maybeRangesStr, keyspaceNames, maybeTableNames); + } + } +} diff --git a/src/java/org/apache/cassandra/tools/nodetool/GetAutoRepairConfig.java b/src/java/org/apache/cassandra/tools/nodetool/GetAutoRepairConfig.java new file mode 100644 index 000000000000..9744498de757 --- /dev/null +++ b/src/java/org/apache/cassandra/tools/nodetool/GetAutoRepairConfig.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.tools.nodetool; + +import java.io.PrintStream; + +import com.google.common.annotations.VisibleForTesting; + +import io.airlift.airline.Command; +import org.apache.cassandra.tools.NodeProbe; +import org.apache.cassandra.tools.NodeTool.NodeToolCmd; + +/** + * Prints all the configurations for AutoRepair through nodetool. + */ +@Command(name = "getautorepairconfig", description = "Print autorepair configurations") +public class GetAutoRepairConfig extends NodeToolCmd +{ + @VisibleForTesting + protected static PrintStream out = System.out; + + @Override + public void execute(NodeProbe probe) + { + if (probe.isAutoRepairDisabled()) + out.println("Auto-repair is not enabled"); + else + out.println(probe.autoRepairConfiguration()); + } +} diff --git a/src/java/org/apache/cassandra/tools/nodetool/Repair.java b/src/java/org/apache/cassandra/tools/nodetool/Repair.java index c66992acc9a8..ad7342cf677b 100644 --- a/src/java/org/apache/cassandra/tools/nodetool/Repair.java +++ b/src/java/org/apache/cassandra/tools/nodetool/Repair.java @@ -17,13 +17,6 @@ */ package org.apache.cassandra.tools.nodetool; -import static com.google.common.collect.Lists.newArrayList; -import static org.apache.commons.lang3.StringUtils.EMPTY; -import io.airlift.airline.Arguments; -import io.airlift.airline.Cli; -import io.airlift.airline.Command; -import io.airlift.airline.Option; - import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -32,14 +25,22 @@ import java.util.function.Supplier; import com.google.common.collect.Sets; +import org.apache.commons.lang3.StringUtils; -import org.apache.cassandra.schema.SchemaConstants; -import org.apache.cassandra.streaming.PreviewKind; +import io.airlift.airline.Arguments; +import io.airlift.airline.Cli; +import io.airlift.airline.Command; +import io.airlift.airline.Option; import org.apache.cassandra.repair.RepairParallelism; import org.apache.cassandra.repair.messages.RepairOption; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.streaming.PreviewKind; import org.apache.cassandra.tools.NodeProbe; import org.apache.cassandra.tools.NodeTool.NodeToolCmd; -import org.apache.commons.lang3.StringUtils; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.Lists.newArrayList; +import static org.apache.commons.lang3.StringUtils.EMPTY; @Command(name = "repair", description = "Repair one or more tables") public class Repair extends NodeToolCmd @@ -105,6 +106,13 @@ public class Repair extends NodeToolCmd @Option(title = "paxos-only", name = {"-paxos-only", "--paxos-only"}, description = "If the --paxos-only flag is included, no table data is repaired, only paxos operations..") private boolean paxosOnly = false; + @Option(title = "accord-only", name = {"-accord-only", "--accord-only"}, description = "If the --accord-only flag is included, no table data is repaired, only accord operations..") + private boolean accordOnly = false; + + @Option(title = "skip-accord", name = {"-skip-accord", "--skip-accord"}, description = "If the --skip-accord flag is included, the Accord repair step is skipped. Accord repair is also skipped for preview repairs.") + private boolean skipAccord = false; + + @Option(title = "ignore_unreplicated_keyspaces", name = {"-iuk","--ignore-unreplicated-keyspaces"}, description = "Use --ignore-unreplicated-keyspaces to ignore keyspaces which are not replicated, otherwise the repair will fail") private boolean ignoreUnreplicatedKeyspaces = false; @@ -134,7 +142,8 @@ else if (preview) @Override public void execute(NodeProbe probe) { - List keyspaces = parseOptionalKeyspace(args, probe, KeyspaceSet.NON_LOCAL_STRATEGY); + KeyspaceSet keyspaceSet = KeyspaceSet.NON_LOCAL_STRATEGY; + List keyspaces = parseOptionalKeyspace(args, probe, keyspaceSet); String[] cfnames = parseOptionalTables(args); if (primaryRange && (!specificDataCenters.isEmpty() || !specificHosts.isEmpty())) @@ -187,9 +196,33 @@ else if (dcParallel) options.put(RepairOption.PREVIEW, getPreviewKind().toString()); options.put(RepairOption.OPTIMISE_STREAMS_KEY, Boolean.toString(optimiseStreams)); options.put(RepairOption.IGNORE_UNREPLICATED_KS, Boolean.toString(ignoreUnreplicatedKeyspaces)); - options.put(RepairOption.REPAIR_PAXOS_KEY, Boolean.toString(!skipPaxos && getPreviewKind() == PreviewKind.NONE)); - options.put(RepairOption.PAXOS_ONLY_KEY, Boolean.toString(paxosOnly && getPreviewKind() == PreviewKind.NONE)); options.put(RepairOption.NO_TOMBSTONE_PURGING, Boolean.toString(dontPurgeTombstones)); + checkArgument(!(paxosOnly && accordOnly), "Can't specify both paxos-only and accord-only"); + checkArgument(!(skipPaxos && paxosOnly), "Can't specify both skip-paxos and paxos-only"); + boolean repairPaxos = !skipPaxos && !accordOnly && getPreviewKind() == PreviewKind.NONE; + options.put(RepairOption.REPAIR_PAXOS_KEY, Boolean.toString(repairPaxos)); + checkArgument(!(skipAccord && accordOnly), "Can't specify both skip-accord and accord-only"); + boolean repairAccord = !skipAccord && !paxosOnly && getPreviewKind() == PreviewKind.NONE; + options.put(RepairOption.REPAIR_ACCORD_KEY, Boolean.toString(repairAccord)); + boolean repairData = false; + if (getPreviewKind() == PreviewKind.NONE) + { + // Paxos only historically doesn't do a repair, but Accord sticks to repairing at ALL + // unless --force is specified. + // If repair is incremental we need to do the repair to get the sstables created in the repaired set + if (accordOnly) + repairData = !fullRepair; + // Default if not Paxos/Accord only is to repair data + else if (!paxosOnly) + repairData = true; + } + else + { + // Preview also "repairs" data + repairData = true; + } + // Incremental repair always needs a data repair to actually do the incremental repair and move the sstables + options.put(RepairOption.REPAIR_DATA_KEY, Boolean.toString(repairData)); if (!startToken.isEmpty() || !endToken.isEmpty()) { diff --git a/src/java/org/apache/cassandra/tools/nodetool/SSTableRepairedSet.java b/src/java/org/apache/cassandra/tools/nodetool/SSTableRepairedSet.java new file mode 100644 index 000000000000..2a7b56732ac9 --- /dev/null +++ b/src/java/org/apache/cassandra/tools/nodetool/SSTableRepairedSet.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tools.nodetool; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + + +import io.airlift.airline.Arguments; +import io.airlift.airline.Command; +import io.airlift.airline.Option; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.tools.NodeProbe; +import org.apache.cassandra.tools.NodeTool; + +/** + * Provides a way to set the repaired state of SSTables without any downtime through nodetool. + */ +@Command(name = "sstablerepairedset", description = "Set the repaired state of SSTables for given keyspace/tables") +public class SSTableRepairedSet extends NodeTool.NodeToolCmd +{ + @Arguments(usage = "[ ]", description = "Optional keyspace followed by zero or more tables") + protected List args = new ArrayList<>(); + + @Option(title = "really-set", + name = { "--really-set" }, + description = "Really set the repaired state of SSTables. If not set, only print SSTables that would be affected.") + protected boolean reallySet = false; + + @Option(title = "is-repaired", + name = { "--is-repaired" }, + description = "Set SSTables to repaired state.") + protected boolean isRepaired = false; + + @Option(title = "is-unrepaired", + name = { "--is-unrepaired" }, + description = "Set SSTables to unrepaired state.") + protected boolean isUnrepaired = false; + + @Override + public void execute(NodeProbe probe) + { + PrintStream out = probe.output().out; + + if (isRepaired == isUnrepaired) + { + out.println("Exactly one of --is-repaired or --is-unrepaired must be provided."); + return; + } + + String message; + if (reallySet) + message = "Mutating repaired state of SSTables for"; + else + message = "Previewing repaired state mutation of SSTables for"; + + List keyspaces = parseOptionalKeyspace(args, probe, KeyspaceSet.NON_LOCAL_STRATEGY); + List tables = new ArrayList<>(Arrays.asList(parseOptionalTables(args))); + + if (args.isEmpty()) + message += " all keyspaces"; + else + message += tables.isEmpty() ? " all tables" : " tables " + String.join(", ", tables) + + " in keyspace " + keyspaces.get(0); + message += " to " + (isRepaired ? "repaired" : "unrepaired"); + out.println(message); + + List sstableList = new ArrayList<>(); + for (String keyspace : keyspaces) + { + try + { + sstableList.addAll(probe.mutateSSTableRepairedState(isRepaired, !reallySet, keyspace, + tables.isEmpty() + ? probe.getAutoRepairTablesForKeyspace(keyspace) // mutate all tables + : tables)); // mutate specific tables + } + catch (InvalidRequestException e) + { + out.println(e.getMessage()); + } + } + if (!reallySet) + out.println("The following SSTables would be mutated:"); + else + out.println("The following SSTables were mutated:"); + for (String sstable : sstableList) + out.println(sstable); + } +} diff --git a/src/java/org/apache/cassandra/tools/nodetool/SetAutoRepairConfig.java b/src/java/org/apache/cassandra/tools/nodetool/SetAutoRepairConfig.java new file mode 100644 index 000000000000..cc00cabd0633 --- /dev/null +++ b/src/java/org/apache/cassandra/tools/nodetool/SetAutoRepairConfig.java @@ -0,0 +1,178 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.tools.nodetool; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Splitter; + +import io.airlift.airline.Arguments; +import io.airlift.airline.Command; +import io.airlift.airline.Option; +import org.apache.cassandra.tools.NodeProbe; +import org.apache.cassandra.tools.NodeTool.NodeToolCmd; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.Set; + +import static com.google.common.base.Preconditions.checkArgument; + +/** + * Allows to set AutoRepair configuration through nodetool. + */ +@Command(name = "setautorepairconfig", description = "sets the autorepair configuration") +public class SetAutoRepairConfig extends NodeToolCmd +{ + @VisibleForTesting + @Arguments(title = " ", usage = " ", + description = "autorepair param and value.\nPossible autorepair parameters are as following: " + + "[start_scheduler|number_of_repair_threads|min_repair_interval|sstable_upper_threshold" + + "|enabled|table_max_repair_time|priority_hosts|forcerepair_hosts|ignore_dcs" + + "|history_clear_delete_hosts_buffer_interval|repair_primary_token_range_only" + + "|parallel_repair_count|parallel_repair_percentage" + + "|allow_parallel_replica_repair|allow_parallel_repair_across_schedules" + + "|materialized_view_repair_enabled|repair_max_retries" + + "|repair_retry_backoff|repair_session_timeout|min_repair_task_duration" + + "|repair_by_keyspace|token_range_splitter.]", + required = true) + protected List args = new ArrayList<>(); + + @VisibleForTesting + @Option(title = "repair type", name = { "-t", "--repair-type" }, description = "Repair type") + protected String repairTypeStr; + + @VisibleForTesting + protected PrintStream out = System.out; + + private static final String TOKEN_RANGE_SPLITTER_PROPERTY_PREFIX = "token_range_splitter."; + + @Override + public void execute(NodeProbe probe) + { + checkArgument(args.size() == 2, "setautorepairconfig requires param-type, and value args."); + String paramType = args.get(0); + String paramVal = args.get(1); + + if (probe.isAutoRepairDisabled() && !paramType.equalsIgnoreCase("start_scheduler")) + { + out.println("Auto-repair is not enabled"); + return; + } + + // options that do not require --repair-type option + switch (paramType) + { + case "start_scheduler": + if (Boolean.parseBoolean(paramVal)) + { + probe.startAutoRepairScheduler(); + } + return; + case "history_clear_delete_hosts_buffer_interval": + probe.setAutoRepairHistoryClearDeleteHostsBufferDuration(paramVal); + return; + case "min_repair_task_duration": + probe.setAutoRepairMinRepairTaskDuration(paramVal); + return; + default: + // proceed to options that require --repair-type option + break; + } + + // options below require --repair-type option + Objects.requireNonNull(repairTypeStr, "--repair-type is required for this parameter."); + + if(paramType.startsWith(TOKEN_RANGE_SPLITTER_PROPERTY_PREFIX)) + { + final String key = paramType.replace(TOKEN_RANGE_SPLITTER_PROPERTY_PREFIX, ""); + probe.setAutoRepairTokenRangeSplitterParameter(repairTypeStr, key, paramVal); + return; + } + + switch (paramType) + { + case "enabled": + probe.setAutoRepairEnabled(repairTypeStr, Boolean.parseBoolean(paramVal)); + break; + case "number_of_repair_threads": + probe.setAutoRepairThreads(repairTypeStr, Integer.parseInt(paramVal)); + break; + case "min_repair_interval": + probe.setAutoRepairMinInterval(repairTypeStr, paramVal); + break; + case "sstable_upper_threshold": + probe.setAutoRepairSSTableCountHigherThreshold(repairTypeStr, Integer.parseInt(paramVal)); + break; + case "table_max_repair_time": + probe.setAutoRepairTableMaxRepairTime(repairTypeStr, paramVal); + break; + case "priority_hosts": + if (paramVal!= null && !paramVal.isEmpty()) + { + probe.setAutoRepairPriorityForHosts(repairTypeStr, paramVal); + } + break; + case "forcerepair_hosts": + probe.setAutoRepairForceRepairForHosts(repairTypeStr, paramVal); + break; + case "ignore_dcs": + Set ignoreDCs = new HashSet<>(); + for (String dc : Splitter.on(',').split(paramVal)) + { + ignoreDCs.add(dc); + } + probe.setAutoRepairIgnoreDCs(repairTypeStr, ignoreDCs); + break; + case "repair_primary_token_range_only": + probe.setAutoRepairPrimaryTokenRangeOnly(repairTypeStr, Boolean.parseBoolean(paramVal)); + break; + case "parallel_repair_count": + probe.setAutoRepairParallelRepairCount(repairTypeStr, Integer.parseInt(paramVal)); + break; + case "parallel_repair_percentage": + probe.setAutoRepairParallelRepairPercentage(repairTypeStr, Integer.parseInt(paramVal)); + break; + case "allow_parallel_replica_repair": + probe.setAutoRepairAllowParallelReplicaRepair(repairTypeStr, Boolean.parseBoolean(paramVal)); + break; + case "allow_parallel_replica_repair_across_schedules": + probe.setAutoRepairAllowParallelReplicaRepairAcrossSchedules(repairTypeStr, Boolean.parseBoolean(paramVal)); + break; + case "materialized_view_repair_enabled": + probe.setAutoRepairMaterializedViewRepairEnabled(repairTypeStr, Boolean.parseBoolean(paramVal)); + break; + case "repair_session_timeout": + probe.setAutoRepairSessionTimeout(repairTypeStr, paramVal); + break; + case "repair_by_keyspace": + probe.setAutoRepairRepairByKeyspace(repairTypeStr, Boolean.parseBoolean(paramVal)); + break; + case "repair_max_retries": + probe.setAutoRepairMaxRetriesCount(repairTypeStr, Integer.parseInt(paramVal)); + break; + case "repair_retry_backoff": + probe.setAutoRepairRetryBackoff(repairTypeStr, paramVal); + break; + default: + throw new IllegalArgumentException("Unknown parameter: " + paramType); + } + } +} diff --git a/src/java/org/apache/cassandra/transport/Client.java b/src/java/org/apache/cassandra/transport/Client.java index 96fea832a857..6c60f6508c16 100644 --- a/src/java/org/apache/cassandra/transport/Client.java +++ b/src/java/org/apache/cassandra/transport/Client.java @@ -45,9 +45,9 @@ public class Client extends SimpleClient { private final SimpleEventHandler eventHandler = new SimpleEventHandler(); - public Client(String host, int port, ProtocolVersion version, EncryptionOptions encryptionOptions) + public Client(String host, int port, ProtocolVersion version, EncryptionOptions.ClientEncryptionOptions encryptionOptions) { - super(host, port, version, version.isBeta(), new EncryptionOptions(encryptionOptions).applyConfig()); + super(host, port, version, version.isBeta(), encryptionOptions.applyConfig()); setEventHandler(eventHandler); } @@ -260,7 +260,7 @@ public static void main(String[] args) throws Exception int port = Integer.parseInt(args[1]); ProtocolVersion version = args.length == 3 ? ProtocolVersion.decode(Integer.parseInt(args[2]), DatabaseDescriptor.getNativeTransportAllowOlderProtocols()) : ProtocolVersion.CURRENT; - EncryptionOptions encryptionOptions = new EncryptionOptions().applyConfig(); + EncryptionOptions.ClientEncryptionOptions encryptionOptions = new EncryptionOptions.ClientEncryptionOptions().applyConfig(); System.out.println("CQL binary protocol console " + host + "@" + port + " using native protocol version " + version); try (Client client = new Client(host, port, version, encryptionOptions)) diff --git a/src/java/org/apache/cassandra/transport/Dispatcher.java b/src/java/org/apache/cassandra/transport/Dispatcher.java index f701434d0068..6a3818482904 100644 --- a/src/java/org/apache/cassandra/transport/Dispatcher.java +++ b/src/java/org/apache/cassandra/transport/Dispatcher.java @@ -151,6 +151,11 @@ public static RequestTime forImmediateExecution() return new RequestTime(MonotonicClock.Global.preciseTime.now()); } + public RequestTime withStartedAt(long startedAtNanos) + { + return new RequestTime(enqueuedAtNanos, startedAtNanos); + } + public long startedAtNanos() { return startedAtNanos; @@ -426,7 +431,7 @@ private static Message.Response processRequest(ServerConnection connection, Mess connection.applyStateTransition(request.type, response.type); return response; } - + /** * Note: this method may be executed on the netty event loop. */ diff --git a/src/java/org/apache/cassandra/transport/SimpleClient.java b/src/java/org/apache/cassandra/transport/SimpleClient.java index f86b128be0ff..dea3535757f6 100644 --- a/src/java/org/apache/cassandra/transport/SimpleClient.java +++ b/src/java/org/apache/cassandra/transport/SimpleClient.java @@ -74,7 +74,7 @@ public class SimpleClient implements Closeable public final String host; public final int port; - private final EncryptionOptions encryptionOptions; + private final EncryptionOptions.ClientEncryptionOptions encryptionOptions; private final int largeMessageThreshold; protected final ResponseHandler responseHandler = new ResponseHandler(); @@ -92,7 +92,7 @@ public static class Builder { private final String host; private final int port; - private EncryptionOptions encryptionOptions = new EncryptionOptions(); + private EncryptionOptions.ClientEncryptionOptions encryptionOptions = new EncryptionOptions.ClientEncryptionOptions(); private ProtocolVersion version = ProtocolVersion.CURRENT; private boolean useBeta = false; private int largeMessageThreshold = FrameEncoder.Payload.MAX_SIZE; @@ -103,7 +103,7 @@ private Builder(String host, int port) this.port = port; } - public Builder encryption(EncryptionOptions options) + public Builder encryption(EncryptionOptions.ClientEncryptionOptions options) { this.encryptionOptions = options; return this; @@ -149,22 +149,22 @@ private SimpleClient(Builder builder) this.largeMessageThreshold = builder.largeMessageThreshold; } - public SimpleClient(String host, int port, ProtocolVersion version, EncryptionOptions encryptionOptions) + public SimpleClient(String host, int port, ProtocolVersion version, EncryptionOptions.ClientEncryptionOptions encryptionOptions) { this(host, port, version, false, encryptionOptions); } - public SimpleClient(String host, int port, EncryptionOptions encryptionOptions) + public SimpleClient(String host, int port, EncryptionOptions.ClientEncryptionOptions encryptionOptions) { this(host, port, ProtocolVersion.CURRENT, encryptionOptions); } public SimpleClient(String host, int port, ProtocolVersion version) { - this(host, port, version, new EncryptionOptions()); + this(host, port, version, new EncryptionOptions.ClientEncryptionOptions()); } - public SimpleClient(String host, int port, ProtocolVersion version, boolean useBeta, EncryptionOptions encryptionOptions) + public SimpleClient(String host, int port, ProtocolVersion version, boolean useBeta, EncryptionOptions.ClientEncryptionOptions encryptionOptions) { this.host = host; this.port = port; @@ -172,7 +172,7 @@ public SimpleClient(String host, int port, ProtocolVersion version, boolean useB throw new IllegalArgumentException(String.format("Beta version of server used (%s), but USE_BETA flag is not set", version)); this.version = version; - this.encryptionOptions = new EncryptionOptions(encryptionOptions).applyConfig(); + this.encryptionOptions = encryptionOptions.applyConfig(); this.largeMessageThreshold = FrameEncoder.Payload.MAX_SIZE - Math.max(FrameEncoderCrc.HEADER_AND_TRAILER_LENGTH, FrameEncoderLZ4.HEADER_AND_TRAILER_LENGTH); @@ -180,7 +180,7 @@ public SimpleClient(String host, int port, ProtocolVersion version, boolean useB public SimpleClient(String host, int port) { - this(host, port, new EncryptionOptions()); + this(host, port, new EncryptionOptions.ClientEncryptionOptions()); } public SimpleClient connect(boolean useCompression) throws IOException diff --git a/src/java/org/apache/cassandra/transport/messages/BatchMessage.java b/src/java/org/apache/cassandra/transport/messages/BatchMessage.java index 8a87efc781fa..d45105f109ce 100644 --- a/src/java/org/apache/cassandra/transport/messages/BatchMessage.java +++ b/src/java/org/apache/cassandra/transport/messages/BatchMessage.java @@ -188,7 +188,7 @@ protected Message.Response execute(QueryState state, Dispatcher.RequestTime requ { p = QueryProcessor.parseAndPrepare((String) query, state.getClientState().cloneWithKeyspaceIfSet(options.getKeyspace()), - false); + false, false); } else { diff --git a/src/java/org/apache/cassandra/triggers/TriggerExecutor.java b/src/java/org/apache/cassandra/triggers/TriggerExecutor.java index 1a0d39e1adb4..c3fe269559e6 100644 --- a/src/java/org/apache/cassandra/triggers/TriggerExecutor.java +++ b/src/java/org/apache/cassandra/triggers/TriggerExecutor.java @@ -24,8 +24,8 @@ import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; import com.google.common.collect.ListMultimap; +import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.slf4j.Logger; @@ -34,7 +34,10 @@ import org.apache.cassandra.config.Config; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.QueryProcessor; -import org.apache.cassandra.db.*; +import org.apache.cassandra.db.CounterMutation; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.IMutation; +import org.apache.cassandra.db.Mutation; import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.exceptions.CassandraException; import org.apache.cassandra.exceptions.InvalidRequestException; @@ -120,7 +123,7 @@ public PartitionUpdate execute(PartitionUpdate updates) throws InvalidRequestExc * @throws InvalidRequestException if additional mutations were generated, but * the initial mutations contains counter updates */ - public Collection execute(Collection mutations) throws InvalidRequestException + public List execute(Collection mutations) throws InvalidRequestException { boolean hasCounters = false; List augmentedMutations = null; @@ -156,7 +159,7 @@ public Collection execute(Collection mutations) t return mergeMutations(Iterables.concat(originalMutations, augmentedMutations)); } - private Collection mergeMutations(Iterable mutations) + private List mergeMutations(Iterable mutations) { ListMultimap, Mutation> groupedMutations = ArrayListMultimap.create(); diff --git a/src/java/org/apache/cassandra/utils/AbstractBiMultiValMap.java b/src/java/org/apache/cassandra/utils/AbstractBiMultiValMap.java new file mode 100644 index 000000000000..e29427d386a5 --- /dev/null +++ b/src/java/org/apache/cassandra/utils/AbstractBiMultiValMap.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.util.Collection; +import java.util.Map; +import java.util.Objects; +import java.util.Set; + +import com.google.common.collect.Multimap; +import com.google.common.collect.Multimaps; + +public abstract class AbstractBiMultiValMap implements Map +{ + protected abstract Map forwardDelegate(); + protected abstract Multimap reverseDelegate(); + + public Multimap inverse() + { + return Multimaps.unmodifiableMultimap(reverseDelegate()); + } + + public void clear() + { + forwardDelegate().clear(); + reverseDelegate().clear(); + } + + public boolean containsKey(Object key) + { + return forwardDelegate().containsKey(key); + } + + public boolean containsValue(Object value) + { + return reverseDelegate().containsKey(value); + } + + public Set> entrySet() + { + return forwardDelegate().entrySet(); + } + + public V get(Object key) + { + return forwardDelegate().get(key); + } + + public boolean isEmpty() + { + return forwardDelegate().isEmpty(); + } + + public Set keySet() + { + return forwardDelegate().keySet(); + } + + public V put(K key, V value) + { + V oldVal = forwardDelegate().put(key, value); + if (oldVal != null) + reverseDelegate().remove(oldVal, key); + reverseDelegate().put(value, key); + return oldVal; + } + + public void putAll(Map m) + { + for (Map.Entry entry : m.entrySet()) + put(entry.getKey(), entry.getValue()); + } + + public V remove(Object key) + { + V oldVal = forwardDelegate().remove(key); + reverseDelegate().remove(oldVal, key); + return oldVal; + } + + public Collection removeValue(V value) + { + Collection keys = reverseDelegate().removeAll(value); + for (K key : keys) + forwardDelegate().remove(key); + return keys; + } + + public int size() + { + return forwardDelegate().size(); + } + + public Collection values() + { + return reverseDelegate().keys(); + } + + public Collection valueSet() + { + return reverseDelegate().keySet(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (!(o instanceof AbstractBiMultiValMap)) return false; + AbstractBiMultiValMap that = (AbstractBiMultiValMap) o; + return forwardDelegate().equals(that.forwardDelegate()) && reverseDelegate().equals(that.reverseDelegate()); + } + + @Override + public int hashCode() + { + return Objects.hash(forwardDelegate(), reverseDelegate()); + } +} diff --git a/src/java/org/apache/cassandra/utils/ArraySerializers.java b/src/java/org/apache/cassandra/utils/ArraySerializers.java new file mode 100644 index 000000000000..cad27d68fe1d --- /dev/null +++ b/src/java/org/apache/cassandra/utils/ArraySerializers.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.io.IOException; +import java.util.function.IntFunction; + +import org.apache.cassandra.io.AsymmetricVersionedSerializer; +import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.ParameterisedVersionedSerializer; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + +import static org.apache.cassandra.db.TypeSizes.sizeofUnsignedVInt; + +public class ArraySerializers +{ + public static void serializeArray(T[] items, DataOutputPlus out, UnversionedSerializer serializer) throws IOException + { + out.writeUnsignedVInt32(items.length); + for (T item : items) + serializer.serialize(item, out); + } + + public static void serializeArray(T[] items, DataOutputPlus out, int version, IVersionedSerializer serializer) throws IOException + { + out.writeUnsignedVInt32(items.length); + for (T item : items) + serializer.serialize(item, out, version); + } + + public static void serializeArray(T[] items, DataOutputPlus out, Version version, AsymmetricVersionedSerializer serializer) throws IOException + { + out.writeUnsignedVInt32(items.length); + for (T item : items) + serializer.serialize(item, out, version); + } + + public static void serializeArray(T[] items, P p, DataOutputPlus out, Version version, ParameterisedVersionedSerializer serializer) throws IOException + { + out.writeUnsignedVInt32(items.length); + for (T item : items) + serializer.serialize(item, p, out, version); + } + + public static T[] deserializeArray(DataInputPlus in, UnversionedSerializer serializer, IntFunction arrayFactory) throws IOException + { + int size = in.readUnsignedVInt32(); + T[] items = arrayFactory.apply(size); + for (int i = 0; i < size; i++) + items[i] = serializer.deserialize(in); + return items; + } + + public static T[] deserializeArray(DataInputPlus in, int version, IVersionedSerializer serializer, IntFunction arrayFactory) throws IOException + { + int size = in.readUnsignedVInt32(); + T[] items = arrayFactory.apply(size); + for (int i = 0; i < size; i++) + items[i] = serializer.deserialize(in, version); + return items; + } + + public static T[] deserializeArray(DataInputPlus in, Version version, AsymmetricVersionedSerializer serializer, IntFunction arrayFactory) throws IOException + { + int size = in.readUnsignedVInt32(); + T[] items = arrayFactory.apply(size); + for (int i = 0; i < size; i++) + items[i] = serializer.deserialize(in, version); + return items; + } + + public static T[] deserializeArray(P p, DataInputPlus in, Version version, ParameterisedVersionedSerializer serializer, IntFunction arrayFactory) throws IOException + { + int size = in.readUnsignedVInt32(); + T[] items = arrayFactory.apply(size); + for (int i = 0; i < size; i++) + items[i] = serializer.deserialize(p, in, version); + return items; + } + + public static long serializedArraySize(T[] array, UnversionedSerializer serializer) + { + long size = sizeofUnsignedVInt(array.length); + for (T item : array) + size += serializer.serializedSize(item); + return size; + } + + public static long serializedArraySize(T[] array, int version, IVersionedSerializer serializer) + { + long size = sizeofUnsignedVInt(array.length); + for (T item : array) + size += serializer.serializedSize(item, version); + return size; + } + + public static long serializedArraySize(T[] array, Version version, AsymmetricVersionedSerializer serializer) + { + long size = sizeofUnsignedVInt(array.length); + for (T item : array) + size += serializer.serializedSize(item, version); + return size; + } + + public static long serializedArraySize(T[] array, P p, Version version, ParameterisedVersionedSerializer serializer) + { + long size = sizeofUnsignedVInt(array.length); + for (T item : array) + size += serializer.serializedSize(item, p, version); + return size; + } +} diff --git a/src/java/org/apache/cassandra/utils/Backoff.java b/src/java/org/apache/cassandra/utils/Backoff.java deleted file mode 100644 index 7974dbf346fb..000000000000 --- a/src/java/org/apache/cassandra/utils/Backoff.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.cassandra.utils; - -import java.util.concurrent.TimeUnit; -import java.util.function.DoubleSupplier; - -import org.apache.cassandra.config.RetrySpec; -import org.apache.cassandra.repair.SharedContext; -import org.apache.cassandra.tcm.Retry; - -public interface Backoff -{ - boolean mayRetry(int attempt); - long computeWaitTime(int attempt); - TimeUnit unit(); - - static Backoff fromRetry(Retry retry) - { - return new Backoff() - { - @Override - public boolean mayRetry(int attempt) - { - return !retry.reachedMax(); - } - - @Override - public long computeWaitTime(int retryCount) - { - return retry.computeSleepFor(); - } - - @Override - public TimeUnit unit() - { - return TimeUnit.MILLISECONDS; - } - }; - } - - static Backoff fromConfig(SharedContext ctx, RetrySpec spec) - { - if (!spec.isEnabled()) - return Backoff.None.INSTANCE; - return new Backoff.ExponentialBackoff(spec.maxAttempts.value, spec.baseSleepTime.toMilliseconds(), spec.maxSleepTime.toMilliseconds(), ctx.random().get()::nextDouble); - } - - enum None implements Backoff - { - INSTANCE; - - @Override - public boolean mayRetry(int attempt) - { - return false; - } - - @Override - public long computeWaitTime(int retryCount) - { - throw new UnsupportedOperationException(); - } - - @Override - public TimeUnit unit() - { - throw new UnsupportedOperationException(); - } - } - - class ExponentialBackoff implements Backoff - { - private final int maxAttempts; - private final long baseSleepTimeMillis; - private final long maxSleepMillis; - private final DoubleSupplier randomSource; - - public ExponentialBackoff(int maxAttempts, long baseSleepTimeMillis, long maxSleepMillis, DoubleSupplier randomSource) - { - this.maxAttempts = maxAttempts; - this.baseSleepTimeMillis = baseSleepTimeMillis; - this.maxSleepMillis = maxSleepMillis; - this.randomSource = randomSource; - } - - public int maxAttempts() - { - return maxAttempts; - } - - @Override - public boolean mayRetry(int attempt) - { - return attempt < maxAttempts; - } - - @Override - public long computeWaitTime(int retryCount) - { - long baseTimeMillis = baseSleepTimeMillis * (1L << retryCount); - // it's possible that this overflows, so fall back to max; - if (baseTimeMillis <= 0) - baseTimeMillis = maxSleepMillis; - // now make sure this is capped to target max - baseTimeMillis = Math.min(baseTimeMillis, maxSleepMillis); - - return (long) (baseTimeMillis * (randomSource.getAsDouble() + 0.5)); - } - - @Override - public TimeUnit unit() - { - return TimeUnit.MILLISECONDS; - } - } -} diff --git a/src/java/org/apache/cassandra/utils/BiMultiValMap.java b/src/java/org/apache/cassandra/utils/BiMultiValMap.java index f439c5c496fd..2859e6964bc2 100644 --- a/src/java/org/apache/cassandra/utils/BiMultiValMap.java +++ b/src/java/org/apache/cassandra/utils/BiMultiValMap.java @@ -17,15 +17,11 @@ */ package org.apache.cassandra.utils; -import java.util.Collection; import java.util.HashMap; import java.util.Map; -import java.util.Objects; -import java.util.Set; import com.google.common.collect.HashMultimap; import com.google.common.collect.Multimap; -import com.google.common.collect.Multimaps; /** * @@ -35,7 +31,7 @@ * @param * @param */ -public class BiMultiValMap implements Map +public class BiMultiValMap extends AbstractBiMultiValMap { protected final Map forwardMap; protected final Multimap reverseMap; @@ -59,104 +55,15 @@ public BiMultiValMap(BiMultiValMap map) reverseMap.putAll(map.inverse()); } - public Multimap inverse() - { - return Multimaps.unmodifiableMultimap(reverseMap); - } - - public void clear() - { - forwardMap.clear(); - reverseMap.clear(); - } - - public boolean containsKey(Object key) - { - return forwardMap.containsKey(key); - } - - public boolean containsValue(Object value) - { - return reverseMap.containsKey(value); - } - - public Set> entrySet() - { - return forwardMap.entrySet(); - } - - public V get(Object key) - { - return forwardMap.get(key); - } - - public boolean isEmpty() - { - return forwardMap.isEmpty(); - } - - public Set keySet() - { - return forwardMap.keySet(); - } - - public V put(K key, V value) - { - V oldVal = forwardMap.put(key, value); - if (oldVal != null) - reverseMap.remove(oldVal, key); - reverseMap.put(value, key); - return oldVal; - } - - public void putAll(Map m) - { - for (Map.Entry entry : m.entrySet()) - put(entry.getKey(), entry.getValue()); - } - - public V remove(Object key) - { - V oldVal = forwardMap.remove(key); - reverseMap.remove(oldVal, key); - return oldVal; - } - - public Collection removeValue(V value) - { - Collection keys = reverseMap.removeAll(value); - for (K key : keys) - forwardMap.remove(key); - return keys; - } - - public int size() - { - return forwardMap.size(); - } - - public Collection values() - { - return reverseMap.keys(); - } - - public Collection valueSet() - { - return reverseMap.keySet(); - } - @Override - public boolean equals(Object o) + protected Map forwardDelegate() { - if (this == o) return true; - if (!(o instanceof BiMultiValMap)) return false; - BiMultiValMap that = (BiMultiValMap) o; - return forwardMap.equals(that.forwardMap) && reverseMap.equals(that.reverseMap); + return forwardMap; } @Override - public int hashCode() + protected Multimap reverseDelegate() { - return Objects.hash(forwardMap, reverseMap); + return reverseMap; } } diff --git a/src/java/org/apache/cassandra/utils/Blocking.java b/src/java/org/apache/cassandra/utils/Blocking.java new file mode 100644 index 000000000000..e04e53b090e2 --- /dev/null +++ b/src/java/org/apache/cassandra/utils/Blocking.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.util.concurrent.TimeUnit; + +public interface Blocking +{ + default void sleep(long millis) throws InterruptedException + { + sleep(millis, TimeUnit.MILLISECONDS); + } + + void sleep(long value, TimeUnit unit) throws InterruptedException; + + enum Default implements Blocking + { + instance; + + @Override + public void sleep(long value, TimeUnit unit) throws InterruptedException + { + unit.sleep(value); + } + } +} diff --git a/src/java/org/apache/cassandra/utils/BooleanSerializer.java b/src/java/org/apache/cassandra/utils/BooleanSerializer.java index 1fe77020682d..f130edea155d 100644 --- a/src/java/org/apache/cassandra/utils/BooleanSerializer.java +++ b/src/java/org/apache/cassandra/utils/BooleanSerializer.java @@ -20,24 +20,26 @@ import java.io.IOException; import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.UnversionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; -public class BooleanSerializer implements IVersionedSerializer +public class BooleanSerializer implements UnversionedSerializer { public static BooleanSerializer serializer = new BooleanSerializer(); + public static IVersionedSerializer messagingSerializer = IVersionedSerializer.from(serializer); - public void serialize(Boolean b, DataOutputPlus out, int version) throws IOException + public void serialize(Boolean b, DataOutputPlus out) throws IOException { out.writeBoolean(b); } - public Boolean deserialize(DataInputPlus in, int version) throws IOException + public Boolean deserialize(DataInputPlus in) throws IOException { return in.readBoolean(); } - public long serializedSize(Boolean aBoolean, int version) + public long serializedSize(Boolean aBoolean) { return 1; } diff --git a/src/java/org/apache/cassandra/utils/ByteBufferUtil.java b/src/java/org/apache/cassandra/utils/ByteBufferUtil.java index 73725a6ce3df..6bdea57d681a 100644 --- a/src/java/org/apache/cassandra/utils/ByteBufferUtil.java +++ b/src/java/org/apache/cassandra/utils/ByteBufferUtil.java @@ -45,6 +45,7 @@ import java.util.UUID; import java.util.stream.Collectors; +import net.nicoulaj.compilecommand.annotations.DontInline; import net.nicoulaj.compilecommand.annotations.Inline; import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.db.marshal.BooleanType; @@ -53,6 +54,7 @@ import org.apache.cassandra.db.marshal.MapType; import org.apache.cassandra.db.marshal.SetType; import org.apache.cassandra.db.marshal.TimestampType; +import org.apache.cassandra.io.UnversionedSerializer; import org.apache.cassandra.io.compress.BufferType; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; @@ -102,6 +104,8 @@ public class ByteBufferUtil /** Represents an unset value in bound variables */ public static final ByteBuffer UNSET_BYTE_BUFFER = ByteBuffer.wrap(new byte[]{}); + public static final long EMPTY_SIZE_ON_HEAP = ObjectSizes.measureDeep(ByteBufferUtil.EMPTY_BYTE_BUFFER); + public static final ByteBuffer[] EMPTY_ARRAY = new ByteBuffer[0]; @Inline @@ -214,6 +218,25 @@ public static byte[] getArray(ByteBuffer buffer, int position, int length) return bytes; } + /** + * You should almost never use this. Instead, use the write* methods to avoid copies. + */ + public static byte[] getArrayUnsafe(ByteBuffer buffer) + { + return getArrayUnsafe(buffer, buffer.position(), buffer.remaining()); + } + + /** + * You should almost never use this. Instead, use the write* methods to avoid copies. + */ + public static byte[] getArrayUnsafe(ByteBuffer buffer, int position, int length) + { + if (buffer.hasArray() && position == 0 && buffer.arrayOffset() == 0 && length == buffer.capacity()) + return buffer.array(); + + return getArray(buffer, position, length); + } + /** * ByteBuffer adaptation of org.apache.commons.lang3.ArrayUtils.lastIndexOf method * @@ -369,6 +392,17 @@ public static void writeWithVIntLength(ByteBuffer bytes, DataOutputPlus out) thr out.writeUnsignedVInt32(bytes.remaining()); out.write(bytes); } + public static void writeWithVIntLengthAndNull(ByteBuffer bytes, DataOutputPlus out) throws IOException + { + if (bytes == null) + { + out.writeVInt32(-1); + return; + } + + out.writeVInt32(bytes.remaining()); + out.write(bytes); + } public static void writeWithShortLength(ByteBuffer buffer, DataOutputPlus out) throws IOException { @@ -399,16 +433,15 @@ public static ByteBuffer readWithVIntLength(DataInputPlus in) throws IOException return ByteBufferUtil.read(in, length); } - public static int serializedSizeWithLength(ByteBuffer buffer) + public static int serializedSizeWithVIntLength(ByteBuffer buffer) { int size = buffer.remaining(); - return TypeSizes.sizeof(size) + size; + return TypeSizes.sizeofUnsignedVInt(size) + size; } - public static int serializedSizeWithVIntLength(ByteBuffer buffer) + public static long estimatedSizeOnHeap(ByteBuffer buffer) { - int size = buffer.remaining(); - return TypeSizes.sizeofUnsignedVInt(size) + size; + return EMPTY_SIZE_ON_HEAP + buffer.remaining(); } public static void skipWithVIntLength(DataInputPlus in) throws IOException @@ -752,7 +785,7 @@ public static ByteBuffer bytes(UUID uuid) public static ByteBuffer bytes(TimeUUID uuid) { - return bytes(uuid.asUUID()); + return ByteBuffer.wrap(UUIDGen.decompose(uuid.msb(), uuid.lsb())); } // Returns whether {@code prefix} is a prefix of {@code value}. @@ -985,4 +1018,125 @@ public static void readFully(FileChannel channel, ByteBuffer dst, long position) position += read; } } + + public static void writeLeastSignificantBytes(long register, int bytes, ByteBuffer out) + { + writeMostSignificantBytes(register << ((8 - bytes)*8), bytes, out); + } + + public static void writeMostSignificantBytes(long register, int bytes, ByteBuffer out) + { + int position = out.position(); + int limit = out.limit(); + if (limit - position < Long.BYTES) + { + writeMostSignificantBytesSlow(register, bytes, out); + } + else + { + out.putLong(position, register); + out.position(position + bytes); + } + } + + @DontInline + private static void writeMostSignificantBytesSlow(long register, int bytes, ByteBuffer out) + { + switch (bytes) + { + case 0: + break; + case 1: + out.put((byte)(register >>> 56)); + break; + case 2: + out.putShort((short)(register >> 48)); + break; + case 3: + out.putShort((short)(register >> 48)); + out.put((byte)(register >> 40)); + break; + case 4: + out.putInt((int)(register >> 32)); + break; + case 5: + out.putInt((int)(register >> 32)); + out.put((byte)(register >> 24)); + break; + case 6: + out.putInt((int)(register >> 32)); + out.putShort((short)(register >> 16)); + break; + case 7: + out.putInt((int)(register >> 32)); + out.putShort((short)(register >> 16)); + out.put((byte)(register >> 8)); + break; + case 8: + out.putLong(register); + break; + default: + throw new IllegalArgumentException(); + } + } + + public static long readLeastSignificantBytes(int bytes, ByteBuffer in) + { + if (bytes == 0) + return 0L; + + int position = in.position(); + int limit = in.limit(); + if (limit - position < Long.BYTES) + { + return readLeastSignificantBytesSlow(bytes, in); + } + else + { + long result = in.getLong(position); + in.position(position + bytes); + return result >>> (64 - 8*bytes); + } + } + + @DontInline + private static long readLeastSignificantBytesSlow(int bytes, ByteBuffer out) + { + switch (bytes) + { + case 0: return 0; + case 1: return out.get() & 0xffL; + case 2: return out.getShort() & 0xffffL; + case 3: return ((out.getShort() & 0xffffL) << 8) | (out.get() & 0xffL); + case 4: return out.getInt() & 0xffffffffL; + case 5: return ((out.getInt() & 0xffffffffL) << 8) | (out.get() & 0xffL); + case 6: return ((out.getInt() & 0xffffffffL) << 16) | (out.getShort() & 0xffffL); + case 7: return ((out.getInt() & 0xffffffffL) << 24) | ((out.getShort() & 0xffffL) << 8) | (out.get() & 0xffL); + case 8: return out.getLong(); + default: throw new IllegalArgumentException(); + } + } + + public static final UnversionedSerializer byteBufferSerializer = new UnversionedSerializer() + { + @Override + public void serialize(ByteBuffer bytes, DataOutputPlus out) throws IOException + { + writeWithVIntLength(bytes, out); + } + + @Override + public ByteBuffer deserialize(DataInputPlus in) throws IOException + { + return readWithVIntLength(in); + } + + @Override + public long serializedSize(ByteBuffer bytes) + { + return serializedSizeWithVIntLength(bytes); + } + }; + + public static final UnversionedSerializer nullableByteBufferSerializer = NullableSerializer.wrap(byteBufferSerializer); } \ No newline at end of file diff --git a/src/java/org/apache/cassandra/utils/CastingSerializer.java b/src/java/org/apache/cassandra/utils/CastingSerializer.java new file mode 100644 index 000000000000..a180aa24e328 --- /dev/null +++ b/src/java/org/apache/cassandra/utils/CastingSerializer.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.io.IOException; + +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.VersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + +/** + * Utility for serializing/deserializing from/into generic interface fields where we know (and require) the + * generic fields to be implementation specific classes + */ +public class CastingSerializer +{ + public static VersionedSerializer create(Class specificClass, VersionedSerializer specificSerializer) + { + return new Versioned<>(specificClass, specificSerializer); + } + + public static UnversionedSerializer create(Class specificClass, UnversionedSerializer specificSerializer) + { + return new Unversioned<>(specificClass, specificSerializer); + } + + private static final class Versioned implements VersionedSerializer + { + private final Class specificClass; + private final VersionedSerializer specificSerializer; + + private Versioned(Class specificClass, VersionedSerializer specificSerializer) + { + this.specificClass = specificClass; + this.specificSerializer = specificSerializer; + } + + @Override + public void serialize(Generic generic, DataOutputPlus out, Version version) throws IOException + { + specificSerializer.serialize(specificClass.cast(generic), out, version); + } + + @Override + public Generic deserialize(DataInputPlus in, Version version) throws IOException + { + Generic result = specificSerializer.deserialize(in, version); + if (result != null && !specificClass.isInstance(result)) + throw new IllegalStateException("Expected instance of " + specificClass.getName()); + return result; + } + + @Override + public long serializedSize(Generic generic, Version version) + { + return specificSerializer.serializedSize(specificClass.cast(generic), version); + } + } + + private static final class Unversioned implements UnversionedSerializer + { + private final Class specificClass; + private final UnversionedSerializer specificSerializer; + + private Unversioned(Class specificClass, UnversionedSerializer specificSerializer) + { + this.specificClass = specificClass; + this.specificSerializer = specificSerializer; + } + + @Override + public void serialize(Generic generic, DataOutputPlus out) throws IOException + { + specificSerializer.serialize(specificClass.cast(generic), out); + } + + @Override + public Generic deserialize(DataInputPlus in) throws IOException + { + Generic result = specificSerializer.deserialize(in); + if (result != null && !specificClass.isInstance(result)) + throw new IllegalStateException("Expected instance of " + specificClass.getName()); + return result; + } + + @Override + public long serializedSize(Generic generic) + { + return specificSerializer.serializedSize(specificClass.cast(generic)); + } + } +} diff --git a/src/java/org/apache/cassandra/utils/Clock.java b/src/java/org/apache/cassandra/utils/Clock.java index c8ba785cab9f..8c3fb609c2b6 100644 --- a/src/java/org/apache/cassandra/utils/Clock.java +++ b/src/java/org/apache/cassandra/utils/Clock.java @@ -107,6 +107,14 @@ public static long currentTimeMillis() { return instance.currentTimeMillis(); } + + /** + * Semantically equivalent to {@link FBUtilities#nowInSeconds()} + */ + public static long nowInSeconds() + { + return instance.nowInSeconds(); + } } public static class Default implements Clock diff --git a/src/java/org/apache/cassandra/utils/CloseableIterator.java b/src/java/org/apache/cassandra/utils/CloseableIterator.java index 32de799ba93f..634629f4bed1 100644 --- a/src/java/org/apache/cassandra/utils/CloseableIterator.java +++ b/src/java/org/apache/cassandra/utils/CloseableIterator.java @@ -66,5 +66,4 @@ public T next() } }; } - } diff --git a/src/java/org/apache/cassandra/utils/CollectionSerializer.java b/src/java/org/apache/cassandra/utils/CollectionSerializer.java deleted file mode 100644 index 9de64509bb26..000000000000 --- a/src/java/org/apache/cassandra/utils/CollectionSerializer.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.cassandra.utils; - -import java.io.IOException; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.RandomAccess; -import java.util.Set; -import java.util.function.IntFunction; - -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; - -import org.apache.cassandra.db.TypeSizes; -import org.apache.cassandra.io.IVersionedSerializer; -import org.apache.cassandra.io.util.DataInputPlus; -import org.apache.cassandra.io.util.DataOutputPlus; - -public class CollectionSerializer -{ - - public static void serializeCollection(IVersionedSerializer valueSerializer, Collection values, DataOutputPlus out, int version) throws IOException - { - out.writeUnsignedVInt32(values.size()); - for (V value : values) - valueSerializer.serialize(value, out, version); - } - - public static & RandomAccess> void serializeList(IVersionedSerializer valueSerializer, L values, DataOutputPlus out, int version) throws IOException - { - int size = values.size(); - out.writeUnsignedVInt32(size); - for (int i = 0 ; i < size ; ++i) - valueSerializer.serialize(values.get(i), out, version); - } - - public static void serializeMap(IVersionedSerializer keySerializer, IVersionedSerializer valueSerializer, Map map, DataOutputPlus out, int version) throws IOException - { - out.writeUnsignedVInt32(map.size()); - for (Map.Entry e : map.entrySet()) - { - keySerializer.serialize(e.getKey(), out, version); - valueSerializer.serialize(e.getValue(), out, version); - } - } - - public static > C deserializeCollection(IVersionedSerializer serializer, IntFunction factory, DataInputPlus in, int version) throws IOException - { - int size = in.readUnsignedVInt32(); - C result = factory.apply(size); - while (size-- > 0) - result.add(serializer.deserialize(in, version)); - return result; - } - - public static > M deserializeMap(IVersionedSerializer keySerializer, IVersionedSerializer valueSerializer, IntFunction factory, DataInputPlus in, int version) throws IOException - { - int size = in.readUnsignedVInt32(); - M result = factory.apply(size); - while (size-- > 0) - { - K key = keySerializer.deserialize(in, version); - V value = valueSerializer.deserialize(in, version); - result.put(key, value); - } - return result; - } - - public static long serializedSizeCollection(IVersionedSerializer valueSerializer, Collection values, int version) - { - long size = TypeSizes.sizeofUnsignedVInt(values.size()); - for (V value : values) - size += valueSerializer.serializedSize(value, version); - return size; - } - - public static & RandomAccess> long serializedSizeList(IVersionedSerializer valueSerializer, L values, int version) throws IOException - { - int items = values.size(); - long size = TypeSizes.sizeofUnsignedVInt(items); - for (int i = 0 ; i < items ; ++i) - size += valueSerializer.serializedSize(values.get(i), version); - return size; - } - - - public static long serializedSizeMap(IVersionedSerializer keySerializer, IVersionedSerializer valueSerializer, Map map, int version) - { - long size = TypeSizes.sizeofUnsignedVInt(map.size()); - for (Map.Entry e : map.entrySet()) - size += keySerializer.serializedSize(e.getKey(), version) - + valueSerializer.serializedSize(e.getValue(), version); - return size; - } - - public static IntFunction> newHashSet() - { - return i -> i == 0 ? Collections.emptySet() : Sets.newHashSetWithExpectedSize(i); - } - - public static IntFunction> newHashMap() - { - return i -> i == 0 ? Collections.emptyMap() : Maps.newHashMapWithExpectedSize(i); - } - -} diff --git a/src/java/org/apache/cassandra/utils/CollectionSerializers.java b/src/java/org/apache/cassandra/utils/CollectionSerializers.java new file mode 100644 index 000000000000..81173bdf1e4c --- /dev/null +++ b/src/java/org/apache/cassandra/utils/CollectionSerializers.java @@ -0,0 +1,687 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.IntFunction; +import javax.annotation.Nonnull; + +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; + +import accord.utils.SortedArrays.SortedArrayList; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.IPartitionerDependentSerializer; +import org.apache.cassandra.io.AsymmetricParameterisedUnversionedSerializer; +import org.apache.cassandra.io.AsymmetricParameterisedVersionedSerializer; +import org.apache.cassandra.io.AsymmetricVersionedSerializer; +import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.ParameterisedUnversionedSerializer; +import org.apache.cassandra.io.ParameterisedVersionedSerializer; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.VersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; + +import static org.apache.cassandra.db.TypeSizes.sizeofUnsignedVInt; + +public class CollectionSerializers +{ + public static void serializeCollection(Collection values, DataOutputPlus out, UnversionedSerializer valueSerializer) throws IOException + { + out.writeUnsignedVInt32(values.size()); + for (V value : values) + valueSerializer.serialize(value, out); + } + + public static void serializeCollection(Collection values, DataOutputPlus out, int version, IVersionedSerializer valueSerializer) throws IOException + { + out.writeUnsignedVInt32(values.size()); + for (V value : values) + valueSerializer.serialize(value, out, version); + } + + public static void serializeCollection(Collection values, DataOutputPlus out, Version version, AsymmetricVersionedSerializer valueSerializer) throws IOException + { + out.writeUnsignedVInt32(values.size()); + for (V value : values) + valueSerializer.serialize(value, out, version); + } + + public static void serializeCollection(Collection values, P p, DataOutputPlus out, Version version, AsymmetricParameterisedVersionedSerializer valueSerializer) throws IOException + { + out.writeUnsignedVInt32(values.size()); + for (V value : values) + valueSerializer.serialize(value, p, out, version); + } + + public static void serializeCollection(Collection values, P p, DataOutputPlus out, AsymmetricParameterisedUnversionedSerializer valueSerializer) throws IOException + { + out.writeUnsignedVInt32(values.size()); + for (V value : values) + valueSerializer.serialize(value, p, out); + } + + public static void serializeCollection(Collection values, DataOutputPlus out, Version version, MetadataSerializer valueSerializer) throws IOException + { + out.writeUnsignedVInt32(values.size()); + for (V value : values) + valueSerializer.serialize(value, out, version); + } + + public static void serializeCollection(Collection values, DataOutputPlus out, int version, IPartitionerDependentSerializer valueSerializer) throws IOException + { + out.writeUnsignedVInt32(values.size()); + for (V value : values) + valueSerializer.serialize(value, out, version); + } + + public static > void serializeList(L values, DataOutputPlus out, UnversionedSerializer valueSerializer) throws IOException + { + int size = values.size(); + out.writeUnsignedVInt32(size); + for (int i = 0 ; i < size ; ++i) + valueSerializer.serialize(values.get(i), out); + } + + public static > void serializeList(L values, P p, DataOutputPlus out, ParameterisedUnversionedSerializer valueSerializer) throws IOException + { + int size = values.size(); + out.writeUnsignedVInt32(size); + for (int i = 0 ; i < size ; ++i) + valueSerializer.serialize(values.get(i), p, out); + } + + public static > void serializeList(L values, P p, DataOutputPlus out, Version version, ParameterisedVersionedSerializer valueSerializer) throws IOException + { + int size = values.size(); + out.writeUnsignedVInt32(size); + for (int i = 0 ; i < size ; ++i) + valueSerializer.serialize(values.get(i), p, out, version); + } + + public static > void serializeList(L values, DataOutputPlus out, int version, IVersionedSerializer valueSerializer) throws IOException + { + int size = values.size(); + out.writeUnsignedVInt32(size); + for (int i = 0 ; i < size ; ++i) + valueSerializer.serialize(values.get(i), out, version); + } + + public static , Version> void serializeList(L values, DataOutputPlus out, Version version, AsymmetricVersionedSerializer valueSerializer) throws IOException + { + int size = values.size(); + out.writeUnsignedVInt32(size); + for (int i = 0 ; i < size ; ++i) + valueSerializer.serialize(values.get(i), out, version); + } + + public static > void serializeList(L values, DataOutputPlus out, Version version, MetadataSerializer valueSerializer) throws IOException + { + int size = values.size(); + out.writeUnsignedVInt32(size); + for (int i = 0 ; i < size ; ++i) + valueSerializer.serialize(values.get(i), out, version); + } + + public static void serializeMap(Map map, DataOutputPlus out, UnversionedSerializer keySerializer, UnversionedSerializer valueSerializer) throws IOException + { + out.writeUnsignedVInt32(map.size()); + for (Map.Entry e : map.entrySet()) + { + keySerializer.serialize(e.getKey(), out); + valueSerializer.serialize(e.getValue(), out); + } + } + + public static void serializeMap(Map map, DataOutputPlus out, int version, IVersionedSerializer keySerializer, IVersionedSerializer valueSerializer) throws IOException + { + out.writeUnsignedVInt32(map.size()); + for (Map.Entry e : map.entrySet()) + { + keySerializer.serialize(e.getKey(), out, version); + valueSerializer.serialize(e.getValue(), out, version); + } + } + + public static void serializeMap(Map map, DataOutputPlus out, int version, IVersionedSerializer keySerializer, IPartitionerDependentSerializer valueSerializer) throws IOException + { + out.writeUnsignedVInt32(map.size()); + for (Map.Entry e : map.entrySet()) + { + keySerializer.serialize(e.getKey(), out, version); + valueSerializer.serialize(e.getValue(), out, version); + } + } + + public static void serializeMap(Map map, DataOutputPlus out, Version version, AsymmetricVersionedSerializer keySerializer, AsymmetricVersionedSerializer valueSerializer) throws IOException + { + out.writeUnsignedVInt32(map.size()); + for (Map.Entry e : map.entrySet()) + { + keySerializer.serialize(e.getKey(), out, version); + valueSerializer.serialize(e.getValue(), out, version); + } + } + + public static void serializeMap(Map map, DataOutputPlus out, Version version, MetadataSerializer keySerializer, MetadataSerializer valueSerializer) throws IOException + { + out.writeUnsignedVInt32(map.size()); + for (Map.Entry e : map.entrySet()) + { + keySerializer.serialize(e.getKey(), out, version); + valueSerializer.serialize(e.getValue(), out, version); + } + } + + public static > SortedArrayList deserializeSortedArrayList(DataInputPlus in, UnversionedSerializer serializer, IntFunction allocator) throws IOException + { + int size = in.readUnsignedVInt32(); + V[] array = allocator.apply(size); + for (int i = 0 ; i < array.length ; ++i) + array[i] = serializer.deserialize(in); + return new SortedArrayList<>(array); + } + + public static List deserializeList(DataInputPlus in, UnversionedSerializer serializer) throws IOException + { + return deserializeCollection(in, serializer, newArrayList()); + } + + public static List deserializeList(DataInputPlus in, int version, IVersionedSerializer serializer) throws IOException + { + return deserializeCollection(in, version, serializer, newArrayList()); + } + + public static List deserializeList(DataInputPlus in, Version version, AsymmetricVersionedSerializer serializer) throws IOException + { + return deserializeCollection(in, version, serializer, newArrayList()); + } + + public static List deserializeList(P p, DataInputPlus in, Version version, AsymmetricParameterisedVersionedSerializer serializer) throws IOException + { + return deserializeCollection(p, in, version, serializer, newArrayList()); + } + + public static List deserializeList(P p, DataInputPlus in, AsymmetricParameterisedUnversionedSerializer serializer) throws IOException + { + return deserializeCollection(p, in, serializer, newArrayList()); + } + + public static List deserializeList(DataInputPlus in, Version version, MetadataSerializer serializer) throws IOException + { + return deserializeCollection(in, version, serializer, newArrayList()); + } + + public static List deserializeList(DataInputPlus in, IPartitioner partitioner, int version, IPartitionerDependentSerializer serializer) throws IOException + { + return deserializeCollection(in, partitioner, version, serializer, newArrayList()); + } + + public static Set deserializeSet(DataInputPlus in, UnversionedSerializer serializer) throws IOException + { + return deserializeCollection(in, serializer, newHashSet()); + } + + public static Set deserializeSet(DataInputPlus in, int version, IVersionedSerializer serializer) throws IOException + { + return deserializeCollection(in, version, serializer, newHashSet()); + } + + public static Set deserializeSet(DataInputPlus in, IPartitioner partitioner, int version, IPartitionerDependentSerializer serializer) throws IOException + { + return deserializeCollection(in, partitioner, version, serializer, newHashSet()); + } + + public static Set deserializeSet(DataInputPlus in, Version version, AsymmetricVersionedSerializer serializer) throws IOException + { + return deserializeCollection(in, version, serializer, newHashSet()); + } + + public static Set deserializeSet(DataInputPlus in, Version version, MetadataSerializer serializer) throws IOException + { + return deserializeCollection(in, version, serializer, newHashSet()); + } + + public static > M deserializeMap(DataInputPlus in, UnversionedSerializer keySerializer, UnversionedSerializer valueSerializer, IntFunction factory) throws IOException + { + int size = in.readUnsignedVInt32(); + M result = factory.apply(size); + while (size-- > 0) + { + K key = keySerializer.deserialize(in); + V value = valueSerializer.deserialize(in); + result.put(key, value); + } + return result; + } + + public static > M deserializeMap(DataInputPlus in, int version, IVersionedSerializer keySerializer, IVersionedSerializer valueSerializer, IntFunction factory) throws IOException + { + int size = in.readUnsignedVInt32(); + M result = factory.apply(size); + while (size-- > 0) + { + K key = keySerializer.deserialize(in, version); + V value = valueSerializer.deserialize(in, version); + result.put(key, value); + } + return result; + } + + public static Map deserializeMap(DataInputPlus in, IPartitioner partitioner, int version, IVersionedSerializer keySerializer, IPartitionerDependentSerializer valueSerializer, IntFunction> factory) throws IOException + { + int size = in.readUnsignedVInt32(); + Map result = factory.apply(size); + while (size-- > 0) + { + K key = keySerializer.deserialize(in, version); + V value = valueSerializer.deserialize(in, partitioner, version); + result.put(key, value); + } + return result; + } + + public static , Version> M deserializeMap(DataInputPlus in, Version version, AsymmetricVersionedSerializer keySerializer, AsymmetricVersionedSerializer valueSerializer, IntFunction factory) throws IOException + { + int size = in.readUnsignedVInt32(); + M result = factory.apply(size); + while (size-- > 0) + { + K key = keySerializer.deserialize(in, version); + V value = valueSerializer.deserialize(in, version); + result.put(key, value); + } + return result; + } + + public static > M deserializeMap(DataInputPlus in, Version version, MetadataSerializer keySerializer, MetadataSerializer valueSerializer, IntFunction factory) throws IOException + { + int size = in.readUnsignedVInt32(); + M result = factory.apply(size); + while (size-- > 0) + { + K key = keySerializer.deserialize(in, version); + V value = valueSerializer.deserialize(in, version); + result.put(key, value); + } + return result; + } + + public static Map deserializeMap(DataInputPlus in, UnversionedSerializer keySerializer, UnversionedSerializer valueSerializer) throws IOException + { + return deserializeMap(in, keySerializer, valueSerializer, Maps::newHashMapWithExpectedSize); + } + + public static Map deserializeMap(DataInputPlus in, int version, IVersionedSerializer keySerializer, IVersionedSerializer valueSerializer) throws IOException + { + return deserializeMap(in, version, keySerializer, valueSerializer, Maps::newHashMapWithExpectedSize); + } + + public static Map deserializeMap(DataInputPlus in, Version version, AsymmetricVersionedSerializer keySerializer, AsymmetricVersionedSerializer valueSerializer) throws IOException + { + return deserializeMap(in, version, keySerializer, valueSerializer, Maps::newHashMapWithExpectedSize); + } + + public static Map deserializeMap(DataInputPlus in, Version version, MetadataSerializer keySerializer, MetadataSerializer valueSerializer) throws IOException + { + return deserializeMap(in, version, keySerializer, valueSerializer, Maps::newHashMapWithExpectedSize); + } + + public static long serializedCollectionSize(Collection values, UnversionedSerializer valueSerializer) + { + long size = sizeofUnsignedVInt(values.size()); + for (V value : values) + size += valueSerializer.serializedSize(value); + return size; + } + + public static long serializedCollectionSize(Collection values, int version, IVersionedSerializer valueSerializer) + { + long size = sizeofUnsignedVInt(values.size()); + for (V value : values) + size += valueSerializer.serializedSize(value, version); + return size; + } + + public static long serializedCollectionSize(Collection values, int version, IPartitionerDependentSerializer valueSerializer) + { + long size = sizeofUnsignedVInt(values.size()); + for (V value : values) + size += valueSerializer.serializedSize(value, version); + return size; + } + + public static long serializedCollectionSize(Collection values, Version version, AsymmetricVersionedSerializer valueSerializer) + { + long size = sizeofUnsignedVInt(values.size()); + for (V value : values) + size += valueSerializer.serializedSize(value, version); + return size; + } + + public static long serializedCollectionSize(P p, Collection values, Version version, AsymmetricParameterisedVersionedSerializer valueSerializer) + { + long size = sizeofUnsignedVInt(values.size()); + for (V value : values) + size += valueSerializer.serializedSize(value, p, version); + return size; + } + + public static long serializedCollectionSize(Collection values, P p, AsymmetricParameterisedUnversionedSerializer valueSerializer) + { + long size = sizeofUnsignedVInt(values.size()); + for (V value : values) + size += valueSerializer.serializedSize(value, p); + return size; + } + + public static long serializedCollectionSize(Collection values, Version version, MetadataSerializer valueSerializer) + { + long size = sizeofUnsignedVInt(values.size()); + for (V value : values) + size += valueSerializer.serializedSize(value, version); + return size; + } + + public static > long serializedListSize(L values, UnversionedSerializer valueSerializer) + { + int items = values.size(); + long size = sizeofUnsignedVInt(items); + for (int i = 0 ; i < items ; ++i) + size += valueSerializer.serializedSize(values.get(i)); + return size; + } + + public static > long serializedListSize(L values, int version, IVersionedSerializer valueSerializer) + { + int items = values.size(); + long size = sizeofUnsignedVInt(items); + for (int i = 0 ; i < items ; ++i) + size += valueSerializer.serializedSize(values.get(i), version); + return size; + } + + public static , Version> long serializedListSize(L values, Version version, AsymmetricVersionedSerializer valueSerializer) + { + int items = values.size(); + long size = sizeofUnsignedVInt(items); + for (int i = 0 ; i < items ; ++i) + size += valueSerializer.serializedSize(values.get(i), version); + return size; + } + + public static , Version> long serializedListSize(L values, P p, Version version, AsymmetricParameterisedVersionedSerializer valueSerializer) + { + int items = values.size(); + long size = sizeofUnsignedVInt(items); + for (int i = 0 ; i < items ; ++i) + size += valueSerializer.serializedSize(values.get(i), p, version); + return size; + } + + public static > long serializedListSize(L values, Version version, MetadataSerializer valueSerializer) + { + int items = values.size(); + long size = sizeofUnsignedVInt(items); + for (int i = 0 ; i < items ; ++i) + size += valueSerializer.serializedSize(values.get(i), version); + return size; + } + + public static long serializedMapSize(Map map, UnversionedSerializer keySerializer, UnversionedSerializer valueSerializer) + { + long size = sizeofUnsignedVInt(map.size()); + for (Map.Entry e : map.entrySet()) + size += keySerializer.serializedSize(e.getKey()) + + valueSerializer.serializedSize(e.getValue()); + return size; + } + + public static long serializedMapSize(Map map, int version, IVersionedSerializer keySerializer, IVersionedSerializer valueSerializer) + { + long size = sizeofUnsignedVInt(map.size()); + for (Map.Entry e : map.entrySet()) + size += keySerializer.serializedSize(e.getKey(), version) + + valueSerializer.serializedSize(e.getValue(), version); + return size; + } + + public static long serializedMapSize(Map map, int version, IVersionedSerializer keySerializer, IPartitionerDependentSerializer valueSerializer) + { + long size = sizeofUnsignedVInt(map.size()); + for (Map.Entry e : map.entrySet()) + size += keySerializer.serializedSize(e.getKey(), version) + + valueSerializer.serializedSize(e.getValue(), version); + return size; + } + + public static long serializedMapSize(Map map, Version version, AsymmetricVersionedSerializer keySerializer, AsymmetricVersionedSerializer valueSerializer) + { + long size = sizeofUnsignedVInt(map.size()); + for (Map.Entry e : map.entrySet()) + size += keySerializer.serializedSize(e.getKey(), version) + + valueSerializer.serializedSize(e.getValue(), version); + return size; + } + + public static long serializedMapSize(Map map, Version version, MetadataSerializer keySerializer, MetadataSerializer valueSerializer) + { + long size = sizeofUnsignedVInt(map.size()); + for (Map.Entry e : map.entrySet()) + size += keySerializer.serializedSize(e.getKey(), version) + + valueSerializer.serializedSize(e.getValue(), version); + return size; + } + + public static IntFunction> newHashSet() + { + return i -> i == 0 ? Collections.emptySet() : Sets.newHashSetWithExpectedSize(i); + } + + public static IntFunction> newHashMap() + { + return i -> i == 0 ? Collections.emptyMap() : Maps.newHashMapWithExpectedSize(i); + } + + public static IntFunction> newArrayList() + { + return i -> i == 0 ? Collections.emptyList() : new ArrayList<>(i); + } + + public static int readCollectionSize(DataInputPlus in, int version) throws IOException + { + return in.readUnsignedVInt32(); + } + + /* + * Private to push auto-complete to the convenience methods + * Feel free to make public if there is a weird collection you want to use + */ + private static > C deserializeCollection(DataInputPlus in, UnversionedSerializer serializer, IntFunction factory) throws IOException + { + int size = in.readUnsignedVInt32(); + C result = factory.apply(size); + while (size-- > 0) + result.add(serializer.deserialize(in)); + return result; + } + + /* + * Private to push auto-complete to the convenience methods + * Feel free to make public if there is a weird collection you want to use + */ + private static > C deserializeCollection(DataInputPlus in, int version, IVersionedSerializer serializer, IntFunction factory) throws IOException + { + int size = in.readUnsignedVInt32(); + C result = factory.apply(size); + while (size-- > 0) + result.add(serializer.deserialize(in, version)); + return result; + } + + private static > C deserializeCollection(DataInputPlus in, IPartitioner partitioner, int version, IPartitionerDependentSerializer serializer, IntFunction factory) throws IOException + { + int size = in.readUnsignedVInt32(); + C result = factory.apply(size); + while (size-- > 0) + result.add(serializer.deserialize(in, partitioner, version)); + return result; + } + + private static , Version> C deserializeCollection(DataInputPlus in, Version version, AsymmetricVersionedSerializer serializer, IntFunction factory) throws IOException + { + int size = in.readUnsignedVInt32(); + C result = factory.apply(size); + while (size-- > 0) + result.add(serializer.deserialize(in, version)); + return result; + } + + private static , Version> C deserializeCollection(P p, DataInputPlus in, Version version, AsymmetricParameterisedVersionedSerializer serializer, IntFunction factory) throws IOException + { + int size = in.readUnsignedVInt32(); + C result = factory.apply(size); + while (size-- > 0) + result.add(serializer.deserialize(p, in, version)); + return result; + } + + private static , Version> C deserializeCollection(P p, DataInputPlus in, AsymmetricParameterisedUnversionedSerializer serializer, IntFunction factory) throws IOException + { + int size = in.readUnsignedVInt32(); + C result = factory.apply(size); + while (size-- > 0) + result.add(serializer.deserialize(p, in)); + return result; + } + + private static > C deserializeCollection(DataInputPlus in, Version version, MetadataSerializer serializer, IntFunction factory) throws IOException + { + int size = in.readUnsignedVInt32(); + C result = factory.apply(size); + while (size-- > 0) + result.add(serializer.deserialize(in, version)); + return result; + } + + public static UnversionedSerializer> newListSerializer(UnversionedSerializer itemSerializer) + { + return new UnversionedSerializer>() + { + @Override + public void serialize(List list, DataOutputPlus out) throws IOException + { + serializeList(list, out, itemSerializer); + } + + @Override + public List deserialize(DataInputPlus in) throws IOException + { + return deserializeList(in, itemSerializer); + } + + @Override + public long serializedSize(List t) + { + return serializedListSize(t, itemSerializer); + } + }; + } + + public static IVersionedSerializer> newListSerializer(IVersionedSerializer itemSerializer) + { + return new IVersionedSerializer>() + { + @Override + public void serialize(List list, DataOutputPlus out, int version) throws IOException + { + serializeList(list, out, version, itemSerializer); + } + + @Override + public List deserialize(DataInputPlus in, int version) throws IOException + { + return deserializeList(in, version, itemSerializer); + } + + @Override + public long serializedSize(List t, int version) + { + return serializedListSize(t, version, itemSerializer); + } + }; + } + + public static VersionedSerializer, Version> newListSerializer(@Nonnull final VersionedSerializer serializer) + { + return new VersionedSerializer, Version>() + { + @Override + public void serialize(List t, DataOutputPlus out, Version version) throws IOException + { + serializeCollection(t, out, version, serializer); + } + + @Override + public List deserialize(DataInputPlus in, Version version) throws IOException + { + return deserializeList(in, version, serializer); + } + + @Override + public long serializedSize(List t, Version version) + { + return serializedCollectionSize(t, version, serializer); + } + }; + } + + public static IPartitionerDependentSerializer> newCollectionSerializer(@Nonnull final IPartitionerDependentSerializer serializer) + { + return new IPartitionerDependentSerializer>() + { + @Override + public void serialize(Collection t, DataOutputPlus out, int version) throws IOException + { + serializeCollection(t, out, version, serializer); + } + + @Override + public Collection deserialize(DataInputPlus in, IPartitioner p, int version) throws IOException + { + return deserializeCollection(in, p, version, serializer, newArrayList()); + } + + @Override + public long serializedSize(Collection t, int version) + { + return serializedCollectionSize(t, version, serializer); + } + }; + } +} diff --git a/src/java/org/apache/cassandra/utils/Collectors3.java b/src/java/org/apache/cassandra/utils/Collectors3.java index c48f16062ac6..3d884fe9df5c 100644 --- a/src/java/org/apache/cassandra/utils/Collectors3.java +++ b/src/java/org/apache/cassandra/utils/Collectors3.java @@ -19,6 +19,8 @@ package org.apache.cassandra.utils; import java.util.Map; +import java.util.SortedSet; +import java.util.TreeSet; import java.util.function.Function; import java.util.stream.Collector; @@ -46,6 +48,8 @@ public class Collectors3 private static final Collector.Characteristics[] SET_CHARACTERISTICS = new Collector.Characteristics[]{ Collector.Characteristics.UNORDERED }; + private static final Collector.Characteristics[] SORTED_SET_CHARACTERISTICS = new Collector.Characteristics[]{ Collector.Characteristics.UNORDERED }; + public static Collector> toImmutableSet() { return Collector.of(ImmutableSet.Builder::new, @@ -75,4 +79,15 @@ public class Collectors3 MAP_CHARACTERISTICS); } + public static Collector, SortedSet> toSortedSet() + { + return Collector.of(() -> new TreeSet(), + (set, value) -> set.add(value), + (set1, set2) -> { + set1.addAll(set2); + return set1; + }, + set -> set, + SORTED_SET_CHARACTERISTICS); + } } diff --git a/src/java/org/apache/cassandra/net/Crc.java b/src/java/org/apache/cassandra/utils/Crc.java similarity index 90% rename from src/java/org/apache/cassandra/net/Crc.java rename to src/java/org/apache/cassandra/utils/Crc.java index 8f63e51a9353..f1a31584f364 100644 --- a/src/java/org/apache/cassandra/net/Crc.java +++ b/src/java/org/apache/cassandra/utils/Crc.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.cassandra.net; +package org.apache.cassandra.utils; import java.io.IOException; import java.nio.ByteBuffer; @@ -48,26 +48,31 @@ public InvalidCrc(int read, int computed) public static CRC32 crc32() { CRC32 crc = crc32.get(); + return initialize(crc); + } + + public static CRC32 initialize(CRC32 crc) + { crc.reset(); - crc.update(initialBytes); + crc.update(initialBytes, 0, initialBytes.length); return crc; } - static int computeCrc32(ByteBuf buffer, int startReaderIndex, int endReaderIndex) + public static int computeCrc32(ByteBuf buffer, int startReaderIndex, int endReaderIndex) { CRC32 crc = crc32(); crc.update(buffer.internalNioBuffer(startReaderIndex, endReaderIndex - startReaderIndex)); return (int) crc.getValue(); } - static int computeCrc32(ByteBuffer buffer, int start, int end) + public static int computeCrc32(ByteBuffer buffer, int start, int end) { CRC32 crc = crc32(); updateCrc32(crc, buffer, start, end); return (int) crc.getValue(); } - static void updateCrc32(CRC32 crc, ByteBuffer buffer, int start, int end) + public static void updateCrc32(CRC32 crc, ByteBuffer buffer, int start, int end) { int savePosition = buffer.position(); int saveLimit = buffer.limit(); @@ -116,7 +121,7 @@ static void updateCrc32(CRC32 crc, ByteBuffer buffer, int start, int end) * @param len the number of bytes, greater than 0 and fewer than 9, to be read from bytes * @return the least-significant bit AND byte order crc24 using the CRC24_POLY polynomial */ - static int crc24(long bytes, int len) + public static int crc24(long bytes, int len) { int crc = CRC24_INIT; while (len-- > 0) diff --git a/src/java/org/apache/cassandra/utils/ExecutorUtils.java b/src/java/org/apache/cassandra/utils/ExecutorUtils.java index 5bb841f32bdd..a28225456c7b 100644 --- a/src/java/org/apache/cassandra/utils/ExecutorUtils.java +++ b/src/java/org/apache/cassandra/utils/ExecutorUtils.java @@ -79,6 +79,39 @@ else if (executor != null) } } + public static boolean shutdownSequentiallyAndWait(Iterable executors, long timeout, TimeUnit unit) + { + long deadline = nanoTime() + unit.toNanos(timeout); + + boolean shutdown = true; + for (Object executor : executors) + { + try + { + if (executor instanceof ExecutorService) + { + ((ExecutorService) executor).shutdown(); + if (!((ExecutorService) executor).awaitTermination(Math.max(0, deadline - nanoTime()), NANOSECONDS)) + shutdown = false; + } + else if (executor instanceof Shutdownable) + { + ((Shutdownable) executor).shutdown(); + if (!((Shutdownable) executor).awaitTermination(Math.max(0, deadline - nanoTime()), NANOSECONDS)) + shutdown = false; + } + else + throw new IllegalArgumentException(executor.toString()); + } + catch (Throwable t) + { + throw new IllegalStateException("Caught interrupt while shutting down " + executor, t); + } + } + + return shutdown; + } + public static void shutdown(ExecutorService ... executors) { shutdown(Arrays.asList(executors)); diff --git a/src/java/org/apache/cassandra/utils/FBUtilities.java b/src/java/org/apache/cassandra/utils/FBUtilities.java index 91b608d18553..a4493f30a201 100644 --- a/src/java/org/apache/cassandra/utils/FBUtilities.java +++ b/src/java/org/apache/cassandra/utils/FBUtilities.java @@ -65,11 +65,13 @@ import com.google.common.base.Preconditions; import com.google.common.base.Suppliers; import com.google.common.collect.ImmutableList; -import com.vdurmont.semver4j.Semver; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import accord.utils.async.AsyncResult; +import accord.utils.async.AsyncResults; +import com.vdurmont.semver4j.Semver; import org.apache.cassandra.audit.IAuditLogger; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.DecoratedKey; @@ -94,7 +96,6 @@ import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; import org.objectweb.asm.Opcodes; -import static org.apache.cassandra.config.CassandraRelevantProperties.CASSANDRA_AVAILABLE_PROCESSORS; import static org.apache.cassandra.config.CassandraRelevantProperties.BUILD_DATE; import static org.apache.cassandra.config.CassandraRelevantProperties.GIT_SHA; import static org.apache.cassandra.config.CassandraRelevantProperties.LINE_SEPARATOR; @@ -131,13 +132,11 @@ public class FBUtilities private static volatile String previousReleaseVersionString; - private static int availableProcessors = CASSANDRA_AVAILABLE_PROCESSORS.getInt(DatabaseDescriptor.getAvailableProcessors()); - private static volatile Supplier systemInfoSupplier = Suppliers.memoize(SystemInfo::new); public static void setAvailableProcessors(int value) { - availableProcessors = value; + DatabaseDescriptor.setAvailableProcessors(value); } @VisibleForTesting @@ -148,10 +147,7 @@ public static void setSystemInfoSupplier(Supplier supplier) public static int getAvailableProcessors() { - if (availableProcessors > 0) - return availableProcessors; - else - return Runtime.getRuntime().availableProcessors(); + return DatabaseDescriptor.getAvailableProcessors(); } public static final int MAX_UNSIGNED_SHORT = 0xFFFF; @@ -1168,20 +1164,26 @@ public static String exec(Map env, Duration timeout, int outBufS { process.destroyForcibly(); logger.error("Command {} did not complete in {}, killed forcibly:\noutput:\n{}\n(truncated {} bytes)\nerror:\n{}\n(truncated {} bytes)", - Arrays.toString(cmd), timeout, out.asString(), outOverflow, err.asString(), errOverflow); + Arrays.toString(cmd), timeout, out.asString(), outOverflow, err.asString(), errOverflow); throw new TimeoutException("Command " + Arrays.toString(cmd) + " did not complete in " + timeout); } int r = process.exitValue(); if (r != 0) { logger.error("Command {} failed with exit code {}:\noutput:\n{}\n(truncated {} bytes)\nerror:\n{}\n(truncated {} bytes)", - Arrays.toString(cmd), r, out.asString(), outOverflow, err.asString(), errOverflow); + Arrays.toString(cmd), r, out.asString(), outOverflow, err.asString(), errOverflow); throw new IOException("Command " + Arrays.toString(cmd) + " failed with exit code " + r); } return out.asString(); } } + public static void updateChecksumShort(Checksum checksum, short v) + { + checksum.update((v >>> 8) & 0xFF); + checksum.update((v >>> 0) & 0xFF); + } + public static void updateChecksumInt(Checksum checksum, int v) { checksum.update((v >>> 24) & 0xFF); @@ -1190,6 +1192,12 @@ public static void updateChecksumInt(Checksum checksum, int v) checksum.update((v >>> 0) & 0xFF); } + public static void updateChecksumLong(Checksum checksum, long v) + { + updateChecksumInt(checksum, (int) (v >>> 32)); + updateChecksumInt(checksum, (int) (v & 0xFFFFFFFFL)); + } + /** * Updates checksum with the provided ByteBuffer at the given offset + length. * Resets position and limit back to their original values on return. @@ -1435,4 +1443,33 @@ public static SystemInfo getSystemInfo() { return systemInfoSupplier.get(); } + + public enum Order { LT, EQ, GT } + public static Order compare(T a, T b, Comparator comparator) + { + int rc = comparator.compare(a, b); + if (rc < 0) return Order.LT; + if (rc == 0) return Order.EQ; + return Order.GT; + } + + public static Order compare(A a, B b, AsymmetricOrdering comparator) + { + int rc = comparator.compareAsymmetric(a, b); + if (rc < 0) return Order.LT; + if (rc == 0) return Order.EQ; + return Order.GT; + } + + public static AsyncResult futureToAsyncResult(org.apache.cassandra.utils.concurrent.Future future) + { + AsyncResult.Settable adapter = AsyncResults.settable(); + future.addCallback((value, failure) -> { + if (failure != null) + adapter.tryFailure(failure); + else + adapter.trySuccess(value); + }); + return adapter; + } } \ No newline at end of file diff --git a/src/java/org/apache/cassandra/utils/FastByteOperations.java b/src/java/org/apache/cassandra/utils/FastByteOperations.java index 2a86712951d2..c37ee21a6ed6 100644 --- a/src/java/org/apache/cassandra/utils/FastByteOperations.java +++ b/src/java/org/apache/cassandra/utils/FastByteOperations.java @@ -70,6 +70,21 @@ public static int compareUnsigned(ByteBuffer b1, ByteBuffer b2) return BestHolder.BEST.compare(b1, b2); } + public static int compareWithMemoryUnsigned(ByteBuffer b1, long address2, int length2) + { + return BestHolder.BEST.compare(b1, address2, length2); + } + + public static int compareWithMemoryUnsigned(byte[] b1, int s1, int l1, long address2, int length2) + { + return BestHolder.BEST.compare(b1, s1,l1, address2, length2); + } + + public static int compareMemoryUnsigned(long address1, int length1, long address2, int length2) + { + return BestHolder.BEST.compare(address1, length1, address2, length2); + } + public static int compareUnsigned(byte[] b1, byte[] b2) { return compareUnsigned(b1, 0, b1.length, b2, 0, b2.length); @@ -102,6 +117,12 @@ abstract public int compare(byte[] buffer1, int offset1, int length1, abstract public int compare(ByteBuffer buffer1, byte[] buffer2, int offset2, int length2); + abstract public int compare(ByteBuffer buffer1, long address2, int length2); + + abstract public int compare(long address1, int length1, long address2, int length2); + + abstract public int compare(byte[] buffer1, int offset1, int length1, long address2, int length2); + abstract public int compare(ByteBuffer buffer1, int offset1, int length1, byte[] buffer2, int offset2, int length2); abstract public int compare(ByteBuffer buffer1, ByteBuffer buffer2); @@ -221,6 +242,44 @@ public int compare(ByteBuffer buffer1, byte[] buffer2, int offset2, int length2) return compare(buffer1, buffer1.position(), buffer1.remaining(), buffer2, offset2, length2); } + @Override + public int compare(ByteBuffer buffer1, long address2, int length2) + { + return compare(buffer1, buffer1.position(), buffer1.remaining(), address2, length2); + } + + public int compare(ByteBuffer buffer1, int position1, int length1, long address2, int length2) + { + { + Object obj1; + long offset1; + if (buffer1.hasArray()) + { + obj1 = buffer1.array(); + offset1 = BYTE_ARRAY_BASE_OFFSET + buffer1.arrayOffset() + position1; + } + else + { + obj1 = null; + offset1 = theUnsafe.getLong(buffer1, DIRECT_BUFFER_ADDRESS_OFFSET) + position1; + } + + return compareTo(obj1, offset1, length1, null, address2, length2); + } + } + @Override + public int compare(long address1, int length1, long address2, int length2) + { + return compareTo(null, address1, length1, null, address2, length2); + } + + @Override + public int compare(byte[] buffer1, int offset1, int length1, long address2, int length2) + { + return compareTo(buffer1, BYTE_ARRAY_BASE_OFFSET + offset1, length1, + null, address2, length2); + } + public int compare(ByteBuffer buffer1, int position1, int length1, byte[] buffer2, int offset2, int length2) { Object obj1; @@ -262,7 +321,7 @@ public void copy(byte[] src, int srcPosition, ByteBuffer trg, int trgPosition, i if (trg.hasArray()) System.arraycopy(src, srcPosition, trg.array(), trg.arrayOffset() + trgPosition, length); else - copy(null, srcPosition + theUnsafe.getLong(src, Unsafe.ARRAY_BYTE_BASE_OFFSET), trg, trgPosition, length); + copy((Object) src, (long) srcPosition + Unsafe.ARRAY_BYTE_BASE_OFFSET, trg, trgPosition, length); } public void copy(ByteBuffer srcBuf, int srcPosition, ByteBuffer trgBuf, int trgPosition, int length) @@ -396,9 +455,9 @@ public static int compareTo(Object buffer1, long memoryOffset1, int length1, if (lw != rw) { if (BIG_ENDIAN) - return UnsignedLongs.compare(lw, rw); + return Long.compareUnsigned(lw, rw); - return UnsignedLongs.compare(Long.reverseBytes(lw), Long.reverseBytes(rw)); + return Long.compareUnsigned(Long.reverseBytes(lw), Long.reverseBytes(rw)); } } @@ -465,6 +524,24 @@ public int compare(ByteBuffer buffer1, byte[] buffer2, int offset2, int length2) return compare(buffer1, ByteBuffer.wrap(buffer2, offset2, length2)); } + @Override + public int compare(ByteBuffer b1, long address2, int length2) + { + throw new UnsupportedOperationException("native memory address is an argument, we cannot do it using a pure Java"); + } + + @Override + public int compare(long address1, int length1, long address2, int length2) + { + throw new UnsupportedOperationException("native memory address is an argument, we cannot do it using a pure Java"); + } + + @Override + public int compare(byte[] b1, int s1, int l1, long address2, int length2) + { + throw new UnsupportedOperationException("native memory address is an argument, we cannot do it using a pure Java"); + } + public int compare(ByteBuffer buffer1, ByteBuffer buffer2) { int end1 = buffer1.limit(); diff --git a/src/java/org/apache/cassandra/utils/HeapUtils.java b/src/java/org/apache/cassandra/utils/HeapUtils.java index 38a696990845..3642eb5470a9 100644 --- a/src/java/org/apache/cassandra/utils/HeapUtils.java +++ b/src/java/org/apache/cassandra/utils/HeapUtils.java @@ -24,6 +24,7 @@ import java.lang.management.ManagementFactory; import java.nio.file.FileStore; import java.nio.file.Path; +import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import javax.management.MBeanServer; @@ -37,6 +38,7 @@ import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.io.util.File; import org.apache.cassandra.io.util.PathUtils; +import org.apache.cassandra.utils.NoSpamLogger.NoSpamLogStatement; import static org.apache.cassandra.config.CassandraRelevantEnv.JAVA_HOME; import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis; @@ -48,6 +50,7 @@ public final class HeapUtils { private static final Logger logger = LoggerFactory.getLogger(HeapUtils.class); + private static final NoSpamLogStatement disabledStatement = NoSpamLogger.getStatement(logger, "Heap dump creation on uncaught exceptions is disabled.", 1L, TimeUnit.MINUTES); private static final Lock DUMP_LOCK = new ReentrantLock(); @@ -130,7 +133,7 @@ public static String maybeCreateHeapDump() } else { - logger.debug("Heap dump creation on uncaught exceptions is disabled."); + disabledStatement.debug(); } } catch (Throwable e) diff --git a/src/java/org/apache/cassandra/utils/Hex.java b/src/java/org/apache/cassandra/utils/Hex.java index b8044b86e1e7..19d470363f7f 100644 --- a/src/java/org/apache/cassandra/utils/Hex.java +++ b/src/java/org/apache/cassandra/utils/Hex.java @@ -27,7 +27,16 @@ public class Hex { private static final Constructor stringConstructor = getProtectedConstructor(String.class, int.class, int.class, char[].class); private final static byte[] charToByte = new byte[256]; - private static final Logger logger = LoggerFactory.getLogger(Hex.class); + + private static class LoggerHandle + { + private static final Logger logger = LoggerFactory.getLogger(Hex.class); + } + + private static Logger logger() + { + return LoggerHandle.logger; + } // package protected for use by ByteBufferUtil. Do not modify this array !! static final char[] byteToChar = new char[16]; @@ -123,7 +132,7 @@ public static String wrapCharArray(char[] c) { // The underlying constructor failed. Unwrapping the exception. Throwable cause = ite.getCause(); - logger.error("Underlying string constructor threw an error: {}", + logger().error("Underlying string constructor threw an error: {}", cause == null ? ite.getMessage() : cause.getMessage()); } catch (Exception e) diff --git a/src/java/org/apache/cassandra/utils/Int32Serializer.java b/src/java/org/apache/cassandra/utils/Int32Serializer.java index 731f5aa038a9..d264d6ebcafe 100644 --- a/src/java/org/apache/cassandra/utils/Int32Serializer.java +++ b/src/java/org/apache/cassandra/utils/Int32Serializer.java @@ -22,25 +22,44 @@ import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.UnversionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; -public class Int32Serializer implements IVersionedSerializer +public class Int32Serializer implements UnversionedSerializer, IVersionedSerializer { public static final Int32Serializer serializer = new Int32Serializer(); - public void serialize(Integer t, DataOutputPlus out, int version) throws IOException + public void serialize(Integer t, DataOutputPlus out) throws IOException { out.writeInt(t); } - public Integer deserialize(DataInputPlus in, int version) throws IOException + @Override + public void serialize(Integer t, DataOutputPlus out, int version) throws IOException + { + serialize(t, out); + } + + public Integer deserialize(DataInputPlus in) throws IOException { return in.readInt(); } - public long serializedSize(Integer t, int version) + @Override + public Integer deserialize(DataInputPlus in, int version) throws IOException + { + return deserialize(in); + } + + public long serializedSize(Integer t) { return TypeSizes.sizeof(t.intValue()); } + + @Override + public long serializedSize(Integer t, int version) + { + return serializedSize(t); + } } diff --git a/src/java/org/apache/cassandra/utils/IntervalTree.java b/src/java/org/apache/cassandra/utils/IntervalTree.java index bde80c5dfe0f..03848a414736 100644 --- a/src/java/org/apache/cassandra/utils/IntervalTree.java +++ b/src/java/org/apache/cassandra/utils/IntervalTree.java @@ -17,14 +17,19 @@ */ package org.apache.cassandra.utils; -import java.util.ArrayDeque; +import java.util.AbstractCollection; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; -import java.util.Deque; import java.util.Iterator; import java.util.List; +import java.util.function.BiPredicate; +import java.util.function.Consumer; +import java.util.function.Predicate; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; +import javax.annotation.Nullable; import com.google.common.base.Joiner; import com.google.common.collect.Iterables; @@ -34,26 +39,45 @@ import org.apache.cassandra.utils.AsymmetricOrdering.Op; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.base.Preconditions.checkState; import static org.apache.cassandra.config.CassandraRelevantProperties.TEST_INTERVAL_TREE_EXPENSIVE_CHECKS; public class IntervalTree, D extends Comparable, I extends Interval> implements Iterable { public static final boolean EXPENSIVE_CHECKS = TEST_INTERVAL_TREE_EXPENSIVE_CHECKS.getBoolean(); + private static final int REBUILD_AT_MOD_COUNT = 20; private static final Logger logger = LoggerFactory.getLogger(IntervalTree.class); + @SuppressWarnings("rawtypes") public static final Interval[] EMPTY_ARRAY = new Interval[0]; - @SuppressWarnings("unchecked") + @SuppressWarnings({"unchecked", "rawtypes"}) private static final IntervalTree EMPTY_TREE = new IntervalTree(null); private final IntervalNode head; + + /** + * Add can potentially unbalance the interval tree each time so force a rebuild after a certain number + * of adds to bound how unbalanced the worst path in the tree can become. + * + * In practice it's likely the tree will have been rebuilt anyways long before it hits mod count, but it's not + * good to leave it unbounded. + * + * Napkin math is a 100k interval tree is a large tree and lg2(100k) is 16 (lg2(1million) is 20) so by bounding it at 20 then + * the worst possible imbalance is a bit more than double a balanced tree. + */ + protected final int modCount; + private final I[] intervalsByMinOrder; private final I[] intervalsByMaxOrder; + @SuppressWarnings("unchecked") protected IntervalTree(Collection intervals) { + this.modCount = 0; if (intervals == null || intervals.isEmpty()) { this.head = null; @@ -72,14 +96,25 @@ else if (intervals.size() == 1) Arrays.sort(intervalsByMaxOrder, Interval.maxOrdering()); this.head = new IntervalNode(Arrays.asList(intervalsByMinOrder), Arrays.asList(intervalsByMaxOrder)); } + if (EXPENSIVE_CHECKS) + { + if (intervalsByMinOrder.length > 1) + for (int i = 1; i < intervalsByMinOrder.length; i++) + checkState(Interval.minOrdering().compare(intervalsByMinOrder[i - 1], intervalsByMinOrder[i]) <= 0, "%s and %s out of order", intervalsByMinOrder[i-1], intervalsByMinOrder[i]); + if (intervalsByMaxOrder.length > 1) + for (int i = 1; i < intervalsByMaxOrder.length; i++) + checkState(Interval.maxOrdering().compare(intervalsByMaxOrder[i - 1], intervalsByMaxOrder[i]) <= 0, "%s and %s out of order", intervalsByMaxOrder[i-1], intervalsByMaxOrder[i]); + } } /** * This constructor will not modify minSortedIntervals and maxSortedIntervals, but it also won't * make defensive copies and will keep the originals. */ + @SuppressWarnings("unchecked") protected IntervalTree(I[] minSortedIntervals, I[] maxSortedIntervals) { + this.modCount = 0; if (minSortedIntervals == null || minSortedIntervals.length == 0) { this.head = null; @@ -97,11 +132,50 @@ else if (minSortedIntervals.length == 1) intervalsByMaxOrder = maxSortedIntervals; this.head = new IntervalNode(Arrays.asList(minSortedIntervals), Arrays.asList(maxSortedIntervals)); } + + if (EXPENSIVE_CHECKS) + { + if (intervalsByMinOrder.length > 1) + for (int i = 1; i < intervalsByMinOrder.length; i++) + checkState(Interval.minOrdering().compare(intervalsByMinOrder[i - 1], intervalsByMinOrder[i]) < 0, "%s and %s out of order", intervalsByMinOrder[i-1], intervalsByMinOrder[i]); + if (intervalsByMaxOrder.length > 1) + for (int i = 1; i < intervalsByMaxOrder.length; i++) + checkState(Interval.maxOrdering().compare(intervalsByMaxOrder[i - 1], intervalsByMaxOrder[i]) < 0, "%s and %s out of order", intervalsByMaxOrder[i-1], intervalsByMaxOrder[i]); + } + } + + protected IntervalTree(IntervalNode head, int modCount, I[] minSortedIntervals, I[] maxSortedIntervals) + { + checkNotNull(minSortedIntervals, "minSortedIntervals is null"); + checkNotNull(maxSortedIntervals, "maxSortedIntervals is null"); + this.head = head; + this.modCount = modCount; + this.intervalsByMinOrder = minSortedIntervals; + this.intervalsByMaxOrder = maxSortedIntervals; + if (EXPENSIVE_CHECKS) + { + if (intervalsByMinOrder.length > 1) + for (int i = 1; i < intervalsByMinOrder.length; i++) + checkState(Interval.minOrdering().compare(intervalsByMinOrder[i - 1], intervalsByMinOrder[i]) < 0, "%s and %s out of order", intervalsByMinOrder[i-1], intervalsByMinOrder[i]); + if (intervalsByMaxOrder.length > 1) + for (int i = 1; i < intervalsByMaxOrder.length; i++) + checkState(Interval.maxOrdering().compare(intervalsByMaxOrder[i - 1], intervalsByMaxOrder[i]) < 0, "%s and %s out of order", intervalsByMaxOrder[i-1], intervalsByMaxOrder[i]); + } + } + + protected IntervalTree create(IntervalNode head, int modCount, @Nullable I[] minSortedIntervals, @Nullable I[] maxSortedIntervals) + { + return new IntervalTree<>(head, modCount, minSortedIntervals, maxSortedIntervals); } protected IntervalTree create(I[] minOrder, I[] maxOrder) { - return new IntervalTree(minOrder, maxOrder); + return new IntervalTree<>(minOrder, maxOrder); + } + + protected IntervalTree create(Collection intervals) + { + return new IntervalTree<>(intervals); } public static , D extends Comparable, I extends Interval> IntervalTree build(Collection intervals) @@ -118,6 +192,16 @@ public static , D extends Comparable, return EMPTY_TREE; } + public static , D extends Comparable, I extends Interval> Builder builder() + { + return new Builder<>(); + } + + public Builder unbuild() + { + return new Builder().addAll(this); + } + public int intervalCount() { return intervalsByMinOrder.length; @@ -144,77 +228,53 @@ public C min() return head.low; } - public List search(Interval searchInterval) + public List> matches(Interval searchInterval) { if (head == null) - return Collections.emptyList(); + return Collections.emptyList(); - List results = new ArrayList(); - head.searchInternal(searchInterval, results); + List> results = new ArrayList<>(); + head.searchInternal(searchInterval, results::add); return results; } - public List search(C point) + public List> matches(C point) { - return search(Interval.create(point, point, null)); + return matches(Interval.create(point, point, null)); } - /** - * The input arrays aren't defensively copied and will be sorted. The update method doesn't allow duplicates or elements to be removed - * to be missing and this differs from the constructor which does not duplicate checking at all. - * - * It made more sense for update to be stricter because it is tracking removals and additions explicitly instead of building - * a list from scratch and in the targeted use case of a list of SSTables there are no duplicates. At a given point in time - * an sstable represents exactly one interval (although it may switch via removal and addition as in early open). - */ - public IntervalTree update(I[] removals, I[] additions) + public List search(Interval searchInterval) { - if (removals == null) - removals = (I[])EMPTY_ARRAY; - if (additions == null) - additions = (I[])EMPTY_ARRAY; - - if (removals.length == 0 && additions.length == 0) - { - return this; - } - - Arrays.sort(removals, Interval.minOrdering()); - Arrays.sort(additions, Interval.minOrdering()); - - for (int i = 1; i < additions.length; i++) - checkState( Interval.minOrdering().compare(additions[i], additions[i-1]) != 0, "Duplicate interval in additions %s", additions[i]); - - I[] newByMin = buildUpdatedArray( - intervalsByMinOrder, - removals, - additions, - Interval.minOrdering() - ); - - Arrays.sort(removals, Interval.maxOrdering()); - Arrays.sort(additions, Interval.maxOrdering()); + if (head == null) + return Collections.emptyList(); - I[] newByMax = buildUpdatedArray( - intervalsByMaxOrder, - removals, - additions, - Interval.maxOrdering() - ); + List results = new ArrayList<>(); + head.searchInternal(searchInterval, i -> results.add(i.data)); + return results; + } - return create(newByMin, newByMax); + public List search(C point) + { + return search(Interval.create(point, point, null)); } @SuppressWarnings("unchecked") - private I[] buildUpdatedArray(I[] existingSorted, - I[] removalsSorted, - I[] additionsSorted, - AsymmetricOrdering, C> cmp) + private I[] buildUpdatedArrayForUpdate(I[] existingSorted, + I[] removalsSorted, + I[] additionsSorted, + AsymmetricOrdering, C> cmp) { + if (EXPENSIVE_CHECKS) + { + if (existingSorted.length > 1) + for (int i = 1; i < existingSorted.length; i++) + checkState(cmp.compare(existingSorted[i - 1], existingSorted[i]) < 0, "%s and %s out of order", existingSorted[i-1], existingSorted[i]); + } + int finalSize = existingSorted.length + additionsSorted.length - removalsSorted.length; I[] result = (I[]) new Interval[finalSize]; - int existingIndex = 0; + int existingIndex = 0; int removalsIndex = 0; int additionsIndex = 0; int resultIndex = 0; @@ -233,6 +293,7 @@ private I[] buildUpdatedArray(I[] existingSorted, } else { + checkState(removalsSorted[removalsIndex].data == currentExisting.data, "Comparator does not implement identity"); existingIndex++; removalsIndex++; @@ -242,7 +303,7 @@ private I[] buildUpdatedArray(I[] existingSorted, } } - if (existingIndex >= existingSorted.length ) + if (existingIndex >= existingSorted.length) break; while (additionsIndex < additionsSorted.length) @@ -276,26 +337,161 @@ else if (additionCmp < 0) return result; } + /** + * The input arrays aren't defensively copied and will be sorted. This update method doesn't allow duplicates or elements to be removed + * to be missing and this differs from creating the tree from scratch using {@link #build(Collection) build(Collection<I>)} method which allows duplicates. + * + * There is also the requirement that D will implement Comparable<D> and that comparator will implement identity + * which is not part of the normal contract of Comparable<D>. That means that if a.compareTo(b) == 0 then a == b; + * + * It made more sense for update to be stricter because it is tracking removals and additions explicitly instead of building + * a list from scratch and in the targeted use case of a list of SSTables there are no duplicates. At a given point in time + * an sstable represents exactly one interval (although it may switch via removal and addition as in early open). + */ + @SuppressWarnings("unchecked") + public IntervalTree update(I[] removals, I[] additions) + { + if ((removals == null || removals.length == 0) && (additions == null || additions.length == 0)) + return this; + + if (removals == null) + removals = (I[])EMPTY_ARRAY; + if (additions == null) + additions = (I[])EMPTY_ARRAY; + + Arrays.sort(removals, Interval.minOrdering()); + Arrays.sort(additions, Interval.minOrdering()); + + if (EXPENSIVE_CHECKS) + { + for (int i = 1; i < additions.length; i++) + checkState(Interval.minOrdering().compare(additions[i], additions[i - 1]) != 0, "Duplicate interval in additions %s", additions[i]); + } + + I[] newByMin = buildUpdatedArrayForUpdate( + intervalsByMinOrder, + removals, + additions, + Interval.minOrdering() + ); + + Arrays.sort(removals, Interval.maxOrdering()); + Arrays.sort(additions, Interval.maxOrdering()); + + I[] newByMax = buildUpdatedArrayForUpdate( + intervalsByMaxOrder, + removals, + additions, + Interval.maxOrdering() + ); + + return create(newByMin, newByMax); + } + + /** + * The in practice use case here is flush which only adds one interval so do binary search + */ + @SuppressWarnings("unchecked") + private I[] buildUpdatedArrayForAdd(I[] addIntervals, I[] existingIntervals, AsymmetricOrdering, C> ordering) + { + int newSize = existingIntervals.length + addIntervals.length; + Arrays.sort(addIntervals, ordering); + I[] newIntervals = (I[])new Interval[newSize]; + int newIndex = 0; + int existingIndex = 0; + + int i = 0; + for (; i < addIntervals.length; i++) + { + if (existingIndex >= existingIntervals.length) + break; + I addInterval = addIntervals[i]; + int insertionPoint = Arrays.binarySearch(existingIntervals, addInterval, ordering); + checkState(insertionPoint < 0, "Interval being added should not already be present"); + insertionPoint = -1 - insertionPoint; + if (insertionPoint > existingIndex) + { + int toCopy = insertionPoint - existingIndex; + System.arraycopy(existingIntervals, existingIndex, newIntervals, newIndex, toCopy); + newIndex += toCopy; + existingIndex += toCopy; + } + newIntervals[newIndex++] = addInterval; + } + + if (i < addIntervals.length) + System.arraycopy(addIntervals, i, newIntervals, newIndex, addIntervals.length - i); + + if (existingIndex < existingIntervals.length) + System.arraycopy(existingIntervals, existingIndex, newIntervals, newIndex, existingIntervals.length - existingIndex); + + return newIntervals; + } + + public IntervalTree add(I[] intervals) + { + if (head == null) + return create(Arrays.asList(intervals)); + if (intervals.length == 0) + return this; + if (modCount + 1 >= REBUILD_AT_MOD_COUNT) + { + return create(new AbstractCollection<>() + { + @Override + public Iterator iterator() + { + return Iterators.concat(IntervalTree.this.iterator(), Iterators.forArray(intervals)); + } + + @Override + public int size() + { + return intervalsByMinOrder.length + intervals.length; + } + }); + } + + // Add does not preserve iteration order, not even by interval bounds, so it's necessary to compute the arrays that preserve the minOrder + // Or pay to sort and build them later + I[] sortableIntervals = Arrays.copyOf(intervals, intervals.length); + I[] newIntervalsByMinOrder = buildUpdatedArrayForAdd(sortableIntervals, + intervalsByMinOrder, + Interval.minOrdering()); + I[] newIntervalsByMaxOrder = buildUpdatedArrayForAdd(sortableIntervals, + intervalsByMaxOrder, + Interval.maxOrdering()); + + return create(head.add(Arrays.asList(intervals)), modCount + 1, newIntervalsByMinOrder, newIntervalsByMaxOrder); + } + + @Override public Iterator iterator() { if (head == null) return Collections.emptyIterator(); - return new TreeIterator(head); + return Iterators.forArray(intervalsByMinOrder); + } + + public Stream stream() + { + return StreamSupport.stream(spliterator(), false); } @Override public String toString() { - return "<" + Joiner.on(", ").join(Iterables.limit(this, 100)) + ">"; + return '<' + Joiner.on(", ").join(Iterables.limit(this, 100)) + '>'; } + @SuppressWarnings("unchecked") @Override public boolean equals(Object o) { if(!(o instanceof IntervalTree)) return false; - IntervalTree that = (IntervalTree)o; + IntervalTree that = (IntervalTree)o; return Iterators.elementsEqual(iterator(), that.iterator()); } @@ -308,6 +504,116 @@ public final int hashCode() return result; } + private I[] buildUpdatedArrayForReplace(I[] existingSorted, + List> replacements, + AsymmetricOrdering, C> cmp) + { + I[] replacementArray = Arrays.copyOf(existingSorted, existingSorted.length); + for (Pair replacement : replacements) + { + I existingInterval = replacement.left; + I newInterval = replacement.right; + + int removalIdx = Arrays.binarySearch(replacementArray, existingInterval, cmp); + if (removalIdx < 0) + throw new IllegalStateException("Interval to replace not found in the existing tree: " + existingInterval); + checkState(existingInterval.data == replacementArray[removalIdx].data, "Comparator does not implement identity"); + + int insertionIdx = Arrays.binarySearch(replacementArray, newInterval, cmp); + checkState(insertionIdx < 0, "Value to be inserted already exists"); + insertionIdx = -1 - insertionIdx; + + if (insertionIdx > removalIdx) + { + // Shift everything from insertionIdx and left down to removalIdx + System.arraycopy(replacementArray, removalIdx + 1, replacementArray, removalIdx, insertionIdx - removalIdx - 1); + replacementArray[insertionIdx - 1] = newInterval; + } + else if (insertionIdx < removalIdx) + { + // Shift everything from insertionIdx and onward right to removalIdx + System.arraycopy(replacementArray, insertionIdx, replacementArray, insertionIdx + 1, removalIdx - insertionIdx); + replacementArray[Math.min(replacementArray.length, insertionIdx)] = newInterval; + } + else + { + replacementArray[insertionIdx] = newInterval; + } + } + + if (EXPENSIVE_CHECKS) + { + if (replacementArray.length > 1) + for (int i = 1; i < replacementArray.length; i++) + checkState(cmp.compare(replacementArray[i - 1], replacementArray[i]) < 0, "%s and %s out of order", replacementArray[i-1], replacementArray[i]); + } + + return replacementArray; + } + + /** + * This replace method doesn't work correctly with duplicates. If the tree already has duplicates each replacement (or duplicate replacement) + * will replace one instance in the tree. + * + * There is also the requirement that D will implement Comparable<D> and that comparator will implement identity + * which is not part of the normal contract of Comparable<D>. That means that if a.compareTo(b) == 0 then a == b; + */ + public IntervalTree replace(List> replacements) + { + if (head == null) + { + checkArgument(replacements.isEmpty(), "Can't replace intervals in an empty tree"); + return this; + } + + if (replacements.isEmpty()) + return this; + + List> sortableReplacements = new ArrayList<>(replacements); + I[] newIntervalsByMinOrder = buildUpdatedArrayForReplace(intervalsByMinOrder, sortableReplacements, Interval.minOrdering()); + I[] newIntervalsByMaxOrder = buildUpdatedArrayForReplace(intervalsByMaxOrder, sortableReplacements, Interval.maxOrdering()); + + checkState(newIntervalsByMinOrder.length == newIntervalsByMaxOrder.length); + if (EXPENSIVE_CHECKS) + { + boolean[] foundMinOrderReplacement = new boolean[replacements.size()]; + boolean[] foundMaxOrderReplacement = new boolean[replacements.size()]; + for (int i = 0; i < newIntervalsByMinOrder.length; i++) + { + for (int j = 0; j < replacements.size(); j++) + { + Pair replacement = replacements.get(j); + if (newIntervalsByMinOrder[i].min.equals(replacement.left.min) && newIntervalsByMinOrder[i].max.equals(replacement.right.max)) + { + checkState(newIntervalsByMinOrder[i].data != replacement.left.data); + if (newIntervalsByMinOrder[i].data == replacement.right.data) + { + checkState(!foundMinOrderReplacement[j], "Replacement value appears more than once"); + foundMinOrderReplacement[j] = true; + } + } + + if (newIntervalsByMaxOrder[i].min.equals(replacement.left.min) && newIntervalsByMaxOrder[i].max.equals(replacement.right.max)) + { + checkState(newIntervalsByMaxOrder[i].data != replacement.left.data); + if (newIntervalsByMaxOrder[i].data == replacement.right.data) + { + checkState(!foundMaxOrderReplacement[j], "Replacement value appears more than once"); + foundMaxOrderReplacement[j] = true; + } + } + } + } + for (int i = 0; i < foundMaxOrderReplacement.length; i++) + checkState(foundMinOrderReplacement[i] && foundMaxOrderReplacement[i], "Didn't find replacement value that should be present"); + } + + return create(head.replace(head, replacements), + modCount, + newIntervalsByMinOrder, + newIntervalsByMaxOrder); + } + protected class IntervalNode { final C center; @@ -375,7 +681,7 @@ public IntervalNode(List minOrder, List maxOrder) if (EXPENSIVE_CHECKS) { - List allEndpoints = new ArrayList(minOrder.size() * 2); + List allEndpoints = new ArrayList<>(minOrder.size() * 2); for (I interval : minOrder) { allEndpoints.add(interval.min); @@ -389,12 +695,12 @@ public IntervalNode(List minOrder, List maxOrder) // Separate interval in intersecting center, left of center and right of center int initialIntersectionSize = i - j + 1; - intersectsLeft = new ArrayList(initialIntersectionSize); - intersectsRight = new ArrayList(initialIntersectionSize); + intersectsLeft = new ArrayList<>(initialIntersectionSize); + intersectsRight = new ArrayList<>(initialIntersectionSize); int initialChildSize = Math.min(i, j); - List leftSegmentMinOrder = new ArrayList(initialChildSize); + List leftSegmentMinOrder = new ArrayList<>(initialChildSize); List leftSegmentMaxOrder = new ArrayList<>(initialChildSize); - List rightSegmentMinOrder = new ArrayList(initialChildSize); + List rightSegmentMinOrder = new ArrayList<>(initialChildSize); List rightSegmentMaxOrder = new ArrayList<>(initialChildSize); for (I candidate : minOrder) @@ -422,14 +728,24 @@ else if (candidate.min.compareTo(center) > 0) assert (intersectsLeft.size() == intersectsRight.size()); assert (intersectsLeft.size() + leftSegmentMinOrder.size() + rightSegmentMinOrder.size()) == minOrder.size() : - "intersects (" + String.valueOf(intersectsLeft.size()) + - ") + leftSegment (" + String.valueOf(leftSegmentMinOrder.size()) + - ") + rightSegment (" + String.valueOf(rightSegmentMinOrder.size()) + - ") != toBisect (" + String.valueOf(minOrder.size()) + ")"; + "intersects (" + intersectsLeft.size() + + ") + leftSegment (" + leftSegmentMinOrder.size() + + ") + rightSegment (" + rightSegmentMinOrder.size() + + ") != toBisect (" + minOrder.size() + ')'; } + public IntervalNode(C center, C low, C high, List intersectsLeft, List intersectsRight, IntervalNode left, IntervalNode right) + { + this.center = center; + this.low = low; + this.high = high; + this.intersectsLeft = intersectsLeft; + this.intersectsRight = intersectsRight; + this.left = left; + this.right = right; + } - void searchInternal(Interval searchInterval, List results) + void searchInternal(Interval searchInterval, Consumer> results) { if (center.compareTo(searchInterval.min) < 0) { @@ -438,7 +754,7 @@ void searchInternal(Interval searchInterval, List results) return; while (i < intersectsRight.size()) - results.add(intersectsRight.get(i++).data); + results.accept(intersectsRight.get(i++)); if (right != null) right.searchInternal(searchInterval, results); @@ -450,7 +766,7 @@ else if (center.compareTo(searchInterval.max) > 0) return; for (int i = 0 ; i < j ; i++) - results.add(intersectsLeft.get(i).data); + results.accept(intersectsLeft.get(i)); if (left != null) left.searchInternal(searchInterval, results); @@ -460,7 +776,7 @@ else if (center.compareTo(searchInterval.max) > 0) // Adds every interval contained in this node to the result set then search left and right for further // overlapping intervals for (Interval interval : intersectsLeft) - results.add(interval.data); + results.accept(interval); if (left != null) left.searchInternal(searchInterval, results); @@ -468,46 +784,191 @@ else if (center.compareTo(searchInterval.max) > 0) right.searchInternal(searchInterval, results); } } + + + private IntervalNode replace(IntervalNode node, List> replacements) + { + if (node == null || replacements.isEmpty()) + return node; + + List> leftSegment = new ArrayList<>(); + List> rightSegment = new ArrayList<>(); + List newIntersectsLeft = null; + List newIntersectsRight = null; + int updated = 0; + + for (Pair entry : replacements) + { + I intervalToRemove = entry.left; + I intervalToAdd = entry.right; + if (node.center.compareTo(intervalToRemove.min) < 0) + { + rightSegment.add(entry); + } + else if (node.center.compareTo(intervalToRemove.max) > 0) + { + leftSegment.add(entry); + } + else + { + // only init once if any interval resides in current node + if (newIntersectsLeft == null) + { + newIntersectsLeft = new ArrayList<>(node.intersectsLeft); + newIntersectsRight = new ArrayList<>(node.intersectsRight); + } + boolean leftUpdated = false; + boolean rightUpdated = false; + + int i = Interval.minOrdering().binarySearchAsymmetric(node.intersectsLeft, intervalToRemove.min, Op.CEIL); + while (i < node.intersectsLeft.size()) + { + if (node.intersectsLeft.get(i).equals(intervalToRemove)) + { + newIntersectsLeft.set(i, intervalToAdd); + leftUpdated = true; + break; + } + i++; + } + + int j = Interval.maxOrdering().binarySearchAsymmetric(node.intersectsRight, intervalToRemove.max, Op.CEIL); + while (j < node.intersectsRight.size()) + { + if (node.intersectsRight.get(j).equals(intervalToRemove)) + { + newIntersectsRight.set(j, intervalToAdd); + rightUpdated = true; + break; + } + j++; + } + assert leftUpdated && rightUpdated : "leftupdated = " + leftUpdated + ", rightupdated = " + rightUpdated; + updated++; + } + } + + assert leftSegment.size() + rightSegment.size() + updated == replacements.size() : + "leftSegment size (" + leftSegment.size() + ") + rightSegment size (" + rightSegment.size() + + ") + updated (" + updated + ") != replacementMap size (" + replacements.size() + ')'; + return new IntervalNode(node.center, + node.low, + node.high, + newIntersectsLeft != null ? newIntersectsLeft : node.intersectsLeft, + newIntersectsRight != null ? newIntersectsRight : node.intersectsRight, + replace(node.left, leftSegment), + replace(node.right, rightSegment)); + } + + private IntervalNode add(Collection intervals) + { + return add(this, intervals); + } + + private IntervalNode add(IntervalNode root, Collection intervals) + { + if (intervals.isEmpty()) + return root; + + if (root == null) + { + List minSortedIntervals = new ArrayList<>(intervals); + Collections.sort(minSortedIntervals, Interval.minOrdering()); + List maxSortedIntervals = new ArrayList<>(intervals); + Collections.sort(maxSortedIntervals, Interval.maxOrdering()); + return new IntervalNode(minSortedIntervals, maxSortedIntervals); + } + + List leftSegment = new ArrayList<>(); + List rightSegment = new ArrayList<>(); + C newLow = root.low; + C newHigh = root.high; + List newIntersectsLeft = null; + List newIntersectsRight = null; + for (I i : intervals) + { + newLow = newLow.compareTo(i.min) < 0 ? newLow : i.min; + newHigh = newHigh.compareTo(i.max) > 0 ? newHigh : i.max; + if (i.max.compareTo(root.center) < 0) + { + leftSegment.add(i); + } + else if (i.min.compareTo(root.center) > 0) + { + rightSegment.add(i); + } + else + { + if (newIntersectsLeft == null) + { + newIntersectsLeft = new ArrayList<>(root.intersectsLeft); + newIntersectsRight = new ArrayList<>(root.intersectsRight); + } + int leftIdx = Collections.binarySearch(newIntersectsLeft, i, Interval.minOrdering()); + checkState(leftIdx < 0, "Should not add the same interval twice"); + leftIdx = -1 - leftIdx; + newIntersectsLeft.add(leftIdx, i); + + int rightIdx = Collections.binarySearch(newIntersectsRight, i, Interval.maxOrdering()); + checkState(rightIdx < 0, "Should not add the same interval twice"); + rightIdx = -1 - rightIdx; + newIntersectsRight.add(rightIdx, i); + } + } + + return new IntervalNode(root.center, + newLow, + newHigh, + newIntersectsLeft != null ? newIntersectsLeft : root.intersectsLeft, + newIntersectsRight != null ? newIntersectsRight : root.intersectsRight, + add(root.left, leftSegment), + add(root.right, rightSegment)); + } } - private class TreeIterator extends AbstractIterator + public static class Builder, D extends Comparable, I extends Interval> { - private final Deque stack = new ArrayDeque(); - private Iterator current; + private final List intervals = new ArrayList<>(); - TreeIterator(IntervalNode node) + public Builder addAll(IntervalTree other) { - super(); - gotoMinOf(node); + other.forEach(intervals::add); + return this; } - protected I computeNext() + public Builder add(I interval) { - while (true) - { - if (current != null && current.hasNext()) - return current.next(); + intervals.add(interval); + return this; + } - IntervalNode node = stack.pollFirst(); - if (node == null) - return endOfData(); + public Builder removeIf(TriPredicate predicate) + { + intervals.removeIf(i -> predicate.test(i.min, i.max, i.data)); + return this; + } - current = node.intersectsLeft.iterator(); + public Builder removeIf(BiPredicate predicate) + { + intervals.removeIf(i -> predicate.test(i.min, i.max)); + return this; + } - // We know this is the smaller not returned yet, but before doing - // its parent, we must do everyone on it's right. - gotoMinOf(node.right); - } + public Builder removeIf(Predicate predicate) + { + intervals.removeIf(i -> predicate.test(i.data)); + return this; } - private void gotoMinOf(IntervalNode node) + public IntervalTree build() { - while (node != null) - { - stack.offerFirst(node); - node = node.left; - } + return IntervalTree.build(intervals); + } + @Override + public String toString() + { + return intervals.toString(); } } } \ No newline at end of file diff --git a/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java b/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java index fdd678efd04b..220c7ff9fd71 100644 --- a/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java +++ b/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java @@ -30,10 +30,6 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableSet; -import org.apache.cassandra.exceptions.UnrecoverableIllegalStateException; -import org.apache.cassandra.metrics.StorageMetrics; -import org.apache.cassandra.service.DiskErrorsHandlerService; -import org.apache.cassandra.tracing.Tracing; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,9 +37,14 @@ import org.apache.cassandra.concurrent.ScheduledExecutors; import org.apache.cassandra.config.Config; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.exceptions.UnrecoverableIllegalStateException; import org.apache.cassandra.io.FSError; import org.apache.cassandra.io.sstable.CorruptSSTableException; +import org.apache.cassandra.journal.Params.FailurePolicy; +import org.apache.cassandra.service.DiskErrorsHandlerService; +import org.apache.cassandra.metrics.StorageMetrics; import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; import static org.apache.cassandra.config.CassandraRelevantProperties.PRINT_HEAP_HISTOGRAM_ON_OUT_OF_MEMORY_ERROR; @@ -94,6 +95,11 @@ public static void inspectCommitLogThrowable(Throwable t) inspectThrowable(t, ex -> DiskErrorsHandlerService.get().inspectCommitLogError(ex)); } + public static void inspectJournalThrowable(Throwable t, String journalName, FailurePolicy failurePolicy) + { + inspectThrowable(t, th -> inspectJournalError(th, journalName, failurePolicy)); + } + public static void inspectThrowable(Throwable t, Consumer fn) throws OutOfMemoryError { boolean isUnstable = false; @@ -128,7 +134,14 @@ else if (t instanceof UnrecoverableIllegalStateException) } // Anything other than an OOM, we should try and heap dump to capture what's going on if configured to do so - HeapUtils.maybeCreateHeapDump(); + try + { + HeapUtils.maybeCreateHeapDump(); + } + catch (Throwable sub) + { + t.addSuppressed(sub); + } if (t instanceof InterruptedException) throw new UncheckedInterruptedException((InterruptedException) t); @@ -189,6 +202,19 @@ private static void forceHeapSpaceOomMaybe(OutOfMemoryError oom) } } + private static void inspectJournalError(Throwable t, String journalName, FailurePolicy failurePolicy) + { + if (!StorageService.instance.isDaemonSetupCompleted()) + { + logger.error("Exiting due to error while processing journal {} during initialization.", journalName, t); + killer.killCurrentJVM(t, true); + } + else if (failurePolicy == FailurePolicy.DIE) + { + killer.killCurrentJVM(t); + } + } + public static void killCurrentJVM(Throwable t, boolean quiet) { killer.killCurrentJVM(t, quiet); diff --git a/src/java/org/apache/cassandra/utils/MergeIterator.java b/src/java/org/apache/cassandra/utils/MergeIterator.java index 1dd1f7833bd1..fdbae53d7f7e 100644 --- a/src/java/org/apache/cassandra/utils/MergeIterator.java +++ b/src/java/org/apache/cassandra/utils/MergeIterator.java @@ -19,6 +19,8 @@ import java.util.*; +import accord.utils.Invariants; + /** Merges sorted input iterators which individually contain unique items. */ public abstract class MergeIterator extends AbstractIterator implements IMergeIterator { @@ -44,6 +46,33 @@ public static MergeIterator get(List> return new ManyToOne<>(sources, comparator, reducer); } + public static > MergeIterator get(List> sources) + { + return get(sources, Comparator.naturalOrder(), new Reducer<>() + { + private E first = null; + @Override + protected void onKeyChange() + { + first = null; + } + + @Override + public void reduce(int idx, E current) + { + if (first == null) + first = current; + } + + @Override + protected E getReduced() + { + Invariants.require(first != null); + return first; + } + }); + } + public Iterable> iterators() { return iterators; @@ -421,6 +450,23 @@ public boolean needsAdvance() /** Accumulator that collects values of type A, and outputs a value of type B. */ public static abstract class Reducer { + public static class Trivial extends Reducer + { + private T reduced = null; + + @Override + public boolean trivialReduceIsTrivial() { return true; } + + @Override + public void reduce(int idx, T current) { reduced = current; } + + @Override + protected T getReduced() { return reduced; } + + @Override + protected void onKeyChange() { reduced = null; } + } + /** * @return true if Out is the same as In for the case of a single source iterator */ diff --git a/src/java/org/apache/cassandra/utils/MerkleTree.java b/src/java/org/apache/cassandra/utils/MerkleTree.java index 2646057c0428..cadc47ed2736 100644 --- a/src/java/org/apache/cassandra/utils/MerkleTree.java +++ b/src/java/org/apache/cassandra/utils/MerkleTree.java @@ -749,7 +749,7 @@ private static ByteBuffer allocate(int innerNodeCount, IPartitioner partitioner) int size = offHeapBufferSize(innerNodeCount, partitioner); logger.debug("Allocating direct buffer of size {} for an off-heap merkle tree", size); ByteBuffer buffer = ByteBuffer.allocateDirect(size); - if (Ref.DEBUG_ENABLED) + if (Ref.TRACE_ENABLED) MemoryUtil.setAttachment(buffer, new Ref.DirectBufferRef<>(null, null)); return buffer; } diff --git a/src/java/org/apache/cassandra/utils/MutableEntry.java b/src/java/org/apache/cassandra/utils/MutableEntry.java new file mode 100644 index 000000000000..9ae0c17177cf --- /dev/null +++ b/src/java/org/apache/cassandra/utils/MutableEntry.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.util.Map; +import java.util.Objects; + +public class MutableEntry implements Map.Entry +{ + private final K k; + private V v; + + public MutableEntry(K k, V v) + { + this.k = k; + this.v = v; + } + + @Override + public K getKey() + { + return k; + } + + @Override + public V getValue() + { + return v; + } + + @Override + public V setValue(V value) + { + V previous = v; + v = Objects.requireNonNull(value); + return previous; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || !(o instanceof Map.Entry)) return false; + Map.Entry that = (Map.Entry) o; + return Objects.equals(k, that.getKey()) && Objects.equals(v, that.getValue()); + } + + @Override + public int hashCode() + { + return Objects.hash(k, v); + } + + @Override + public String toString() + { + return k + "=" + v; + } +} diff --git a/src/java/org/apache/cassandra/utils/NoSpamLogger.java b/src/java/org/apache/cassandra/utils/NoSpamLogger.java index 0a13f6b2a5ae..94c5e7d103d8 100644 --- a/src/java/org/apache/cassandra/utils/NoSpamLogger.java +++ b/src/java/org/apache/cassandra/utils/NoSpamLogger.java @@ -21,11 +21,10 @@ import java.util.concurrent.atomic.AtomicLong; import java.util.function.Supplier; +import com.google.common.annotations.VisibleForTesting; import org.cliffc.high_scale_lib.NonBlockingHashMap; import org.slf4j.Logger; -import com.google.common.annotations.VisibleForTesting; - import static org.apache.cassandra.utils.Clock.Global; /** @@ -47,7 +46,7 @@ public class NoSpamLogger */ public enum Level { - INFO, WARN, ERROR + DEBUG, INFO, WARN, ERROR } @VisibleForTesting @@ -78,7 +77,7 @@ public NoSpamLogStatement(String statement, long minIntervalNanos) this.minIntervalNanos = minIntervalNanos; } - private boolean shouldLog(long nowNanos) + public boolean shouldLog(long nowNanos) { long expected = get(); return nowNanos >= expected && compareAndSet(expected, nowNanos + minIntervalNanos); @@ -100,6 +99,9 @@ private boolean logNoCheck(Level l, Object... objects) { switch (l) { + case DEBUG: + wrapped.debug(statement, objects); + break; case INFO: wrapped.info(statement, objects); break; @@ -115,6 +117,16 @@ private boolean logNoCheck(Level l, Object... objects) return true; } + public boolean debug(long nowNanos, Object... objects) + { + return NoSpamLogStatement.this.log(Level.DEBUG, nowNanos, objects); + } + + public boolean debug(Object... objects) + { + return NoSpamLogStatement.this.debug(CLOCK.nanoTime(), objects); + } + public boolean info(long nowNanos, Object... objects) { return NoSpamLogStatement.this.log(Level.INFO, nowNanos, objects); @@ -217,6 +229,11 @@ private NoSpamLogger(Logger wrapped, long minInterval, TimeUnit timeUnit) minIntervalNanos = timeUnit.toNanos(minInterval); } + public static NoSpamLogger wrap(Logger wrapped, long minInterval, TimeUnit timeUnit) + { + return new NoSpamLogger(wrapped, minInterval, timeUnit); + } + public boolean info(long nowNanos, String s, Object... objects) { return NoSpamLogger.this.log( Level.INFO, s, nowNanos, objects); diff --git a/src/java/org/apache/cassandra/utils/NullableSerializer.java b/src/java/org/apache/cassandra/utils/NullableSerializer.java index 67e2d6a0a925..8392bf19dc85 100644 --- a/src/java/org/apache/cassandra/utils/NullableSerializer.java +++ b/src/java/org/apache/cassandra/utils/NullableSerializer.java @@ -22,49 +22,134 @@ import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.VersionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; public class NullableSerializer { + public static void serializeNullable(T value, DataOutputPlus out, UnversionedSerializer serializer) throws IOException + { + out.writeBoolean(value != null); + if (value != null) + serializer.serialize(value, out); + } + + public static void serializeNullable(T value, DataOutputPlus out, int version, IVersionedSerializer serializer) throws IOException + { + out.writeBoolean(value != null); + if (value != null) + serializer.serialize(value, out, version); + } - public static void serializeNullable(IVersionedSerializer serializer, T value, DataOutputPlus out, int version) throws IOException + public static void serializeNullable(T value, DataOutputPlus out, Version version, VersionedSerializer serializer) throws IOException { out.writeBoolean(value != null); if (value != null) serializer.serialize(value, out, version); } - public static T deserializeNullable(IVersionedSerializer serializer, DataInputPlus in, int version) throws IOException + public static T deserializeNullable(DataInputPlus in, UnversionedSerializer serializer) throws IOException + { + return in.readBoolean() ? serializer.deserialize(in) : null; + } + + public static T deserializeNullable(DataInputPlus in, int version, IVersionedSerializer serializer) throws IOException + { + return in.readBoolean() ? serializer.deserialize(in, version) : null; + } + + public static T deserializeNullable(DataInputPlus in, Version version, VersionedSerializer serializer) throws IOException { return in.readBoolean() ? serializer.deserialize(in, version) : null; } - public static long serializedSizeNullable(IVersionedSerializer serializer, T value, int version) + public static long serializedNullableSize(T value, UnversionedSerializer serializer) + { + return value != null + ? TypeSizes.sizeof(true) + serializer.serializedSize(value) + : TypeSizes.sizeof(false); + } + + public static long serializedNullableSize(T value, int version, IVersionedSerializer serializer) { return value != null ? TypeSizes.sizeof(true) + serializer.serializedSize(value, version) : TypeSizes.sizeof(false); } + public static long serializedNullableSize(T value, Version version, VersionedSerializer serializer) + { + return value != null + ? TypeSizes.sizeof(true) + serializer.serializedSize(value, version) + : TypeSizes.sizeof(false); + } + + public static UnversionedSerializer wrap(UnversionedSerializer wrap) + { + return new UnversionedSerializer<>() + { + @Override + public void serialize(T t, DataOutputPlus out) throws IOException + { + serializeNullable(t, out, wrap); + } + + @Override + public T deserialize(DataInputPlus in) throws IOException + { + return deserializeNullable(in, wrap); + } + + @Override + public long serializedSize(T t) + { + return serializedNullableSize(t, wrap); + } + }; + } + public static IVersionedSerializer wrap(IVersionedSerializer wrap) { return new IVersionedSerializer() { public void serialize(T t, DataOutputPlus out, int version) throws IOException { - serializeNullable(wrap, t, out, version); + serializeNullable(t, out, version, wrap); } public T deserialize(DataInputPlus in, int version) throws IOException { - return deserializeNullable(wrap, in, version); + return deserializeNullable(in, version, wrap); } public long serializedSize(T t, int version) { - return serializedSizeNullable(wrap, t, version); + return serializedNullableSize(t, version, wrap); } }; } + public static VersionedSerializer wrap(VersionedSerializer wrap) + { + return new VersionedSerializer<>() { + @Override + public void serialize(T t, DataOutputPlus out, Version version) throws IOException + { + serializeNullable(t, out, version, wrap); + } + + @Override + public T deserialize(DataInputPlus in, Version version) throws IOException + { + return deserializeNullable(in, version, wrap); + } + + @Override + public long serializedSize(T t, Version version) + { + return serializedNullableSize(t, version, wrap); + } + }; + } } diff --git a/src/java/org/apache/cassandra/utils/PojoToString.java b/src/java/org/apache/cassandra/utils/PojoToString.java new file mode 100644 index 000000000000..369a386cdce9 --- /dev/null +++ b/src/java/org/apache/cassandra/utils/PojoToString.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; + +import com.fasterxml.jackson.core.JsonProcessingException; +import org.yaml.snakeyaml.DumperOptions; +import org.yaml.snakeyaml.DumperOptions.FlowStyle; +import org.yaml.snakeyaml.Yaml; + +import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.cassandra.utils.JsonUtils.JSON_OBJECT_MAPPER; +import static org.apache.cassandra.utils.LocalizeString.toUpperCaseLocalized; + +/** + * Helper to format POJOs that are easy to convert (primitives, nonnull, and built in collections) + * in various human + machine readable formats. Useful for JMX and nodetool. + */ +public class PojoToString +{ + public static final Integer VERSION_50 = 0; + + public static final Integer CURRENT_VERSION = VERSION_50; + + enum Format + { + YAML, + MINIFIED_YAML(true), + JSON, + MINIFIED_JSON(true); + + boolean minified; + + Format() + { + this(false); + } + + Format(boolean minified) + { + this.minified = minified; + } + + public boolean isYaml() + { + return this == YAML || this == MINIFIED_YAML; + } + + public static Format fromString(String formatString) + { + formatString = toUpperCaseLocalized(formatString); + switch (formatString) + { + case "YAML": + return YAML; + case "MINIFIED-YAML": + return MINIFIED_YAML; + case "JSON": + return JSON; + case "MINIFIED-JSON": + return MINIFIED_JSON; + default: throw new IllegalArgumentException("Unsupported format " + formatString + + " supported formats are YAML, MINIFIED-YAML, JSON, MINIFIED-JSON"); + } + } + } + private static final Set> ALLOWED_PRIMITIVES = ImmutableSet.of( + String.class, + Double.class, + Float.class, + Long.class, + Integer.class, + Short.class, + Byte.class + ); + + private static final List> ALLOWED_COLLECTIONS = ImmutableList.of( + List.class, + Set.class + ); + + /** + * Helper to convert POJOs from a restricted set (primitive Java types and collections) to a human/machine readable + * format that is specified by the format parameter. + * + * This doesn't enforce what objects are serialized so you can get error or messy output if you try and serialize + * things that aren't primitive or collections. + * + * The map must contain a 'version' key set to CURRENT_VERSION + * @param map Map POJO that must be restricted to easily representable types (map, set , list, primitives), and contains the 'version' key set to CURRENT_VERSION + * @param formatString Human/machine readable format name, can be YAML or JSON, prefix with MINIFIED- to get a minified version + * @return The map formatted in the requested format + * @throws IllegalArgumentException If the 'version' key is not present and set to CURRENT_VERSION + */ + public static String pojoMapToString(Map map, String formatString) + { + checkArgument(CURRENT_VERSION.equals(map.get("version"))); + return pojoToString(map, formatString); + } + + private static String pojoToString(Object obj, String formatString) + { + validateAllowedTypes(obj); + Format format = Format.fromString(formatString); + if (format.isYaml()) + { + DumperOptions dumperOptions = new DumperOptions(); + if (format.minified) + { + dumperOptions.setDefaultFlowStyle(FlowStyle.FLOW); + dumperOptions.setIndent(1); + dumperOptions.setWidth(Integer.MAX_VALUE); + dumperOptions.setSplitLines(false); + } + // TODO How do you get snake yaml to produce minified output? + return new Yaml(dumperOptions).dump(obj); + } + else + { + try + { + if (format.minified) + return JSON_OBJECT_MAPPER.writeValueAsString(obj); + else + return JSON_OBJECT_MAPPER.writerWithDefaultPrettyPrinter().writeValueAsString(obj); + } + catch (JsonProcessingException e) + { + throw new RuntimeException(e); + } + } + } + + private static void validateAllowedTypes(Object o) + { + if (o == null) + throw new NullPointerException("Null objects are unsupported"); + if (o instanceof Map) + { + for (Map.Entry entry : ((Map)o).entrySet()) + { + Object key = entry.getKey(); + if (!(key instanceof String | key instanceof Long | key instanceof Integer)) + throw new IllegalArgumentException("Map has entry with key " + entry.getKey() + " of " + + key.getClass() + " which is unsupported, only String is supported for map keys"); + validateAllowedTypes(entry.getValue()); + } + } + else if (o instanceof Collection) + { + if (!(o instanceof Set | o instanceof List)) + throw new IllegalArgumentException("Collection " + o + " with " + o.getClass() + " is not in allow list " + ALLOWED_COLLECTIONS); + for (Object element : ((Collection)o)) + validateAllowedTypes(element); + } + else if (!ALLOWED_PRIMITIVES.contains(o.getClass())) + throw new IllegalArgumentException("Scalar " + o + " with " + o.getClass() + " is not in allow list " + ALLOWED_PRIMITIVES); + + } +} diff --git a/src/java/org/apache/cassandra/utils/RTree.java b/src/java/org/apache/cassandra/utils/RTree.java new file mode 100644 index 000000000000..5b7affe373d9 --- /dev/null +++ b/src/java/org/apache/cassandra/utils/RTree.java @@ -0,0 +1,535 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; +import javax.annotation.CheckForNull; + +import com.google.common.collect.AbstractIterator; + +public class RTree implements RangeTree +{ + /** + * Tuning size target can be tricky as it is based on expected access patterns and expected matche sizes. There is also + * a memory cost to account for as large tree sizes will have far more nodes with a small target than a large target. + * + * If matching most of the data then larger sizes leads to fewer hops + * If matching few elements then tree depth maters the most, if walking a long tree is more costly than walking the + * element list, then shrinking depth (by having larger size target) can improve performance. + */ + private static final int DEFAULT_SIZE_TARGET = 1 << 7; + private static final int DEFAULT_NUMBER_OF_CHILDREN = 6; + + private final Comparator comparator; + private final Accessor accessor; + private final int sizeTarget; + private final int numChildren; + private Node node = new Node(); + + public RTree(Comparator comparator, Accessor accessor) + { + this(comparator, accessor, DEFAULT_SIZE_TARGET, DEFAULT_NUMBER_OF_CHILDREN); + } + + public RTree(Comparator comparator, Accessor accessor, int sizeTarget, int numChildren) + { + if (sizeTarget <= 1) + throw new IllegalArgumentException("size target must be 2 or more"); + if (numChildren <= 1) + throw new IllegalArgumentException("Number of children must be 2 or more"); + if (sizeTarget < numChildren) + throw new IllegalArgumentException("Size target (" + sizeTarget + ") was less than number of children (" + numChildren + ")"); + this.comparator = comparator; + this.accessor = accessor; + this.sizeTarget = sizeTarget; + this.numChildren = numChildren; + } + + public static , Range, Value> RTree create(Accessor accessor) + { + return new RTree<>(Comparator.naturalOrder(), accessor); + } + + @Override + public List get(Range range) + { + List matches = new ArrayList<>(); + get(range, e -> matches.add(e.getValue())); + return matches; + } + + @Override + public void get(Range range, Consumer> onMatch) + { + node.search(range, onMatch, e -> e.getKey().equals(range), Function.identity()); + } + + @Override + public List> search(Range range) + { + List> matches = new ArrayList<>(); + search(range, matches::add); + return matches; + } + + @Override + public void search(Range range, Consumer> onMatch) + { + node.search(range, onMatch, ignore -> true, Function.identity()); + } + + public List find(Range range) + { + List matches = new ArrayList<>(); + find(range, matches::add); + return matches; + } + + public void find(Range range, Consumer onMatch) + { + node.search(range, onMatch, ignore -> true, Map.Entry::getValue); + } + + @Override + public List> searchToken(Token token) + { + List> matches = new ArrayList<>(); + searchToken(token, matches::add); + return matches; + } + + @Override + public void searchToken(Token token, Consumer> onMatch) + { + node.searchToken(token, onMatch, ignore -> true, Function.identity()); + } + + public List findToken(Token token) + { + List matches = new ArrayList<>(); + findToken(token, matches::add); + return matches; + } + + public void findToken(Token token, Consumer onMatch) + { + node.searchToken(token, onMatch, ignore -> true, Map.Entry::getValue); + } + + @Override + public boolean add(Range key, Value value) + { + node.add(key, value); + return true; + } + + @Override + public int remove(Range key) + { + return node.removeIf(e -> e.getKey().equals(key)); + } + + public int remove(Range key, Value value) + { + Map.Entry match = Map.entry(key, value); + return node.removeIf(match::equals); + } + + @Override + public void clear() + { + node = new Node(); + } + + @Override + public int size() + { + return node.size; + } + + @Override + public boolean isEmpty() + { + return node.size == 0; + } + + public String displayTree() + { + StringBuilder sb = new StringBuilder(); + node.displayTree(0, sb); + return sb.toString(); + } + + @Override + public Iterator> iterator() + { + return node.iterator(); + } + + @Override + public Stream> stream() + { + return StreamSupport.stream(spliterator(), false); + } + + private class Node implements Iterable> + { + private List> values = new ArrayList<>(); + private List children = null; + private int size = 0; + private Token minStart, maxStart, minEnd, maxEnd; + + int removeIf(Predicate> condition) + { + if (minStart == null) + return 0; + if (children != null) + { + int sum = 0; + for (Node node : children) + sum += node.removeIf(condition); + size -= sum; + return sum; + } + class Counter {int value;} + Counter counter = new Counter(); + values.removeIf(e -> { + if (condition.test(e)) + { + counter.value++; + return true; + } + return false; + }); + size -= counter.value; + if (values.isEmpty()) + minStart = maxStart = minEnd = maxEnd = null; + return counter.value; + } + + void add(Range range, Value value) + { + size++; + if (minStart == null) + { + minStart = maxStart = accessor.start(range); + minEnd = maxEnd = accessor.end(range); + } + else + { + Token start = accessor.start(range); + minStart = min(minStart, start); + maxStart = max(maxStart, start); + Token end = accessor.end(range); + minEnd = min(minEnd, end); + maxEnd = max(maxEnd, end); + } + if (children != null) + { + findBestMatch(range).add(range, value); + return; + } + values.add(new MutableEntry(range, value)); + if (shouldSplit()) + split(); + } + + private Node findBestMatch(Range range) + { + int topIdx = 0; + Node node = children.get(0); + int topScore = node.score(range); + int size = node.size; + for (int i = 1; i < children.size(); i++) + { + node = children.get(i); + int score = node.score(range); + if (score > topScore || (score == topScore && size > node.size)) + { + topIdx = i; + size = node.size; + } + } + return children.get(topIdx); + } + + private int score(Range range) + { + if (minStart == null) + return 0; + if (!intersects(range)) + return -10; + int score = 5; // overlapps + if (values != null) // is leaf + score += 5; + + int startScore = 0; + if (comparator.compare(maxStart, accessor.start(range)) <= 0) + startScore += 10; + else if (comparator.compare(minStart, accessor.start(range)) <= 0) + startScore += 5; + + int endScore = 0; + if (comparator.compare(minEnd, accessor.end(range)) >= 0) + endScore += 10; + else if (comparator.compare(maxEnd, accessor.end(range)) >= 0) + endScore += 5; + // if fully contained, then add the scores: 10 for largest bounds, 20 for smallest bounds + if (!(startScore == 0 || endScore == 0)) + score += startScore + endScore; + return score; + } + + boolean shouldSplit() + { + return values.size() > sizeTarget + // if the same range is used over and over again, splitting doesn't do much + && !(comparator.compare(minStart, maxStart) == 0 + && comparator.compare(minEnd, maxEnd) == 0); + } + + List>> partitionByEnd() + { + List allEndpoints = new ArrayList<>(values.size() * 2); + for (Map.Entry a : values) + { + allEndpoints.add(accessor.start(a.getKey())); + allEndpoints.add(accessor.end(a.getKey())); + } + allEndpoints.sort(comparator); + List maxToken = new ArrayList<>(numChildren); + int tick = allEndpoints.size() / numChildren; + int offset = tick; + for (int i = 0; i < numChildren; i++) + { + maxToken.add(allEndpoints.get(offset)); + offset += tick; + if (offset >= allEndpoints.size()) + { + maxToken.add(allEndpoints.get(allEndpoints.size() - 1)); + break; + } + } + + List>> partitions = new ArrayList<>(numChildren); + for (int i = 0; i < numChildren; i++) + partitions.add(new ArrayList<>()); + + for (Map.Entry a : values) + { + Token end = accessor.end(a.getKey()); + List> selected = null; + for (int i = 0; i < numChildren; i++) + { + if (comparator.compare(end, maxToken.get(i)) < 0) + { + selected = partitions.get(i); + break; + } + } + if (selected == null) + selected = partitions.get(partitions.size() - 1); + selected.add(a); + } + int[] sizes = partitions.stream().mapToInt(List::size).toArray(); + return goodEnough(sizes) ? partitions : null; + } + + private boolean goodEnough(int[] sizes) + { + double sum = 0.0; + for (int i : sizes) + sum += i; + double mean = sum / sizes.length; + double stddev = 0.0; + for (int i : sizes) + stddev += Math.pow(i - mean, 2); + stddev = Math.sqrt(stddev / sizes.length); + return stddev < 1.5; + } + + void split() + { + children = new ArrayList<>(numChildren); + for (int i = 0; i < numChildren; i++) + children.add(new Node()); + + List>> partitions = partitionByEnd(); + if (partitions == null) + partitions = partitionEven(); + for (int i = 0; i < children.size(); i++) + { + Node c = children.get(i); + List> entries = partitions.get(i); + entries.forEach(e -> c.add(e.getKey(), e.getValue())); + } + + values.clear(); + values = null; + } + + private List>> partitionEven() + { + values.sort((a, b) -> { + Range left = a.getKey(); + Range right = b.getKey(); + int rc = comparator.compare(accessor.start(left), accessor.start(right)); + if (rc == 0) + rc = comparator.compare(accessor.end(left), accessor.end(right)); + return rc; + }); + List>> partition = new ArrayList<>(numChildren); + int size = Math.max(1, values.size() / numChildren); + int offset = 0; + for (int i = 0; i < numChildren - 1; i++) + { + int total = size; + partition.add(new ArrayList<>(values.subList(offset, offset + total))); + offset += total; + } + partition.add(new ArrayList<>(values.subList(offset, values.size()))); + return partition; + } + + void search(Range range, Consumer matches, Predicate> predicate, Function, T> transformer) + { + if (minStart == null) + return; + if (!intersects(range)) + return; + if (children != null) + { + children.forEach(n -> n.search(range, matches, predicate, transformer)); + return; + } + values.forEach(e -> { + if (accessor.intersects(e.getKey(), range) && predicate.test(e)) + matches.accept(transformer.apply(e)); + }); + } + + void searchToken(Token token, Consumer matches, Predicate> predicate, Function, T> transformer) + { + if (minStart == null) + return; + if (!contains(minStart, maxEnd, token)) + return; + if (children != null) + { + for (int i = 0, size = children.size(); i < size; i++) + { + Node node = children.get(i); + node.searchToken(token, matches, predicate, transformer); + } + return; + } + values.forEach(e -> { + if (accessor.contains(e.getKey(), token) && predicate.test(e)) + matches.accept(transformer.apply(e)); + }); + } + + boolean intersects(Range range) + { + return accessor.intersects(range, minStart, maxEnd); + } + + boolean contains(Token start, Token end, Token value) + { + return accessor.contains(start, end, value); + } + + private void displayTree(int level, StringBuilder sb) + { + for (int i = 0; i < level; i++) + sb.append('\t'); + sb.append("start:(").append(minStart).append(", ").append(maxStart).append("), end:(").append(minEnd).append(", ").append(maxEnd).append("):"); + if (children != null) + { + sb.append('\n'); + children.forEach(n -> n.displayTree(level + 1, sb)); + } + else + { + sb.append(' ').append(size).append('\n'); + } + } + + @Override + public String toString() + { + return "Node{" + + "minStart=" + minStart + + ", maxStart=" + maxStart + + ", minEnd=" + minEnd + + ", maxEnd=" + maxEnd + + ", values=" + values + + ", children=" + children + + '}'; + } + + private Token min(Token a, Token b) + { + return comparator.compare(a, b) < 0 ? a : b; + } + + private Token max(Token a, Token b) + { + return comparator.compare(a, b) < 0 ? b : a; + } + + @Override + public Iterator> iterator() + { + if (values != null) + return values.iterator(); + return new AbstractIterator<>() + { + private int index = 0; + private Iterator> it = null; + @CheckForNull + @Override + protected Map.Entry computeNext() + { + while (true) + { + if (it == null) + { + if (index == children.size()) + return endOfData(); + it = children.get(index++).iterator(); + } + if (it.hasNext()) + return it.next(); + it = null; + } + } + }; + } + } +} diff --git a/src/java/org/apache/cassandra/utils/RangeTree.java b/src/java/org/apache/cassandra/utils/RangeTree.java new file mode 100644 index 000000000000..4fb0b8696acb --- /dev/null +++ b/src/java/org/apache/cassandra/utils/RangeTree.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.util.List; +import java.util.Map; +import java.util.function.Consumer; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +public interface RangeTree extends Iterable> +{ + void searchToken(Token token, Consumer> onMatch); + + boolean add(Range key, Value value); + + List get(Range range); + + void get(Range range, Consumer> onMatch); + + List> search(Range range); + + void search(Range range, Consumer> onMatch); + + List> searchToken(Token token); + + int remove(Range key); + + void clear(); + + int size(); + + boolean isEmpty(); + + default Stream> stream() + { + return StreamSupport.stream(spliterator(), false); + } + + interface Accessor + { + Token start(Range range); + Token end(Range range); + boolean contains(Token start, Token end, Token token); + default boolean contains(Range range, Token token) + { + return contains(start(range), end(range), token); + } + boolean intersects(Range range, Token start, Token end); + default boolean intersects(Range left, Range right) + { + return intersects(left, start(right), end(right)); + } + } +} diff --git a/src/java/org/apache/cassandra/utils/Sortable.java b/src/java/org/apache/cassandra/utils/Sortable.java new file mode 100644 index 000000000000..145967cb24f6 --- /dev/null +++ b/src/java/org/apache/cassandra/utils/Sortable.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.util.Comparator; + +public interface Sortable> extends Iterable +{ + int size(); + + S sorted(Comparator comparator); +} diff --git a/src/java/org/apache/cassandra/utils/SortedBiMultiValMap.java b/src/java/org/apache/cassandra/utils/SortedBiMultiValMap.java index 44ac0a01c0b4..1b0d8541ee43 100644 --- a/src/java/org/apache/cassandra/utils/SortedBiMultiValMap.java +++ b/src/java/org/apache/cassandra/utils/SortedBiMultiValMap.java @@ -18,17 +18,21 @@ package org.apache.cassandra.utils; import java.util.Collection; -import java.util.SortedMap; +import java.util.NavigableMap; import java.util.TreeMap; import com.google.common.collect.SortedSetMultimap; import com.google.common.collect.TreeMultimap; -public class SortedBiMultiValMap extends BiMultiValMap +public class SortedBiMultiValMap extends AbstractBiMultiValMap { - protected SortedBiMultiValMap(SortedMap forwardMap, SortedSetMultimap reverseMap) + protected final NavigableMap forwardMap; + protected final SortedSetMultimap reverseMap; + + protected SortedBiMultiValMap(NavigableMap forwardMap, SortedSetMultimap reverseMap) { - super(forwardMap, reverseMap); + this.forwardMap = forwardMap; + this.reverseMap = reverseMap; } public static , V extends Comparable> SortedBiMultiValMap create() @@ -36,10 +40,10 @@ public static , V extends Comparable> SortedBiMultiVa return new SortedBiMultiValMap(new TreeMap(), TreeMultimap.create()); } - public static , V extends Comparable> SortedBiMultiValMap create(BiMultiValMap map) + public static , V extends Comparable, M extends AbstractBiMultiValMap> SortedBiMultiValMap create(M map) { SortedBiMultiValMap newMap = SortedBiMultiValMap.create(); - newMap.forwardMap.putAll(map.forwardMap); + newMap.forwardMap.putAll(map.forwardDelegate()); // Put each individual TreeSet instead of Multimap#putAll(Multimap) to get linear complexity // See CASSANDRA-14660 for (Entry> entry : map.inverse().asMap().entrySet()) @@ -47,4 +51,15 @@ public static , V extends Comparable> SortedBiMultiVa return newMap; } + @Override + protected NavigableMap forwardDelegate() + { + return forwardMap; + } + + @Override + protected SortedSetMultimap reverseDelegate() + { + return reverseMap; + } } diff --git a/src/java/org/apache/cassandra/utils/Throwables.java b/src/java/org/apache/cassandra/utils/Throwables.java index 242c7688089a..df9ad111380b 100644 --- a/src/java/org/apache/cassandra/utils/Throwables.java +++ b/src/java/org/apache/cassandra/utils/Throwables.java @@ -39,6 +39,8 @@ import org.apache.cassandra.io.util.File; import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; +import static com.google.common.base.Throwables.getStackTraceAsString; + public final class Throwables { public enum FileOpType { READ, WRITE } @@ -48,6 +50,23 @@ public interface DiscreteAction void perform() throws E; } + public interface ThrowingRunnable + { + void run() throws Exception; + } + + public static void runUnchecked(ThrowingRunnable runnable) + { + try + { + runnable.run(); + } + catch (Exception e) + { + throwAsUncheckedException(e); + } + } + public static boolean isCausedBy(Throwable t, Predicate cause) { return cause.test(t) || (t.getCause() != null && cause.test(t.getCause())); @@ -340,4 +359,16 @@ public static void assertAnyCause(Throwable err, Class... c if (Arrays.stream(causeClasses).noneMatch(c -> anyCauseMatches(err, c::isInstance))) throw new AssertionError("The exception is not caused by any of " + Arrays.toString(causeClasses), err); } + + public static Object getStackTraceAsToString(Throwable t) + { + return new Object() + { + @Override + public String toString() + { + return getStackTraceAsString(t); + } + }; + } } diff --git a/src/java/org/apache/cassandra/utils/TimeUUID.java b/src/java/org/apache/cassandra/utils/TimeUUID.java index 6f4281c7ca64..a0b969884894 100644 --- a/src/java/org/apache/cassandra/utils/TimeUUID.java +++ b/src/java/org/apache/cassandra/utils/TimeUUID.java @@ -46,6 +46,7 @@ import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.marshal.ValueAccessor; import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.UnversionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.locator.InetAddressAndPort; @@ -143,7 +144,12 @@ public static TimeUUID fromBytes(long msb, long lsb) public static TimeUUID deserialize(ByteBuffer buffer) { - return fromBytes(buffer.getLong(buffer.position()), buffer.getLong(buffer.position() + 8)); + return deserialize(buffer, buffer.position()); + } + + public static TimeUUID deserialize(ByteBuffer buffer, int position) + { + return fromBytes(buffer.getLong(position), buffer.getLong(position + 8)); } public static TimeUUID deserialize(DataInput in) throws IOException @@ -237,6 +243,11 @@ public static long unixMicrosToRawTimestamp(long unixMicros) return unixMicros * 10 - (UUID_EPOCH_UNIX_MILLIS * 10000); } + public static long unixMicrosToMsb(long unixMicros) + { + return TimeUUID.rawTimestampToMsb(TimeUUID.unixMicrosToRawTimestamp(unixMicros)); + } + public static long msbToRawTimestamp(long msb) { assert (UUID_VERSION_BITS_IN_MSB & msb) == TIMESTAMP_UUID_VERSION_IN_MSB; @@ -351,7 +362,7 @@ public ByteBuffer serialize(T value) } } - public static class Serializer extends AbstractSerializer implements IVersionedSerializer + public static class Serializer extends AbstractSerializer implements IVersionedSerializer, UnversionedSerializer { public static final Serializer instance = new Serializer(); @@ -371,17 +382,35 @@ public void serialize(TimeUUID t, DataOutputPlus out, int version) throws IOExce t.serialize(out); } + @Override + public void serialize(TimeUUID t, DataOutputPlus out) throws IOException + { + t.serialize(out); + } + @Override public TimeUUID deserialize(DataInputPlus in, int version) throws IOException { return TimeUUID.deserialize(in); } + @Override + public TimeUUID deserialize(DataInputPlus in) throws IOException + { + return TimeUUID.deserialize(in); + } + @Override public long serializedSize(TimeUUID t, int version) { return 16; } + + @Override + public long serializedSize(TimeUUID t) + { + return 16; + } } public static class Generator diff --git a/src/java/org/apache/cassandra/utils/TriPredicate.java b/src/java/org/apache/cassandra/utils/TriPredicate.java new file mode 100644 index 000000000000..a443d823f3f2 --- /dev/null +++ b/src/java/org/apache/cassandra/utils/TriPredicate.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +public interface TriPredicate +{ + boolean test(A a, B b, C c); +} diff --git a/src/java/org/apache/cassandra/utils/UUIDGen.java b/src/java/org/apache/cassandra/utils/UUIDGen.java index 14ab23083b5f..b6bdba2cd4d6 100644 --- a/src/java/org/apache/cassandra/utils/UUIDGen.java +++ b/src/java/org/apache/cassandra/utils/UUIDGen.java @@ -30,7 +30,12 @@ public class UUIDGen /** creates a type 1 uuid from raw bytes. */ public static UUID getUUID(ByteBuffer raw) { - return new UUID(raw.getLong(raw.position()), raw.getLong(raw.position() + 8)); + return getUUID(raw.getLong(raw.position()), raw.getLong(raw.position() + 8)); + } + + public static UUID getUUID(long mostSigBits, long leastSigBits) + { + return new UUID(mostSigBits, leastSigBits); } public static ByteBuffer toByteBuffer(UUID uuid) @@ -45,13 +50,16 @@ public static ByteBuffer toByteBuffer(UUID uuid) /** decomposes a uuid into raw bytes. */ public static byte[] decompose(UUID uuid) { - long most = uuid.getMostSignificantBits(); - long least = uuid.getLeastSignificantBits(); + return decompose(uuid.getMostSignificantBits(), uuid.getLeastSignificantBits()); + } + + public static byte[] decompose(long msb, long lsb) + { byte[] b = new byte[16]; for (int i = 0; i < 8; i++) { - b[i] = (byte)(most >>> ((7-i) * 8)); - b[8+i] = (byte)(least >>> ((7-i) * 8)); + b[i] = (byte)(msb >>> ((7-i) * 8)); + b[8+i] = (byte)(lsb >>> ((7-i) * 8)); } return b; } diff --git a/src/java/org/apache/cassandra/utils/UUIDSerializer.java b/src/java/org/apache/cassandra/utils/UUIDSerializer.java index 47b6f8c565a2..f1cd26d15f6b 100644 --- a/src/java/org/apache/cassandra/utils/UUIDSerializer.java +++ b/src/java/org/apache/cassandra/utils/UUIDSerializer.java @@ -22,25 +22,47 @@ import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.UnversionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; -public class UUIDSerializer implements IVersionedSerializer +public class UUIDSerializer implements IVersionedSerializer, UnversionedSerializer { public static UUIDSerializer serializer = new UUIDSerializer(); + @Override public void serialize(UUID uuid, DataOutputPlus out, int version) throws IOException + { + serialize(uuid, out); + } + + @Override + public void serialize(UUID uuid, DataOutputPlus out) throws IOException { out.writeLong(uuid.getMostSignificantBits()); out.writeLong(uuid.getLeastSignificantBits()); } + @Override public UUID deserialize(DataInputPlus in, int version) throws IOException + { + return deserialize(in); + } + + @Override + public UUID deserialize(DataInputPlus in) throws IOException { return new UUID(in.readLong(), in.readLong()); } + @Override public long serializedSize(UUID uuid, int version) + { + return serializedSize(uuid); + } + + @Override + public long serializedSize(UUID uuid) { return TypeSizes.sizeof(uuid.getMostSignificantBits()) + TypeSizes.sizeof(uuid.getLeastSignificantBits()); } diff --git a/src/java/org/apache/cassandra/utils/btree/AbstractBTreeMap.java b/src/java/org/apache/cassandra/utils/btree/AbstractBTreeMap.java index 3d0baf08f178..7ef33b417a0d 100644 --- a/src/java/org/apache/cassandra/utils/btree/AbstractBTreeMap.java +++ b/src/java/org/apache/cassandra/utils/btree/AbstractBTreeMap.java @@ -24,6 +24,7 @@ import java.util.Comparator; import java.util.Iterator; import java.util.Map; +import java.util.NavigableSet; import java.util.Set; import com.google.common.collect.Iterators; @@ -32,11 +33,13 @@ public abstract class AbstractBTreeMap extends AbstractMap { protected final Object[] tree; protected final KeyComparator comparator; + protected final AsymmetricKeyComparator asymmetricComparator; - protected AbstractBTreeMap(Object[] tree, KeyComparator comparator) + protected AbstractBTreeMap(Object[] tree, KeyComparator comparator, AsymmetricKeyComparator asymmetricComparator) { this.tree = tree; this.comparator = comparator; + this.asymmetricComparator = asymmetricComparator; } /** @@ -92,15 +95,15 @@ public V get(Object key) { if (key == null) throw new NullPointerException(); - Entry entry = BTree.find(tree, comparator, new Entry<>((K)key, null)); + Entry entry = (Entry) BTree.find(tree, asymmetricComparator, key); if (entry != null) return entry.getValue(); return null; } - private Set keySet = null; + private NavigableSet keySet = null; @Override - public Set keySet() + public NavigableSet keySet() { if (keySet == null) keySet = BTreeSet.wrap(BTree.transformAndFilter(tree, (entry) -> ((Map.Entry)entry).getKey()), comparator.keyComparator); @@ -160,6 +163,22 @@ public int compare(Map.Entry o1, Map.Entry o2) } } + protected static class AsymmetricKeyComparator implements Comparator + { + protected final Comparator keyComparator; + + protected AsymmetricKeyComparator(Comparator keyComparator) + { + this.keyComparator = keyComparator; + } + + @Override + public int compare(Object o1, Object o2) + { + return keyComparator.compare(((Map.Entry)o1).getKey(), (K)o2); + } + } + static class Entry extends AbstractMap.SimpleEntry { public Entry(K key, V value) diff --git a/src/java/org/apache/cassandra/utils/btree/BTree.java b/src/java/org/apache/cassandra/utils/btree/BTree.java index 8674d714daf8..ad01bb2c1cce 100644 --- a/src/java/org/apache/cassandra/utils/btree/BTree.java +++ b/src/java/org/apache/cassandra/utils/btree/BTree.java @@ -29,6 +29,7 @@ import com.google.common.collect.Iterators; import com.google.common.collect.Ordering; +import accord.utils.Invariants; import org.apache.cassandra.utils.BiLongAccumulator; import org.apache.cassandra.utils.BulkIterator; import org.apache.cassandra.utils.LongAccumulator; @@ -140,6 +141,13 @@ public static Object[] build(BulkIterator sourc return buildRoot(source, size, updateF); } + public static Object[] unsafeAllocateNonEmptyLeaf(int size) + { + Invariants.requireArgument(size > 0, "size should be non-zero"); + Invariants.requireArgument(size <= MAX_KEYS, "size (%s) should be no more than %s", size, MAX_KEYS); + return new Object[size | 1]; + } + /** * Build a leaf with {@code size} elements taken in bulk from {@code insert}, and apply {@code updateF} to these elements */ @@ -2393,7 +2401,7 @@ final boolean mustRedistribute() /** * Are we empty, i.e. we have no contents in either {@link #buffer} or {@link #savedBuffer} */ - final boolean isEmpty() + public final boolean isEmpty() { return count == 0 && savedNextKey == null; } @@ -3326,7 +3334,7 @@ public void close() } @Override - void reset() + public void reset() { Arrays.fill(leaf().buffer, null); leaf().count = 0; diff --git a/src/java/org/apache/cassandra/utils/btree/BTreeBiMap.java b/src/java/org/apache/cassandra/utils/btree/BTreeBiMap.java index 1e1984e635c7..3480b8dbaa42 100644 --- a/src/java/org/apache/cassandra/utils/btree/BTreeBiMap.java +++ b/src/java/org/apache/cassandra/utils/btree/BTreeBiMap.java @@ -29,16 +29,32 @@ public class BTreeBiMap extends AbstractBTreeMap implements BiMap valueComparator; + private final AsymmetricKeyComparator asymmetricValueComparator; protected static BTreeBiMap withComparators(Object[] tree, Object [] inverse, Comparator comparator, Comparator valueComparator) { - return new BTreeBiMap<>(tree, inverse, new KeyComparator<>(comparator), new KeyComparator<>(valueComparator)); + KeyComparator keyComparator = new KeyComparator<>(comparator); + AsymmetricKeyComparator asymmetricKeyComparator = new AsymmetricKeyComparator<>(comparator); + KeyComparator valueKeyComparator; + AsymmetricKeyComparator asymmetricValueComparator; + if (comparator == valueComparator) + { + valueKeyComparator = (KeyComparator) keyComparator; + asymmetricValueComparator = (AsymmetricKeyComparator) asymmetricKeyComparator; + } + else + { + valueKeyComparator = new KeyComparator<>(valueComparator); + asymmetricValueComparator = new AsymmetricKeyComparator<>(valueComparator); + } + return new BTreeBiMap<>(tree, inverse, keyComparator, asymmetricKeyComparator, valueKeyComparator, asymmetricValueComparator); } - private BTreeBiMap(Object[] tree, Object [] inverse, KeyComparator comparator, KeyComparator valueComparator) + private BTreeBiMap(Object[] tree, Object [] inverse, KeyComparator comparator, AsymmetricKeyComparator asymmetricKeyComparator, KeyComparator valueComparator, AsymmetricKeyComparator asymmetricValueComparator) { - super(tree, comparator); + super(tree, comparator, asymmetricKeyComparator); this.valueComparator = valueComparator; + this.asymmetricValueComparator = asymmetricValueComparator; this.inverse = inverse; } @@ -55,7 +71,7 @@ public static , V extends Comparable> BTreeBiMap inverse() { - return new BTreeBiMap<>(inverse, tree, valueComparator, comparator); + return new BTreeBiMap<>(inverse, tree, valueComparator, asymmetricValueComparator, comparator, asymmetricComparator); } @Override @@ -72,8 +88,8 @@ public BTreeBiMap with(K key, V value) return new BTreeBiMap<>(BTree.update(tree, new Object[]{ entry }, comparator, UpdateFunction.noOp()), BTree.update(inverse, new Object[] { new AbstractBTreeMap.Entry<>(value, key) }, valueComparator, UpdateFunction.noOp()), - comparator, - valueComparator); + comparator, asymmetricComparator, + valueComparator, asymmetricValueComparator); } @Override @@ -92,7 +108,7 @@ public BTreeBiMap without(K key) Object[] newTree = BTreeRemoval.remove(tree, comparator, new AbstractBTreeMap.Entry<>(key, null)); Object[] newInverse = BTreeRemoval.remove(inverse, valueComparator, new AbstractBTreeMap.Entry<>(existingEntry.getValue(), null)); - return new BTreeBiMap<>(newTree, newInverse, comparator, valueComparator); + return new BTreeBiMap<>(newTree, newInverse, comparator, asymmetricComparator, valueComparator, asymmetricValueComparator); } public Set values() diff --git a/src/java/org/apache/cassandra/utils/btree/BTreeMap.java b/src/java/org/apache/cassandra/utils/btree/BTreeMap.java index 2d8e92a298a3..c27cf2125b53 100644 --- a/src/java/org/apache/cassandra/utils/btree/BTreeMap.java +++ b/src/java/org/apache/cassandra/utils/btree/BTreeMap.java @@ -33,12 +33,12 @@ public class BTreeMap extends AbstractBTreeMap implements NavigableM { protected static BTreeMap withComparator(Object[] tree, Comparator comparator) { - return new BTreeMap<>(tree, new KeyComparator<>(comparator)); + return new BTreeMap<>(tree, new KeyComparator<>(comparator), new AsymmetricKeyComparator<>(comparator)); } - protected BTreeMap(Object[] tree, KeyComparator comparator) + protected BTreeMap(Object[] tree, KeyComparator comparator, AsymmetricKeyComparator asymmetricComparator) { - super(tree, comparator); + super(tree, comparator, asymmetricComparator); } public static BTreeMap empty(Comparator comparator) @@ -61,7 +61,7 @@ public BTreeMap with(K key, V value) AbstractBTreeMap.Entry existing; if ((existing = BTree.find(tree, comparator, entry)) != null && !existing.equals(entry)) throw new IllegalStateException("Map already contains " + key); - return new BTreeMap<>(BTree.update(tree, new Object[]{ entry }, comparator, UpdateFunction.noOp()), comparator); + return new BTreeMap<>(BTree.update(tree, BTree.singleton(entry), comparator, UpdateFunction.noOp()), comparator, asymmetricComparator); } public BTreeMap withForce(K key, V value) @@ -69,7 +69,7 @@ public BTreeMap withForce(K key, V value) if (key == null || value == null) throw new NullPointerException(); AbstractBTreeMap.Entry entry = new AbstractBTreeMap.Entry<>(key, value); - return new BTreeMap<>(BTree.update(tree, new Object[] { entry }, comparator, UpdateFunction.Simple.of((a, b) -> b)), comparator); + return new BTreeMap<>(BTree.update(tree, BTree.singleton(entry), comparator, UpdateFunction.Simple.of((a, b) -> b)), comparator, asymmetricComparator); } public BTreeMap without(K key) @@ -77,13 +77,14 @@ public BTreeMap without(K key) if (key == null) throw new NullPointerException(); - return new BTreeMap<>(BTreeRemoval.remove(tree, comparator, new AbstractBTreeMap.Entry<>(key, null)), comparator); + return new BTreeMap<>(BTreeRemoval.remove(tree, asymmetricComparator, key), comparator, asymmetricComparator); } @Override public Map.Entry lowerEntry(K key) { - return BTree.lower(tree, comparator, new AbstractBTreeMap.Entry<>(key, null)); + //noinspection unchecked + return (Map.Entry) BTree.lower(tree, asymmetricComparator, key); } @Override @@ -96,7 +97,8 @@ public K lowerKey(K key) @Override public Map.Entry floorEntry(K key) { - return BTree.floor(tree, comparator, new AbstractBTreeMap.Entry<>(key, null)); + //noinspection unchecked + return (Map.Entry) BTree.floor(tree, asymmetricComparator, key); } @Override @@ -109,7 +111,8 @@ public K floorKey(K key) @Override public Map.Entry ceilingEntry(K key) { - return BTree.ceil(tree, comparator, new AbstractBTreeMap.Entry<>(key, null)); + //noinspection unchecked + return (Map.Entry) BTree.ceil(tree, asymmetricComparator, key); } @Override @@ -122,7 +125,8 @@ public K ceilingKey(K key) @Override public Map.Entry higherEntry(K key) { - return BTree.higher(tree, comparator, new AbstractBTreeMap.Entry<>(key, null)); + //noinspection unchecked + return (Map.Entry) BTree.higher(tree, asymmetricComparator, key); } @Override @@ -151,8 +155,9 @@ public Map.Entry lastEntry() @Override public NavigableMap descendingMap() { + Comparator reversed = comparator.keyComparator.reversed(); return new BTreeMap<>(BTree.build(BulkIterator.of(BTree.iterable(tree, BTree.Dir.DESC).iterator()), BTree.size(tree), UpdateFunction.noOp), - new KeyComparator<>(comparator.keyComparator.reversed())); + new KeyComparator<>(reversed), new AsymmetricKeyComparator<>(reversed)); } @Override diff --git a/src/java/org/apache/cassandra/utils/btree/BTreeSet.java b/src/java/org/apache/cassandra/utils/btree/BTreeSet.java index b9d9ed71729b..20ee7cf0440b 100644 --- a/src/java/org/apache/cassandra/utils/btree/BTreeSet.java +++ b/src/java/org/apache/cassandra/utils/btree/BTreeSet.java @@ -38,7 +38,6 @@ import static org.apache.cassandra.utils.btree.BTree.findIndex; - public class BTreeSet extends AbstractSet implements NavigableSet, List { protected final Comparator comparator; @@ -239,6 +238,8 @@ public boolean containsAll(Collection c) return false; return true; } + + @Override public int hashCode() { // we can't just delegate to Arrays.deepHashCode(), diff --git a/src/java/org/apache/cassandra/utils/concurrent/AsyncFuture.java b/src/java/org/apache/cassandra/utils/concurrent/AsyncFuture.java index 89cea2dbe3c4..3e5052e2dc8a 100644 --- a/src/java/org/apache/cassandra/utils/concurrent/AsyncFuture.java +++ b/src/java/org/apache/cassandra/utils/concurrent/AsyncFuture.java @@ -19,14 +19,18 @@ package org.apache.cassandra.utils.concurrent; import java.util.concurrent.Executor; -import java.util.concurrent.atomic.AtomicReferenceFieldUpdater; +import java.util.concurrent.locks.LockSupport; import java.util.function.Function; import javax.annotation.Nullable; import com.google.common.util.concurrent.AsyncFunction; import com.google.common.util.concurrent.ListenableFuture; // checkstyle: permit this import +import accord.utils.Invariants; import io.netty.util.concurrent.GenericFutureListener; +import org.apache.cassandra.utils.concurrent.ListenerList.Waiting; + +import static org.apache.cassandra.utils.Clock.Global.nanoTime; /** * Our default {@link Future} implementation, with all state being managed without locks (except those used by the JVM). @@ -45,11 +49,6 @@ */ public class AsyncFuture extends AbstractFuture { - @SuppressWarnings({ "rawtypes" }) - private static final AtomicReferenceFieldUpdater waitingUpdater = AtomicReferenceFieldUpdater.newUpdater(AsyncFuture.class, WaitQueue.class, "waiting"); - @SuppressWarnings({ "unused" }) - private volatile WaitQueue waiting; - public AsyncFuture() { super(); @@ -100,10 +99,7 @@ boolean trySet(Object v) if (resultUpdater.compareAndSet(this, current, v)) { if (v != UNCANCELLABLE) - { ListenerList.notify(listenersUpdater, this); - AsyncAwaitable.signalAll(waitingUpdater, this); - } return true; } } @@ -122,6 +118,17 @@ void appendListener(ListenerList newListener) ListenerList.notify(listenersUpdater, this); } + /** + * Logically append {@code newListener} to {@link #listeners} + * (at this stage it is a stack, so we actually prepend) + * + * @param newListener must be either a {@link ListenerList} or {@link GenericFutureListener} + */ + boolean appendListenerIfNotNotifying(ListenerList newListener) + { + return ListenerList.pushIfNotNotifying(listenersUpdater, this, newListener); + } + /** * Support {@link com.google.common.util.concurrent.Futures#transform} natively * @@ -161,14 +168,64 @@ public Future andThenAsync(Function> andTh @Override public AsyncFuture await() throws InterruptedException { - //noinspection unchecked - return AsyncAwaitable.await(waitingUpdater, Future::isDone, this); + if (isDone()) + return this; + + Waiting waiting = new Waiting<>(); + if (!appendListenerIfNotNotifying(waiting)) + { + Invariants.require(isDone()); + return this; + } + + while (true) + { + if (isDone()) + return this; + + LockSupport.park(); + + if (Thread.interrupted()) + { + waiting.cancel(); + throw new InterruptedException(); + } + } } @Override public boolean awaitUntil(long nanoTimeDeadline) throws InterruptedException { - return AsyncAwaitable.awaitUntil(waitingUpdater, Future::isDone, this, nanoTimeDeadline); + if (isDone()) + return true; + + Waiting waiting = new Waiting<>(); + if (!appendListenerIfNotNotifying(waiting)) + { + Invariants.require(isDone()); + return true; + } + + while (true) + { + if (isDone()) + return true; + + long wait = nanoTimeDeadline - nanoTime(); + if (wait <= 0) + { + waiting.cancel(); + return false; + } + + LockSupport.parkNanos(wait); + + if (Thread.interrupted()) + { + waiting.cancel(); + throw new InterruptedException(); + } + } } } diff --git a/src/java/org/apache/cassandra/utils/concurrent/ConcurrentLinkedStack.java b/src/java/org/apache/cassandra/utils/concurrent/ConcurrentLinkedStack.java new file mode 100644 index 000000000000..bebe1c7434de --- /dev/null +++ b/src/java/org/apache/cassandra/utils/concurrent/ConcurrentLinkedStack.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils.concurrent; + +import java.util.concurrent.atomic.AtomicReferenceFieldUpdater; +import java.util.function.BiConsumer; +import java.util.function.Consumer; + +public class ConcurrentLinkedStack +{ + static final class Node extends IntrusiveStack> + { + final T value; + Node(T value) + { + this.value = value; + } + } + + private volatile Node head; + private static final AtomicReferenceFieldUpdater headUpdater = AtomicReferenceFieldUpdater.newUpdater(ConcurrentLinkedStack.class, Node.class, "head"); + + public void push(T value) + { + IntrusiveStack.getAndPush(headUpdater, this, (Node)new Node<>(value)); + } + + public boolean isEmpty() + { + return head == null; + } + + public void drain(Consumer forEach, boolean reverse) + { + if (isEmpty()) + return; + + Node head = headUpdater.getAndSet(this, null); + if (reverse) head = IntrusiveStack.reverse(head); + IntrusiveStack.forEach(head, n -> n.value, forEach); + } + + public

    void drain(BiConsumer forEach, P param, boolean reverse) + { + if (isEmpty()) + return; + + Node head = headUpdater.getAndSet(this, null); + if (reverse) head = IntrusiveStack.reverse(head); + IntrusiveStack.forEach(head, n -> n.value, forEach, param); + } +} diff --git a/src/java/org/apache/cassandra/utils/concurrent/FutureCombiner.java b/src/java/org/apache/cassandra/utils/concurrent/FutureCombiner.java index b01ce0b59469..709e5da60811 100644 --- a/src/java/org/apache/cassandra/utils/concurrent/FutureCombiner.java +++ b/src/java/org/apache/cassandra/utils/concurrent/FutureCombiner.java @@ -20,6 +20,7 @@ import io.netty.util.concurrent.GenericFutureListener; import io.netty.util.concurrent.GlobalEventExecutor; +import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.List; @@ -56,6 +57,7 @@ private interface ListenerFactory */ private static class Listener extends AtomicInteger implements GenericFutureListener> { + private static final long serialVersionUID = 0; // for simulator support Supplier onSuccess; // non-final so we can release resources immediately when failing fast final FutureCombiner complete; @@ -241,6 +243,11 @@ public static Future> allOf(Collection(futures, () -> futures.stream().map(f -> f.getNow()).collect(Collectors.toList()), FailFastListener::new); } + public static Future> allOf(io.netty.util.concurrent.Future... futures) + { + return allOf(Arrays.asList(futures)); + } + /** * Waits for all futures to complete, returning a list containing values of all successful input futures. This * emulates Guava's Futures::successfulAsList in that results will be in the same order as inputs and any diff --git a/src/java/org/apache/cassandra/utils/concurrent/IntrusiveStack.java b/src/java/org/apache/cassandra/utils/concurrent/IntrusiveStack.java index e61d56545fb0..d27ed598f090 100644 --- a/src/java/org/apache/cassandra/utils/concurrent/IntrusiveStack.java +++ b/src/java/org/apache/cassandra/utils/concurrent/IntrusiveStack.java @@ -20,6 +20,7 @@ import java.util.Iterator; import java.util.concurrent.atomic.AtomicReferenceFieldUpdater; +import java.util.function.BiConsumer; import java.util.function.BiFunction; import java.util.function.Consumer; import java.util.function.Function; @@ -64,39 +65,51 @@ public T next() T next; @Inline - protected static > T push(AtomicReferenceFieldUpdater headUpdater, O owner, T prepend) + protected static > T getAndPush(AtomicReferenceFieldUpdater headUpdater, O owner, T prepend) { - return push(headUpdater, owner, prepend, (prev, next) -> { + return getAndPush(headUpdater, owner, prepend, (prev, next) -> { next.next = prev; return next; }); } - protected static > T push(AtomicReferenceFieldUpdater headUpdater, O owner, T prepend, BiFunction combine) + protected static > T getAndPush(AtomicReferenceFieldUpdater headUpdater, O owner, T prepend, BiFunction combine) { while (true) { T head = headUpdater.get(owner); - if (headUpdater.compareAndSet(owner, head, combine.apply(head, prepend))) + T newHead = combine.apply(head, prepend); + if (headUpdater.compareAndSet(owner, head, newHead)) return head; } } + protected static > T pushAndGet(AtomicReferenceFieldUpdater headUpdater, O owner, T prepend, BiFunction combine) + { + while (true) + { + T head = headUpdater.get(owner); + T newHead = combine.apply(head, prepend); + if (head == newHead || headUpdater.compareAndSet(owner, head, newHead)) + return newHead; + } + } + protected interface Setter { public boolean compareAndSet(O owner, T expect, T update); } @Inline - protected static > T push(Function getter, Setter setter, O owner, T prepend) + protected static > T getAndPush(Function getter, Setter setter, O owner, T prepend) { - return push(getter, setter, owner, prepend, (prev, next) -> { + return getAndPush(getter, setter, owner, prepend, (prev, next) -> { next.next = prev; return next; }); } - protected static > T push(Function getter, Setter setter, O owner, T prepend, BiFunction combine) + protected static > T getAndPush(Function getter, Setter setter, O owner, T prepend, BiFunction combine) { while (true) { @@ -189,10 +202,25 @@ public void forEach(Consumer forEach) } protected static > void forEach(T list, Consumer forEach) + { + forEach(list, Function.identity(), forEach); + } + + protected static , P> void forEach(T list, BiConsumer forEach, P param) + { + forEach(list, Function.identity(), forEach, param); + } + + protected static , V> void forEach(T list, Function getter, Consumer forEach) + { + forEach(list, getter, Consumer::accept, forEach); + } + + protected static , V> void forEach(T list, Function getter, BiConsumer forEach, P param) { while (list != null) { - forEach.accept(list); + forEach.accept(param, getter.apply(list)); list = list.next; } } diff --git a/src/java/org/apache/cassandra/utils/concurrent/ListenerList.java b/src/java/org/apache/cassandra/utils/concurrent/ListenerList.java index 40b908b4e10e..1ba3f83d50a0 100644 --- a/src/java/org/apache/cassandra/utils/concurrent/ListenerList.java +++ b/src/java/org/apache/cassandra/utils/concurrent/ListenerList.java @@ -20,6 +20,7 @@ import java.util.concurrent.Executor; import java.util.concurrent.atomic.AtomicReferenceFieldUpdater; +import java.util.concurrent.locks.LockSupport; import java.util.function.BiConsumer; import java.util.function.Consumer; import javax.annotation.Nullable; @@ -50,13 +51,22 @@ static ListenerList pushHead(ListenerList prev, ListenerList next) { Notifying result = new Notifying(); result.next = next; - next.next = prev == NOTIFYING ? null : prev; + next.next = prev.next; return result; } next.next = prev; return next; } + static ListenerList pushHeadIfNotNotifying(ListenerList prev, ListenerList next) + { + if (prev instanceof Notifying) + return prev; + + next.next = prev; + return next; + } + /** * Logically append {@code newListener} to {@link #listeners} * (at this stage it is a stack, so we actually prepend) @@ -66,7 +76,12 @@ static ListenerList pushHead(ListenerList prev, ListenerList next) @Inline static void push(AtomicReferenceFieldUpdater updater, T in, ListenerList newListener) { - IntrusiveStack.push(updater, in, newListener, ListenerList::pushHead); + IntrusiveStack.getAndPush(updater, in, newListener, ListenerList::pushHead); + } + + static boolean pushIfNotNotifying(AtomicReferenceFieldUpdater updater, T in, ListenerList newListener) + { + return newListener == IntrusiveStack.pushAndGet(updater, in, newListener, ListenerList::pushHeadIfNotNotifying); } /** @@ -91,13 +106,22 @@ static > void notify(AtomicReferenceFieldUpdater future) static class Notifying extends ListenerList { static final Notifying NOTIFYING = new Notifying(); + static final Notifying DONE = new Notifying(); @Override void notifySelf(Executor notifyExecutor, Future future) @@ -361,6 +386,27 @@ void notifySelf(Executor notifyExecutor, Future future) } } + static class Waiting extends ListenerList + { + volatile Thread waiting = Thread.currentThread(); + + @Override + void notifySelf(Executor notifyExecutor, Future future) + { + Thread thread = waiting; + if (thread != null) + { + LockSupport.unpark(thread); + waiting = null; + } + } + + void cancel() + { + waiting = null; + } + } + /** * @return true iff the invoking thread is executing {@code executor} */ diff --git a/src/java/org/apache/cassandra/utils/concurrent/LockWithAsyncSignal.java b/src/java/org/apache/cassandra/utils/concurrent/LockWithAsyncSignal.java new file mode 100644 index 000000000000..2f358d93856f --- /dev/null +++ b/src/java/org/apache/cassandra/utils/concurrent/LockWithAsyncSignal.java @@ -0,0 +1,268 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils.concurrent; + +import java.util.concurrent.ConcurrentSkipListSet; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicIntegerFieldUpdater; +import java.util.concurrent.atomic.AtomicLongFieldUpdater; +import java.util.concurrent.atomic.AtomicReferenceFieldUpdater; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.LockSupport; + +import accord.utils.Invariants; + +// WARNING: experimental - needs more testing +public class LockWithAsyncSignal implements Lock +{ + interface AwaitFunction + { + T await(LockWithAsyncSignal lock, Waiter waiter) throws T; + } + + private static final Waiter AWAITING_LOCK = new Waiter(0, null); + + private volatile Thread owner; + private static final AtomicReferenceFieldUpdater ownerUpdater = AtomicReferenceFieldUpdater.newUpdater(LockWithAsyncSignal.class, Thread.class, "owner"); + private int depth; + + // TODO (desired): better combined queue + final ConcurrentSkipListSet waiters = new ConcurrentSkipListSet<>(); + + static class Waiter implements Comparable + { + final long ticket; + final Thread thread; + + Waiter(long ticket, Thread thread) + { + this.ticket = ticket; + this.thread = thread; + } + + @Override + public int compareTo(Waiter that) + { + return Long.compare(this.ticket, that.ticket); + } + } + + volatile int signal; + private static final AtomicIntegerFieldUpdater signalUpdater = AtomicIntegerFieldUpdater.newUpdater(LockWithAsyncSignal.class, "signal"); + + volatile long ticket; + private static final AtomicLongFieldUpdater ticketUpdater = AtomicLongFieldUpdater.newUpdater(LockWithAsyncSignal.class, "ticket"); + + public void lock() + { + lockInternal(LockWithAsyncSignal::awaitUninterruptibly); + } + + public void lockInterruptibly() throws InterruptedException + { + lockInternal(LockWithAsyncSignal::awaitThrows); + } + + private void lockInternal(AwaitFunction await) throws T + { + Thread thread = Thread.currentThread(); + if (ownerUpdater.compareAndSet(this, null, thread) || owner == thread) + { + ++depth; + } + else + { + awaitLock(false, thread, 1, await); + } + } + + public boolean tryLock() + { + Thread thread = Thread.currentThread(); + if (!ownerUpdater.compareAndSet(this, null, thread) && owner != thread) + return false; + + ++depth; + return true; + } + + public void await() throws InterruptedException + { + await(LockWithAsyncSignal::awaitDeferThrow); + } + + public void awaitUninterruptibly() + { + await(LockWithAsyncSignal::awaitUninterruptibly); + } + + private void await(AwaitFunction await) throws T + { + Thread thread = Thread.currentThread(); + int restoreDepth = depth; + Invariants.require(owner == thread); + + depth = 0; + owner = null; + + awaitLock(true, thread, restoreDepth, await); + } + + public void unlock() + { + Invariants.require(owner == Thread.currentThread()); + if (--depth > 0) + return; + + owner = null; + wakeOne(); + } + + private void awaitLock(boolean awaitingSignal, Thread thread, int restoreDepth, AwaitFunction await) throws T + { + T pending = null; + while (true) + { + Waiter waiter = register(awaitingSignal, thread); + if (awaitingSignal && signal == 0) + { + if (owner == null) + wakeOne(false); // will not wake ourselves as we only signal pure lock waiters + } + else if (ownerUpdater.compareAndSet(this, null, thread)) + { + depth = restoreDepth; + waiters.remove(waiter); + if (pending != null) + throw pending; + return; + } + pending = firstNonNull(pending, await.await(this, waiter)); + awaitingSignal &= pending == null; + } + } + + private static T firstNonNull(T cur, T next) + { + return cur != null ? cur : next; + } + + public void signal() + { + if (signalUpdater.compareAndSet(this, 0, 1) && owner == null) + wakeOne(true); + } + + public void clearSignal() + { + signal = 0; + } + + public boolean isOwner(Thread thread) + { + return thread == owner; + } + + private Waiter register(boolean awaitingSignal, Thread thread) + { + long ticket = ticketUpdater.updateAndGet(this, v -> v == Long.MAX_VALUE ? 1 : v + 1); + if (awaitingSignal) + ticket = -ticket; + Waiter waiter = new Waiter(ticket, thread); + waiters.add(waiter); + return waiter; + } + + private InterruptedException awaitDeferThrow(Waiter waiter) + { + while (waiters.contains(waiter)) + { + if (Thread.interrupted()) + { + waiters.remove(waiter); + return new InterruptedException(); + } + LockSupport.park(); + } + return null; + } + + private InterruptedException awaitThrows(Waiter waiter) throws InterruptedException + { + while (waiters.contains(waiter)) + { + if (Thread.interrupted()) + { + if (!waiters.remove(waiter)) + wakeOne(waiter.ticket < 0 || signal > 0); + + throw new InterruptedException(); + } + LockSupport.park(); + } + return null; + } + + private RuntimeException awaitUninterruptibly(Waiter waiter) + { + while (waiters.contains(waiter)) + LockSupport.park(); + return null; + } + + private void wakeOne() + { + wakeOne(signal > 0); + } + + private void wakeOne(boolean awaitingSignal) + { + Waiter wake; + if (awaitingSignal) + { + wake = waiters.pollFirst(); + if (wake == null) + return; + } + else + { + do + { + wake = waiters.ceiling(AWAITING_LOCK); + if (wake == null) + return; + } while (!waiters.remove(wake)); + } + + LockSupport.unpark(wake.thread); + } + + @Override + public boolean tryLock(long time, TimeUnit unit) throws InterruptedException + { + throw new UnsupportedOperationException(); + } + + @Override + public Condition newCondition() + { + throw new UnsupportedOperationException(); + } +} diff --git a/src/java/org/apache/cassandra/utils/concurrent/Ref.java b/src/java/org/apache/cassandra/utils/concurrent/Ref.java index e268f5fd73c2..6719948e3343 100644 --- a/src/java/org/apache/cassandra/utils/concurrent/Ref.java +++ b/src/java/org/apache/cassandra/utils/concurrent/Ref.java @@ -61,6 +61,7 @@ import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.SimulatorSafe.UNSAFE; import static org.apache.cassandra.config.CassandraRelevantProperties.TEST_DEBUG_REF_COUNT; +import static org.apache.cassandra.config.CassandraRelevantProperties.TEST_DEBUG_REF_EVENTS; import static org.apache.cassandra.utils.Shared.Scope.SIMULATION; import static org.apache.cassandra.utils.Throwables.maybeFail; import static org.apache.cassandra.utils.Throwables.merge; @@ -99,7 +100,8 @@ public final class Ref implements RefCounted { static final Logger logger = LoggerFactory.getLogger(Ref.class); - public static final boolean DEBUG_ENABLED = TEST_DEBUG_REF_COUNT.getBoolean(); + public static final boolean TRACE_ENABLED = TEST_DEBUG_REF_COUNT.getBoolean(); + public static final boolean DEBUG_EVENTS_ENABLED = TEST_DEBUG_REF_EVENTS.getBoolean(); static OnLeak ON_LEAK; @Shared(scope = SIMULATION) @@ -154,6 +156,11 @@ public T get() return referent; } + public Tidy tidier() + { + return state.globalState.tidy; + } + public Ref tryRef() { return state.globalState.ref() ? new Ref<>(referent, state.globalState) : null; @@ -170,10 +177,10 @@ public Ref ref() public String printDebugInfo() { - if (DEBUG_ENABLED) + if (TRACE_ENABLED) { - state.debug.log(state.toString()); - return "Memory was freed by " + state.debug.deallocateThread; + ((Debug)state.debug).log(state.toString()); + return "Memory was freed by " + ((Debug)state.debug).deallocateThread; } return "Memory was freed"; } @@ -191,7 +198,7 @@ public int globalCount() // ensures it is only released once, and that it is always released static final class State extends PhantomReference { - final Debug debug = DEBUG_ENABLED ? new Debug() : null; + final Object debug = TRACE_ENABLED ? new Debug() : DEBUG_EVENTS_ENABLED ? new ArrayList<>() : null; final GlobalState globalState; private volatile int released; @@ -206,8 +213,8 @@ static final class State extends PhantomReference void assertNotReleased() { - if (DEBUG_ENABLED && released == 1) - debug.log(toString()); + if (TRACE_ENABLED && released == 1) + ((Debug)debug).log(toString()); assert released == 0; } @@ -216,8 +223,8 @@ Throwable ensureReleased(Throwable accumulate) if (releasedUpdater.getAndSet(this, 1) == 0) { accumulate = globalState.release(this, accumulate); - if (DEBUG_ENABLED) - debug.deallocate(); + if (TRACE_ENABLED) + ((Debug)debug).deallocate(); } return accumulate; } @@ -230,8 +237,8 @@ void release(boolean leak) { String id = this.toString(); logger.error("BAD RELEASE: attempted to release a reference ({}) that has already been released", id); - if (DEBUG_ENABLED) - debug.log(id); + if (TRACE_ENABLED) + ((Debug)debug).log(id); throw new IllegalStateException("Attempted to release a reference that has already been released"); } return; @@ -240,16 +247,16 @@ void release(boolean leak) if (leak) { String id = this.toString(); - logger.error("LEAK DETECTED: a reference ({}) to {} was not released before the reference was garbage collected", id, globalState); - if (DEBUG_ENABLED) - debug.log(id); + logger.error("LEAK DETECTED: a reference ({}) to {} was not released before the reference was garbage collected{}", id, globalState, (DEBUG_EVENTS_ENABLED ? "(debug: " + debug + ')' : "")); + if (TRACE_ENABLED) + ((Debug)debug).log(id); OnLeak onLeak = ON_LEAK; if (onLeak != null) onLeak.onLeak(this); } - else if (DEBUG_ENABLED) + else if (TRACE_ENABLED) { - debug.deallocate(); + ((Debug)debug).deallocate(); } if (fail != null) logger.error("Error when closing {}", globalState, fail); @@ -299,6 +306,12 @@ String print(String thread, StackTraceElement[] trace) } } + public void debug(String event) + { + if (DEBUG_EVENTS_ENABLED) + ((List)state.debug).add(event); + } + // the object that manages the actual cleaning up; this does not reference the target object // so that we can detect when references are lost to the resource itself, and still cleanup afterwards // the Tidy object MUST NOT contain any references to the object we are managing @@ -383,10 +396,10 @@ public String toString() private static final Set globallyExtant = Collections.newSetFromMap(new ConcurrentHashMap<>()); static final ReferenceQueue referenceQueue = new ReferenceQueue<>(); private static final Shutdownable EXEC = executorFactory().infiniteLoop("Reference-Reaper", Ref::reapOneReference, UNSAFE); - static final ScheduledExecutorService STRONG_LEAK_DETECTOR = !DEBUG_ENABLED ? null : executorFactory().scheduled("Strong-Reference-Leak-Detector"); + static final ScheduledExecutorService STRONG_LEAK_DETECTOR = !TRACE_ENABLED ? null : executorFactory().scheduled("Strong-Reference-Leak-Detector"); static { - if (DEBUG_ENABLED) + if (TRACE_ENABLED) { STRONG_LEAK_DETECTOR.scheduleAtFixedRate(new Visitor(), 1, 15, TimeUnit.MINUTES); STRONG_LEAK_DETECTOR.scheduleAtFixedRate(new StrongLeakDetector(), 2, 15, TimeUnit.MINUTES); @@ -593,6 +606,8 @@ void traverse(final RefCounted.Tidy rootObject) InProgressVisit inProgress = null; while (inProgress != null || !path.isEmpty()) { + if (Thread.currentThread().isInterrupted()) + throw new UncheckedInterruptedException(new InterruptedException()); //If necessary fetch the next object to start tracing if (inProgress == null) inProgress = path.pollLast(); diff --git a/src/java/org/apache/cassandra/utils/concurrent/Semaphore.java b/src/java/org/apache/cassandra/utils/concurrent/Semaphore.java index 01c52c5d9343..a0ac316f2906 100644 --- a/src/java/org/apache/cassandra/utils/concurrent/Semaphore.java +++ b/src/java/org/apache/cassandra/utils/concurrent/Semaphore.java @@ -23,6 +23,7 @@ import org.apache.cassandra.utils.Intercept; import org.apache.cassandra.utils.Shared; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; import static org.apache.cassandra.utils.Shared.Scope.SIMULATION; @Shared(scope = SIMULATION) @@ -99,6 +100,7 @@ public static Semaphore newFairSemaphore(int permits) public static class Standard extends java.util.concurrent.Semaphore implements Semaphore { + private static final long serialVersionUID = 0; // for simulator support public Standard(int permits) { this(permits, false); @@ -138,7 +140,7 @@ public int waiting() */ public boolean tryAcquireUntil(int acquire, long nanoTimeDeadline) throws InterruptedException { - long wait = nanoTimeDeadline - System.nanoTime(); + long wait = nanoTimeDeadline - nanoTime(); return tryAcquire(acquire, Math.max(0, wait), TimeUnit.NANOSECONDS); } diff --git a/src/java/org/apache/cassandra/utils/jmx/AbstractJmxSocketFactory.java b/src/java/org/apache/cassandra/utils/jmx/AbstractJmxSocketFactory.java index 3990c64ee7d4..ef68f524861e 100644 --- a/src/java/org/apache/cassandra/utils/jmx/AbstractJmxSocketFactory.java +++ b/src/java/org/apache/cassandra/utils/jmx/AbstractJmxSocketFactory.java @@ -82,7 +82,7 @@ public Map configure(InetAddress serverAddress, JMXServerOptions.setJmxSystemProperties(jmxEncryptionOptions.getAcceptedProtocols(), jmxEncryptionOptions.getCipherSuites()); logger.info("Enabling JMX SSL using jmx_encryption_options"); - boolean requireClientAuth = jmxEncryptionOptions.getClientAuth() == EncryptionOptions.ClientAuth.REQUIRED; + boolean requireClientAuth = jmxEncryptionOptions.getClientAuth() == EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; String[] ciphers = jmxEncryptionOptions.cipherSuitesArray(); String[] protocols = jmxEncryptionOptions.acceptedProtocolsArray(); SSLContext sslContext = jmxEncryptionOptions.sslContextFactoryInstance.createJSSESslContext(jmxEncryptionOptions.getClientAuth()); diff --git a/src/java/org/apache/cassandra/utils/logging/AbstractVirtualTableAppender.java b/src/java/org/apache/cassandra/utils/logging/AbstractVirtualTableAppender.java new file mode 100644 index 000000000000..7becbc13fcd5 --- /dev/null +++ b/src/java/org/apache/cassandra/utils/logging/AbstractVirtualTableAppender.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils.logging; + +import java.util.LinkedList; +import java.util.List; +import java.util.Optional; + +import ch.qos.logback.classic.spi.LoggingEvent; +import ch.qos.logback.core.AppenderBase; +import org.apache.cassandra.db.virtual.AbstractLoggerVirtualTable; +import org.apache.cassandra.db.virtual.SlowQueriesTable; +import org.apache.cassandra.db.virtual.VirtualKeyspace; +import org.apache.cassandra.db.virtual.VirtualKeyspaceRegistry; +import org.apache.cassandra.db.virtual.VirtualTable; + +import static org.apache.cassandra.schema.SchemaConstants.VIRTUAL_VIEWS; + +public abstract class AbstractVirtualTableAppender extends AppenderBase +{ + private final int defaultRows; + + protected AbstractVirtualTableAppender(int defaultRows) + { + this.defaultRows = defaultRows; + } + + // for holding messages until virtual registry contains logs virtual table + // as it takes some time during startup of a node to initialise virtual tables but messages are + // logged already + protected final List messageBuffer = new LinkedList<>(); + + protected T getVirtualTable(Class vtableClass, String tableName) + { + VirtualKeyspace keyspace = VirtualKeyspaceRegistry.instance.getKeyspaceNullable(VIRTUAL_VIEWS); + + if (keyspace == null) + return null; + + Optional virtualTable = keyspace.tables() + .stream() + .filter(vt -> vt.name().equals(tableName)) + .findFirst(); + + if (virtualTable.isEmpty()) + return null; + + VirtualTable vt = virtualTable.get(); + + if (!vt.getClass().equals(vtableClass)) + throw new IllegalStateException(String.format("Virtual table %s.%s is not backed by an instance of %s but by %s", + VIRTUAL_VIEWS, + tableName, + vtableClass.getName(), + vt.getClass().getName())); + + return (T) vt; + } + + /** + * This method adds an event to virtual table, when present. + * When vtable is null, we will attempt to find it among registered ones. Then not found, we add it to internal + * buffer for later processing. This might happen e.g. for logging tables when log events + * were appended via logging framework sooner than registration of virtual tables was done so after they are registered, + * they would miss logging events happened before being so. + * + * @param vtable vtable to append to + * @param event event to append to + * @param tableName table name of virtual table to append to + * @return vtable or when null, found vtable + */ + protected AbstractLoggerVirtualTable appendToVirtualTable(AbstractLoggerVirtualTable vtable, LoggingEvent event, String tableName) + { + AbstractLoggerVirtualTable foundVtable; + if (vtable == null) + { + foundVtable = getVirtualTable(SlowQueriesTable.class, tableName); + if (foundVtable == null) + addToBuffer(event); + else + foundVtable.add(event); + } + else + { + foundVtable = vtable; + vtable.add(event); + } + + return foundVtable; + } + + @Override + public void stop() + { + synchronized (messageBuffer) + { + messageBuffer.clear(); + super.stop(); + } + } + + /** + * Flushes all log entries which were appended before virtual table was registered. + * + * @see org.apache.cassandra.service.CassandraDaemon#setupVirtualKeyspaces + */ + public void flushBuffer(Class> vtableClass, String tableName) + { + synchronized (messageBuffer) + { + Optional.ofNullable(getVirtualTable(vtableClass, tableName)).ifPresent(vtable -> { + messageBuffer.forEach(vtable::add); + messageBuffer.clear(); + }); + } + } + + protected void addToBuffer(LoggingEvent eventObject) + { + synchronized (messageBuffer) + { + // we restrict how many logging events we can put into buffer, + // so we are not growing without any bound when things go south + if (messageBuffer.size() < defaultRows) + messageBuffer.add(eventObject); + } + } +} diff --git a/src/java/org/apache/cassandra/utils/logging/ClassNameFilter.java b/src/java/org/apache/cassandra/utils/logging/ClassNameFilter.java new file mode 100644 index 000000000000..ef0ca2c5ca4b --- /dev/null +++ b/src/java/org/apache/cassandra/utils/logging/ClassNameFilter.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils.logging; + +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.filter.AbstractMatcherFilter; +import ch.qos.logback.core.spi.FilterReply; + +public class ClassNameFilter extends AbstractMatcherFilter +{ + String loggerName; + + public void setLoggerName(String loggerName) + { + this.loggerName = loggerName; + } + + @Override + public FilterReply decide(ILoggingEvent event) + { + if (!isStarted()) return FilterReply.NEUTRAL; + if (event.getLoggerName().equals(loggerName)) return onMatch; + return onMismatch; + } + + @Override + public void start() + { + if (loggerName != null) super.start(); + } +} diff --git a/src/java/org/apache/cassandra/utils/logging/LogbackLoggingSupport.java b/src/java/org/apache/cassandra/utils/logging/LogbackLoggingSupport.java index e710d44dd1dc..d8f83116bbcd 100644 --- a/src/java/org/apache/cassandra/utils/logging/LogbackLoggingSupport.java +++ b/src/java/org/apache/cassandra/utils/logging/LogbackLoggingSupport.java @@ -18,7 +18,6 @@ package org.apache.cassandra.utils.logging; -import java.lang.management.ManagementFactory; import java.security.AccessControlException; import java.util.ArrayList; import java.util.Iterator; @@ -26,25 +25,21 @@ import java.util.Map; import java.util.Optional; -import javax.management.JMX; -import javax.management.ObjectName; - -import org.apache.cassandra.security.ThreadAwareSecurityManager; +import com.google.common.collect.Maps; import org.apache.commons.lang3.StringUtils; import org.slf4j.LoggerFactory; -import com.google.common.collect.Maps; - import ch.qos.logback.classic.Level; import ch.qos.logback.classic.Logger; import ch.qos.logback.classic.LoggerContext; -import ch.qos.logback.classic.jmx.JMXConfiguratorMBean; import ch.qos.logback.classic.spi.ILoggingEvent; import ch.qos.logback.classic.spi.TurboFilterList; import ch.qos.logback.classic.turbo.ReconfigureOnChangeFilter; import ch.qos.logback.classic.turbo.TurboFilter; +import ch.qos.logback.classic.util.ContextInitializer; import ch.qos.logback.core.Appender; -import ch.qos.logback.core.hook.DelayingShutdownHook; +import ch.qos.logback.core.hook.DefaultShutdownHook; +import org.apache.cassandra.security.ThreadAwareSecurityManager; /** * Encapsulates all logback-specific implementations in a central place. @@ -60,7 +55,8 @@ public class LogbackLoggingSupport implements LoggingSupport @Override public void onStartup() { - checkOnlyOneVirtualTableAppender(); + checkOnlyOneVirtualTableAppender(VirtualTableAppender.class); + checkOnlyOneVirtualTableAppender(SlowQueriesAppender.class); // The default logback configuration in conf/logback.xml allows reloading the // configuration when the configuration file has changed (every 60 seconds by default). @@ -92,7 +88,7 @@ public void onStartup() @Override public void onShutdown() { - DelayingShutdownHook logbackHook = new DelayingShutdownHook(); + DefaultShutdownHook logbackHook = new DefaultShutdownHook(); logbackHook.setContext((LoggerContext) LoggerFactory.getILoggerFactory()); logbackHook.run(); } @@ -105,10 +101,9 @@ public void setLoggingLevel(String classQualifier, String rawLevel) throws Excep // if both classQualifier and rawLevel are empty, reload from configuration if (StringUtils.isBlank(classQualifier) && StringUtils.isBlank(rawLevel)) { - JMXConfiguratorMBean jmxConfiguratorMBean = JMX.newMBeanProxy(ManagementFactory.getPlatformMBeanServer(), - new ObjectName("ch.qos.logback.classic:Name=default,Type=ch.qos.logback.classic.jmx.JMXConfigurator"), - JMXConfiguratorMBean.class); - jmxConfiguratorMBean.reloadDefaultConfiguration(); + LoggerContext lc = (LoggerContext) LoggerFactory.getILoggerFactory(); + lc.reset(); + new ContextInitializer(lc).autoConfig(); return; } // classQualifier is set, but blank level given @@ -138,7 +133,20 @@ public Map getLoggingLevels() } @Override - public Optional> getAppender(Class appenderClass, String name) + public Optional getLogger(String loggerName) + { + LoggerContext lc = (LoggerContext) LoggerFactory.getILoggerFactory(); + for (Logger logBackLogger : lc.getLoggerList()) + { + if (logBackLogger.getName().equals(loggerName)) + return Optional.of(logBackLogger); + } + + return Optional.empty(); + } + + @Override + public > Optional getAppender(Class appenderClass, String appenderName) { LoggerContext lc = (LoggerContext) LoggerFactory.getILoggerFactory(); for (Logger logBackLogger : lc.getLoggerList()) @@ -146,15 +154,15 @@ public Optional> getAppender(Class appenderClass, String name) for (Iterator> iterator = logBackLogger.iteratorForAppenders(); iterator.hasNext();) { Appender appender = iterator.next(); - if (appender.getClass() == appenderClass && appender.getName().equals(name)) - return Optional.of(appender); + if (appender.getClass() == appenderClass && appender.getName().equals(appenderName)) + return Optional.of(appenderClass.cast(appender)); } } return Optional.empty(); } - private void checkOnlyOneVirtualTableAppender() + private void checkOnlyOneVirtualTableAppender(Class appenderClass) { int count = 0; LoggerContext lc = (LoggerContext) LoggerFactory.getILoggerFactory(); @@ -164,7 +172,7 @@ private void checkOnlyOneVirtualTableAppender() for (Iterator> iterator = logBackLogger.iteratorForAppenders(); iterator.hasNext();) { Appender appender = iterator.next(); - if (appender instanceof VirtualTableAppender) + if (appenderClass.isAssignableFrom(appender.getClass())) { virtualAppenderNames.add(appender.getName()); count += 1; @@ -174,7 +182,7 @@ private void checkOnlyOneVirtualTableAppender() if (count > 1) throw new IllegalStateException(String.format("There are multiple appenders of class %s of names %s. There is only one appender of such class allowed.", - VirtualTableAppender.class.getName(), String.join(",", virtualAppenderNames))); + appenderClass.getName(), String.join(",", virtualAppenderNames))); } private boolean hasAppenders(Logger logBackLogger) diff --git a/src/java/org/apache/cassandra/utils/logging/LoggingSupport.java b/src/java/org/apache/cassandra/utils/logging/LoggingSupport.java index 35e11975f922..00b40cb966de 100644 --- a/src/java/org/apache/cassandra/utils/logging/LoggingSupport.java +++ b/src/java/org/apache/cassandra/utils/logging/LoggingSupport.java @@ -21,6 +21,7 @@ import java.util.Map; import java.util.Optional; +import ch.qos.logback.classic.Logger; import ch.qos.logback.core.Appender; /** @@ -53,7 +54,12 @@ default void onShutdown() {} */ Map getLoggingLevels(); - default Optional> getAppender(Class appenderClass, String appenderName) + default > Optional getAppender(Class appenderClass, String appenderName) + { + return Optional.empty(); + } + + default Optional getLogger(String loggerName) { return Optional.empty(); } diff --git a/src/java/org/apache/cassandra/utils/logging/SlowQueriesAppender.java b/src/java/org/apache/cassandra/utils/logging/SlowQueriesAppender.java new file mode 100644 index 000000000000..4af2e383077b --- /dev/null +++ b/src/java/org/apache/cassandra/utils/logging/SlowQueriesAppender.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils.logging; + +import ch.qos.logback.classic.spi.LoggingEvent; +import org.apache.cassandra.db.virtual.AbstractLoggerVirtualTable; +import org.apache.cassandra.db.virtual.SlowQueriesTable; + +public final class SlowQueriesAppender extends AbstractVirtualTableAppender +{ + public static final String APPENDER_NAME = "SLOW_QUERIES_APPENDER"; + + private AbstractLoggerVirtualTable slowQueries; + + public SlowQueriesAppender() + { + super(SlowQueriesTable.LOGS_VIRTUAL_TABLE_DEFAULT_ROWS); + } + + @Override + protected void append(LoggingEvent eventObject) + { + // slowQueries will be null as long as virtual tables + // are not registered, and we already try to put queries there. + // As soon as vtable is registered (as part of node's startup / initialisation), + // slow queries will never be null again + slowQueries = appendToVirtualTable(slowQueries, eventObject, SlowQueriesTable.TABLE_NAME); + } +} diff --git a/src/java/org/apache/cassandra/utils/logging/VirtualTableAppender.java b/src/java/org/apache/cassandra/utils/logging/VirtualTableAppender.java index 2820b2936f4a..03a142004afd 100644 --- a/src/java/org/apache/cassandra/utils/logging/VirtualTableAppender.java +++ b/src/java/org/apache/cassandra/utils/logging/VirtualTableAppender.java @@ -18,111 +18,35 @@ package org.apache.cassandra.utils.logging; -import java.util.LinkedList; -import java.util.List; -import java.util.Optional; import java.util.Set; import com.google.common.collect.ImmutableSet; import ch.qos.logback.classic.spi.LoggingEvent; -import ch.qos.logback.core.AppenderBase; import org.apache.cassandra.audit.FileAuditLogger; +import org.apache.cassandra.db.virtual.AbstractLoggerVirtualTable; import org.apache.cassandra.db.virtual.LogMessagesTable; -import org.apache.cassandra.db.virtual.VirtualKeyspace; -import org.apache.cassandra.db.virtual.VirtualKeyspaceRegistry; -import org.apache.cassandra.db.virtual.VirtualTable; - -import static org.apache.cassandra.db.virtual.LogMessagesTable.LOGS_VIRTUAL_TABLE_DEFAULT_ROWS; -import static org.apache.cassandra.db.virtual.LogMessagesTable.TABLE_NAME; -import static org.apache.cassandra.schema.SchemaConstants.VIRTUAL_VIEWS; /** * Appends Cassandra logs to virtual table system_views.system_logs */ -public final class VirtualTableAppender extends AppenderBase +public final class VirtualTableAppender extends AbstractVirtualTableAppender { public static final String APPENDER_NAME = "CQLLOG"; private static final Set forbiddenLoggers = ImmutableSet.of(FileAuditLogger.class.getName()); - private LogMessagesTable logs; - - // for holding messages until virtual registry contains logs virtual table - // as it takes some time during startup of a node to initialise virtual tables but messages are - // logged already - private final List messageBuffer = new LinkedList<>(); + private AbstractLoggerVirtualTable logs; - @Override - protected void append(LoggingEvent eventObject) + public VirtualTableAppender() { - if (!forbiddenLoggers.contains(eventObject.getLoggerName())) - { - if (logs == null) - { - logs = getVirtualTable(); - if (logs == null) - addToBuffer(eventObject); - else - logs.add(eventObject); - } - else - logs.add(eventObject); - } + super(LogMessagesTable.LOGS_VIRTUAL_TABLE_DEFAULT_ROWS); } @Override - public void stop() - { - messageBuffer.clear(); - super.stop(); - } - - /** - * Flushes all logs which were appended before virtual table was registered. - * - * @see org.apache.cassandra.service.CassandraDaemon#setupVirtualKeyspaces - */ - public void flushBuffer() - { - Optional.ofNullable(getVirtualTable()).ifPresent(vtable -> { - messageBuffer.forEach(vtable::add); - messageBuffer.clear(); - }); - } - - private LogMessagesTable getVirtualTable() - { - VirtualKeyspace keyspace = VirtualKeyspaceRegistry.instance.getKeyspaceNullable(VIRTUAL_VIEWS); - - if (keyspace == null) - return null; - - Optional logsTable = keyspace.tables() - .stream() - .filter(vt -> vt.name().equals(TABLE_NAME)) - .findFirst(); - - if (!logsTable.isPresent()) - return null; - - VirtualTable vt = logsTable.get(); - - if (!(vt instanceof LogMessagesTable)) - throw new IllegalStateException(String.format("Virtual table %s.%s is not backed by an instance of %s but by %s", - VIRTUAL_VIEWS, - TABLE_NAME, - LogMessagesTable.class.getName(), - vt.getClass().getName())); - - return (LogMessagesTable) vt; - } - - private void addToBuffer(LoggingEvent eventObject) + protected void append(LoggingEvent eventObject) { - // we restrict how many logging events we can put into buffer, - // so we are not growing without any bound when things go south - if (messageBuffer.size() < LOGS_VIRTUAL_TABLE_DEFAULT_ROWS) - messageBuffer.add(eventObject); + if (!forbiddenLoggers.contains(eventObject.getLoggerName())) + logs = appendToVirtualTable(logs, eventObject, LogMessagesTable.TABLE_NAME); } } diff --git a/src/java/org/apache/cassandra/utils/memory/BigEndianMemoryUtil.java b/src/java/org/apache/cassandra/utils/memory/BigEndianMemoryUtil.java new file mode 100644 index 000000000000..7641dcb5b2da --- /dev/null +++ b/src/java/org/apache/cassandra/utils/memory/BigEndianMemoryUtil.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils.memory; + + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import com.google.common.annotations.VisibleForTesting; + +import org.apache.cassandra.utils.Architecture; + +public class BigEndianMemoryUtil extends MemoryUtil +{ + public static int getUnsignedShort(long address) + { + if (Architecture.IS_UNALIGNED || (address & 0b1) == 0L) + return (Architecture.BIG_ENDIAN ? unsafe.getShort(address) : Short.reverseBytes(unsafe.getShort(address))) & 0xffff; + else + return getShortByByte(address) & 0xffff; + } + + public static int getInt(long address) + { + if (Architecture.IS_UNALIGNED || (address & 0b11) == 0L) + return Architecture.BIG_ENDIAN ? unsafe.getInt(address) : Integer.reverseBytes(unsafe.getInt(address)); + else + return getIntByByte(address); + } + + public static long getLong(long address) + { + if (Architecture.IS_UNALIGNED || (address & 0b111) == 0L) + return Architecture.BIG_ENDIAN ? unsafe.getLong(address) : Long.reverseBytes(unsafe.getLong(address)); + else + return getLongByByte(address); + } + + public static void setShort(long address, short s) + { + if (Architecture.IS_UNALIGNED || (address & 0b1) == 0L) + unsafe.putShort(address, Architecture.BIG_ENDIAN ? s : Short.reverseBytes(s)); + else + putShortByByte(address, s); + } + + public static void setInt(long address, int l) + { + if (Architecture.IS_UNALIGNED || (address & 0b11) == 0L) + unsafe.putInt(address, Architecture.BIG_ENDIAN ? l : Integer.reverseBytes(l)); + else + putIntByByte(address, l); + } + + public static void setLong(long address, long l) + { + if (Architecture.IS_UNALIGNED || (address & 0b111) == 0L) + unsafe.putLong(address, Architecture.BIG_ENDIAN ? l : Long.reverseBytes(l)); + else + putLongByByte(address, l); + } + + @VisibleForTesting + static long getLongByByte(long address) + { + return (((long) unsafe.getByte(address ) ) << 56) | + (((long) unsafe.getByte(address + 1) & 0xff) << 48) | + (((long) unsafe.getByte(address + 2) & 0xff) << 40) | + (((long) unsafe.getByte(address + 3) & 0xff) << 32) | + (((long) unsafe.getByte(address + 4) & 0xff) << 24) | + (((long) unsafe.getByte(address + 5) & 0xff) << 16) | + (((long) unsafe.getByte(address + 6) & 0xff) << 8) | + (((long) unsafe.getByte(address + 7) & 0xff) ); + } + + @VisibleForTesting + static int getIntByByte(long address) + { + return (((int) unsafe.getByte(address ) ) << 24) | + (((int) unsafe.getByte(address + 1) & 0xff) << 16) | + (((int) unsafe.getByte(address + 2) & 0xff) << 8) | + (((int) unsafe.getByte(address + 3) & 0xff) ); + } + + @VisibleForTesting + static int getShortByByte(long address) + { + return (((int) unsafe.getByte(address ) ) << 8) | + (((int) unsafe.getByte(address + 1) & 0xff) ); + } + + @VisibleForTesting + static void putLongByByte(long address, long value) + { + unsafe.putByte(address , (byte) (value >> 56)); + unsafe.putByte(address + 1, (byte) (value >> 48)); + unsafe.putByte(address + 2, (byte) (value >> 40)); + unsafe.putByte(address + 3, (byte) (value >> 32)); + unsafe.putByte(address + 4, (byte) (value >> 24)); + unsafe.putByte(address + 5, (byte) (value >> 16)); + unsafe.putByte(address + 6, (byte) (value >> 8)); + unsafe.putByte(address + 7, (byte) (value )); + } + + @VisibleForTesting + static void putIntByByte(long address, int value) + { + unsafe.putByte(address , (byte) (value >> 24)); + unsafe.putByte(address + 1, (byte) (value >> 16)); + unsafe.putByte(address + 2, (byte) (value >> 8)); + unsafe.putByte(address + 3, (byte) (value )); + } + + @VisibleForTesting + static void putShortByByte(long address, short value) + { + unsafe.putByte(address , (byte) (value >> 8)); + unsafe.putByte(address + 1, (byte) (value )); + } + + public static ByteBuffer getByteBuffer(long address, int length) + { + return getByteBuffer(address, length, ByteOrder.BIG_ENDIAN); + } + + public static ByteBuffer getHollowDirectByteBuffer() + { + return getHollowDirectByteBuffer(ByteOrder.BIG_ENDIAN); + } +} diff --git a/src/java/org/apache/cassandra/utils/memory/BufferPool.java b/src/java/org/apache/cassandra/utils/memory/BufferPool.java index cddfc8fe6122..90c95228f9fe 100644 --- a/src/java/org/apache/cassandra/utils/memory/BufferPool.java +++ b/src/java/org/apache/cassandra/utils/memory/BufferPool.java @@ -130,6 +130,7 @@ public class BufferPool public static final int TINY_CHUNK_SIZE = NORMAL_ALLOCATION_UNIT; public static final int TINY_ALLOCATION_UNIT = TINY_CHUNK_SIZE / 64; public static final int TINY_ALLOCATION_LIMIT = TINY_CHUNK_SIZE / 2; + private static final boolean REF_TRACE_ENABLED = Ref.TRACE_ENABLED; private static final Logger logger = LoggerFactory.getLogger(BufferPool.class); private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(logger, 15L, TimeUnit.MINUTES); @@ -1330,7 +1331,7 @@ static Chunk getParentChunk(ByteBuffer buffer) void setAttachment(ByteBuffer buffer) { - if (Ref.DEBUG_ENABLED) + if (REF_TRACE_ENABLED) MemoryUtil.setAttachment(buffer, new DirectBufferRef<>(this, null)); else MemoryUtil.setAttachment(buffer, this); @@ -1342,7 +1343,7 @@ boolean releaseAttachment(ByteBuffer buffer) if (attachment == null) return false; - if (Ref.DEBUG_ENABLED) + if (REF_TRACE_ENABLED) ((DirectBufferRef) attachment).release(); return true; diff --git a/src/java/org/apache/cassandra/utils/memory/LittleEndianMemoryUtil.java b/src/java/org/apache/cassandra/utils/memory/LittleEndianMemoryUtil.java new file mode 100644 index 000000000000..2553b9314984 --- /dev/null +++ b/src/java/org/apache/cassandra/utils/memory/LittleEndianMemoryUtil.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils.memory; + + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import com.google.common.annotations.VisibleForTesting; + +import org.apache.cassandra.utils.Architecture; + +public class LittleEndianMemoryUtil extends MemoryUtil +{ + public static int getUnsignedShort(long address) + { + if (Architecture.IS_UNALIGNED || (address & 0b1) == 0L) + return (Architecture.BIG_ENDIAN ? Short.reverseBytes(unsafe.getShort(address)) : unsafe.getShort(address)) & 0xffff; + else + return getShortByByte(address) & 0xffff; + } + + public static int getInt(long address) + { + if (Architecture.IS_UNALIGNED || (address & 0b11) == 0L) + return Architecture.BIG_ENDIAN ? Integer.reverseBytes(unsafe.getInt(address)) : unsafe.getInt(address); + else + return getIntByByte(address); + } + + public static long getLong(long address) + { + if (Architecture.IS_UNALIGNED || (address & 0b111) == 0L) + return Architecture.BIG_ENDIAN ? Long.reverseBytes(unsafe.getLong(address)) : unsafe.getLong(address); + else + return getLongByByte(address); + } + + public static void setShort(long address, short s) + { + if (Architecture.IS_UNALIGNED || (address & 0b1) == 0L) + unsafe.putShort(address, Architecture.BIG_ENDIAN ? Short.reverseBytes(s) : s); + else + putShortByByte(address, s); + } + + public static void setInt(long address, int l) + { + if (Architecture.IS_UNALIGNED || (address & 0b11) == 0L) + unsafe.putInt(address, Architecture.BIG_ENDIAN ? Integer.reverseBytes(l) : l); + else + putIntByByte(address, l); + } + + public static void setLong(long address, long l) + { + if (Architecture.IS_UNALIGNED || (address & 0b111) == 0L) + unsafe.putLong(address, Architecture.BIG_ENDIAN ? Long.reverseBytes(l) : l); + else + putLongByByte(address, l); + } + + @VisibleForTesting + static long getLongByByte(long address) + { + return (((long) unsafe.getByte(address + 7) ) << 56) | + (((long) unsafe.getByte(address + 6) & 0xff) << 48) | + (((long) unsafe.getByte(address + 5) & 0xff) << 40) | + (((long) unsafe.getByte(address + 4) & 0xff) << 32) | + (((long) unsafe.getByte(address + 3) & 0xff) << 24) | + (((long) unsafe.getByte(address + 2) & 0xff) << 16) | + (((long) unsafe.getByte(address + 1) & 0xff) << 8) | + (((long) unsafe.getByte(address ) & 0xff) ); + } + + @VisibleForTesting + static int getIntByByte(long address) + { + return (((int) unsafe.getByte(address + 3) ) << 24) | + (((int) unsafe.getByte(address + 2) & 0xff) << 16) | + (((int) unsafe.getByte(address + 1) & 0xff) << 8) | + (((int) unsafe.getByte(address ) & 0xff) ); + } + + @VisibleForTesting + static int getShortByByte(long address) + { + return (((int) unsafe.getByte(address + 1) ) << 8) | + (((int) unsafe.getByte(address ) & 0xff) ); + } + + @VisibleForTesting + static void putLongByByte(long address, long value) + { + unsafe.putByte(address + 7, (byte) (value >> 56)); + unsafe.putByte(address + 6, (byte) (value >> 48)); + unsafe.putByte(address + 5, (byte) (value >> 40)); + unsafe.putByte(address + 4, (byte) (value >> 32)); + unsafe.putByte(address + 3, (byte) (value >> 24)); + unsafe.putByte(address + 2, (byte) (value >> 16)); + unsafe.putByte(address + 1, (byte) (value >> 8)); + unsafe.putByte(address , (byte) (value )); + } + + @VisibleForTesting + static void putIntByByte(long address, int value) + { + unsafe.putByte(address + 3, (byte) (value >> 24)); + unsafe.putByte(address + 2, (byte) (value >> 16)); + unsafe.putByte(address + 1, (byte) (value >> 8)); + unsafe.putByte(address , (byte) (value )); + } + + @VisibleForTesting + static void putShortByByte(long address, short value) + { + unsafe.putByte(address + 1, (byte) (value >> 8)); + unsafe.putByte(address , (byte) (value )); + } + + public static ByteBuffer getByteBuffer(long address, int length) + { + return getByteBuffer(address, length, ByteOrder.LITTLE_ENDIAN); + } + + public static ByteBuffer getHollowDirectByteBuffer() + { + return getHollowDirectByteBuffer(ByteOrder.LITTLE_ENDIAN); + } +} diff --git a/src/java/org/apache/cassandra/utils/memory/MemoryUtil.java b/src/java/org/apache/cassandra/utils/memory/MemoryUtil.java index 453f3eda1ba3..724c673f883f 100644 --- a/src/java/org/apache/cassandra/utils/memory/MemoryUtil.java +++ b/src/java/org/apache/cassandra/utils/memory/MemoryUtil.java @@ -24,24 +24,20 @@ import com.sun.jna.Native; -import org.apache.cassandra.utils.Architecture; - import sun.misc.Unsafe; public abstract class MemoryUtil { private static final long UNSAFE_COPY_THRESHOLD = 1024 * 1024L; // copied from java.nio.Bits - private static final Unsafe unsafe; + protected static final Unsafe unsafe; private static final Class DIRECT_BYTE_BUFFER_CLASS, RO_DIRECT_BYTE_BUFFER_CLASS; private static final long DIRECT_BYTE_BUFFER_ADDRESS_OFFSET; private static final long DIRECT_BYTE_BUFFER_CAPACITY_OFFSET; private static final long DIRECT_BYTE_BUFFER_LIMIT_OFFSET; private static final long DIRECT_BYTE_BUFFER_POSITION_OFFSET; private static final long DIRECT_BYTE_BUFFER_ATTACHMENT_OFFSET; - private static final Class BYTE_BUFFER_CLASS; - private static final long BYTE_BUFFER_OFFSET_OFFSET; - private static final long BYTE_BUFFER_HB_OFFSET; + protected static final Class BYTE_BUFFER_CLASS; private static final long BYTE_ARRAY_BASE_OFFSET; static @@ -61,8 +57,6 @@ public abstract class MemoryUtil RO_DIRECT_BYTE_BUFFER_CLASS = ByteBuffer.allocateDirect(0).asReadOnlyBuffer().getClass(); clazz = ByteBuffer.allocate(0).getClass(); - BYTE_BUFFER_OFFSET_OFFSET = unsafe.objectFieldOffset(ByteBuffer.class.getDeclaredField("offset")); - BYTE_BUFFER_HB_OFFSET = unsafe.objectFieldOffset(ByteBuffer.class.getDeclaredField("hb")); BYTE_BUFFER_CLASS = clazz; BYTE_ARRAY_BASE_OFFSET = unsafe.arrayBaseOffset(byte[].class); @@ -104,56 +98,11 @@ public static void setByte(long address, int count, byte b) unsafe.setMemory(address, count, b); } - public static void setShort(long address, short s) - { - unsafe.putShort(address, Architecture.BIG_ENDIAN ? Short.reverseBytes(s) : s); - } - - public static void setInt(long address, int l) - { - if (Architecture.IS_UNALIGNED) - unsafe.putInt(address, Architecture.BIG_ENDIAN ? Integer.reverseBytes(l) : l); - else - putIntByByte(address, l); - } - - public static void setLong(long address, long l) - { - if (Architecture.IS_UNALIGNED) - unsafe.putLong(address, Architecture.BIG_ENDIAN ? Long.reverseBytes(l) : l); - else - putLongByByte(address, l); - } - public static byte getByte(long address) { return unsafe.getByte(address); } - public static int getShort(long address) - { - if (Architecture.IS_UNALIGNED) - return (Architecture.BIG_ENDIAN ? Short.reverseBytes(unsafe.getShort(address)) : unsafe.getShort(address)) & 0xffff; - else - return getShortByByte(address) & 0xffff; - } - - public static int getInt(long address) - { - if (Architecture.IS_UNALIGNED) - return Architecture.BIG_ENDIAN ? Integer.reverseBytes(unsafe.getInt(address)) : unsafe.getInt(address); - else - return getIntByByte(address); - } - - public static long getLong(long address) - { - if (Architecture.IS_UNALIGNED) - return Architecture.BIG_ENDIAN ? Long.reverseBytes(unsafe.getLong(address)) : unsafe.getLong(address); - else - return getLongByByte(address); - } - public static ByteBuffer getByteBuffer(long address, int length) { return getByteBuffer(address, length, ByteOrder.nativeOrder()); @@ -186,21 +135,6 @@ public static ByteBuffer getHollowDirectByteBuffer(ByteOrder order) return instance; } - public static ByteBuffer getHollowByteBuffer() - { - ByteBuffer instance; - try - { - instance = (ByteBuffer) unsafe.allocateInstance(BYTE_BUFFER_CLASS); - } - catch (InstantiationException e) - { - throw new AssertionError(e); - } - instance.order(ByteOrder.nativeOrder()); - return instance; - } - public static boolean isExactlyDirect(ByteBuffer buffer) { return buffer.getClass() == DIRECT_BYTE_BUFFER_CLASS; @@ -250,107 +184,24 @@ public static void setByteBufferCapacity(ByteBuffer instance, int capacity) unsafe.putInt(instance, DIRECT_BYTE_BUFFER_CAPACITY_OFFSET, capacity); } - public static long getLongByByte(long address) - { - if (Architecture.BIG_ENDIAN) - { - return (((long) unsafe.getByte(address ) ) << 56) | - (((long) unsafe.getByte(address + 1) & 0xff) << 48) | - (((long) unsafe.getByte(address + 2) & 0xff) << 40) | - (((long) unsafe.getByte(address + 3) & 0xff) << 32) | - (((long) unsafe.getByte(address + 4) & 0xff) << 24) | - (((long) unsafe.getByte(address + 5) & 0xff) << 16) | - (((long) unsafe.getByte(address + 6) & 0xff) << 8) | - (((long) unsafe.getByte(address + 7) & 0xff) ); - } - else - { - return (((long) unsafe.getByte(address + 7) ) << 56) | - (((long) unsafe.getByte(address + 6) & 0xff) << 48) | - (((long) unsafe.getByte(address + 5) & 0xff) << 40) | - (((long) unsafe.getByte(address + 4) & 0xff) << 32) | - (((long) unsafe.getByte(address + 3) & 0xff) << 24) | - (((long) unsafe.getByte(address + 2) & 0xff) << 16) | - (((long) unsafe.getByte(address + 1) & 0xff) << 8) | - (((long) unsafe.getByte(address ) & 0xff) ); - } - } - - public static int getIntByByte(long address) - { - if (Architecture.BIG_ENDIAN) - { - return (((int) unsafe.getByte(address ) ) << 24) | - (((int) unsafe.getByte(address + 1) & 0xff) << 16) | - (((int) unsafe.getByte(address + 2) & 0xff) << 8 ) | - (((int) unsafe.getByte(address + 3) & 0xff) ); - } - else - { - return (((int) unsafe.getByte(address + 3) ) << 24) | - (((int) unsafe.getByte(address + 2) & 0xff) << 16) | - (((int) unsafe.getByte(address + 1) & 0xff) << 8) | - (((int) unsafe.getByte(address ) & 0xff) ); - } - } - - - public static int getShortByByte(long address) + /** + * Transfers count bytes to Memory starting at memoryOffset from ByteBuffer starting at bufferOffset + * + * @param targetAddress target start offset in the memory + * @param sourceBuffer the source data buffer + * @param bufferOffset start offset of the buffer + * @param count number of bytes to transfer + */ + public static void setBytes(long targetAddress, ByteBuffer sourceBuffer, int bufferOffset, int count) { - if (Architecture.BIG_ENDIAN) - { - return (((int) unsafe.getByte(address ) ) << 8) | - (((int) unsafe.getByte(address + 1) & 0xff) ); - } - else - { - return (((int) unsafe.getByte(address + 1) ) << 8) | - (((int) unsafe.getByte(address ) & 0xff) ); - } - } + if (count == 0) + return; + int start = sourceBuffer.position() + bufferOffset; - public static void putLongByByte(long address, long value) - { - if (Architecture.BIG_ENDIAN) - { - unsafe.putByte(address, (byte) (value >> 56)); - unsafe.putByte(address + 1, (byte) (value >> 48)); - unsafe.putByte(address + 2, (byte) (value >> 40)); - unsafe.putByte(address + 3, (byte) (value >> 32)); - unsafe.putByte(address + 4, (byte) (value >> 24)); - unsafe.putByte(address + 5, (byte) (value >> 16)); - unsafe.putByte(address + 6, (byte) (value >> 8)); - unsafe.putByte(address + 7, (byte) (value)); - } + if (sourceBuffer.isDirect()) + setBytes(getAddress(sourceBuffer) + start, targetAddress, count); else - { - unsafe.putByte(address + 7, (byte) (value >> 56)); - unsafe.putByte(address + 6, (byte) (value >> 48)); - unsafe.putByte(address + 5, (byte) (value >> 40)); - unsafe.putByte(address + 4, (byte) (value >> 32)); - unsafe.putByte(address + 3, (byte) (value >> 24)); - unsafe.putByte(address + 2, (byte) (value >> 16)); - unsafe.putByte(address + 1, (byte) (value >> 8)); - unsafe.putByte(address, (byte) (value)); - } - } - - public static void putIntByByte(long address, int value) - { - if (Architecture.BIG_ENDIAN) - { - unsafe.putByte(address, (byte) (value >> 24)); - unsafe.putByte(address + 1, (byte) (value >> 16)); - unsafe.putByte(address + 2, (byte) (value >> 8)); - unsafe.putByte(address + 3, (byte) (value)); - } - else - { - unsafe.putByte(address + 3, (byte) (value >> 24)); - unsafe.putByte(address + 2, (byte) (value >> 16)); - unsafe.putByte(address + 1, (byte) (value >> 8)); - unsafe.putByte(address, (byte) (value)); - } + setBytes(targetAddress, sourceBuffer.array(), sourceBuffer.arrayOffset() + start, count); } public static void setBytes(long address, ByteBuffer buffer) @@ -424,4 +275,51 @@ else if (count == 0) unsafe.copyMemory(null, address, buffer, BYTE_ARRAY_BASE_OFFSET + bufferOffset, count); } + + /** + * Transfers count bytes from Memory starting at address to ByteBuffer starting at bufferOffset + * + * @param sourceAddress start offset in the memory + * @param targetBuffer the target data buffer + * @param bufferOffset start offset of the buffer + * @param length number of bytes to transfer + */ + public static void getBytes(long sourceAddress, ByteBuffer targetBuffer, int bufferOffset, int length) + { + if (targetBuffer == null) + throw new NullPointerException(); + else if (length < 0 || length > targetBuffer.remaining()) + throw new IndexOutOfBoundsException(); + else if (length == 0) + return; + + Object obj; + long offset; + if (targetBuffer.hasArray()) + { + obj = targetBuffer.array(); + offset = BYTE_ARRAY_BASE_OFFSET + targetBuffer.arrayOffset(); + } + else + { + obj = null; + offset = unsafe.getLong(targetBuffer, DIRECT_BYTE_BUFFER_ADDRESS_OFFSET); + } + offset += targetBuffer.position(); + offset += bufferOffset; + + unsafe.copyMemory(null, sourceAddress, obj, offset, length); + } + + /** + * Transfers count bytes from Memory starting at address to ByteBuffer + * + * @param sourceAddress start offset in the memory + * @param targetBuffer the target data buffer + * @param length number of bytes to transfer + */ + public static void getBytes(long sourceAddress, ByteBuffer targetBuffer, int length) + { + getBytes(sourceAddress, targetBuffer, 0, length); + } } diff --git a/src/java/org/apache/cassandra/utils/memory/NativeEndianMemoryUtil.java b/src/java/org/apache/cassandra/utils/memory/NativeEndianMemoryUtil.java new file mode 100644 index 000000000000..3cb5edb28b98 --- /dev/null +++ b/src/java/org/apache/cassandra/utils/memory/NativeEndianMemoryUtil.java @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils.memory; + + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import com.google.common.annotations.VisibleForTesting; + +import org.apache.cassandra.utils.Architecture; + +/** + * Use this API only for data which are stored in-memory + * and not serialized directly (without converting to Java primitives) to disk and network + */ +public class NativeEndianMemoryUtil extends MemoryUtil +{ + public static int getUnsignedShort(long address) + { + if (Architecture.IS_UNALIGNED || (address & 0b1) == 0L) + return unsafe.getShort(address) & 0xffff; + else + return getShortByByte(address) & 0xffff; + } + + public static int getInt(long address) + { + if (Architecture.IS_UNALIGNED || (address & 0b11) == 0L) + return unsafe.getInt(address); + else + return getIntByByte(address); + } + + public static long getLong(long address) + { + if (Architecture.IS_UNALIGNED || (address & 0b111) == 0L) + return unsafe.getLong(address); + else + return getLongByByte(address); + } + + public static void setShort(long address, short s) + { + if (Architecture.IS_UNALIGNED || (address & 0b1) == 0L) + unsafe.putShort(address, s); + else + putShortByByte(address, s); + } + + public static void setInt(long address, int l) + { + if (Architecture.IS_UNALIGNED || (address & 0b11) == 0L) + unsafe.putInt(address, l); + else + putIntByByte(address, l); + } + + public static void setLong(long address, long l) + { + if (Architecture.IS_UNALIGNED || (address & 0b111) == 0L) + unsafe.putLong(address, l); + else + putLongByByte(address, l); + } + + @VisibleForTesting + static long getLongByByte(long address) + { + if (Architecture.BIG_ENDIAN) + { + return (((long) unsafe.getByte(address ) ) << 56) | + (((long) unsafe.getByte(address + 1) & 0xff) << 48) | + (((long) unsafe.getByte(address + 2) & 0xff) << 40) | + (((long) unsafe.getByte(address + 3) & 0xff) << 32) | + (((long) unsafe.getByte(address + 4) & 0xff) << 24) | + (((long) unsafe.getByte(address + 5) & 0xff) << 16) | + (((long) unsafe.getByte(address + 6) & 0xff) << 8) | + (((long) unsafe.getByte(address + 7) & 0xff) ); + } + else + { + return (((long) unsafe.getByte(address + 7) ) << 56) | + (((long) unsafe.getByte(address + 6) & 0xff) << 48) | + (((long) unsafe.getByte(address + 5) & 0xff) << 40) | + (((long) unsafe.getByte(address + 4) & 0xff) << 32) | + (((long) unsafe.getByte(address + 3) & 0xff) << 24) | + (((long) unsafe.getByte(address + 2) & 0xff) << 16) | + (((long) unsafe.getByte(address + 1) & 0xff) << 8) | + (((long) unsafe.getByte(address ) & 0xff) ); + } + } + + @VisibleForTesting + static int getIntByByte(long address) + { + if (Architecture.BIG_ENDIAN) + { + return (((int) unsafe.getByte(address ) ) << 24) | + (((int) unsafe.getByte(address + 1) & 0xff) << 16) | + (((int) unsafe.getByte(address + 2) & 0xff) << 8) | + (((int) unsafe.getByte(address + 3) & 0xff) ); + } + else + { + return (((int) unsafe.getByte(address + 3) ) << 24) | + (((int) unsafe.getByte(address + 2) & 0xff) << 16) | + (((int) unsafe.getByte(address + 1) & 0xff) << 8) | + (((int) unsafe.getByte(address ) & 0xff) ); + } + } + + @VisibleForTesting + static int getShortByByte(long address) + { + if (Architecture.BIG_ENDIAN) + { + return (((int) unsafe.getByte(address ) ) << 8) | + (((int) unsafe.getByte(address + 1) & 0xff) ); + } + else + { + return (((int) unsafe.getByte(address + 1) ) << 8) | + (((int) unsafe.getByte(address ) & 0xff) ); + } + } + + @VisibleForTesting + static void putLongByByte(long address, long value) + { + if (Architecture.BIG_ENDIAN) + { + unsafe.putByte(address , (byte) (value >> 56)); + unsafe.putByte(address + 1, (byte) (value >> 48)); + unsafe.putByte(address + 2, (byte) (value >> 40)); + unsafe.putByte(address + 3, (byte) (value >> 32)); + unsafe.putByte(address + 4, (byte) (value >> 24)); + unsafe.putByte(address + 5, (byte) (value >> 16)); + unsafe.putByte(address + 6, (byte) (value >> 8)); + unsafe.putByte(address + 7, (byte) (value )); + } + else + { + unsafe.putByte(address + 7, (byte) (value >> 56)); + unsafe.putByte(address + 6, (byte) (value >> 48)); + unsafe.putByte(address + 5, (byte) (value >> 40)); + unsafe.putByte(address + 4, (byte) (value >> 32)); + unsafe.putByte(address + 3, (byte) (value >> 24)); + unsafe.putByte(address + 2, (byte) (value >> 16)); + unsafe.putByte(address + 1, (byte) (value >> 8)); + unsafe.putByte(address , (byte) (value )); + } + } + + @VisibleForTesting + static void putIntByByte(long address, int value) + { + if (Architecture.BIG_ENDIAN) + { + unsafe.putByte(address , (byte) (value >> 24)); + unsafe.putByte(address + 1, (byte) (value >> 16)); + unsafe.putByte(address + 2, (byte) (value >> 8)); + unsafe.putByte(address + 3, (byte) (value )); + } + else + { + unsafe.putByte(address + 3, (byte) (value >> 24)); + unsafe.putByte(address + 2, (byte) (value >> 16)); + unsafe.putByte(address + 1, (byte) (value >> 8)); + unsafe.putByte(address , (byte) (value )); + } + } + + @VisibleForTesting + static void putShortByByte(long address, short value) + { + if (Architecture.BIG_ENDIAN) + { + unsafe.putByte(address , (byte) (value >> 8)); + unsafe.putByte(address + 1, (byte) (value )); + } + else + { + unsafe.putByte(address + 1, (byte) (value >> 8)); + unsafe.putByte(address , (byte) (value )); + } + } + + public static ByteBuffer getByteBuffer(long address, int length) + { + return getByteBuffer(address, length, ByteOrder.nativeOrder()); + } + + public static ByteBuffer getHollowDirectByteBuffer() + { + return getHollowDirectByteBuffer(ByteOrder.nativeOrder()); + } +} diff --git a/src/java/org/apache/cassandra/utils/vint/VIntCoding.java b/src/java/org/apache/cassandra/utils/vint/VIntCoding.java index c2bb51004914..a444f4147bc3 100644 --- a/src/java/org/apache/cassandra/utils/vint/VIntCoding.java +++ b/src/java/org/apache/cassandra/utils/vint/VIntCoding.java @@ -49,8 +49,10 @@ import java.io.DataInput; import java.io.IOException; import java.nio.ByteBuffer; +import java.nio.ByteOrder; import io.netty.util.concurrent.FastThreadLocal; +import net.nicoulaj.compilecommand.annotations.DontInline; import net.nicoulaj.compilecommand.annotations.Inline; import org.apache.cassandra.db.marshal.ValueAccessor; import org.apache.cassandra.io.util.DataInputPlus; @@ -109,6 +111,90 @@ public static long readUnsignedVInt(DataInput input) throws IOException return retval; } + @DontInline + private static long readUnsignedVIntSlow(ByteBuffer in, byte firstByte) + { + int size = numberOfExtraBytesToRead(firstByte); + long retval = firstByte & firstByteValueMask(size); + for (int ii = 0; ii < size; ii++) + { + byte b = in.get(); + retval <<= 8; + retval |= b & 0xff; + } + + return retval; + } + + @DontInline + private static long readUnsignedVIntSlow(ByteBuffer in, int position, byte firstByte) + { + int size = numberOfExtraBytesToRead(firstByte); + long retval = firstByte & firstByteValueMask(size); + for (int ii = 0; ii < size; ii++) + { + byte b = in.get(position++); + retval <<= 8; + retval |= b & 0xff; + } + + return retval; + } + + public static long readUnsignedVInt(ByteBuffer in) + { + byte firstByte = in.get(); + if (firstByte >= 0) + return firstByte; + + int position = in.position(); + int limit = in.limit(); + if (limit - position < 8) + return readUnsignedVIntSlow(in, firstByte); + + int extraBytes = VIntCoding.numberOfExtraBytesToRead(firstByte); + int extraBits = extraBytes * 8; + + long retval = in.getLong(position); + if (in.order() == ByteOrder.LITTLE_ENDIAN) + retval = Long.reverseBytes(retval); + in.position(position + extraBytes); + + // truncate the bytes we read in excess of those we needed + retval >>>= 64 - extraBits; + // remove the non-value bits from the first byte + firstByte &= VIntCoding.firstByteValueMask(extraBytes); + // shift the first byte up to its correct position + retval |= (long) firstByte << extraBits; + return retval; + } + + public static long readUnsignedVInt(ByteBuffer in, int position) + { + byte firstByte = in.get(position++); + if (firstByte >= 0) + return firstByte; + + int limit = in.limit(); + if (limit - position < 8) + return readUnsignedVIntSlow(in, position, firstByte); + + int extraBytes = VIntCoding.numberOfExtraBytesToRead(firstByte); + int extraBits = extraBytes * 8; + + long retval = in.getLong(position); + if (in.order() == ByteOrder.LITTLE_ENDIAN) + retval = Long.reverseBytes(retval); + + // truncate the bytes we read in excess of those we needed + retval >>>= 64 - extraBits; + // remove the non-value bits from the first byte + firstByte &= VIntCoding.firstByteValueMask(extraBytes); + // shift the first byte up to its correct position + retval |= (long) firstByte << extraBits; + return retval; + } + public static void skipUnsignedVInt(DataInputPlus input) throws IOException { int firstByte = input.readByte(); @@ -222,6 +308,17 @@ public static long getUnsignedVInt(V input, ValueAccessor accessor, int r return retval; } + public static int readLengthOfVInt(V input, ValueAccessor accessor, int position) + { + byte firstByte = accessor.getByte(input, position); + if (firstByte >= 0) + return 1; + + int extraBytes = accord.utils.VIntCoding.numberOfExtraBytesToRead(firstByte); + return 1 + extraBytes; + } + + /** * Computes size of an unsigned vint that starts at readerIndex of the provided ByteBuf. * @@ -245,6 +342,11 @@ public static long readVInt(DataInput input) throws IOException return decodeZigZag64(readUnsignedVInt(input)); } + public static long readVInt(ByteBuffer input) + { + return decodeZigZag64(readUnsignedVInt(input)); + } + /** * Read up to a signed 32-bit integer back. * @@ -271,6 +373,34 @@ public static int readUnsignedVInt32(DataInput input) throws IOException return checkedCast(readUnsignedVInt(input)); } + /** + * Read up to a 32-bit integer. + * + * This method assumes the original integer was written using {@link #writeUnsignedVInt32(int, DataOutputPlus)} + * or similar that doesn't zigzag encodes the vint. + * + * @throws VIntOutOfRangeException If the vint doesn't fit into a 32-bit integer + */ + public static int readUnsignedVInt32(ByteBuffer input) + { + return checkedCast(readUnsignedVInt(input)); + } + + public static int readUnsignedVInt32(ByteBuffer input, int position) + { + return checkedCast(readUnsignedVInt(input, position)); + } + + public static int readLengthOfVInt(ByteBuffer in, int position) + { + byte firstByte = in.get(position); + if (firstByte >= 0) + return 1; + + int extraBytes = numberOfExtraBytesToRead(firstByte); + return 1 + extraBytes; + } + // & this with the first byte to give the value part for a given extraBytesToRead encoded in the byte public static int firstByteValueMask(int extraBytesToRead) { diff --git a/test/burn/org/apache/cassandra/service/accord/AccordExecutorBurnTest.java b/test/burn/org/apache/cassandra/service/accord/AccordExecutorBurnTest.java new file mode 100644 index 000000000000..88a785d9a92c --- /dev/null +++ b/test/burn/org/apache/cassandra/service/accord/AccordExecutorBurnTest.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Function; + +import org.junit.Test; + +import org.apache.cassandra.concurrent.ExecutorPlus; +import org.apache.cassandra.utils.concurrent.Semaphore; +import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; + +import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; + +public class AccordExecutorBurnTest +{ + static class State + { + final AccordExecutor executor; + final AccordCommandStore[] commandStores; + final ExecutorPlus loadGen; + final int generators; + final int targetCount; + final Semaphore permits; + final AtomicInteger submitted = new AtomicInteger(); + final AtomicInteger completed = new AtomicInteger(); + + State(AccordExecutor executor, Function storeFactory, + int taskCount, int concurrency, int generators, int commandStores) + { + this.executor = executor; + this.targetCount = taskCount; + this.permits = Semaphore.newSemaphore(concurrency); + this.generators = generators; + this.loadGen = executorFactory().pooled("loadgen", generators); + this.commandStores = new AccordCommandStore[commandStores]; + for (int i = 0 ; i < commandStores ; ++i) + this.commandStores[i] = storeFactory.apply(executor); + } + + void start() + { + for (int i = 0 ; i < generators ; ++i) + loadGen.execute(this::run); + } + + void run() + { + while (true) + { + int slot = submitted.get(); + if (slot >= targetCount) + return; + if (!submitted.compareAndSet(slot, slot + 1)) + continue; + + try { permits.acquire(1); } + catch (InterruptedException e) { throw new UncheckedInterruptedException(e); } + submitSomething(); + } + } + + private void submitSomething() + { + } + } + + @Test + public void test() + { + + } + +} diff --git a/test/burn/org/apache/cassandra/transport/SimpleClientBurnTest.java b/test/burn/org/apache/cassandra/transport/SimpleClientBurnTest.java index ed3406d8bc04..3247051197ae 100644 --- a/test/burn/org/apache/cassandra/transport/SimpleClientBurnTest.java +++ b/test/burn/org/apache/cassandra/transport/SimpleClientBurnTest.java @@ -138,11 +138,11 @@ public int encodedSize(QueryMessage queryMessage, ProtocolVersion version) Arrays.asList( () -> new SimpleClient(address.getHostAddress(), port, ProtocolVersion.V5, true, - new EncryptionOptions()) + new EncryptionOptions.ClientEncryptionOptions()) .connect(false), () -> new SimpleClient(address.getHostAddress(), port, ProtocolVersion.V4, false, - new EncryptionOptions()) + new EncryptionOptions.ClientEncryptionOptions()) .connect(false) ); diff --git a/test/burn/org/apache/cassandra/transport/SimpleClientPerfTest.java b/test/burn/org/apache/cassandra/transport/SimpleClientPerfTest.java index 4417b7cb8957..ca5adfff02da 100644 --- a/test/burn/org/apache/cassandra/transport/SimpleClientPerfTest.java +++ b/test/burn/org/apache/cassandra/transport/SimpleClientPerfTest.java @@ -102,7 +102,7 @@ public void measureSmall() throws Throwable new SizeCaps(10, 20, 5, 10), () -> new SimpleClient(address.getHostAddress(), port, version, true, - new EncryptionOptions()) + new EncryptionOptions.ClientEncryptionOptions()) .connect(false), version); } @@ -114,7 +114,7 @@ public void measureSmallWithCompression() throws Throwable new SizeCaps(10, 20, 5, 10), () -> new SimpleClient(address.getHostAddress(), port, version, true, - new EncryptionOptions()) + new EncryptionOptions.ClientEncryptionOptions()) .connect(true), version); } @@ -126,7 +126,7 @@ public void measureLarge() throws Throwable new SizeCaps(1000, 2000, 5, 150), () -> new SimpleClient(address.getHostAddress(), port, version, true, - new EncryptionOptions()) + new EncryptionOptions.ClientEncryptionOptions()) .connect(false), version); } @@ -138,7 +138,7 @@ public void measureLargeWithCompression() throws Throwable new SizeCaps(1000, 2000, 5, 150), () -> new SimpleClient(address.getHostAddress(), port, version, true, - new EncryptionOptions()) + new EncryptionOptions.ClientEncryptionOptions()) .connect(true), version); } diff --git a/test/conf/cassandra-jmx-disabled-sslconfig.yaml b/test/conf/cassandra-jmx-disabled-sslconfig.yaml index 12baefb75f88..4708909c96a8 100644 --- a/test/conf/cassandra-jmx-disabled-sslconfig.yaml +++ b/test/conf/cassandra-jmx-disabled-sslconfig.yaml @@ -147,3 +147,9 @@ stream_throughput_outbound: 24MiB/s sasi_indexes_enabled: true materialized_views_enabled: true file_cache_enabled: true + +accord: + enabled: true + journal_directory: build/test/cassandra/accord_journal + queue_shard_count: 2 + command_store_shard_count: 4 \ No newline at end of file diff --git a/test/conf/cassandra-jmx-pem-sslconfig.yaml b/test/conf/cassandra-jmx-pem-sslconfig.yaml index 55adfd87dd26..0e8f204791ee 100644 --- a/test/conf/cassandra-jmx-pem-sslconfig.yaml +++ b/test/conf/cassandra-jmx-pem-sslconfig.yaml @@ -147,3 +147,9 @@ stream_throughput_outbound: 24MiB/s sasi_indexes_enabled: true materialized_views_enabled: true file_cache_enabled: true + +accord: + enabled: true + journal_directory: build/test/cassandra/accord_journal + queue_shard_count: 2 + command_store_shard_count: 4 \ No newline at end of file diff --git a/test/conf/cassandra-jmx-sslconfig-with-passwordfile.yaml b/test/conf/cassandra-jmx-sslconfig-with-passwordfile.yaml index 0b495485d128..6f66b04bbc67 100644 --- a/test/conf/cassandra-jmx-sslconfig-with-passwordfile.yaml +++ b/test/conf/cassandra-jmx-sslconfig-with-passwordfile.yaml @@ -72,3 +72,9 @@ stream_throughput_outbound: 24MiB/s sasi_indexes_enabled: true materialized_views_enabled: true file_cache_enabled: true + +accord: + enabled: true + journal_directory: build/test/cassandra/accord_journal + queue_shard_count: 2 + command_store_shard_count: 4 \ No newline at end of file diff --git a/test/conf/cassandra-jmx-sslconfig.yaml b/test/conf/cassandra-jmx-sslconfig.yaml index 1a6ef9a9457f..cd2f9fb0365f 100644 --- a/test/conf/cassandra-jmx-sslconfig.yaml +++ b/test/conf/cassandra-jmx-sslconfig.yaml @@ -72,3 +72,9 @@ stream_throughput_outbound: 24MiB/s sasi_indexes_enabled: true materialized_views_enabled: true file_cache_enabled: true + +accord: + enabled: true + journal_directory: build/test/cassandra/accord_journal + queue_shard_count: 2 + command_store_shard_count: 4 \ No newline at end of file diff --git a/test/conf/cassandra-mtls.yaml b/test/conf/cassandra-mtls.yaml index d6f1b3e52c6b..e80c2ac296e4 100644 --- a/test/conf/cassandra-mtls.yaml +++ b/test/conf/cassandra-mtls.yaml @@ -87,3 +87,5 @@ authenticator: class_name : org.apache.cassandra.auth.MutualTlsAuthenticator parameters : validator_class_name: org.apache.cassandra.auth.SpiffeCertificateValidator +accord: + journal_directory: build/test/cassandra/accord_journal diff --git a/test/conf/cassandra-murmur.yaml b/test/conf/cassandra-murmur.yaml index 2e5828fb56a0..75a208e5fa61 100644 --- a/test/conf/cassandra-murmur.yaml +++ b/test/conf/cassandra-murmur.yaml @@ -42,3 +42,5 @@ user_defined_functions_enabled: true scripted_user_defined_functions_enabled: false sasi_indexes_enabled: true materialized_views_enabled: true +accord: + journal_directory: build/test/cassandra/accord_journal diff --git a/test/conf/cassandra-old.yaml b/test/conf/cassandra-old.yaml index b8c3b028c519..000df1148ff4 100644 --- a/test/conf/cassandra-old.yaml +++ b/test/conf/cassandra-old.yaml @@ -56,3 +56,5 @@ internode_send_buff_size_in_bytes: 5 internode_recv_buff_size_in_bytes: 5 max_hint_window_in_ms: 10800000 cache_load_timeout_seconds: 35 +accord: + journal_directory: build/test/cassandra/accord_journal diff --git a/test/conf/cassandra-pem-jks-sslcontextfactory.yaml b/test/conf/cassandra-pem-jks-sslcontextfactory.yaml index 0bd034d5ae9a..5c27c7b9f951 100644 --- a/test/conf/cassandra-pem-jks-sslcontextfactory.yaml +++ b/test/conf/cassandra-pem-jks-sslcontextfactory.yaml @@ -149,3 +149,5 @@ stream_throughput_outbound: 24MiB/s sasi_indexes_enabled: true materialized_views_enabled: true file_cache_enabled: true +accord: + journal_directory: build/test/cassandra/accord_journal diff --git a/test/conf/cassandra-pem-sslcontextfactory-invalidconfiguration.yaml b/test/conf/cassandra-pem-sslcontextfactory-invalidconfiguration.yaml index 16cffb58bd8b..58c2e9293a45 100644 --- a/test/conf/cassandra-pem-sslcontextfactory-invalidconfiguration.yaml +++ b/test/conf/cassandra-pem-sslcontextfactory-invalidconfiguration.yaml @@ -146,3 +146,5 @@ stream_throughput_outbound: 24MiB/s sasi_indexes_enabled: true materialized_views_enabled: true file_cache_enabled: true +accord: + journal_directory: build/test/cassandra/accord_journal diff --git a/test/conf/cassandra-pem-sslcontextfactory.yaml b/test/conf/cassandra-pem-sslcontextfactory.yaml index 229a0b03fdc3..ef68105e94c8 100644 --- a/test/conf/cassandra-pem-sslcontextfactory.yaml +++ b/test/conf/cassandra-pem-sslcontextfactory.yaml @@ -150,3 +150,5 @@ stream_throughput_outbound: 24MiB/s sasi_indexes_enabled: true materialized_views_enabled: true file_cache_enabled: true +accord: + journal_directory: build/test/cassandra/accord_journal diff --git a/test/conf/cassandra-seeds.yaml b/test/conf/cassandra-seeds.yaml index 53f82dd6ecd7..1049e27fa891 100644 --- a/test/conf/cassandra-seeds.yaml +++ b/test/conf/cassandra-seeds.yaml @@ -41,3 +41,5 @@ row_cache_class_name: org.apache.cassandra.cache.OHCProvider row_cache_size: 16MiB user_defined_functions_enabled: true scripted_user_defined_functions_enabled: false +accord: + journal_directory: build/test/cassandra/accord_journal diff --git a/test/conf/cassandra-sslcontextfactory-invalidconfiguration.yaml b/test/conf/cassandra-sslcontextfactory-invalidconfiguration.yaml index 6b7488336076..26db768f5cb7 100644 --- a/test/conf/cassandra-sslcontextfactory-invalidconfiguration.yaml +++ b/test/conf/cassandra-sslcontextfactory-invalidconfiguration.yaml @@ -80,3 +80,5 @@ stream_throughput_outbound: 23841858MiB/s sasi_indexes_enabled: true materialized_views_enabled: true file_cache_enabled: true +accord: + journal_directory: build/test/cassandra/accord_journal diff --git a/test/conf/cassandra-sslcontextfactory.yaml b/test/conf/cassandra-sslcontextfactory.yaml index a20d26e59bee..153d2f924532 100644 --- a/test/conf/cassandra-sslcontextfactory.yaml +++ b/test/conf/cassandra-sslcontextfactory.yaml @@ -83,3 +83,5 @@ stream_throughput_outbound: 23841858MiB/s sasi_indexes_enabled: true materialized_views_enabled: true file_cache_enabled: true +accord: + journal_directory: build/test/cassandra/accord_journal diff --git a/test/conf/cassandra.yaml b/test/conf/cassandra.yaml index e9ba02c4415e..20c1d2fd70dd 100644 --- a/test/conf/cassandra.yaml +++ b/test/conf/cassandra.yaml @@ -68,6 +68,12 @@ local_read_size_fail_threshold: 8192KiB row_index_read_size_warn_threshold: 4096KiB row_index_read_size_fail_threshold: 8192KiB +accord: + enabled: true + journal_directory: build/test/cassandra/accord_journal + queue_shard_count: 2 + command_store_shard_count: 4 + memtable: configurations: skiplist: diff --git a/test/conf/logback-burntest.xml b/test/conf/logback-burntest.xml index e1e48a9d3fae..f2ade83f2e26 100644 --- a/test/conf/logback-burntest.xml +++ b/test/conf/logback-burntest.xml @@ -20,7 +20,7 @@ - + @@ -36,7 +36,7 @@ - %-5level [%thread] ${instance_id} %date{ISO8601} %msg%n + %-5level [%thread] ${instance_id} %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %msg%n false @@ -51,7 +51,7 @@ - %-5level [%thread] ${instance_id} %date{ISO8601} %F:%L - %msg%n + %-5level [%thread] ${instance_id} %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n @@ -63,4 +63,7 @@ + + + diff --git a/test/conf/logback-dtest-quiet.xml b/test/conf/logback-dtest-quiet.xml new file mode 100644 index 000000000000..8f1f1f15fd99 --- /dev/null +++ b/test/conf/logback-dtest-quiet.xml @@ -0,0 +1,56 @@ + + + + + + + + + + + ./build/test/logs/${cassandra.testtag}/${suitename}/${cluster_id}/${instance_id}/system.log + + %-5level [%thread] ${instance_id} %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n + + + INFO + + true + + + + + %-5level %date{"HH:mm:ss,SSS"} %msg%n + + + ERROR + + + + + + + + + + diff --git a/test/conf/logback-dtest.xml b/test/conf/logback-dtest.xml index 48d9859b67e3..22d2e9faa4a6 100644 --- a/test/conf/logback-dtest.xml +++ b/test/conf/logback-dtest.xml @@ -22,19 +22,19 @@ - + ./build/test/logs/${cassandra.testtag}/${suitename}/${cluster_id}/${instance_id}/system.log - %-5level [%thread] ${instance_id} %date{ISO8601} %F:%L - %msg%n + %-5level [%thread] ${instance_id} %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n true - %-5level %date{HH:mm:ss,SSS} %msg%n + %-5level %date{"HH:mm:ss,SSS"} %msg%n WARN @@ -43,7 +43,7 @@ - %-5level [%thread] ${instance_id} %date{ISO8601} %F:%L - %msg%n + %-5level [%thread] ${instance_id} %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n DEBUG @@ -53,6 +53,11 @@ + + diff --git a/test/conf/logback-dtest_with_slow_query_appender.xml b/test/conf/logback-dtest_with_slow_query_appender.xml new file mode 100644 index 000000000000..62d112d8008b --- /dev/null +++ b/test/conf/logback-dtest_with_slow_query_appender.xml @@ -0,0 +1,63 @@ + + + + + + + + + + + ./build/test/logs/${cassandra.testtag}/${suitename}/${cluster_id}/${instance_id}/system.log + + %-5level [%thread] ${instance_id} %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %msg%n + + true + + + + + %-5level %date{"HH:mm:ss,SSS"} %msg%n + + + WARN + + + + + + %-5level [%thread] ${instance_id} %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n + + + DEBUG + + + + + + + + + + + + + + + diff --git a/test/conf/logback-dtest_with_slow_query_appender_invalid.xml b/test/conf/logback-dtest_with_slow_query_appender_invalid.xml new file mode 100644 index 000000000000..1f7f58e86193 --- /dev/null +++ b/test/conf/logback-dtest_with_slow_query_appender_invalid.xml @@ -0,0 +1,73 @@ + + + + + + + + + + + ./build/test/logs/${cassandra.testtag}/${suitename}/${cluster_id}/${instance_id}/system.log + + %-5level [%thread] ${instance_id} %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %msg%n + + true + + + + + %-5level %date{"HH:mm:ss,SSS"} %msg%n + + + WARN + + + + + + %-5level [%thread] ${instance_id} %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n + + + DEBUG + + + + + + + + + INFO + + + + + + + + + + + + + + + + diff --git a/test/conf/logback-dtest_with_vtable_appender.xml b/test/conf/logback-dtest_with_vtable_appender.xml index c9fd108c77d8..726c46d524d8 100644 --- a/test/conf/logback-dtest_with_vtable_appender.xml +++ b/test/conf/logback-dtest_with_vtable_appender.xml @@ -21,19 +21,19 @@ - + ./build/test/logs/${cassandra.testtag}/${suitename}/${cluster_id}/${instance_id}/system.log - %-5level [%thread] ${instance_id} %date{ISO8601} %msg%n + %-5level [%thread] ${instance_id} %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %msg%n true - %-5level %date{HH:mm:ss,SSS} %msg%n + %-5level %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %msg%n WARN @@ -42,7 +42,7 @@ - %-5level [%thread] ${instance_id} %date{ISO8601} %F:%L - %msg%n + %-5level [%thread] ${instance_id} %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n DEBUG diff --git a/test/conf/logback-dtest_with_vtable_appender_invalid.xml b/test/conf/logback-dtest_with_vtable_appender_invalid.xml index 1b30c141c2a7..257f85753498 100644 --- a/test/conf/logback-dtest_with_vtable_appender_invalid.xml +++ b/test/conf/logback-dtest_with_vtable_appender_invalid.xml @@ -21,19 +21,19 @@ - + ./build/test/logs/${cassandra.testtag}/${suitename}/${cluster_id}/${instance_id}/system.log - %-5level [%thread] ${instance_id} %date{ISO8601} %msg%n + %-5level [%thread] ${instance_id} %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %msg%n true - %-5level %date{HH:mm:ss,SSS} %msg%n + %-5level %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %msg%n WARN @@ -42,7 +42,7 @@ - %-5level [%thread] ${instance_id} %date{ISO8601} %F:%L - %msg%n + %-5level [%thread] ${instance_id} %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n DEBUG diff --git a/test/conf/logback-jmh.xml b/test/conf/logback-jmh.xml index 4138f19c72bc..1f9bb3fd1b86 100644 --- a/test/conf/logback-jmh.xml +++ b/test/conf/logback-jmh.xml @@ -23,7 +23,6 @@ appender reference in the root level section below. --> - @@ -42,7 +41,7 @@ appender reference in the root level section below. 5GB - %-5level [%thread] %date{ISO8601} %F:%L - %msg%n + %-5level [%thread] %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n @@ -59,7 +58,7 @@ appender reference in the root level section below. 5GB - %-5level [%thread] %date{ISO8601} %F:%L - %msg%n + %-5level [%thread] %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n @@ -79,7 +78,7 @@ appender reference in the root level section below. INFO - %-5level [%thread] %date{ISO8601} %F:%L - %msg%n + %-5level [%thread] %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n diff --git a/test/conf/logback-simulator.xml b/test/conf/logback-simulator.xml index d0082d43fa1d..87d2ae327bc8 100644 --- a/test/conf/logback-simulator.xml +++ b/test/conf/logback-simulator.xml @@ -16,24 +16,39 @@ ~ limitations under the License. --> - - - + + + + + - + + + + ./build/test/logs/simulator/${run_start}-${run_seed}/history.log + + %msg%n + + true + + org.apache.cassandra.simulator.paxos.LoggingHistoryValidator + ACCEPT + DENY + + - ./build/test/logs/${cassandra.testtag}/${suitename}/${cluster_id}/${instance_id}/system.log + ./build/test/logs/simulator/${run_start}-${run_seed}/cluster-${cluster_id}/${instance_id}/system.log - %-5level [%thread] ${instance_id} %date{ISO8601} %msg%n + %-5level [%thread] ${instance_id} %replace(CS:%X{command_store} ){'CS\:\s+', ''}%replace(OP:%X{async_op} ){'OP\:\s+', ''}%date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %msg%n true - %-5level [%thread] ${instance_id} %date{ISO8601} %F:%L - %msg%n + %-5level [%thread] ${instance_id} %replace(CS:%X{command_store} ){'CS\:\s+', ''}%replace(OP:%X{async_op} ){'OP\:\s+', ''}%date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n WARN @@ -49,10 +64,12 @@ + + diff --git a/test/conf/logback-test.xml b/test/conf/logback-test.xml index 3e3349fd82f0..757806e35f2b 100644 --- a/test/conf/logback-test.xml +++ b/test/conf/logback-test.xml @@ -19,7 +19,7 @@ - + @@ -38,14 +38,14 @@ - %-5level [%thread] %date{ISO8601} %msg%n + %-5level [%thread] %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %msg%n false - %-5level [%thread] %date{ISO8601} %F:%L - %msg%n + %-5level [%thread] %date{"yyyy-MM-dd'T'HH:mm:ss,SSS", UTC} %F:%L - %msg%n DEBUG diff --git a/test/data/config/version=5.0-alpha1.yml b/test/data/config/version=5.0-alpha1.yml index 8dad0f60acc2..19995ce52b88 100644 --- a/test/data/config/version=5.0-alpha1.yml +++ b/test/data/config/version=5.0-alpha1.yml @@ -407,7 +407,7 @@ max_concurrent_automatic_sstable_upgrades: "java.lang.Integer" maximum_replication_factor_warn_threshold: "java.lang.Integer" denylist_reads_enabled: "java.lang.Boolean" permissions_cache_active_update: "java.lang.Boolean" -available_processors: "java.lang.Integer" +available_processors: "org.apache.cassandra.config.OptionaldPositiveInt" file_cache_round_up: "java.lang.Boolean" secondary_indexes_per_table_warn_threshold: "java.lang.Integer" tables_warn_threshold: "java.lang.Integer" diff --git a/test/data/config/version=5.1-alpha1.yml b/test/data/config/version=5.1-alpha1.yml index e730adcdb9bf..de5801c30249 100644 --- a/test/data/config/version=5.1-alpha1.yml +++ b/test/data/config/version=5.1-alpha1.yml @@ -408,7 +408,7 @@ max_concurrent_automatic_sstable_upgrades: "java.lang.Integer" maximum_replication_factor_warn_threshold: "java.lang.Integer" denylist_reads_enabled: "java.lang.Boolean" permissions_cache_active_update: "java.lang.Boolean" -available_processors: "java.lang.Integer" +available_processors: "org.apache.cassandra.config.OptionaldPositiveInt" file_cache_round_up: "java.lang.Boolean" secondary_indexes_per_table_warn_threshold: "java.lang.Integer" tables_warn_threshold: "java.lang.Integer" diff --git a/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-CompressionInfo.db b/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-CompressionInfo.db new file mode 100644 index 000000000000..aa6d6e6a102d Binary files /dev/null and b/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-CompressionInfo.db differ diff --git a/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-Data.db b/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-Data.db new file mode 100644 index 000000000000..c24c3fae61a4 Binary files /dev/null and b/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-Data.db differ diff --git a/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-Digest.crc32 b/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-Digest.crc32 new file mode 100644 index 000000000000..c84b9595a002 --- /dev/null +++ b/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-Digest.crc32 @@ -0,0 +1 @@ +1026070592 \ No newline at end of file diff --git a/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-Filter.db b/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-Filter.db new file mode 100644 index 000000000000..8868e5c18008 Binary files /dev/null and b/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-Filter.db differ diff --git a/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-Partitions.db b/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-Partitions.db new file mode 100644 index 000000000000..6e4273411ff0 Binary files /dev/null and b/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-Partitions.db differ diff --git a/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-Rows.db b/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-Rows.db new file mode 100644 index 000000000000..d2ba4d639b61 Binary files /dev/null and b/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-Rows.db differ diff --git a/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-Statistics.db b/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-Statistics.db new file mode 100644 index 000000000000..fb646831d00a Binary files /dev/null and b/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-Statistics.db differ diff --git a/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-TOC.txt b/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-TOC.txt new file mode 100644 index 000000000000..298910cfdc58 --- /dev/null +++ b/test/data/legacy-sstables/da/legacy_tables/legacy_da_clust_be_index_summary/da-500-bti-TOC.txt @@ -0,0 +1,8 @@ +Data.db +Statistics.db +Digest.crc32 +TOC.txt +CompressionInfo.db +Filter.db +Partitions.db +Rows.db diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-CompressionInfo.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-CompressionInfo.db new file mode 100644 index 000000000000..49ebec126f4a Binary files /dev/null and b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-CompressionInfo.db differ diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-Data.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-Data.db new file mode 100644 index 000000000000..4b0fa7ef3507 Binary files /dev/null and b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-Data.db differ diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-Digest.crc32 b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-Digest.crc32 new file mode 100644 index 000000000000..5f0b313cc8a5 --- /dev/null +++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-Digest.crc32 @@ -0,0 +1 @@ +1371588035 \ No newline at end of file diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-Filter.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-Filter.db new file mode 100644 index 000000000000..2e1d5d29ca06 Binary files /dev/null and b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-Filter.db differ diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-Index.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-Index.db new file mode 100644 index 000000000000..ad88ef6efc77 Binary files /dev/null and b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-Index.db differ diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-Statistics.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-Statistics.db new file mode 100644 index 000000000000..2c3a57350c2c Binary files /dev/null and b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-Statistics.db differ diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-Summary.db b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-Summary.db new file mode 100644 index 000000000000..0c575b7c1bf6 Binary files /dev/null and b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-Summary.db differ diff --git a/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-TOC.txt b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-TOC.txt new file mode 100644 index 000000000000..dde00207af51 --- /dev/null +++ b/test/data/legacy-sstables/ma/legacy_tables/legacy_ma_clust_be_index_summary/ma-306-big-TOC.txt @@ -0,0 +1,8 @@ +CompressionInfo.db +Data.db +Digest.crc32 +Summary.db +Index.db +Statistics.db +TOC.txt +Filter.db diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-CompressionInfo.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-CompressionInfo.db new file mode 100644 index 000000000000..589a4160f5ff Binary files /dev/null and b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-CompressionInfo.db differ diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-Data.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-Data.db new file mode 100644 index 000000000000..4b4fe735f53f Binary files /dev/null and b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-Data.db differ diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-Digest.crc32 b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-Digest.crc32 new file mode 100644 index 000000000000..cc0ff47a6beb --- /dev/null +++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-Digest.crc32 @@ -0,0 +1 @@ +849406636 \ No newline at end of file diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-Filter.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-Filter.db new file mode 100644 index 000000000000..2e1d5d29ca06 Binary files /dev/null and b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-Filter.db differ diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-Index.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-Index.db new file mode 100644 index 000000000000..f1bccb20c571 Binary files /dev/null and b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-Index.db differ diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-Statistics.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-Statistics.db new file mode 100644 index 000000000000..7510a11124cd Binary files /dev/null and b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-Statistics.db differ diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-Summary.db b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-Summary.db new file mode 100644 index 000000000000..0c575b7c1bf6 Binary files /dev/null and b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-Summary.db differ diff --git a/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-TOC.txt b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-TOC.txt new file mode 100644 index 000000000000..6cd09785abec --- /dev/null +++ b/test/data/legacy-sstables/mb/legacy_tables/legacy_mb_clust_be_index_summary/mb-307-big-TOC.txt @@ -0,0 +1,8 @@ +Index.db +Filter.db +TOC.txt +Digest.crc32 +Summary.db +Data.db +Statistics.db +CompressionInfo.db diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-CompressionInfo.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-CompressionInfo.db new file mode 100644 index 000000000000..8a0a148b64c0 Binary files /dev/null and b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-CompressionInfo.db differ diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-Data.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-Data.db new file mode 100644 index 000000000000..5208946c1d29 Binary files /dev/null and b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-Data.db differ diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-Digest.crc32 b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-Digest.crc32 new file mode 100644 index 000000000000..3f9e06243713 --- /dev/null +++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-Digest.crc32 @@ -0,0 +1 @@ +1373250029 \ No newline at end of file diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-Filter.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-Filter.db new file mode 100644 index 000000000000..2e1d5d29ca06 Binary files /dev/null and b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-Filter.db differ diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-Index.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-Index.db new file mode 100644 index 000000000000..f1bccb20c571 Binary files /dev/null and b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-Index.db differ diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-Statistics.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-Statistics.db new file mode 100644 index 000000000000..dba6ce739c22 Binary files /dev/null and b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-Statistics.db differ diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-Summary.db b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-Summary.db new file mode 100644 index 000000000000..0c575b7c1bf6 Binary files /dev/null and b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-Summary.db differ diff --git a/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-TOC.txt b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-TOC.txt new file mode 100644 index 000000000000..19154fb1404d --- /dev/null +++ b/test/data/legacy-sstables/mc/legacy_tables/legacy_mc_clust_be_index_summary/mc-3113-big-TOC.txt @@ -0,0 +1,8 @@ +Summary.db +CompressionInfo.db +Digest.crc32 +Statistics.db +Index.db +Data.db +Filter.db +TOC.txt diff --git a/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-CompressionInfo.db b/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-CompressionInfo.db new file mode 100644 index 000000000000..e79d9784589a Binary files /dev/null and b/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-CompressionInfo.db differ diff --git a/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-Data.db b/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-Data.db new file mode 100644 index 000000000000..a6a94e6b5ec3 Binary files /dev/null and b/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-Data.db differ diff --git a/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-Digest.crc32 b/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-Digest.crc32 new file mode 100644 index 000000000000..44c7c2710e14 --- /dev/null +++ b/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-Digest.crc32 @@ -0,0 +1 @@ +2461228597 \ No newline at end of file diff --git a/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-Filter.db b/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-Filter.db new file mode 100644 index 000000000000..2e1d5d29ca06 Binary files /dev/null and b/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-Filter.db differ diff --git a/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-Index.db b/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-Index.db new file mode 100644 index 000000000000..0860005fb9d8 Binary files /dev/null and b/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-Index.db differ diff --git a/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-Statistics.db b/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-Statistics.db new file mode 100644 index 000000000000..9288af336e4c Binary files /dev/null and b/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-Statistics.db differ diff --git a/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-Summary.db b/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-Summary.db new file mode 100644 index 000000000000..0c575b7c1bf6 Binary files /dev/null and b/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-Summary.db differ diff --git a/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-TOC.txt b/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-TOC.txt new file mode 100644 index 000000000000..fa38f567c160 --- /dev/null +++ b/test/data/legacy-sstables/md/legacy_tables/legacy_md_clust_be_index_summary/md-31110-big-TOC.txt @@ -0,0 +1,8 @@ +Data.db +Statistics.db +Filter.db +Summary.db +CompressionInfo.db +Index.db +TOC.txt +Digest.crc32 diff --git a/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-CompressionInfo.db b/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-CompressionInfo.db new file mode 100644 index 000000000000..f807a43f3e96 Binary files /dev/null and b/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-CompressionInfo.db differ diff --git a/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-Data.db b/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-Data.db new file mode 100644 index 000000000000..6fb056218cc9 Binary files /dev/null and b/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-Data.db differ diff --git a/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-Digest.crc32 b/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-Digest.crc32 new file mode 100644 index 000000000000..4d40edafc7ad --- /dev/null +++ b/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-Digest.crc32 @@ -0,0 +1 @@ +3525076442 \ No newline at end of file diff --git a/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-Filter.db b/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-Filter.db new file mode 100644 index 000000000000..2e1d5d29ca06 Binary files /dev/null and b/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-Filter.db differ diff --git a/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-Index.db b/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-Index.db new file mode 100644 index 000000000000..2fd4d9b85fe5 Binary files /dev/null and b/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-Index.db differ diff --git a/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-Statistics.db b/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-Statistics.db new file mode 100644 index 000000000000..2482df5a8151 Binary files /dev/null and b/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-Statistics.db differ diff --git a/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-Summary.db b/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-Summary.db new file mode 100644 index 000000000000..0c575b7c1bf6 Binary files /dev/null and b/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-Summary.db differ diff --git a/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-TOC.txt b/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-TOC.txt new file mode 100644 index 000000000000..b91824d28955 --- /dev/null +++ b/test/data/legacy-sstables/me/legacy_tables/legacy_me_clust_be_index_summary/me-31111-big-TOC.txt @@ -0,0 +1,8 @@ +Index.db +CompressionInfo.db +Statistics.db +TOC.txt +Summary.db +Digest.crc32 +Data.db +Filter.db diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-CompressionInfo.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-CompressionInfo.db new file mode 100644 index 000000000000..8ded333178f7 Binary files /dev/null and b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-CompressionInfo.db differ diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-Data.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-Data.db new file mode 100644 index 000000000000..0bbfe1acd569 Binary files /dev/null and b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-Data.db differ diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-Digest.crc32 b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-Digest.crc32 new file mode 100644 index 000000000000..7ffa3bc3c73b --- /dev/null +++ b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-Digest.crc32 @@ -0,0 +1 @@ +309317098 \ No newline at end of file diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-Filter.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-Filter.db new file mode 100644 index 000000000000..8868e5c18008 Binary files /dev/null and b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-Filter.db differ diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-Index.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-Index.db new file mode 100644 index 000000000000..e04d3dbda92e Binary files /dev/null and b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-Index.db differ diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-Statistics.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-Statistics.db new file mode 100644 index 000000000000..804ccf248f87 Binary files /dev/null and b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-Statistics.db differ diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-Summary.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-Summary.db new file mode 100644 index 000000000000..0c575b7c1bf6 Binary files /dev/null and b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-Summary.db differ diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-TOC.txt b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-TOC.txt new file mode 100644 index 000000000000..203013437a0b --- /dev/null +++ b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_be_index_summary/na-40-big-TOC.txt @@ -0,0 +1,8 @@ +TOC.txt +Data.db +Index.db +Statistics.db +Digest.crc32 +CompressionInfo.db +Filter.db +Summary.db diff --git a/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-CompressionInfo.db b/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-CompressionInfo.db new file mode 100644 index 000000000000..c34e6728465e Binary files /dev/null and b/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-CompressionInfo.db differ diff --git a/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-Data.db b/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-Data.db new file mode 100644 index 000000000000..becb2d28fbd6 Binary files /dev/null and b/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-Data.db differ diff --git a/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-Digest.crc32 b/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-Digest.crc32 new file mode 100644 index 000000000000..b0e4cbb28e6d --- /dev/null +++ b/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-Digest.crc32 @@ -0,0 +1 @@ +3874015080 \ No newline at end of file diff --git a/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-Filter.db b/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-Filter.db new file mode 100644 index 000000000000..8868e5c18008 Binary files /dev/null and b/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-Filter.db differ diff --git a/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-Index.db b/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-Index.db new file mode 100644 index 000000000000..d3b366dd185c Binary files /dev/null and b/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-Index.db differ diff --git a/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-Statistics.db b/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-Statistics.db new file mode 100644 index 000000000000..f49e76531f64 Binary files /dev/null and b/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-Statistics.db differ diff --git a/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-Summary.db b/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-Summary.db new file mode 100644 index 000000000000..0c575b7c1bf6 Binary files /dev/null and b/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-Summary.db differ diff --git a/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-TOC.txt b/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-TOC.txt new file mode 100644 index 000000000000..dc0664391ebb --- /dev/null +++ b/test/data/legacy-sstables/nb/legacy_tables/legacy_nb_clust_be_index_summary/nb-400-big-TOC.txt @@ -0,0 +1,8 @@ +Data.db +TOC.txt +Digest.crc32 +Summary.db +Index.db +Filter.db +CompressionInfo.db +Statistics.db diff --git a/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-CompressionInfo.db b/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-CompressionInfo.db new file mode 100644 index 000000000000..3d328b087561 Binary files /dev/null and b/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-CompressionInfo.db differ diff --git a/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-Data.db b/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-Data.db new file mode 100644 index 000000000000..7d422681cb7a Binary files /dev/null and b/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-Data.db differ diff --git a/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-Digest.crc32 b/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-Digest.crc32 new file mode 100644 index 000000000000..e86ec54b54e0 --- /dev/null +++ b/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-Digest.crc32 @@ -0,0 +1 @@ +1158768921 \ No newline at end of file diff --git a/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-Filter.db b/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-Filter.db new file mode 100644 index 000000000000..8868e5c18008 Binary files /dev/null and b/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-Filter.db differ diff --git a/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-Index.db b/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-Index.db new file mode 100644 index 000000000000..46ab61e61f37 Binary files /dev/null and b/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-Index.db differ diff --git a/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-Statistics.db b/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-Statistics.db new file mode 100644 index 000000000000..a548e130d2e9 Binary files /dev/null and b/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-Statistics.db differ diff --git a/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-Summary.db b/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-Summary.db new file mode 100644 index 000000000000..0c575b7c1bf6 Binary files /dev/null and b/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-Summary.db differ diff --git a/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-TOC.txt b/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-TOC.txt new file mode 100644 index 000000000000..576c5e0598c5 --- /dev/null +++ b/test/data/legacy-sstables/oa/legacy_tables/legacy_oa_clust_be_index_summary/oa-500-big-TOC.txt @@ -0,0 +1,8 @@ +Data.db +Statistics.db +Digest.crc32 +TOC.txt +CompressionInfo.db +Filter.db +Index.db +Summary.db diff --git a/test/data/serialization/5.0/service.SyncComplete.bin b/test/data/serialization/5.0/service.SyncComplete.bin index 7c775cef6601..d8465dbf61e8 100644 Binary files a/test/data/serialization/5.0/service.SyncComplete.bin and b/test/data/serialization/5.0/service.SyncComplete.bin differ diff --git a/test/data/serialization/5.0/service.ValidationComplete.bin b/test/data/serialization/5.0/service.ValidationComplete.bin index f0c502458a5f..c595fbcc0f6c 100644 Binary files a/test/data/serialization/5.0/service.ValidationComplete.bin and b/test/data/serialization/5.0/service.ValidationComplete.bin differ diff --git a/test/data/serialization/5.1/service.SyncComplete.bin b/test/data/serialization/5.1/service.SyncComplete.bin index b5f3633e7b69..5e27e345fdd5 100644 Binary files a/test/data/serialization/5.1/service.SyncComplete.bin and b/test/data/serialization/5.1/service.SyncComplete.bin differ diff --git a/test/distributed/org/apache/cassandra/distributed/Cluster.java b/test/distributed/org/apache/cassandra/distributed/Cluster.java index 4effc4e9c49a..07e64041c68b 100644 --- a/test/distributed/org/apache/cassandra/distributed/Cluster.java +++ b/test/distributed/org/apache/cassandra/distributed/Cluster.java @@ -76,6 +76,11 @@ public Builder() } } + public void forEach(IIsolatedExecutor.SerializableRunnable runnable) + { + forEach(i -> i.runOnInstance(runnable)); + } + public void enableMessageLogging() { filters().allVerbs().inbound().messagesMatching((from, to, msg) -> { diff --git a/test/distributed/org/apache/cassandra/distributed/api/ICoordinator.java b/test/distributed/org/apache/cassandra/distributed/api/ICoordinator.java index aee6aeaeb8df..4adaff4c4d0e 100644 --- a/test/distributed/org/apache/cassandra/distributed/api/ICoordinator.java +++ b/test/distributed/org/apache/cassandra/distributed/api/ICoordinator.java @@ -18,11 +18,12 @@ package org.apache.cassandra.distributed.api; +import org.apache.cassandra.distributed.shared.FutureUtils; + import java.util.Iterator; import java.util.UUID; import java.util.concurrent.Future; - -import org.apache.cassandra.distributed.shared.FutureUtils; +import java.util.function.BiConsumer; // The cross-version API requires that a Coordinator can be constructed without any constructor arguments public interface ICoordinator @@ -60,6 +61,8 @@ default Object[][] execute(String query, ConsistencyLevel serialConsistencyLevel } SimpleQueryResult executeWithResult(String query, ConsistencyLevel consistencyLevel, Object... boundValues); + Future executeWithResult(BiConsumer callback, String query, ConsistencyLevel consistencyLevel, Object... boundValues); + Future executeWithResult(BiConsumer callback, String query, ConsistencyLevel serialConsistencyLevel, ConsistencyLevel commitConsistencyLevel, Object... boundValues); default SimpleQueryResult executeWithResult(String query, ConsistencyLevel serialConsistencyLevel, ConsistencyLevel commitConsistencyLevel, Object... boundValues) { @@ -79,6 +82,7 @@ default Future asyncExecuteWithTracing(UUID sessionId, String query, } Future asyncExecuteWithTracingWithResult(UUID sessionId, String query, ConsistencyLevel consistencyLevel, Object... boundValues); + Future asyncExecuteWithResult(String query, ConsistencyLevel consistencyLevel, Object... boundValues); default Object[][] executeWithTracing(UUID sessionId, String query, ConsistencyLevel consistencyLevel, Object... boundValues) { diff --git a/test/distributed/org/apache/cassandra/distributed/api/QueryResults.java b/test/distributed/org/apache/cassandra/distributed/api/QueryResults.java index 081d06a525eb..46ce489dd1ae 100644 --- a/test/distributed/org/apache/cassandra/distributed/api/QueryResults.java +++ b/test/distributed/org/apache/cassandra/distributed/api/QueryResults.java @@ -25,6 +25,11 @@ import java.util.NoSuchElementException; import java.util.Objects; import java.util.function.Predicate; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import com.google.common.base.Preconditions; +import com.google.common.collect.AbstractIterator; public final class QueryResults { @@ -69,6 +74,34 @@ public static QueryResult filter(QueryResult result, Predicate fn) return new FilterQueryResult(result, fn); } + public static Iterable> stringify(SimpleQueryResult qr) + { + return stringify(qr, -1); + } + + public static Iterable> stringify(SimpleQueryResult qr, int maxColumnSize) + { + Preconditions.checkArgument(maxColumnSize == -1 || maxColumnSize > 0, "max column size must be positive or -1 (disabled); given %s", maxColumnSize); + qr.mark(); + return () -> { + qr.reset(); + return new AbstractIterator<>() + { + @Override + protected List computeNext() + { + if (!qr.hasNext()) + return endOfData(); + Row next = qr.next(); + Stream stream = Stream.of(next.toObjectArray()).map(Objects::toString); + if (maxColumnSize != -1) + stream = stream.map(s -> s.length() > maxColumnSize ? s.substring(0, maxColumnSize) + "..." : s); + return stream.collect(Collectors.toList()); + } + }; + }; + } + public static Builder builder() { return new Builder(); diff --git a/test/distributed/org/apache/cassandra/distributed/api/Row.java b/test/distributed/org/apache/cassandra/distributed/api/Row.java index 9d08cb2859c5..711627881c60 100644 --- a/test/distributed/org/apache/cassandra/distributed/api/Row.java +++ b/test/distributed/org/apache/cassandra/distributed/api/Row.java @@ -21,6 +21,7 @@ import java.util.Arrays; import java.util.Date; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.NoSuchElementException; import java.util.Objects; @@ -85,6 +86,17 @@ public T get(int index) return (T) results[index]; } + public T get(int index, T defaultValue) + { + checkAccess(); + if (index < 0 || index >= results.length) + throw new NoSuchElementException("by index: " + index); + T result = (T) results[index]; + if (result == null) + return defaultValue; + return result; + } + public T get(String name) { checkAccess(); @@ -94,66 +106,158 @@ public T get(String name) return (T) results[idx]; } + public T get(String name, T defaultValue) + { + checkAccess(); + int idx = findIndex(name); + if (idx == NOT_FOUND) + throw new NoSuchElementException("by name: " + name); + T result = (T) results[idx]; + if (result == null) + return defaultValue; + return result; + } + + public Boolean getBoolean(int index) + { + return get(index); + } + + public Boolean getBoolean(int index, Boolean defaultValue) + { + return get(index, defaultValue); + } + + public Boolean getBoolean(String name) + { + return get(name); + } + + public Boolean getBoolean(String name, Boolean defaultValue) + { + return get(name, defaultValue); + } + public Short getShort(int index) { return get(index); } + public Short getShort(int index, Short defaultValue) + { + return get(index, defaultValue); + } + public Short getShort(String name) { return get(name); } + public Short getShort(String name, Short defaultValue) + { + return get(name, defaultValue); + } + public Integer getInteger(int index) { return get(index); } + public Integer getInteger(int index, Integer defaultValue) + { + return get(index, defaultValue); + } + public Integer getInteger(String name) { return get(name); } + public Integer getInteger(String name, Integer defaultValue) + { + return get(name, defaultValue); + } + public Long getLong(int index) { return get(index); } + public Long getLong(int index, Long defaultValue) + { + return get(index, defaultValue); + } + public Long getLong(String name) { return get(name); } + public Long getLong(String name, Long defaultValue) + { + return get(name, defaultValue); + } + public Float getFloat(int index) { return get(index); } + public Float getFloat(int index, Float defaultValue) + { + return get(index, defaultValue); + } + public Float getFloat(String name) { return get(name); } + public Float getFloat(String name, Float defaultValue) + { + return get(name, defaultValue); + } + public Double getDouble(int index) { return get(index); } + public Double getDouble(int index, Double defaultValue) + { + return get(index, defaultValue); + } + public Double getDouble(String name) { return get(name); } + public Double getDouble(String name, Double defaultValue) + { + return get(name, defaultValue); + } + public String getString(int index) { return get(index); } + public String getString(int index, String defaultValue) + { + return get(index, defaultValue); + } + public String getString(String name) { return get(name); } + public String getString(String name, String defaultValue) + { + return get(name, defaultValue); + } + public UUID getUUID(int index) { Object uuid = get(index); @@ -162,6 +266,14 @@ public UUID getUUID(int index) return (UUID) uuid; } + public UUID getUUID(int index, UUID defaultValue) + { + Object uuid = get(index, defaultValue); + if (uuid instanceof TimeUUID) + return ((TimeUUID) uuid).asUUID(); + return (UUID) uuid; + } + public UUID getUUID(String name) { Object uuid = get(name); @@ -170,26 +282,74 @@ public UUID getUUID(String name) return (UUID) uuid; } + public UUID getUUID(String name, UUID defaultValue) + { + Object uuid = get(name, defaultValue); + if (uuid instanceof TimeUUID) + return ((TimeUUID) uuid).asUUID(); + return (UUID) uuid; + } + public Date getTimestamp(int index) { return get(index); } + public Date getTimestamp(int index, Date defaultValue) + { + return get(index, defaultValue); + } + public Date getTimestamp(String name) { return get(name); } + public Date getTimestamp(String name, Date defaultValue) + { + return get(name, defaultValue); + } + public Set getSet(int index) { return get(index); } + public Set getSet(int index, Set defaultValue) + { + return get(index, defaultValue); + } + public Set getSet(String name) { return get(name); } + public Set getSet(String name, Set defaultValue) + { + return get(name, defaultValue); + } + + public List getList(int index) + { + return get(index); + } + + public List getList(int index, List defaultValue) + { + return get(index, defaultValue); + } + + public List getList(String name) + { + return get(name); + } + + public List getList(String name, List defaultValue) + { + return get(name, defaultValue); + } + /** * Get the row as a array. */ diff --git a/test/distributed/org/apache/cassandra/distributed/api/SimpleQueryResult.java b/test/distributed/org/apache/cassandra/distributed/api/SimpleQueryResult.java index 2b71e8b8b17f..d49ed57b2f58 100644 --- a/test/distributed/org/apache/cassandra/distributed/api/SimpleQueryResult.java +++ b/test/distributed/org/apache/cassandra/distributed/api/SimpleQueryResult.java @@ -61,12 +61,15 @@ */ public class SimpleQueryResult implements QueryResult { + private static final int NO_MARK = -1; + private final String[] names; private final Object[][] results; private final List warnings; private final Predicate filter; private final Row row; - private int offset = -1; + private int offset = NO_MARK; + private int mark = NO_MARK; public SimpleQueryResult(String[] names, Object[][] results) { @@ -108,12 +111,18 @@ public SimpleQueryResult filter(Predicate fn) return new SimpleQueryResult(names, results, filter.and(fn), offset); } + public void mark() + { + mark = offset; + } + /** * Reset the cursor to the start of the query result; if the query result has not been iterated, this has no effect. */ public void reset() { - offset = -1; + offset = mark; + mark = NO_MARK; row.setResults(null); } diff --git a/test/distributed/org/apache/cassandra/distributed/impl/AbstractCluster.java b/test/distributed/org/apache/cassandra/distributed/impl/AbstractCluster.java index 7f98f9003d13..6cd9c8e60a80 100644 --- a/test/distributed/org/apache/cassandra/distributed/impl/AbstractCluster.java +++ b/test/distributed/org/apache/cassandra/distributed/impl/AbstractCluster.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.lang.annotation.Annotation; +import java.lang.reflect.Constructor; import java.lang.reflect.Field; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; @@ -29,6 +30,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; @@ -54,14 +56,18 @@ import javax.annotation.concurrent.GuardedBy; import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Throwables; import com.google.common.collect.ImmutableSet; +import com.google.common.collect.LinkedHashMultimap; import org.junit.Assume; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.concurrent.ExecutorFactory; import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.dht.Token; +import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.Constants; import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.api.Feature; @@ -88,6 +94,7 @@ import org.apache.cassandra.distributed.shared.Versions; import org.apache.cassandra.io.util.PathUtils; import org.apache.cassandra.net.Verb; +import org.apache.cassandra.utils.AssertionUtils; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.Isolated; import org.apache.cassandra.utils.Shared; @@ -165,7 +172,6 @@ public abstract class AbstractCluster implements ICluster ignoreUncaughtThrowable = null; private final List uncaughtExceptions = new CopyOnWriteArrayList<>(); @@ -191,6 +197,29 @@ public static abstract class AbstractBuilder { + try + { + Class ef = classLoader.loadClass(ExecutorFactory.class.getName()); + Class efd = classLoader.loadClass(ExecutorFactory.Default.class.getName()); + Constructor newEfd = efd.getConstructor(ClassLoader.class, ThreadGroup.class, Thread.UncaughtExceptionHandler.class); + Object executorFactory = newEfd.newInstance(classLoader, threadGroup, threadGroup); + Class efg = classLoader.loadClass(ExecutorFactory.Global.class.getName()); + Method setEfg = efg.getMethod("unsafeSet", ef); + setEfg.invoke(null, executorFactory); + } + catch (ClassNotFoundException e) + { + if (this instanceof Cluster.Builder) + throw new RuntimeException(e); + else + logger.info("Unable to set ExecutorFactory for instance {}", i, e); + } + catch (NoSuchMethodException | InvocationTargetException | InstantiationException | IllegalAccessException e) + { + throw new RuntimeException(e); + } + }); } public AbstractBuilder(Factory factory) @@ -304,7 +333,14 @@ private IInvokableInstance newInstance() ++generation; IClassTransformer transformer = classTransformer == null ? null : classTransformer.initialise(); ClassLoader classLoader = new InstanceClassLoader(generation, config.num(), version.classpath, sharedClassLoader, sharedClassPredicate, transformer); - ThreadGroup threadGroup = new ThreadGroup(clusterThreadGroup, "node" + config.num() + (generation > 1 ? "_" + generation : "")); + ThreadGroup threadGroup = new ThreadGroup(clusterThreadGroup, "node" + config.num() + (generation > 1 ? "_" + generation : "")) + { + @Override + public void uncaughtException(Thread t, Throwable e) + { + AbstractCluster.this.uncaughtException(get(config.num()), t, e); + } + }; if (instanceInitializer != null) instanceInitializer.initialise(classLoader, threadGroup, config.num(), generation); @@ -769,11 +805,6 @@ public void run(List> actions, int instanceId, int... moreIn } } - public void forEach(IIsolatedExecutor.SerializableRunnable runnable) - { - forEach(i -> i.sync(runnable)); - } - public void forEach(Consumer consumer) { forEach(instances, consumer); @@ -1032,8 +1063,8 @@ protected String getMonitorTimeoutMessage() public void startup() { - previousHandler = Thread.getDefaultUncaughtExceptionHandler(); - Thread.setDefaultUncaughtExceptionHandler(this::uncaughtExceptions); + // start the JNA cleaner on the system class loader to avoid pinning an instance + com.sun.jna.internal.Cleaner.getCleaner(); try (AllMembersAliveMonitor monitor = new AllMembersAliveMonitor()) { monitor.startPolling(); @@ -1070,22 +1101,34 @@ public void startup() } } - private void uncaughtExceptions(Thread thread, Throwable error) + private void uncaughtException(I instance, Thread thread, Throwable error) { + // should no longer be possible given this is called from a ThreadGroup, but just in case if (!(thread.getContextClassLoader() instanceof InstanceClassLoader)) - { - Thread.UncaughtExceptionHandler handler = previousHandler; - if (null != handler) - handler.uncaughtException(thread, error); return; + + try + { + instance.uncaughtException(thread, error); + } + catch (Throwable t) + { + // mixing ClassLoaders so can't use normal instanceOf check + if (AssertionUtils.isInstanceof(InstanceKiller.InstanceShutdown.class).matches(Throwables.getRootCause(t))) + { + // The exception was handled by JVMStabilityInspector + return; + } + maybeAddUncaughtExceptions(t, instance); } - InstanceClassLoader cl = (InstanceClassLoader) thread.getContextClassLoader(); - get(cl.getInstanceId()).uncaughtException(thread, error); + maybeAddUncaughtExceptions(error, instance); + } + private void maybeAddUncaughtExceptions(Throwable error, I instance) + { BiPredicate ignore = ignoreUncaughtThrowable; - I instance = get(cl.getInstanceId()); - if ((ignore == null || !ignore.test(cl.getInstanceId(), error)) && instance != null && !instance.isShutdown()) + if ((ignore == null || !ignore.test(instance.config().num(), error)) && instance != null && !instance.isShutdown()) uncaughtExceptions.add(error); } @@ -1131,8 +1174,6 @@ public void close() PathUtils.deleteRecursive(root); else logger.error("Not removing directories, as some instances haven't fully stopped."); - Thread.setDefaultUncaughtExceptionHandler(previousHandler); - previousHandler = null; checkAndResetUncaughtExceptions(); //checkForThreadLeaks(); //withThreadLeakCheck(futures); @@ -1161,19 +1202,23 @@ private IllegalStateException checkForThreadLeaks() //This is an alternate version of the thread leak check that just checks to see if any threads are still alive // with the context classloader. Map allThreads = Thread.getAllStackTraces(); - StringBuilder sb = new StringBuilder(); + var groupByStacktrace = LinkedHashMultimap., String>create(); for (Map.Entry e : allThreads.entrySet()) { if (!(e.getKey().getContextClassLoader() instanceof InstanceClassLoader)) continue; e.getKey().setContextClassLoader(null); - sb.append(e.getKey().getName()).append(":\n"); - for (StackTraceElement s : e.getValue()) + groupByStacktrace.put(Arrays.asList(e.getValue()), e.getKey().getName()); + } + if (groupByStacktrace.isEmpty()) return null; + StringBuilder sb = new StringBuilder(); + for (Map.Entry, Collection> e : groupByStacktrace.asMap().entrySet()) + { + sb.append("Threads: ").append(e.getValue()).append(":\n"); + for (StackTraceElement s : e.getKey()) sb.append("\t").append(s).append("\n"); } - return sb.length() > 0 - ? new IllegalStateException("Unterminated threads detected; active threads:\n" + sb) - : null; + return new IllegalStateException("Unterminated threads detected; active threads:\n" + sb); } public List tokens() @@ -1183,7 +1228,7 @@ public List tokens() { try { - IPartitioner partitioner = ((IPartitioner)Class.forName(i.config().getString("partitioner")).newInstance()); + IPartitioner partitioner = FBUtilities.newPartitioner(i.config().getString("partitioner")); return Stream.of(i.config().getString("initial_token").split(",")).map(partitioner.getTokenFactory()::fromString); } catch (Throwable t) diff --git a/test/distributed/org/apache/cassandra/distributed/impl/Coordinator.java b/test/distributed/org/apache/cassandra/distributed/impl/Coordinator.java index 20293ad11996..d623bef85f8b 100644 --- a/test/distributed/org/apache/cassandra/distributed/impl/Coordinator.java +++ b/test/distributed/org/apache/cassandra/distributed/impl/Coordinator.java @@ -25,6 +25,7 @@ import java.util.List; import java.util.UUID; import java.util.concurrent.Future; +import java.util.function.BiConsumer; import com.google.common.collect.Iterators; @@ -41,9 +42,9 @@ import org.apache.cassandra.distributed.api.SimpleQueryResult; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.service.QueryState; +import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.transport.Dispatcher; import org.apache.cassandra.transport.ProtocolVersion; -import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.transport.messages.ResultMessage; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.TimeUUID; @@ -62,6 +63,30 @@ public SimpleQueryResult executeWithResult(String query, ConsistencyLevel consis return instance().sync(() -> unsafeExecuteInternal(query, consistencyLevel, boundValues)).call(); } + @Override + public Future executeWithResult(BiConsumer callback, String query, ConsistencyLevel consistencyLevel, Object... boundValues) + { + return executeWithResult(callback, query, null, consistencyLevel, boundValues); + } + + @Override + public Future executeWithResult(BiConsumer callback, String query, ConsistencyLevel serialConsistencyLevel, ConsistencyLevel commitConsistencyLevel, Object... boundValues) + { + return instance().async(cb -> { + SimpleQueryResult result; + try + { + result = CoordinatorHelper.unsafeExecuteInternal(query, serialConsistencyLevel, commitConsistencyLevel, boundValues); + } + catch (Throwable t) + { + callback.accept(null, t); + return; + } + callback.accept(result, null); + }).apply(callback); + } + public Future asyncExecuteWithTracingWithResult(UUID sessionId, String query, ConsistencyLevel consistencyLevelOrigin, Object... boundValues) { return instance.async(() -> { @@ -77,6 +102,12 @@ public Future asyncExecuteWithTracingWithResult(UUID sessionI }).call(); } + @Override + public Future asyncExecuteWithResult(String query, ConsistencyLevel consistencyLevelOrigin, Object... boundValues) + { + return instance.async(() -> unsafeExecuteInternal(query, consistencyLevelOrigin, boundValues)).call(); + } + public static org.apache.cassandra.db.ConsistencyLevel toCassandraCL(ConsistencyLevel cl) { try @@ -127,8 +158,7 @@ public QueryResult executeWithPagingWithResult(String query, ConsistencyLevel co boundBBValues.add(ByteBufferUtil.objectToBytes(boundValue)); prepared.validate(clientState); - Invariants.checkState(prepared instanceof SelectStatement, - "Only SELECT statements can be executed with paging %s", prepared); + Invariants.require(prepared instanceof SelectStatement, "Only SELECT statements can be executed with paging %s", prepared); Dispatcher.RequestTime requestTime = Dispatcher.RequestTime.forImmediateExecution(); SelectStatement selectStatement = (SelectStatement) prepared; @@ -146,7 +176,7 @@ public QueryResult executeWithPagingWithResult(String query, ConsistencyLevel co ResultMessage.Rows initialRows = selectStatement.execute(queryState, initialOptions, requestTime); Iterator iter = new Iterator() { - ResultMessage.Rows rows = selectStatement.execute(queryState, initialOptions, requestTime); + ResultMessage.Rows rows = initialRows; Iterator iter = RowUtil.toIter(rows); public boolean hasNext() diff --git a/test/distributed/org/apache/cassandra/distributed/impl/DirectStreamingConnectionFactory.java b/test/distributed/org/apache/cassandra/distributed/impl/DirectStreamingConnectionFactory.java index 72105d8f403c..bd674b3b1e49 100644 --- a/test/distributed/org/apache/cassandra/distributed/impl/DirectStreamingConnectionFactory.java +++ b/test/distributed/org/apache/cassandra/distributed/impl/DirectStreamingConnectionFactory.java @@ -325,6 +325,12 @@ public synchronized void onClose(Runnable runOnClose) else if (onClose == null) onClose = runOnClose; else { Runnable tmp = onClose; onClose = () -> { tmp.run(); runOnClose.run(); }; } } + + @Override + public int hashCode() + { + return id; + } } private final DirectStreamingChannel outToRecipient, outToOriginator; diff --git a/test/distributed/org/apache/cassandra/distributed/impl/Instance.java b/test/distributed/org/apache/cassandra/distributed/impl/Instance.java index a7cb238119db..575482c5a06c 100644 --- a/test/distributed/org/apache/cassandra/distributed/impl/Instance.java +++ b/test/distributed/org/apache/cassandra/distributed/impl/Instance.java @@ -42,6 +42,7 @@ import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Stream; +import javax.annotation.Nullable; import javax.management.ListenerNotFoundException; import javax.management.Notification; import javax.management.NotificationListener; @@ -56,15 +57,16 @@ import org.apache.cassandra.auth.AuthCache; import org.apache.cassandra.batchlog.Batch; import org.apache.cassandra.batchlog.BatchlogManager; -import org.apache.cassandra.concurrent.ExecutorFactory; import org.apache.cassandra.concurrent.ExecutorLocals; import org.apache.cassandra.concurrent.ExecutorPlus; +import org.apache.cassandra.concurrent.ImmediateExecutor; import org.apache.cassandra.concurrent.NamedThreadFactory; import org.apache.cassandra.concurrent.ScheduledExecutors; import org.apache.cassandra.concurrent.SharedExecutorPool; import org.apache.cassandra.concurrent.Stage; import org.apache.cassandra.config.Config; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.config.DurationSpec; import org.apache.cassandra.config.YamlConfigurationLoader; import org.apache.cassandra.cql3.CQLStatement; import org.apache.cassandra.cql3.QueryHandler; @@ -101,6 +103,7 @@ import org.apache.cassandra.gms.Gossiper; import org.apache.cassandra.hints.DTestSerializer; import org.apache.cassandra.hints.HintsService; +import org.apache.cassandra.index.IndexStatusManager; import org.apache.cassandra.index.SecondaryIndexManager; import org.apache.cassandra.io.IVersionedAsymmetricSerializer; import org.apache.cassandra.io.sstable.format.SSTableReader; @@ -127,6 +130,7 @@ import org.apache.cassandra.service.QueryState; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.service.StorageServiceMBean; +import org.apache.cassandra.service.accord.AccordService; import org.apache.cassandra.service.paxos.PaxosRepair; import org.apache.cassandra.service.paxos.PaxosState; import org.apache.cassandra.service.paxos.uncommitted.UncommittedTableData; @@ -159,6 +163,7 @@ import org.apache.cassandra.utils.JVMStabilityInspector; import org.apache.cassandra.utils.Throwables; import org.apache.cassandra.utils.concurrent.Ref; +import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; import org.apache.cassandra.utils.logging.LoggingSupportFactory; import org.apache.cassandra.utils.memory.BufferPools; import org.apache.cassandra.utils.progress.jmx.JMXBroadcastExecutor; @@ -194,6 +199,8 @@ public class Instance extends IsolatedExecutor implements IInvokableInstance private volatile boolean internodeMessagingStarted = false; private final AtomicLong startedAt = new AtomicLong(); private IsolatedJmx isolatedJmx; + private static boolean RECEIVE_MESSAGES_ASYNC = false; + public static void setReceiveMessagesAsync(boolean v) {RECEIVE_MESSAGES_ASYNC = v; } /** @deprecated See CASSANDRA-17013 */ @Deprecated(since = "4.1") @@ -209,7 +216,12 @@ public class Instance extends IsolatedExecutor implements IInvokableInstance Instance(IInstanceConfig config, ClassLoader classLoader, FileSystem fileSystem, ShutdownExecutor shutdownExecutor) { - super("node" + config.num(), classLoader, executorFactory().pooled("isolatedExecutor", Integer.MAX_VALUE), shutdownExecutor); + super("node" + config.num(), classLoader, executorFactory().configurePooled("isolatedExecutor", Integer.MAX_VALUE) + // we report uncaught exceptions on node thread pools, but + // we never reported exceptions from this thread pool, and + // tests deliberately produce a lot so it would be a lot of + // work to whitelist the exceptions - volunteers welcome! + .withUncaughtExceptionHandler(JVMStabilityInspector::uncaughtException).build(), shutdownExecutor); this.config = config; if (fileSystem != null) File.unsafeSetFilesystem(fileSystem); @@ -383,19 +395,22 @@ protected void registerOutboundFilter(ICluster cluster) MessagingService.instance().outboundSink.add((message, to) -> { if (isShutdown()) return false; // TODO: Simulator needs this to trigger a failure - IMessage serialzied = serializeMessage(message.from(), to, message); int fromNum = config.num(); // since this instance is sending the message, from will always be this instance IInstance toInstance = cluster.get(fromCassandraInetAddressAndPort(to)); if (toInstance == null) return true; // TODO: Simulator needs this to trigger a failure + if (!cluster.filters().hasInbound() && !cluster.filters().hasOutbound()) + return true; // no filters... nothing to see here int toNum = toInstance.config().num(); + IMessage serialzied = serializeMessage(message.from(), to, message); return cluster.filters().permitOutbound(fromNum, toNum, serialzied); }); } + @Override public void uncaughtException(Thread thread, Throwable throwable) { - sync(JVMStabilityInspector::uncaughtException).accept(thread, throwable); + JVMStabilityInspector.uncaughtException(thread, throwable); } public static IMessage serializeMessage(InetAddressAndPort from, InetAddressAndPort to, Message messageOut) @@ -444,7 +459,7 @@ public static IMessage serializeMessage(InetAddressAndPort from, InetAddressAndP byte[] bytes = out.toByteArray(); if (messageOut.serializedSize(toVersion) != bytes.length) throw new AssertionError(String.format("Message serializedSize(%s) does not match what was written with serialize(out, %s) for verb %s and serializer %s; " + - "expected %s, actual %s ", toVersion, toVersion, messageOut.verb(), Message.serializer.getClass(), + "expected %s, actual %s ", toVersion, toVersion, messageOut.verb(), messageOut.verb().serializer().getClass(), messageOut.serializedSize(toVersion), bytes.length)); return new MessageImpl(messageOut.verb().id, bytes, messageOut.id(), toVersion, messageOut.expiresAtNanos(), fromCassandraInetAddressAndPort(from)); } @@ -503,7 +518,8 @@ public static Message.Header deserializeHeader(IMessage message) @Override public void receiveMessage(IMessage message) { - sync(receiveMessageRunnable(message)).accept(false); + if (RECEIVE_MESSAGES_ASYNC) async(receiveMessageRunnable(message)).apply(false); + else sync(receiveMessageRunnable(message)).accept(false); } @Override @@ -553,6 +569,10 @@ private SerializableConsumer receiveMessageRunnable(IMessage message) inInstancelogger.warn("Dropping message {} due to stage {} being shutdown", messageIn, header.verb.stage); return; } + // This can cause deadlocks when sending messages to self so use Stage.MISC.executor() just to have a + // place for it to run + if (executor == ImmediateExecutor.INSTANCE) + executor = Stage.MISC.executor(); executor.execute(ExecutorLocals.create(state), () -> MessagingService.instance().inboundSink.accept(messageIn)); } }; @@ -604,6 +624,18 @@ public ExecutorPlus executorFor(int verbId) return Verb.fromId(verbId).stage.executor(); } + @Nullable + private DurationSpec startupTimeout() + { + Object c = config.get(Constants.KEY_DTEST_STARTUP_TIMEOUT); + if (c == null) return null; + if (c instanceof String) + return new DurationSpec.LongNanosecondsBound((String) c); + if (c instanceof Number) + return new DurationSpec.LongNanosecondsBound(((Number) c).longValue()); + throw new IllegalArgumentException("Key " + Constants.KEY_DTEST_STARTUP_TIMEOUT + " only allowed to have string/number values, but given " + c + ": " + c.getClass()); + } + @Override public void startup(ICluster cluster) { @@ -613,7 +645,7 @@ public void startup(ICluster cluster) // commit to extend the functionality of the @Shared annotation to app classes. assert startedAt.compareAndSet(0L, System.nanoTime()) : String.format("startedAt on instance %d expected to be 0, but was %d", config().num(), startedAt.get()); - sync(() -> { + Future result = async(() -> { inInstancelogger = LoggerFactory.getLogger(Instance.class); try { @@ -640,14 +672,40 @@ public void startup(ICluster cluster) throw (RuntimeException) t; throw new RuntimeException(t); } - }).run(); + }).call(); + DurationSpec timeout = startupTimeout(); + if (timeout == null) + { + waitOn(result); + } + else + { + try + { + result.get(timeout.quantity(), timeout.unit()); + } + catch (InterruptedException e) + { + Thread.currentThread().interrupt(); + throw new UncheckedInterruptedException(e); + } + catch (TimeoutException | ExecutionException e) + { + throw new RuntimeException(e); + } + } initialized = true; } - private synchronized void startJmx() + private synchronized void setupMbeanWrapper() { this.isolatedJmx = new IsolatedJmx(this, inInstancelogger); + this.isolatedJmx.setupMBeanWrapper(); + } + + private synchronized void startJmx() + { isolatedJmx.startJmx(); } @@ -689,9 +747,6 @@ private static void propagateMessagingVersions(ICluster cluster) protected void partialStartup(ICluster cluster) throws IOException, NoSuchFieldException, IllegalAccessException, ExecutionException, InterruptedException, StartupException { - // org.apache.cassandra.distributed.impl.AbstractCluster.startup sets the exception handler for the thread - // so extract it to populate ExecutorFactory.Global - ExecutorFactory.Global.tryUnsafeSet(new ExecutorFactory.Default(Thread.currentThread().getContextClassLoader(), null, Thread.getDefaultUncaughtExceptionHandler())); if (config.has(GOSSIP)) { // TODO: hacky @@ -707,6 +762,8 @@ protected void partialStartup(ICluster cluster) throws IOException, NoSuchFie assert config.networkTopology().contains(config.broadcastAddress()) : String.format("Network topology %s doesn't contain the address %s", config.networkTopology(), config.broadcastAddress()); DistributedTestInitialLocationProvider.assign(config.networkTopology()); + if (config.has(JMX)) + setupMbeanWrapper(); DatabaseDescriptor.daemonInitialization(); if (config.has(JMX)) startJmx(); @@ -813,6 +870,7 @@ protected void partialStartup(ICluster cluster) throws IOException, NoSuchFie ClusterMetadataService.instance().processor().fetchLogAndWait(); NodeId self = Register.maybeRegister(); RegistrationStatus.instance.onRegistration(); + AccordService.startup(self); boolean joinRing = config.get(Constants.KEY_DTEST_JOIN_RING) == null || (boolean) config.get(Constants.KEY_DTEST_JOIN_RING); if (ClusterMetadata.current().directory.peerState(self) != NodeState.JOINED && joinRing) { @@ -897,6 +955,7 @@ public Future shutdown(boolean runOnExitThreads, boolean shutdownMessaging { Future future = async((ExecutorService executor) -> { Throwable error = null; + inInstancelogger.warn("Shutting down in thread {}", Thread.currentThread().getName()); error = parallelRun(error, executor, SnapshotManager.instance::close); @@ -913,6 +972,11 @@ public Future shutdown(boolean runOnExitThreads, boolean shutdownMessaging error = parallelRun(error, executor, () -> Gossiper.instance.stopShutdownAndWait(1L, MINUTES)); } + else + { + error = parallelRun(error, executor, + () -> Gossiper.instance.shutdownAndWait(1L, MINUTES)); + } error = parallelRun(error, executor, StorageService.instance::disableAutoCompaction); @@ -956,6 +1020,7 @@ public Future shutdown(boolean runOnExitThreads, boolean shutdownMessaging () -> ActiveRepairService.instance().shutdownNowAndWait(1L, MINUTES), () -> EpochAwareDebounce.instance.close(), SnapshotManager.instance::close, + () -> IndexStatusManager.instance.shutdownAndWait(1L, MINUTES), DiskErrorsHandlerService::close ); @@ -972,6 +1037,11 @@ public Future shutdown(boolean runOnExitThreads, boolean shutdownMessaging () -> SharedExecutorPool.SHARED.shutdownAndWait(1L, MINUTES) ); + error = parallelRun(error, executor, () -> { + if (!AccordService.isSetup()) return; + AccordService.instance().shutdownAndWait(1l, MINUTES); + }); + // CommitLog must shut down after Stage, or threads from the latter may attempt to use the former. // (ex. A Mutation stage thread may attempt to add a mutation to the CommitLog.) error = parallelRun(error, executor, CommitLog.instance::shutdownBlocking); @@ -1230,6 +1300,11 @@ private static Throwable parallelRun(Throwable accumulate, ExecutorService runOn } })); } + // This is not used code, but it is here for when you run in a debugger... + // When shutdown gets blocked we need to be able to trace down which future is blocked, so this idx + // helps map the location... the reason we can't leverage here is the timeout logic is higher up, so + // 'idx' really only helps out in a debugger... + int idx = 0; for (Future future : results) { try @@ -1242,6 +1317,7 @@ private static Throwable parallelRun(Throwable accumulate, ExecutorService runOn { accumulate = Throwables.merge(accumulate, t); } + idx++; } return accumulate; } diff --git a/test/distributed/org/apache/cassandra/distributed/impl/InstanceConfig.java b/test/distributed/org/apache/cassandra/distributed/impl/InstanceConfig.java index 8afd90faf78b..45ef68cb81f5 100644 --- a/test/distributed/org/apache/cassandra/distributed/impl/InstanceConfig.java +++ b/test/distributed/org/apache/cassandra/distributed/impl/InstanceConfig.java @@ -28,10 +28,11 @@ import java.util.TreeMap; import java.util.UUID; import java.util.function.Function; -import java.util.stream.Collectors; import com.vdurmont.semver4j.Semver; +import org.apache.cassandra.config.AccordSpec; import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.config.OptionaldPositiveInt; import org.apache.cassandra.distributed.api.Feature; import org.apache.cassandra.distributed.api.IInstanceConfig; import org.apache.cassandra.distributed.shared.NetworkTopology; @@ -40,6 +41,8 @@ import org.apache.cassandra.locator.NetworkTopologyProximity; import org.apache.cassandra.locator.SimpleSeedProvider; +import static org.apache.cassandra.config.CassandraRelevantProperties.DTEST_ACCORD_ENABLED; + public class InstanceConfig implements IInstanceConfig { public final int num; @@ -73,6 +76,7 @@ private InstanceConfig(int num, String commitlog_directory, String hints_directory, String cdc_raw_directory, + AccordSpec accord, Collection initial_token, int storage_port, int native_transport_port, @@ -83,7 +87,7 @@ private InstanceConfig(int num, this.hostId = new UUID(0x4000L, (1L << 63) | num); // deterministic hostId for simulator //TODO move away from magic strings in favor of constants this .set("num_tokens", initial_token.size()) - .set("initial_token", initial_token.stream().collect(Collectors.joining(","))) + .set("initial_token", String.join(",", initial_token)) .set("broadcast_address", broadcast_address) .set("listen_address", listen_address) .set("broadcast_rpc_address", broadcast_rpc_address) @@ -93,6 +97,12 @@ private InstanceConfig(int num, .set("commitlog_directory", commitlog_directory) .set("hints_directory", hints_directory) .set("cdc_raw_directory", cdc_raw_directory) + .set("accord.enabled", accord.enabled) + .set("accord.journal_directory", accord.journal_directory) + .set("accord.queue_shard_count", accord.queue_shard_count.toString()) + .set("accord.command_store_shard_count", accord.command_store_shard_count.toString()) + .set("accord.expire_txn", accord.expire_txn) + .set("accord.enable_virtual_debug_only_keyspace", "true") .set("partitioner", "org.apache.cassandra.dht.Murmur3Partitioner") .set("start_native_transport", true) .set("concurrent_writes", 2) @@ -316,6 +326,11 @@ public static InstanceConfig generate(int nodeNum, int datadirCount) { int seedNode = provisionStrategy.seedNodeNum(); + AccordSpec accordSpec = new AccordSpec(); + accordSpec.enabled = DTEST_ACCORD_ENABLED.getBoolean(); + accordSpec.journal_directory = String.format("%s/node%d/accord_journal", root, nodeNum); + accordSpec.queue_shard_count = new OptionaldPositiveInt(2); + accordSpec.command_store_shard_count = new OptionaldPositiveInt(4); return new InstanceConfig(nodeNum, networkTopology, provisionStrategy.ipAddress(nodeNum), @@ -329,6 +344,7 @@ public static InstanceConfig generate(int nodeNum, String.format("%s/node%d/commitlog", root, nodeNum), String.format("%s/node%d/hints", root, nodeNum), String.format("%s/node%d/cdc", root, nodeNum), + accordSpec, tokens, provisionStrategy.storagePort(nodeNum), provisionStrategy.nativeTransportPort(nodeNum), diff --git a/test/distributed/org/apache/cassandra/distributed/impl/IsolatedExecutor.java b/test/distributed/org/apache/cassandra/distributed/impl/IsolatedExecutor.java index 68ff1e71c65b..9e84d32df7cf 100644 --- a/test/distributed/org/apache/cassandra/distributed/impl/IsolatedExecutor.java +++ b/test/distributed/org/apache/cassandra/distributed/impl/IsolatedExecutor.java @@ -126,7 +126,7 @@ public IIsolatedExecutor with(ExecutorService executor) public Future shutdown() { - isolatedExecutor.shutdownNow(); + isolatedExecutor.shutdown(); return shutdownExecutor.shutdown(name, classLoader, isolatedExecutor, () -> { // Shutdown logging last - this is not ideal as the logging subsystem is initialized diff --git a/test/distributed/org/apache/cassandra/distributed/impl/IsolatedJmx.java b/test/distributed/org/apache/cassandra/distributed/impl/IsolatedJmx.java index 18eaeb85c532..7ad57a52b6b4 100644 --- a/test/distributed/org/apache/cassandra/distributed/impl/IsolatedJmx.java +++ b/test/distributed/org/apache/cassandra/distributed/impl/IsolatedJmx.java @@ -24,6 +24,7 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.concurrent.TimeUnit; import javax.management.remote.JMXConnector; import javax.management.remote.JMXConnectorServer; @@ -76,6 +77,8 @@ public void startJmx() { try { + Objects.requireNonNull(wrapper, "Must call setupMBeanWrapper before use"); + // Several RMI threads hold references to in-jvm dtest objects, and are, by default, kept // alive for long enough (minutes) to keep classloaders from being collected. // Set these two system properties to a low value to allow cleanup to occur fast enough @@ -88,12 +91,10 @@ public void startJmx() int jmxPort = config.jmxPort(); String hostname = addr.getHostAddress(); - wrapper = new MBeanWrapper.InstanceMBeanWrapper(hostname + ":" + jmxPort); - ((MBeanWrapper.DelegatingMbeanWrapper) MBeanWrapper.instance).setDelegate(wrapper); // CASSANDRA-18508: Sensitive JMX SSL configuration options can be easily exposed Map jmxServerOptionsMap = (Map) config.getParams().get("jmx_server_options"); - EncryptionOptions jmxEncryptionOptions; + EncryptionOptions.ClientEncryptionOptions jmxEncryptionOptions; if (jmxServerOptionsMap == null) { JMXServerOptions parsingSystemProperties = JMXServerOptions.createParsingSystemProperties(); @@ -158,6 +159,15 @@ public void startJmx() } } + public void setupMBeanWrapper() + { + InetAddress addr = config.broadcastAddress().getAddress(); + int jmxPort = config.jmxPort(); + String hostname = addr.getHostAddress(); + wrapper = new MBeanWrapper.InstanceMBeanWrapper(hostname + ':' + jmxPort); + ((MBeanWrapper.DelegatingMbeanWrapper) MBeanWrapper.instance).setDelegate(wrapper); + } + /** * Builds {@code EncryptionOptions} from the map based SSL configuration properties. * @@ -165,7 +175,7 @@ public void startJmx() * @return EncryptionOptions built object */ @SuppressWarnings("unchecked") - private EncryptionOptions getJmxEncryptionOptions(Map jmxServerOptionsMap) + private EncryptionOptions.ClientEncryptionOptions getJmxEncryptionOptions(Map jmxServerOptionsMap) { if (jmxServerOptionsMap == null) return null; @@ -176,34 +186,34 @@ private EncryptionOptions getJmxEncryptionOptions(Map jmxServerO { return null; } - EncryptionOptions jmxEncryptionOptions = new EncryptionOptions(); + EncryptionOptions.ClientEncryptionOptions.Builder jmxEncryptionOptionsBuilder = new EncryptionOptions.ClientEncryptionOptions.Builder(); String[] cipherSuitesArray = (String[]) encryptionOptionsMap.get(EncryptionOptions.ConfigKey.CIPHER_SUITES.toString()); if (cipherSuitesArray != null) { - jmxEncryptionOptions = jmxEncryptionOptions.withCipherSuites(cipherSuitesArray); + jmxEncryptionOptionsBuilder.withCipherSuites(cipherSuitesArray); } List acceptedProtocols = (List) encryptionOptionsMap.get(EncryptionOptions.ConfigKey.ACCEPTED_PROTOCOLS.toString()); if (acceptedProtocols != null) { - jmxEncryptionOptions = jmxEncryptionOptions.withAcceptedProtocols(acceptedProtocols); + jmxEncryptionOptionsBuilder.withAcceptedProtocols(acceptedProtocols); } Boolean requireClientAuthValue = (Boolean) encryptionOptionsMap.get(EncryptionOptions.ConfigKey.REQUIRE_CLIENT_AUTH.toString()); - EncryptionOptions.ClientAuth requireClientAuth = requireClientAuthValue == null ? - EncryptionOptions.ClientAuth.NOT_REQUIRED : - EncryptionOptions.ClientAuth.from(String.valueOf(requireClientAuthValue)); + EncryptionOptions.ClientEncryptionOptions.ClientAuth requireClientAuth = requireClientAuthValue == null ? + EncryptionOptions.ClientEncryptionOptions.ClientAuth.NOT_REQUIRED : + EncryptionOptions.ClientEncryptionOptions.ClientAuth.from(String.valueOf(requireClientAuthValue)); Object enabledOption = encryptionOptionsMap.get(EncryptionOptions.ConfigKey.ENABLED.toString()); boolean enabled = enabledOption != null ? (Boolean) encryptionOptionsMap.get(EncryptionOptions.ConfigKey.ENABLED.toString()) : false; //CASSANDRA-18508 NOTE - We do not populate sslContextFactory configuration here for tests, it could be enhanced - jmxEncryptionOptions = jmxEncryptionOptions - .withKeyStore((String) encryptionOptionsMap.get(EncryptionOptions.ConfigKey.KEYSTORE.toString())) - .withKeyStorePassword((String) encryptionOptionsMap.get(EncryptionOptions.ConfigKey.KEYSTORE_PASSWORD.toString())) - .withTrustStore((String) encryptionOptionsMap.get(EncryptionOptions.ConfigKey.TRUSTSTORE.toString())) - .withTrustStorePassword((String) encryptionOptionsMap.get(EncryptionOptions.ConfigKey.TRUSTSTORE_PASSWORD.toString())) - .withRequireClientAuth(requireClientAuth) - .withEnabled(enabled); - return jmxEncryptionOptions; + return jmxEncryptionOptionsBuilder + .withKeyStore((String) encryptionOptionsMap.get(EncryptionOptions.ConfigKey.KEYSTORE.toString())) + .withKeyStorePassword((String) encryptionOptionsMap.get(EncryptionOptions.ConfigKey.KEYSTORE_PASSWORD.toString())) + .withTrustStore((String) encryptionOptionsMap.get(EncryptionOptions.ConfigKey.TRUSTSTORE.toString())) + .withTrustStorePassword((String) encryptionOptionsMap.get(EncryptionOptions.ConfigKey.TRUSTSTORE_PASSWORD.toString())) + .withRequireClientAuth(requireClientAuth) + .withEnabled(enabled) + .build(); } private void waitForJmxAvailability(Map env) diff --git a/test/distributed/org/apache/cassandra/distributed/impl/MessageImpl.java b/test/distributed/org/apache/cassandra/distributed/impl/MessageImpl.java index 758d41358342..fc1dbcf0de59 100644 --- a/test/distributed/org/apache/cassandra/distributed/impl/MessageImpl.java +++ b/test/distributed/org/apache/cassandra/distributed/impl/MessageImpl.java @@ -21,11 +21,13 @@ import java.net.InetSocketAddress; import org.apache.cassandra.distributed.api.IMessage; +import org.apache.cassandra.net.Verb; import org.apache.cassandra.utils.ByteArrayUtil; // a container for simplifying the method signature for per-instance message handling/delivery public class MessageImpl implements IMessage { + private static final long serialVersionUID = 0; // for simulator support public final int verb; public final byte[] bytes; public final long id; @@ -77,7 +79,7 @@ public InetSocketAddress from() public String toString() { return "MessageImpl{" + - "verb=" + verb + + "verb=" + Verb.fromId(verb) + ", bytes=" + ByteArrayUtil.bytesToHex(bytes) + ", id=" + id + ", version=" + version + diff --git a/test/distributed/org/apache/cassandra/distributed/impl/Query.java b/test/distributed/org/apache/cassandra/distributed/impl/Query.java index 82933e735a62..26428034e682 100644 --- a/test/distributed/org/apache/cassandra/distributed/impl/Query.java +++ b/test/distributed/org/apache/cassandra/distributed/impl/Query.java @@ -27,6 +27,7 @@ import org.apache.cassandra.cql3.QueryProcessor; import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.distributed.api.IIsolatedExecutor; +import org.apache.cassandra.distributed.api.SimpleQueryResult; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.service.ClientWarn; import org.apache.cassandra.service.QueryState; @@ -36,15 +37,15 @@ import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FBUtilities; -public class Query implements IIsolatedExecutor.SerializableCallable +public class Query implements IIsolatedExecutor.SerializableCallable { private static final long serialVersionUID = 1L; - final String query; + public final String query; final long timestamp; final org.apache.cassandra.distributed.api.ConsistencyLevel commitConsistencyOrigin; final org.apache.cassandra.distributed.api.ConsistencyLevel serialConsistencyOrigin; - final Object[] boundValues; + public final Object[] boundValues; public Query(String query, long timestamp, org.apache.cassandra.distributed.api.ConsistencyLevel commitConsistencyOrigin, org.apache.cassandra.distributed.api.ConsistencyLevel serialConsistencyOrigin, Object[] boundValues) { @@ -55,7 +56,8 @@ public Query(String query, long timestamp, org.apache.cassandra.distributed.api. this.boundValues = boundValues; } - public Object[][] call() + @Override + public SimpleQueryResult call() { ConsistencyLevel commitConsistency = toCassandraCL(commitConsistencyOrigin); ConsistencyLevel serialConsistency = serialConsistencyOrigin == null ? null : toCassandraCL(serialConsistencyOrigin); @@ -88,7 +90,7 @@ public Object[][] call() if (res != null) res.setWarnings(ClientWarn.instance.getWarnings()); - return RowUtil.toQueryResult(res).toObjectArrays(); + return RowUtil.toQueryResult(res); } public String toString() @@ -100,4 +102,4 @@ static org.apache.cassandra.db.ConsistencyLevel toCassandraCL(org.apache.cassand { return org.apache.cassandra.db.ConsistencyLevel.fromCode(cl.ordinal()); } -} \ No newline at end of file +} diff --git a/test/distributed/org/apache/cassandra/distributed/impl/TestChangeListener.java b/test/distributed/org/apache/cassandra/distributed/impl/TestChangeListener.java index a40ee9a96f94..977a515170b8 100644 --- a/test/distributed/org/apache/cassandra/distributed/impl/TestChangeListener.java +++ b/test/distributed/org/apache/cassandra/distributed/impl/TestChangeListener.java @@ -18,10 +18,8 @@ package org.apache.cassandra.distributed.impl; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.function.Predicate; +import java.util.NavigableMap; +import java.util.concurrent.ConcurrentSkipListMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -30,7 +28,9 @@ import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.listeners.ChangeListener; +import org.apache.cassandra.utils.concurrent.CountDownLatch; import org.apache.cassandra.utils.concurrent.WaitQueue; +import org.apache.cassandra.utils.concurrent.WaitQueue.Signal; public class TestChangeListener implements ChangeListener { @@ -43,77 +43,68 @@ public static void register() ClusterMetadataService.instance().log().addListener(instance); } - private final List> preCommitPredicates = new ArrayList<>(); - private final List> postCommitPredicates = new ArrayList<>(); + NavigableMap preCommitBarriers = new ConcurrentSkipListMap<>(); + NavigableMap postCommitBarriers = new ConcurrentSkipListMap<>(); private final WaitQueue waiters = WaitQueue.newWaitQueue(); - @Override - public void notifyPreCommit(ClusterMetadata prev, ClusterMetadata next, boolean fromSnapshot) + private class CommitBarrier { - Iterator> iter = preCommitPredicates.iterator(); - while (iter.hasNext()) + private final CountDownLatch waiting = CountDownLatch.newCountDownLatch(1); + private final Runnable onPaused; + private final String desc; + + private CommitBarrier(Runnable onPaused, String desc) + { + this.onPaused = onPaused; + this.desc = desc; + } + + private void await() { - if (iter.next().test(next.epoch)) - { - logger.debug("Epoch matches pre-commit predicate, pausing"); - pause(); - iter.remove(); - } + logger.debug("Notifying paused: {}", desc); + Signal s = waiters.register(); + waiting.decrement(); + onPaused.run(); + s.awaitUninterruptibly(); + logger.debug("Unpaused: {}", desc); } } + @Override + public void notifyPreCommit(ClusterMetadata prev, ClusterMetadata next, boolean fromSnapshot) + { + CommitBarrier commitBarrier = preCommitBarriers.remove(next.epoch); + if (commitBarrier != null) + commitBarrier.await(); + } @Override public void notifyPostCommit(ClusterMetadata prev, ClusterMetadata next, boolean fromSnapshot) { - Iterator> iter = postCommitPredicates.iterator(); - while (iter.hasNext()) - { - if (iter.next().test(next.epoch)) - { - logger.debug("Epoch matches post-commit predicate, pausing"); - pause(); - iter.remove(); - } - } + CommitBarrier commitBarrier = postCommitBarriers.remove(next.epoch); + if (commitBarrier != null) + commitBarrier.await(); } - public void pauseBefore(Epoch epoch, Runnable onMatch) + public void pauseBefore(Epoch epoch, Runnable onPaused) { - logger.debug("Requesting pause before enacting {}", epoch); - preCommitPredicates.add((e) -> { - if (e.is(epoch)) - { - onMatch.run(); - return true; - } - return false; - }); + preCommitBarriers.put(epoch, new CommitBarrier(onPaused, "pre-commit " + epoch)); } - public void pauseAfter(Epoch epoch, Runnable onMatch) + public void pauseAfter(Epoch epoch, Runnable onPaused) { - logger.debug("Requesting pause after enacting {}", epoch); - postCommitPredicates.add((e) -> { - if (e.is(epoch)) - { - onMatch.run(); - return true; - } - return false; - }); + postCommitBarriers.put(epoch, new CommitBarrier(onPaused, "post-commit " + epoch)); } - public void pause() + public void unpause() { - WaitQueue.Signal signal = waiters.register(); - logger.debug("Log follower is paused, waiting..."); - signal.awaitUninterruptibly(); - logger.debug("Resumed log follower..."); + logger.info("Unpausing all precommit and post commit barriers"); + waiters.signalAll(); } - public void unpause() + public void clearAndUnpause() { - logger.debug("Unpausing log follower"); + preCommitBarriers.clear(); + postCommitBarriers.clear(); waiters.signalAll(); } } diff --git a/test/distributed/org/apache/cassandra/distributed/impl/UnsafeGossipHelper.java b/test/distributed/org/apache/cassandra/distributed/impl/UnsafeGossipHelper.java index 377ac198802a..314f6010c169 100644 --- a/test/distributed/org/apache/cassandra/distributed/impl/UnsafeGossipHelper.java +++ b/test/distributed/org/apache/cassandra/distributed/impl/UnsafeGossipHelper.java @@ -49,6 +49,7 @@ public class UnsafeGossipHelper { public static class HostInfo implements Serializable { + private static final long serialVersionUID = 0; // for simulator support final InetSocketAddress address; final UUID hostId; final String tokenString; @@ -267,7 +268,7 @@ public static IIsolatedExecutor.SerializableRunnable markShutdownRunner(InetSock EndpointState state = Gossiper.instance.getEndpointStateForEndpoint(getByAddress(address)); VersionedValue status = new VersionedValue.VersionedValueFactory(partitioner).shutdown(true); state.addApplicationState(ApplicationState.STATUS, status); - state.getHeartBeatState().forceHighestPossibleVersionUnsafe(); + state.forceHighestPossibleVersionUnsafe(); StorageService.instance.onChange(getByAddress(address), ApplicationState.STATUS, status); }); }; diff --git a/test/distributed/org/apache/cassandra/distributed/mock/nodetool/InternalNodeProbe.java b/test/distributed/org/apache/cassandra/distributed/mock/nodetool/InternalNodeProbe.java index bf6a0879cedd..1ab4f843b7fb 100644 --- a/test/distributed/org/apache/cassandra/distributed/mock/nodetool/InternalNodeProbe.java +++ b/test/distributed/org/apache/cassandra/distributed/mock/nodetool/InternalNodeProbe.java @@ -46,6 +46,7 @@ import org.apache.cassandra.service.GCInspector; import org.apache.cassandra.service.StorageProxy; import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.AccordOperations; import org.apache.cassandra.service.snapshot.SnapshotManager; import org.apache.cassandra.streaming.StreamManager; import org.apache.cassandra.tcm.CMSOperations; @@ -74,6 +75,7 @@ protected void connect() ssProxy = StorageService.instance; snapshotProxy = SnapshotManager.instance; cmsProxy = CMSOperations.instance; + accordProxy = AccordOperations.instance; msProxy = MessagingService.instance(); streamProxy = StreamManager.instance; compactionProxy = CompactionManager.instance; diff --git a/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java b/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java index cbb6c420463c..bdff5b8900cc 100644 --- a/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java +++ b/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java @@ -25,13 +25,16 @@ import java.util.Arrays; import java.util.Collection; import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.Set; +import java.util.UUID; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.function.BiConsumer; @@ -46,14 +49,28 @@ import com.google.common.base.Strings; import com.google.common.collect.ImmutableList; import com.google.common.util.concurrent.Futures; -import org.apache.cassandra.distributed.api.*; +import org.apache.cassandra.tcm.compatibility.TokenRingUtils; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.Shared; import org.assertj.core.api.Assertions; import org.junit.Assert; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import accord.primitives.TxnId; +import org.apache.cassandra.db.virtual.AccordDebugKeyspace; import org.apache.cassandra.dht.Token; import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.ICluster; +import org.apache.cassandra.distributed.api.IInstance; +import org.apache.cassandra.distributed.api.IInstanceConfig; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.IMessageFilters; +import org.apache.cassandra.distributed.api.NodeToolResult; +import org.apache.cassandra.distributed.api.Row; +import org.apache.cassandra.distributed.api.SimpleQueryResult; import org.apache.cassandra.distributed.impl.AbstractCluster; import org.apache.cassandra.distributed.impl.InstanceConfig; import org.apache.cassandra.distributed.impl.TestChangeListener; @@ -67,7 +84,11 @@ import org.apache.cassandra.net.RequestCallback; import org.apache.cassandra.net.Verb; import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.TableId; import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.AccordService; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tcm.Commit; @@ -80,12 +101,12 @@ import org.apache.cassandra.utils.concurrent.CountDownLatch; import static com.google.common.util.concurrent.Uninterruptibles.sleepUninterruptibly; +import static org.assertj.core.api.Assertions.assertThat; import static org.apache.cassandra.config.CassandraRelevantProperties.BOOTSTRAP_SCHEMA_DELAY_MS; import static org.apache.cassandra.config.CassandraRelevantProperties.BROADCAST_INTERVAL_MS; import static org.apache.cassandra.config.CassandraRelevantProperties.REPLACE_ADDRESS_FIRST_BOOT; import static org.apache.cassandra.config.CassandraRelevantProperties.RING_DELAY; import static org.apache.cassandra.distributed.impl.TestEndpointCache.toCassandraInetAddressAndPort; -import static org.assertj.core.api.Assertions.assertThat; /** * Utilities for working with jvm-dtest clusters. @@ -470,9 +491,14 @@ public static void clearEntryFilters(IInvokableInstance instance) instance.runOnInstance(() -> ClusterMetadataService.instance().log().clearFilters()); } + public static Callable pauseBeforeEnacting(IInvokableInstance instance, long epoch) + { + return pauseBeforeEnacting(instance, Epoch.create(epoch), 30, TimeUnit.SECONDS); + } + public static Callable pauseBeforeEnacting(IInvokableInstance instance, Epoch epoch) { - return pauseBeforeEnacting(instance, epoch, 10, TimeUnit.SECONDS); + return pauseBeforeEnacting(instance, epoch, 30, TimeUnit.SECONDS); } protected static Callable pauseBeforeEnacting(IInvokableInstance instance, @@ -483,7 +509,10 @@ protected static Callable pauseBeforeEnacting(IInvokableInstance instance, return instance.callOnInstance(() -> { TestChangeListener listener = TestChangeListener.instance; AsyncPromise promise = new AsyncPromise<>(); - listener.pauseBefore(epoch, () -> promise.setSuccess(null)); + listener.pauseBefore(epoch, () -> { + logger.info("Notifying waiter of pausing for pauseBeforeEnacting epoch {}", epoch); + promise.setSuccess(null); + }); return () -> { try { @@ -514,7 +543,10 @@ protected static Callable pauseAfterEnacting(IInvokableInstance instance, return instance.callOnInstance(() -> { TestChangeListener listener = TestChangeListener.instance; AsyncPromise promise = new AsyncPromise<>(); - listener.pauseAfter(epoch, () -> promise.setSuccess(null)); + listener.pauseAfter(epoch, () -> { + logger.info("Notifying waiter of pausing for pauseAfterEnacting epoch {}", epoch); + promise.setSuccess(null); + }); return () -> { try { @@ -600,6 +632,11 @@ public static void unpauseEnactment(IInvokableInstance instance) instance.runOnInstance(() -> TestChangeListener.instance.unpause()); } + public static void clearAndUnpause(IInvokableInstance instance) + { + instance.runOnInstance(() -> TestChangeListener.instance.clearAndUnpause()); + } + public static boolean isMigrating(IInvokableInstance instance) { return instance.callOnInstance(() -> ClusterMetadataService.instance().isMigrating()); @@ -630,7 +667,7 @@ public String toString() } } - public static void waitForCMSToQuiesce(ICluster cluster, int[] cmsNodes) + public static void waitForCMSToQuiesce(ICluster cluster, int... cmsNodes) { // first step; find the largest epoch waitForCMSToQuiesce(cluster, maxEpoch(cluster, cmsNodes)); @@ -1550,5 +1587,103 @@ public static void assertModeJoined(IInvokableInstance inst) .describedAs("Unexpected StorageService operation mode") .isEqualTo(StorageService.Mode.NORMAL); } -} + public static LinkedHashMap queryTxnState(AbstractCluster cluster, TxnId txnId, int... nodes) + { + String cql = String.format("SELECT * FROM %s.%s WHERE txn_id=?", SchemaConstants.VIRTUAL_ACCORD_DEBUG, AccordDebugKeyspace.TXN_BLOCKED_BY); + LinkedHashMap map = new LinkedHashMap<>(); + Iterable it = nodes.length == 0 ? cluster : cluster.get(nodes); + for (T i : it) + { + if (i.isShutdown()) + continue; + SimpleQueryResult result = i.executeInternalWithResult(cql, txnId.toString()); + map.put(i.toString(), result); + } + return map; + } + + public static String queryTxnStateAsString(AbstractCluster cluster, TxnId txnId, int... nodes) + { + StringBuilder sb = new StringBuilder(); + queryTxnStateAsString(sb, cluster, txnId, nodes); + return sb.toString(); + } + + public static void queryTxnStateAsString(StringBuilder sb, AbstractCluster cluster, TxnId txnId, int... nodes) + { + LinkedHashMap map = queryTxnState(cluster, txnId, nodes); + for (var e : map.entrySet()) + { + sb.append(e.getKey()).append(":\n"); + SimpleQueryResult result = e.getValue(); + if (!result.names().isEmpty()) + sb.append(result.names()).append('\n'); + while (result.hasNext()) + { + var row = result.next(); + sb.append(Arrays.asList(row.toObjectArray())).append('\n'); + } + } + } + + public static TableId tableId(Cluster cluster, String ks, String table) + { + String str = cluster.getFirstRunningInstance().callOnInstance(() -> Schema.instance.getKeyspaceInstance(ks).getColumnFamilyStore(table).getTableId().toString()); + return TableId.fromUUID(UUID.fromString(str)); + } + + public static void awaitAccordEpochReady(Cluster cluster, long epoch) + { + cluster.forEach(i -> { + if (i.isShutdown()) return; + i.runOnInstance(() -> { + try + { + AccordService.instance().epochReady(Epoch.create(epoch)).get(); + } + catch (InterruptedException | ExecutionException e) + { + throw new RuntimeException(e); + } + }); + }); + } + + @Shared + public static class Range implements Serializable + { + public final String left, right; + + public Range(String left, String right) + { + this.left = left; + this.right = right; + } + + public Range(long left, long right) + { + this(Long.toString(left), Long.toString(right)); + } + + public long left() + { + return Long.parseLong(left); + } + + public long right() + { + return Long.parseLong(right); + } + } + + public static List getPrimaryRanges(IInvokableInstance instance, String keyspace) + { + return instance.callOnInstance(() -> { + var ranges = TokenRingUtils.getPrimaryRangesForEndpoint(keyspace, FBUtilities.getBroadcastAddressAndPort()); + return ranges.stream() + .flatMap(r -> r.unwrap().stream().map(r2 -> new Range(r2.left.toString(), r2.right.toString()))) + .collect(Collectors.toList()); + }); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/shared/VersionedApplicationState.java b/test/distributed/org/apache/cassandra/distributed/shared/VersionedApplicationState.java index fd3e40a70d69..97d202b697f1 100644 --- a/test/distributed/org/apache/cassandra/distributed/shared/VersionedApplicationState.java +++ b/test/distributed/org/apache/cassandra/distributed/shared/VersionedApplicationState.java @@ -22,6 +22,7 @@ public class VersionedApplicationState implements Serializable { + private static final long serialVersionUID = 0; // for simulator support public final int applicationState; public final String value; public final int version; diff --git a/test/distributed/org/apache/cassandra/distributed/test/AbstractEncryptionOptionsImpl.java b/test/distributed/org/apache/cassandra/distributed/test/AbstractEncryptionOptionsImpl.java index 25e9e64b6326..7c3b91127ae0 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/AbstractEncryptionOptionsImpl.java +++ b/test/distributed/org/apache/cassandra/distributed/test/AbstractEncryptionOptionsImpl.java @@ -56,7 +56,7 @@ import org.apache.cassandra.security.SSLFactory; import static java.util.concurrent.TimeUnit.SECONDS; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; import static org.apache.cassandra.distributed.test.AbstractEncryptionOptionsImpl.ConnectResult.CONNECTING; import static org.apache.cassandra.distributed.test.AbstractEncryptionOptionsImpl.ConnectResult.UNINITIALIZED; import static org.apache.cassandra.utils.concurrent.Condition.newOneTimeCondition; @@ -118,10 +118,13 @@ public class TlsConnection final int port; final List acceptedProtocols; final List cipherSuites; - final EncryptionOptions encryptionOptions = new EncryptionOptions() - .withEnabled(true) - .withKeyStore(validKeyStorePath).withKeyStorePassword(validKeyStorePassword) - .withTrustStore(validTrustStorePath).withTrustStorePassword(validTrustStorePassword); + final EncryptionOptions.ClientEncryptionOptions encryptionOptions = new EncryptionOptions.ClientEncryptionOptions.Builder() + .withEnabled(true) + .withKeyStore(validKeyStorePath) + .withKeyStorePassword(validKeyStorePassword) + .withTrustStore(validTrustStorePath) + .withTrustStorePassword(validTrustStorePassword) + .build(); private Throwable lastThrowable; private String lastProtocol; private String lastCipher; @@ -202,7 +205,7 @@ ConnectResult connect() throws Throwable setProtocolAndCipher(null, null); SslContext sslContext = SSLFactory.getOrCreateSslContext( - encryptionOptions.withAcceptedProtocols(acceptedProtocols).withCipherSuites(cipherSuites), + new EncryptionOptions.ClientEncryptionOptions.Builder(encryptionOptions).withAcceptedProtocols(acceptedProtocols).withCipherSuites(cipherSuites).build(), REQUIRED, ISslContextFactory.SocketType.CLIENT, "test"); EventLoopGroup workerGroup = new NioEventLoopGroup(); diff --git a/test/distributed/org/apache/cassandra/distributed/test/AbstractVirtualLogsTableTest.java b/test/distributed/org/apache/cassandra/distributed/test/AbstractVirtualLogsTableTest.java new file mode 100644 index 000000000000..c8bb32fe7280 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/AbstractVirtualLogsTableTest.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test; + +import org.junit.Ignore; + +import static java.lang.String.format; + +@Ignore +public abstract class AbstractVirtualLogsTableTest extends TestBaseImpl +{ + public String query(String template) + { + return format(template, getTableName()); + } + + public abstract String getTableName(); +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/AuthTest.java b/test/distributed/org/apache/cassandra/distributed/test/AuthTest.java index f424691eacce..cd63a45ac9e1 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/AuthTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/AuthTest.java @@ -80,6 +80,8 @@ public void testZeroTimestampForDefaultRoleCreation() throws Exception .withTokenSupplier(TokenSupplier.evenlyDistributedTokens(2, 1)) .withConfig(config -> config.with(NETWORK, GOSSIP, NATIVE_PROTOCOL) .set("authenticator", "PasswordAuthenticator") + // Test drops all TCM communication, which precludes topology discovery + .set("accord.enabled", false) .set("credentials_validity", "2s")) // revert to OSS default .start()) { diff --git a/test/distributed/org/apache/cassandra/distributed/test/CASAddTest.java b/test/distributed/org/apache/cassandra/distributed/test/CASAddTest.java index 50ddd091580f..37e44f7794b6 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/CASAddTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/CASAddTest.java @@ -72,13 +72,13 @@ public void testAdditionNotExists() throws Throwable assertRows(cluster.coordinator(1).execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1", ConsistencyLevel.SERIAL), row(1, null, null)); // this section is testing current limitations... if they start to fail due to the limitations going away... update this test to include those cases - Assertions.assertThatThrownBy(() -> cluster.coordinator(1).execute(batch( + Assertions.assertThatThrownBy(() -> cluster.coordinator(1).execute(unloggedBatch( "INSERT INTO " + KEYSPACE + ".tbl (pk, a, b) VALUES (1, 0, '') IF NOT EXISTS", "UPDATE " + KEYSPACE + ".tbl SET a = a + 1, b = b + 'success' WHERE pk = 1 IF EXISTS" ), ConsistencyLevel.QUORUM)) .is(AssertionUtils.is(InvalidRequestException.class)) .hasMessage("Cannot mix IF EXISTS and IF NOT EXISTS conditions for the same row"); - Assertions.assertThatThrownBy(() -> cluster.coordinator(1).execute(batch( + Assertions.assertThatThrownBy(() -> cluster.coordinator(1).execute(unloggedBatch( "INSERT INTO " + KEYSPACE + ".tbl (pk, a, b) VALUES (1, 0, '') IF NOT EXISTS", "UPDATE " + KEYSPACE + ".tbl SET a = a + 1, b = b + 'success' WHERE pk = 1" diff --git a/test/distributed/org/apache/cassandra/distributed/test/CASMultiDCTest.java b/test/distributed/org/apache/cassandra/distributed/test/CASMultiDCTest.java index 382cb543c70e..8fa3b0fe94fd 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/CASMultiDCTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/CASMultiDCTest.java @@ -21,6 +21,7 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Consumer; +import org.junit.AfterClass; import org.junit.Assert; import org.junit.Before; import org.junit.BeforeClass; @@ -36,7 +37,10 @@ import org.apache.cassandra.service.paxos.PaxosCommit; import org.apache.cassandra.utils.ByteBufferUtil; -import static org.apache.cassandra.distributed.api.ConsistencyLevel.*; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.LOCAL_QUORUM; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.LOCAL_SERIAL; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.QUORUM; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.SERIAL; public class CASMultiDCTest { @@ -75,6 +79,12 @@ public static void beforeClass() throws Throwable })); } + @AfterClass + public static void afterClass() throws Throwable + { + CLUSTER.close(); + } + @Before public void setUp() { diff --git a/test/distributed/org/apache/cassandra/distributed/test/CASTestBase.java b/test/distributed/org/apache/cassandra/distributed/test/CASTestBase.java index 58b47cc9b79e..1c441ded3ef7 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/CASTestBase.java +++ b/test/distributed/org/apache/cassandra/distributed/test/CASTestBase.java @@ -20,6 +20,7 @@ import java.util.Collections; import java.util.UUID; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import org.junit.Assert; @@ -185,6 +186,8 @@ public static void addToRing(boolean bootstrapping, IInstance peer) public static void assertVisibleInRing(IInstance peer) { InetAddressAndPort endpoint = InetAddressAndPort.getByAddress(peer.broadcastAddress()); + long deadline = System.nanoTime() + TimeUnit.SECONDS.toNanos(30); + while (System.nanoTime() < deadline && !Gossiper.instance.isAlive(endpoint)); Assert.assertTrue(Gossiper.instance.isAlive(endpoint)); } diff --git a/test/distributed/org/apache/cassandra/distributed/test/ColumnConstraintsTest.java b/test/distributed/org/apache/cassandra/distributed/test/ColumnConstraintsTest.java index 77b72ccca1d2..4abdd780aa0b 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/ColumnConstraintsTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/ColumnConstraintsTest.java @@ -58,7 +58,7 @@ public void testInvalidConstraintsExceptions() throws IOException "org.apache.cassandra.db.marshal.IntegerType, org.apache.cassandra.db.marshal.LongType, " + "org.apache.cassandra.db.marshal.ShortType] but it was class org.apache.cassandra.db.marshal.UTF8Type"); - assertThrowsInvalidConstraintException(cluster, String.format("CREATE TABLE %s (pk int, ck1 int CHECK LENGTH(ck1) < 100, ck2 int, v int, " + + assertThrowsInvalidConstraintException(cluster, String.format("CREATE TABLE %s (pk int, ck1 int CHECK LENGTH() < 100, ck2 int, v int, " + "PRIMARY KEY ((pk), ck1, ck2));", tableName), "Column should be of type class org.apache.cassandra.db.marshal.UTF8Type or " + "class org.apache.cassandra.db.marshal.AsciiType but got class org.apache.cassandra.db.marshal.Int32Type"); @@ -213,7 +213,7 @@ public void testLengthTableLevelConstraint() throws IOException for (Map.Entry relation : RELATIONS_MAP.entrySet()) { String tableName = String.format(KEYSPACE + ".%s_tbl1_%s", type, relation.getKey()); - String createTableStatementSmallerThan = "CREATE TABLE " + tableName + " (pk " + type + " CHECK LENGTH(pk) " + relation.getValue() + " 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk), ck1, ck2));"; + String createTableStatementSmallerThan = "CREATE TABLE " + tableName + " (pk " + type + " CHECK LENGTH() " + relation.getValue() + " 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk), ck1, ck2));"; cluster.schemaChange(createTableStatementSmallerThan); } } @@ -299,7 +299,7 @@ public void testNotNullTableLevelConstraint() throws IOException try (Cluster cluster = init(Cluster.build(1).start())) { String tableName = String.format(KEYSPACE + ".%s_tbl1_%s", type, "st"); - String createTableNotNullValue = "CREATE TABLE " + tableName + " (pk int, value int CHECK NOT_NULL(value), PRIMARY KEY (pk));"; + String createTableNotNullValue = "CREATE TABLE " + tableName + " (pk int, value int CHECK NOT NULL, PRIMARY KEY (pk));"; cluster.schemaChange(createTableNotNullValue); Assertions.assertThatThrownBy(() -> cluster.coordinator(1).execute(String.format("INSERT INTO " + tableName + " (pk, value) VALUES (1, null)"), ConsistencyLevel.ALL)) diff --git a/test/distributed/org/apache/cassandra/distributed/test/CompactionDiskSpaceTest.java b/test/distributed/org/apache/cassandra/distributed/test/CompactionDiskSpaceTest.java index 6cc0fdfe4970..d6418e696bab 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/CompactionDiskSpaceTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/CompactionDiskSpaceTest.java @@ -58,7 +58,7 @@ public void testNoSpaceLeft() throws IOException cluster.schemaChange("create table "+KEYSPACE+".tbl (id int primary key, x int) with compaction = {'class':'SizeTieredCompactionStrategy'}"); cluster.coordinator(1).execute("insert into "+KEYSPACE+".tbl (id, x) values (1,1)", ConsistencyLevel.ALL); cluster.get(1).flush(KEYSPACE); - cluster.setUncaughtExceptionsFilter((t) -> t.getMessage() != null && t.getMessage().contains("Not enough space for compaction")); + cluster.setUncaughtExceptionsFilter((t) -> t.getMessage() != null && t.getMessage().contains("Not enough space for compaction") && (t.getMessage().contains(KEYSPACE+".tbl") || t.getMessage().contains("system_"))); cluster.get(1).runOnInstance(() -> { ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore("tbl"); BB.estimatedRemaining.set(2000); diff --git a/test/distributed/org/apache/cassandra/distributed/test/EphemeralSnapshotTest.java b/test/distributed/org/apache/cassandra/distributed/test/EphemeralSnapshotTest.java index f4d423cf23f8..4a4d9ccf8da5 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/EphemeralSnapshotTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/EphemeralSnapshotTest.java @@ -18,13 +18,22 @@ package org.apache.cassandra.distributed.test; +import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import javax.management.openmbean.CompositeDataSupport; +import javax.management.openmbean.TabularData; +import javax.management.openmbean.TabularDataSupport; import com.google.common.util.concurrent.Futures; import org.junit.Test; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.IInvokableInstance; @@ -32,6 +41,8 @@ import org.apache.cassandra.io.util.File; import org.apache.cassandra.service.snapshot.SnapshotManager; import org.apache.cassandra.service.snapshot.SnapshotManifest; +import org.apache.cassandra.service.snapshot.SnapshotOptions; +import org.apache.cassandra.service.snapshot.SnapshotType; import org.apache.cassandra.utils.Pair; import static java.lang.String.format; @@ -40,6 +51,8 @@ import static org.apache.cassandra.distributed.api.Feature.GOSSIP; import static org.apache.cassandra.distributed.api.Feature.NATIVE_PROTOCOL; import static org.apache.cassandra.distributed.api.Feature.NETWORK; +import static org.apache.cassandra.schema.SchemaConstants.LOCAL_SYSTEM_KEYSPACE_NAMES; +import static org.apache.cassandra.schema.SchemaConstants.REPLICATED_SYSTEM_KEYSPACE_NAMES; import static org.awaitility.Awaitility.await; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -58,7 +71,7 @@ public void testStartupRemovesEphemeralSnapshotOnEphemeralFlagInManifest() throw .withConfig(config -> config.with(GOSSIP, NETWORK, NATIVE_PROTOCOL)) .start())) { - Pair initialisationData = initialise(c); + Pair initialisationData = initialise(c, tableName); rewriteManifestToEphemeral(initialisationData.left, initialisationData.right); @@ -76,7 +89,7 @@ public void testStartupRemovesEphemeralSnapshotOnMarkerFile() throws Exception .withConfig(config -> config.with(GOSSIP, NETWORK, NATIVE_PROTOCOL)) .start())) { - Pair initialisationData = initialise(c); + Pair initialisationData = initialise(c, tableName); String tableId = initialisationData.left; String[] dataDirs = initialisationData.right; @@ -106,7 +119,7 @@ public void testEphemeralSnapshotIsNotClearableFromNodetool() throws Exception { IInvokableInstance instance = c.get(1); - Pair initialisationData = initialise(c); + Pair initialisationData = initialise(c, tableName); rewriteManifestToEphemeral(initialisationData.left, initialisationData.right); c.get(1).runOnInstance((IIsolatedExecutor.SerializableRunnable) () -> SnapshotManager.instance.restart(true)); @@ -130,7 +143,7 @@ public void testClearingAllSnapshotsFromNodetoolWillKeepEphemeralSnaphotsIntact( { IInvokableInstance instance = c.get(1); - Pair initialisationData = initialise(c); + Pair initialisationData = initialise(c, tableName); rewriteManifestToEphemeral(initialisationData.left, initialisationData.right); @@ -142,13 +155,104 @@ public void testClearingAllSnapshotsFromNodetoolWillKeepEphemeralSnaphotsIntact( } } - private Pair initialise(Cluster c) + /** + * @see CASSANDRA-20490 + */ + @Test + public void testForceEphemeralSnapshotWhenAlreadyExists() throws Exception + { + try (Cluster c = init(builder().withNodes(1) + .withConfig(config -> config.with(GOSSIP, NETWORK, NATIVE_PROTOCOL)) + .start())) + { + IInvokableInstance instance = c.get(1); + + c.schemaChange(withKeyspace("CREATE TABLE IF NOT EXISTS %s." + tableName + " (cityid int PRIMARY KEY, name text)")); + c.coordinator(1).execute(withKeyspace("INSERT INTO %s." + tableName + "(cityid, name) VALUES (1, 'Canberra');"), ONE); + + instance.flush(KEYSPACE); + + takeEphemeralSnapshotForcibly(c, KEYSPACE, tableName, snapshotName); + assertTrue(instance.nodetoolResult("listsnapshots", "-e").getStdout().contains(snapshotName)); + float firstSnapshotSize = getSnapshotSizeOnDisk(c, KEYSPACE, tableName, snapshotName); + + SnapshotManifest snapshotManifest = SnapshotManifest.deserializeFromJsonFile(new File(findManifest(getDataDirs(c), getTableId(c, KEYSPACE, tableName)))); + assertEquals(1, snapshotManifest.getFiles().size()); + + // list sstables + List snapshotFilesAfterFirstSnapshot = getSnapshotFiles(c, snapshotName); + assertFalse(snapshotFilesAfterFirstSnapshot.isEmpty()); + + // add more data + insertData(c, tableName); + + takeEphemeralSnapshotForcibly(c, KEYSPACE, tableName, snapshotName); + assertTrue(instance.nodetoolResult("listsnapshots", "-e").getStdout().contains(snapshotName)); + SnapshotManifest secondSnapshotManifest = SnapshotManifest.deserializeFromJsonFile(new File(findManifest(getDataDirs(c), getTableId(c, KEYSPACE, tableName)))); + assertEquals(2, secondSnapshotManifest.getFiles().size()); + + List snapshotFilesAfterSecondSnapshot = getSnapshotFiles(c, snapshotName); + assertFalse(snapshotFilesAfterSecondSnapshot.isEmpty()); + + // list again and check it is superset of previous listing + assertTrue(snapshotFilesAfterSecondSnapshot.size() > snapshotFilesAfterFirstSnapshot.size()); + assertTrue(snapshotFilesAfterSecondSnapshot.containsAll(snapshotFilesAfterFirstSnapshot)); + assertTrue(secondSnapshotManifest.getFiles().containsAll(snapshotManifest.getFiles())); + + float secondSnapshotSize = getSnapshotSizeOnDisk(c, KEYSPACE, tableName, snapshotName); + + assertTrue(secondSnapshotSize > firstSnapshotSize); + } + } + + private Float getSnapshotSizeOnDisk(Cluster c, String keyspace, String table, String snapshotName) + { + return c.get(1).applyOnInstance((IIsolatedExecutor.SerializableTriFunction) (ks, tb, name) -> { + + Map stringTabularDataMap = SnapshotManager.instance.listSnapshots(Map.of("include_ephemeral", "true")); + + TabularDataSupport tabularData = (TabularDataSupport) stringTabularDataMap.get(name); + for (Object value : tabularData.values()) + { + CompositeDataSupport cds = (CompositeDataSupport) value; + return Float.parseFloat(((String) cds.get("Size on disk")).split(" ")[0]); + } + + return 0F; + }, keyspace, table, snapshotName); + } + + private void takeEphemeralSnapshotForcibly(Cluster c, String keyspace, String table, String snapshotName) + { + c.get(1).applyOnInstance((IIsolatedExecutor.SerializableTriFunction) (ks, tb, name) -> + { + ColumnFamilyStore cfs = Keyspace.getValidKeyspace(ks).getColumnFamilyStore(tb); + try + { + SnapshotManager.instance.takeSnapshot(SnapshotOptions.systemSnapshot(name, SnapshotType.REPAIR, (sstable) -> true, cfs.getKeyspaceTableName()) + .ephemeral() + .build()); + } + catch (Throwable t) + { + throw new RuntimeException(t.getMessage()); + } + return null; + }, keyspace, table, snapshotName); + } + + private void insertData(Cluster c, String tableName) { c.schemaChange(withKeyspace("CREATE TABLE IF NOT EXISTS %s." + tableName + " (cityid int PRIMARY KEY, name text)")); c.coordinator(1).execute(withKeyspace("INSERT INTO %s." + tableName + "(cityid, name) VALUES (1, 'Canberra');"), ONE); IInvokableInstance instance = c.get(1); - instance.flush(KEYSPACE); + } + + private Pair initialise(Cluster c, String tableName) + { + insertData(c, tableName); + IInvokableInstance instance = c.get(1); assertEquals(0, instance.nodetool("snapshot", "-kt", withKeyspace("%s." + tableName), "-t", snapshotName)); waitForSnapshot(instance, snapshotName); @@ -158,15 +262,17 @@ private Pair initialise(Cluster c) assertEquals(0, instance.nodetool("snapshot", "-kt", withKeyspace("%s." + tableName), "-t", snapshotName2)); waitForSnapshot(instance, snapshotName2); - String tableId = instance.callOnInstance((IIsolatedExecutor.SerializableCallable) () -> { - return Keyspace.open(KEYSPACE).getMetadata().tables.get(tableName).get().id.asUUID().toString().replaceAll("-", ""); - }); + String tableId = getTableId(c, KEYSPACE, tableName); - String[] dataDirs = (String[]) instance.config().get("data_file_directories"); + String[] dataDirs = getDataDirs(c); return Pair.create(tableId, dataDirs); } + private String[] getDataDirs(Cluster c) + { + return (String[]) c.get(1).config().get("data_file_directories"); + } private void verify(IInvokableInstance instance) { @@ -220,4 +326,45 @@ private Path findManifest(String[] dataDirs, String tableId) throw new IllegalStateException("Unable to find manifest!"); } + + private List getSnapshotFiles(Cluster cluster, String snapshotName) + { + return cluster.get(1).applyOnInstance((IIsolatedExecutor.SerializableFunction>) (name) -> { + List result = new ArrayList<>(); + + for (Keyspace keyspace : Keyspace.all()) + { + if (LOCAL_SYSTEM_KEYSPACE_NAMES.contains(keyspace.getName()) || REPLICATED_SYSTEM_KEYSPACE_NAMES.contains(keyspace.getName())) + continue; + + for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores()) + { + for (String dataDir : DatabaseDescriptor.getAllDataFileLocations()) + { + File snapshotDir = new File(dataDir, format("%s/%s-%s/snapshots/%s", keyspace.getName(), cfs.name, cfs.metadata().id.toHexString(), name)); + if (snapshotDir.exists()) + { + try + { + Files.list(snapshotDir.toPath()).forEach(p -> result.add(p.toString())); + } + catch (IOException e) + { + throw new RuntimeException("Unable to list " + snapshotDir.toPath(), e); + } + } + } + } + } + + return result; + }, snapshotName); + } + + private String getTableId(Cluster c, String keyspace, String tableName) + { + return c.get(1).applyOnInstance((IIsolatedExecutor.SerializableBiFunction) (ks, tb) -> { + return Keyspace.open(ks).getMetadata().tables.get(tb).get().id.asUUID().toString().replaceAll("-", ""); + }, keyspace, tableName); + } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/ForBenchmarks.java b/test/distributed/org/apache/cassandra/distributed/test/ForBenchmarks.java new file mode 100644 index 000000000000..fc4dc57a7b82 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/ForBenchmarks.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.Feature; + +import java.io.IOException; + +public class ForBenchmarks extends TestBaseImpl { + public static void main(String[] args) throws IOException, InterruptedException { + try (Cluster cluster = Cluster.build(3) + .withConfig(c -> c.with(Feature.values())) + .start()) { + cluster.get(1).nodetoolResult("cms", "reconfigure", "3").asserts().success(); + + Thread.currentThread().join(); + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/HungBootstrapDoesNotHangTest.java b/test/distributed/org/apache/cassandra/distributed/test/HungBootstrapDoesNotHangTest.java new file mode 100644 index 000000000000..474c555ab491 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/HungBootstrapDoesNotHangTest.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test; + +import java.io.IOException; +import java.util.Collection; +import java.util.concurrent.ForkJoinPool; + +import org.junit.Test; + +import net.bytebuddy.ByteBuddy; +import net.bytebuddy.dynamic.loading.ClassLoadingStrategy; +import net.bytebuddy.implementation.MethodDelegation; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.Constants; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.TokenSupplier; +import org.apache.cassandra.distributed.shared.ClusterUtils; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ownership.MovementMap; +import org.apache.cassandra.tcm.sequences.BootstrapAndJoin; +import org.apache.cassandra.utils.Shared; +import org.apache.cassandra.utils.concurrent.CountDownLatch; +import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; +import org.assertj.core.api.Assertions; + +import static net.bytebuddy.matcher.ElementMatchers.named; +import static net.bytebuddy.matcher.ElementMatchers.takesArguments; + +/** + * When bootstrap hangs it can hang forever, but this can be a problem in CI as the test reports as "timeout" and all logs and history is lost. This test makes sure that JVM-DTest instances do shutdown properly even in this case + */ +public class HungBootstrapDoesNotHangTest extends TestBaseImpl +{ + @Test + public void test() throws IOException + { + TokenSupplier tokenSupplier = TokenSupplier.evenlyDistributedTokens(2); + try (Cluster cluster = Cluster.build(1) + .withTokenSupplier(tokenSupplier) + .withConfig(c -> c.set("auto_bootstrap", true).with(Feature.values())) + .withInstanceInitializer(BBHelper::install) + .createWithoutStarting()) + { + cluster.get(1).startup(cluster); // should work fine + IInvokableInstance node2 = ClusterUtils.addInstance(cluster, c -> c.set(Constants.KEY_DTEST_STARTUP_TIMEOUT, "1m") + .set(Constants.KEY_DTEST_API_STARTUP_FAILURE_AS_SHUTDOWN, false)); + ForkJoinPool.commonPool().execute(() -> { + node2.startup(); // should hang and never reach the next line + State.notBlocked(); + }); + State.awaitBlocked(); + + Assertions.assertThat(State.wasBlocked()).describedAs("node2 was supposed to get blocked by ByteBuddy but didnt").isEqualTo(true); + + // node1 is up, node2 is blocked in bootstrap... now let the cluster close + } + } + + @Shared + public static class State + { + private static final CountDownLatch blocked = CountDownLatch.newCountDownLatch(1); + private static volatile boolean wasBlocked = true; + + public static void blocked() + { + blocked.decrement(); + } + + public static void notBlocked() + { + wasBlocked = false; + blocked(); + } + + public static void awaitBlocked() + { + blocked.awaitThrowUncheckedOnInterrupt(); + } + + public static boolean wasBlocked() + { + return wasBlocked; + } + } + + public static class BBHelper + { + public static void install(ClassLoader cl, int id) + { + if (id != 2) return; + new ByteBuddy().rebase(BootstrapAndJoin.class) + .method(named("bootstrap").and(takesArguments(6))) + .intercept(MethodDelegation.to(BBHelper.class)) + .make() + .load(cl, ClassLoadingStrategy.Default.INJECTION); + } + + public static boolean bootstrap(final Collection tokens, + long bootstrapTimeoutMillis, + ClusterMetadata metadata, + InetAddressAndPort beingReplaced, + MovementMap movements, + MovementMap strictMovements) + { + try + { + State.blocked(); + Thread.currentThread().join(); + return false; + } + catch (InterruptedException e) + { + throw new UncheckedInterruptedException(e); + } + } + + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/IPMembershipTest.java b/test/distributed/org/apache/cassandra/distributed/test/IPMembershipTest.java index 9f242b0212d0..c51e8163a6fe 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/IPMembershipTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/IPMembershipTest.java @@ -18,6 +18,7 @@ package org.apache.cassandra.distributed.test; +import java.io.File; import java.io.IOException; import java.util.Arrays; import java.util.Set; @@ -33,7 +34,6 @@ import org.apache.cassandra.distributed.api.IInvokableInstance; import org.apache.cassandra.distributed.impl.InstanceConfig; import org.apache.cassandra.distributed.shared.ClusterUtils; -import org.apache.cassandra.io.util.FileUtils; import org.apache.cassandra.locator.NoOpProximity; import org.apache.cassandra.locator.SimpleLocationProvider; import org.apache.cassandra.tools.ToolRunner; @@ -46,6 +46,25 @@ public class IPMembershipTest extends TestBaseImpl { + + private static void deleteRecursiveNoStaticInit(File file) + { + if (file.isDirectory()) + { + for (File entry : file.listFiles()) + deleteRecursiveNoStaticInit(entry); + } + else + { + file.delete(); + } + } + + private static void deleteRecursiveNoStaticInit(org.apache.cassandra.io.util.File file) + { + deleteRecursiveNoStaticInit(new File(file.absolutePath())); + } + /** * Port of replace_address_test.py::fail_without_replace_test to jvm-dtest */ @@ -64,7 +83,7 @@ public void sameIPFailWithoutReplace() throws IOException for (boolean auto_bootstrap : Arrays.asList(true, false)) { stopUnchecked(nodeToReplace); - getDirectories(nodeToReplace).forEach(FileUtils::deleteRecursive); + getDirectories(nodeToReplace).forEach(IPMembershipTest::deleteRecursiveNoStaticInit); nodeToReplace.config().set("auto_bootstrap", auto_bootstrap); diff --git a/test/distributed/org/apache/cassandra/distributed/test/IntegrationTestBase.java b/test/distributed/org/apache/cassandra/distributed/test/IntegrationTestBase.java index 16f4e58f304d..e05b52e06d66 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/IntegrationTestBase.java +++ b/test/distributed/org/apache/cassandra/distributed/test/IntegrationTestBase.java @@ -44,7 +44,7 @@ public static void before() throws Throwable protected static void init(int nodes, Consumer cfg) throws Throwable { - Invariants.checkState(!initialized); + Invariants.require(!initialized); cluster = Cluster.build() .withNodes(nodes) .withConfig(cfg) diff --git a/test/distributed/org/apache/cassandra/distributed/test/MessageFiltersTest.java b/test/distributed/org/apache/cassandra/distributed/test/MessageFiltersTest.java index e1a0a91e32b9..ac4e952fe626 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/MessageFiltersTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/MessageFiltersTest.java @@ -45,6 +45,7 @@ import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.net.NoPayload; import org.apache.cassandra.net.Verb; +import org.assertj.core.api.Assertions; import static org.apache.cassandra.distributed.api.Feature.GOSSIP; import static org.apache.cassandra.distributed.api.Feature.NETWORK; @@ -214,7 +215,7 @@ public void testMessageMatching() throws Throwable Message decoded = Instance.deserializeMessage(msg); return (Integer) decoded.verb().id; }).call(); - Assert.assertTrue(verbs.contains(id)); + Assertions.assertThat(verbs).describedAs("Unexpected verb %s", Verb.fromId(id)).contains(id); counter.incrementAndGet(); return false; }).drop(); @@ -274,7 +275,8 @@ public void hintSerializationTest() throws Exception try (Cluster cluster = init(builder().withNodes(3) .withConfig(config -> config.with(GOSSIP) .with(NETWORK) - .set("hinted_handoff_enabled", true)) + .set("hinted_handoff_enabled", true) + .set("accord.enabled", false)) .start())) { cluster.schemaChange(withKeyspace("CREATE TABLE %s.tbl (k int PRIMARY KEY, v int)")); diff --git a/test/distributed/org/apache/cassandra/distributed/test/MixedModeFuzzTest.java b/test/distributed/org/apache/cassandra/distributed/test/MixedModeFuzzTest.java index 1f7f388d3481..8b2d58ac6ddd 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/MixedModeFuzzTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/MixedModeFuzzTest.java @@ -49,7 +49,7 @@ import net.bytebuddy.dynamic.loading.ClassLoadingStrategy; import net.bytebuddy.implementation.MethodDelegation; import org.apache.cassandra.cql3.CQLStatement; -import org.apache.cassandra.cql3.QueryHandler; +import org.apache.cassandra.cql3.QueryHandler.Prepared; import org.apache.cassandra.cql3.QueryProcessor; import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.distributed.api.ConsistencyLevel; @@ -271,9 +271,10 @@ public void mixedModeFuzzTest() throws Throwable c.get(nodeWithFix.get() ? 1 : 2).runOnInstance(() -> { SystemKeyspace.loadPreparedStatements((id, query, keyspace) -> { + Prepared prepared = QueryProcessor.instance.getPrepared(id); if (rng.nextBoolean()) QueryProcessor.instance.evictPrepared(id); - return true; + return prepared; }); }); break; @@ -453,7 +454,7 @@ public static ResultMessage.Prepared prepare(String queryString, ClientState cli if (existing != null) return existing; - QueryHandler.Prepared prepared = QueryProcessor.parseAndPrepare(queryString, clientState, false); + Prepared prepared = QueryProcessor.parseAndPrepare(queryString, clientState, false); CQLStatement statement = prepared.statement; int boundTerms = statement.getBindVariables().size(); diff --git a/test/distributed/org/apache/cassandra/distributed/test/OptimiseStreamsRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/OptimiseStreamsRepairTest.java index 63a2f95f166d..cec5fdb2e5c2 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/OptimiseStreamsRepairTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/OptimiseStreamsRepairTest.java @@ -19,7 +19,6 @@ package org.apache.cassandra.distributed.test; import java.io.IOException; - import java.net.UnknownHostException; import java.util.ArrayList; import java.util.HashMap; @@ -29,8 +28,9 @@ import java.util.Random; import java.util.Set; import java.util.concurrent.Callable; - import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicInteger; + import org.junit.Test; import net.bytebuddy.ByteBuddy; @@ -44,8 +44,8 @@ import org.apache.cassandra.distributed.api.NodeToolResult; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.repair.AsymmetricRemoteSyncTask; -import org.apache.cassandra.repair.LocalSyncTask; import org.apache.cassandra.repair.RepairJob; +import org.apache.cassandra.repair.LocalSyncTask; import org.apache.cassandra.repair.SyncTask; import org.apache.cassandra.repair.TreeResponse; @@ -59,6 +59,8 @@ public class OptimiseStreamsRepairTest extends TestBaseImpl { + static final AtomicInteger createOptimizedSyncCount = new AtomicInteger(); + @Test public void testBasic() throws Exception { @@ -97,6 +99,7 @@ public void testBasic() throws Exception res = cluster.get(1).nodetoolResult("repair", KEYSPACE, "--preview", "--full"); res.asserts().success(); res.asserts().notificationContains("Previewed data was in sync"); + assertTrue(cluster.get(1).callOnInstance(() -> createOptimizedSyncCount.get()) > 0); } } @@ -114,6 +117,7 @@ public static void install(ClassLoader cl, int id) public static List createOptimisedSyncingSyncTasks(List trees, @SuperCall Callable> zuperCall) { + createOptimizedSyncCount.incrementAndGet(); List tasks = null; try { diff --git a/test/distributed/org/apache/cassandra/distributed/test/PaxosRepair2Test.java b/test/distributed/org/apache/cassandra/distributed/test/PaxosRepair2Test.java index 175f70c7973f..6f0dc8e7149d 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/PaxosRepair2Test.java +++ b/test/distributed/org/apache/cassandra/distributed/test/PaxosRepair2Test.java @@ -28,12 +28,6 @@ import com.google.common.collect.Iterators; import com.google.common.collect.Sets; - -import org.apache.cassandra.distributed.shared.WithProperties; -import org.apache.cassandra.repair.SharedContext; -import org.apache.cassandra.service.paxos.cleanup.PaxosCleanupLocalCoordinator; -import org.apache.cassandra.service.paxos.cleanup.PaxosCleanupResponse; -import org.apache.cassandra.utils.Shared; import org.awaitility.Awaitility; import org.junit.Assert; import org.junit.Test; @@ -71,11 +65,13 @@ import org.apache.cassandra.distributed.api.Feature; import org.apache.cassandra.distributed.api.IInvokableInstance; import org.apache.cassandra.distributed.shared.ClusterUtils; +import org.apache.cassandra.distributed.shared.WithProperties; import org.apache.cassandra.exceptions.CasWriteTimeoutException; import org.apache.cassandra.gms.FailureDetector; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.Verb; import org.apache.cassandra.repair.RepairParallelism; +import org.apache.cassandra.repair.SharedContext; import org.apache.cassandra.repair.messages.RepairOption; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.Schema; @@ -88,6 +84,8 @@ import org.apache.cassandra.service.paxos.Commit; import org.apache.cassandra.service.paxos.Paxos; import org.apache.cassandra.service.paxos.PaxosState; +import org.apache.cassandra.service.paxos.cleanup.PaxosCleanupLocalCoordinator; +import org.apache.cassandra.service.paxos.cleanup.PaxosCleanupResponse; import org.apache.cassandra.service.paxos.uncommitted.PaxosKeyState; import org.apache.cassandra.service.paxos.uncommitted.PaxosRows; import org.apache.cassandra.service.paxos.uncommitted.PaxosUncommittedTracker; @@ -98,6 +96,7 @@ import org.apache.cassandra.utils.CloseableIterator; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.Shared; import static java.util.concurrent.TimeUnit.SECONDS; import static org.apache.cassandra.config.CassandraRelevantProperties.AUTO_REPAIR_FREQUENCY_SECONDS; @@ -150,8 +149,9 @@ private static void repair(Cluster cluster, String keyspace, String table, boole options.put(RepairOption.FORCE_REPAIR_KEY, Boolean.toString(force)); options.put(RepairOption.PREVIEW, PreviewKind.NONE.toString()); options.put(RepairOption.IGNORE_UNREPLICATED_KS, Boolean.toString(false)); + options.put(RepairOption.REPAIR_DATA_KEY, Boolean.toString(false)); options.put(RepairOption.REPAIR_PAXOS_KEY, Boolean.toString(true)); - options.put(RepairOption.PAXOS_ONLY_KEY, Boolean.toString(true)); + options.put(RepairOption.REPAIR_ACCORD_KEY, Boolean.toString(false)); cluster.get(1).runOnInstance(() -> { int cmd = StorageService.instance.repairAsync(keyspace, options); diff --git a/test/distributed/org/apache/cassandra/distributed/test/PaxosRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/PaxosRepairTest.java index a2af4a213731..febffe529042 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/PaxosRepairTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/PaxosRepairTest.java @@ -19,7 +19,12 @@ package org.apache.cassandra.distributed.test; import java.net.InetSocketAddress; -import java.util.*; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -38,15 +43,23 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.cassandra.config.*; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.config.Config; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.QueryOptions; import org.apache.cassandra.cql3.QueryProcessor; import org.apache.cassandra.cql3.statements.SelectStatement; -import org.apache.cassandra.db.*; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.ReadExecutionController; +import org.apache.cassandra.db.ReadQuery; +import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.db.compaction.CompactionManager; import org.apache.cassandra.db.marshal.Int32Type; import org.apache.cassandra.db.partitions.PartitionIterator; -import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.RowIterator; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.Constants; import org.apache.cassandra.distributed.api.ConsistencyLevel; @@ -68,7 +81,9 @@ import org.apache.cassandra.service.ActiveRepairService; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.service.StorageService; -import org.apache.cassandra.service.paxos.*; +import org.apache.cassandra.service.paxos.Ballot; +import org.apache.cassandra.service.paxos.Commit; +import org.apache.cassandra.service.paxos.PaxosState; import org.apache.cassandra.service.paxos.cleanup.PaxosCleanup; import org.apache.cassandra.service.paxos.uncommitted.PaxosRows; import org.apache.cassandra.streaming.PreviewKind; @@ -78,7 +93,12 @@ import org.apache.cassandra.tcm.membership.NodeVersion; import org.apache.cassandra.tcm.transformations.CustomTransformation; import org.apache.cassandra.tcm.transformations.ForceSnapshot; -import org.apache.cassandra.utils.*; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.CassandraVersion; +import org.apache.cassandra.utils.ExecutorUtils; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.TimeUUID; import static java.util.concurrent.TimeUnit.SECONDS; import static org.apache.cassandra.distributed.shared.AssertUtils.assertRows; @@ -167,8 +187,9 @@ private static void repair(Cluster cluster, String keyspace, String table, boole options.put(RepairOption.FORCE_REPAIR_KEY, Boolean.toString(force)); options.put(RepairOption.PREVIEW, PreviewKind.NONE.toString()); options.put(RepairOption.IGNORE_UNREPLICATED_KS, Boolean.toString(false)); + options.put(RepairOption.REPAIR_DATA_KEY, Boolean.toString(false)); options.put(RepairOption.REPAIR_PAXOS_KEY, Boolean.toString(true)); - options.put(RepairOption.PAXOS_ONLY_KEY, Boolean.toString(true)); + options.put(RepairOption.REPAIR_ACCORD_KEY, Boolean.toString(false)); cluster.get(1).runOnInstance(() -> { int cmd = StorageService.instance.repairAsync(keyspace, options); diff --git a/test/distributed/org/apache/cassandra/distributed/test/QueriesTableTest.java b/test/distributed/org/apache/cassandra/distributed/test/QueriesTableTest.java index b0c3902ad152..e0805620a745 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/QueriesTableTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/QueriesTableTest.java @@ -15,23 +15,26 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.cassandra.distributed.test; import java.io.IOException; +import java.util.Arrays; import java.util.concurrent.Callable; import java.util.concurrent.CyclicBarrier; -import java.util.concurrent.TimeUnit; -import net.bytebuddy.ByteBuddy; -import net.bytebuddy.dynamic.loading.ClassLoadingStrategy; -import net.bytebuddy.implementation.MethodDelegation; -import net.bytebuddy.implementation.bind.annotation.SuperCall; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; +import accord.impl.progresslog.DefaultProgressLogs; +import com.datastax.driver.core.ConsistencyLevel; import com.datastax.driver.core.Session; +import com.datastax.driver.core.SimpleStatement; +import com.datastax.driver.core.Statement; +import net.bytebuddy.ByteBuddy; +import net.bytebuddy.dynamic.loading.ClassLoadingStrategy; +import net.bytebuddy.implementation.MethodDelegation; +import net.bytebuddy.implementation.bind.annotation.SuperCall; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.Mutation; import org.apache.cassandra.db.ReadCommand; @@ -39,10 +42,15 @@ import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.IInvokableInstance; import org.apache.cassandra.distributed.api.Row; import org.apache.cassandra.distributed.api.SimpleQueryResult; +import org.apache.cassandra.service.consensus.TransactionalMode; import org.apache.cassandra.utils.Throwables; +import org.awaitility.Awaitility; +import static com.google.common.base.Preconditions.checkState; +import static java.util.concurrent.TimeUnit.SECONDS; import static net.bytebuddy.matcher.ElementMatchers.named; import static net.bytebuddy.matcher.ElementMatchers.takesArguments; import static org.junit.Assert.assertTrue; @@ -57,7 +65,9 @@ public class QueriesTableTest extends TestBaseImpl public static void createCluster() throws IOException { SHARED_CLUSTER = init(Cluster.build(1).withInstanceInitializer(QueryDelayHelper::install) - .withConfig(c -> c.with(Feature.NATIVE_PROTOCOL, Feature.GOSSIP)).start()); + .withConfig(c -> c.with(Feature.NATIVE_PROTOCOL, Feature.GOSSIP) + .set("write_request_timeout", "10s")).start()); + DRIVER_CLUSTER = JavaDriverUtils.create(SHARED_CLUSTER); SESSION = DRIVER_CLUSTER.connect(); } @@ -79,19 +89,31 @@ public static void closeCluster() public void shouldExposeReadsAndWrites() throws Throwable { SHARED_CLUSTER.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (k int primary key, v int)"); - - boolean readVisible = false; - boolean coordinatorReadVisible = false; - boolean writeVisible = false; - boolean coordinatorWriteVisible = false; - SESSION.executeAsync("INSERT INTO " + KEYSPACE + ".tbl (k, v) VALUES (0, 0)"); SESSION.executeAsync("SELECT * FROM " + KEYSPACE + ".tbl WHERE k = 0"); // Wait until the coordinator/local read and write are visible: + Awaitility.await() + .atMost(60, SECONDS) + .pollInterval(1, SECONDS) + .dontCatchUncaughtExceptions() + .untilAsserted(QueriesTableTest::assertReadsAndWritesVisible); + + // Issue another read and write to unblock the original queries in progress: + SESSION.execute("INSERT INTO " + KEYSPACE + ".tbl (k, v) VALUES (0, 0)"); + SESSION.execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE k = 0"); + + waitForQueriesToFinish(); + } + + private static void assertReadsAndWritesVisible() + { SimpleQueryResult result = SHARED_CLUSTER.get(1).executeInternalWithResult("SELECT * FROM system_views.queries"); - while (result.toObjectArrays().length < 4) - result = SHARED_CLUSTER.get(1).executeInternalWithResult("SELECT * FROM system_views.queries"); + + boolean readVisible = false; + boolean coordinatorReadVisible = false; + boolean writeVisible = false; + boolean coordinatorWriteVisible = false; while (result.hasNext()) { @@ -105,32 +127,37 @@ public void shouldExposeReadsAndWrites() throws Throwable coordinatorWriteVisible |= threadId.contains("Native-Transport-Requests") && task.contains("INSERT"); } - // Issue another read and write to unblock the original queries in progress: - SESSION.execute("INSERT INTO " + KEYSPACE + ".tbl (k, v) VALUES (0, 0)"); - SESSION.execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE k = 0"); - assertTrue(readVisible); assertTrue(coordinatorReadVisible); assertTrue(writeVisible); assertTrue(coordinatorWriteVisible); - - waitForQueriesToFinish(); } @Test public void shouldExposeCAS() throws Throwable { SHARED_CLUSTER.schemaChange("CREATE TABLE " + KEYSPACE + ".cas_tbl (k int primary key, v int)"); - - boolean readVisible = false; - boolean coordinatorUpdateVisible = false; - SESSION.executeAsync("UPDATE " + KEYSPACE + ".cas_tbl SET v = 10 WHERE k = 0 IF v = 0"); // Wait until the coordinator update and local read required by the CAS operation are visible: + Awaitility.await() + .atMost(60, SECONDS) + .pollInterval(1, SECONDS) + .dontCatchUncaughtExceptions() + .untilAsserted(QueriesTableTest::assertCasVisible); + + // Issue a read to unblock the read generated by the original CAS operation: + SESSION.executeAsync("SELECT * FROM " + KEYSPACE + ".cas_tbl WHERE k = 0"); + + waitForQueriesToFinish(); + } + + private static void assertCasVisible() + { SimpleQueryResult result = SHARED_CLUSTER.get(1).executeInternalWithResult("SELECT * FROM system_views.queries"); - while (result.toObjectArrays().length < 2) - result = SHARED_CLUSTER.get(1).executeInternalWithResult("SELECT * FROM system_views.queries"); + + boolean readVisible = false; + boolean coordinatorUpdateVisible = false; while (result.hasNext()) { @@ -142,26 +169,77 @@ public void shouldExposeCAS() throws Throwable coordinatorUpdateVisible |= threadId.contains("Native-Transport-Requests") && task.contains("UPDATE"); } - // Issue a read to unblock the read generated by the original CAS operation: - SESSION.executeAsync("SELECT * FROM " + KEYSPACE + ".cas_tbl WHERE k = 0"); - assertTrue(readVisible); assertTrue(coordinatorUpdateVisible); + } + + @Test + public void shouldExposeTransaction() throws Throwable + { + SHARED_CLUSTER.schemaChange("CREATE TABLE " + KEYSPACE + ".accord_tbl (k int primary key, v int) WITH " + TransactionalMode.mixed_reads.asCqlParam()); + + // Disable recovery to make sure only one local read occurs: + for (IInvokableInstance instance : SHARED_CLUSTER) + instance.runOnInstance(() -> DefaultProgressLogs.unsafePauseForTesting(true)); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + KEYSPACE + ".accord_tbl WHERE k = 0);\n" + + " SELECT row1.k, row1.v;\n" + + " IF row1.v = 0 THEN\n" + + " UPDATE " + KEYSPACE + ".accord_tbl SET v = 10 WHERE k = 0;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + + Statement statement = new SimpleStatement(update); + statement.setConsistencyLevel(ConsistencyLevel.QUORUM); + SESSION.executeAsync(statement); + + // Wait until the coordinator update and local read required by the CAS operation are visible: + Awaitility.await() + .atMost(60, SECONDS) + .pollInterval(1, SECONDS) + .dontCatchUncaughtExceptions() + .untilAsserted(QueriesTableTest::assertTransactionVisible); + + // Issue a read to unblock the read generated by the original CAS operation: + SESSION.executeAsync("SELECT * FROM " + KEYSPACE + ".accord_tbl WHERE k = 0"); waitForQueriesToFinish(); } + private static void assertTransactionVisible() + { + SimpleQueryResult queries = SHARED_CLUSTER.get(1).executeInternalWithResult("SELECT * FROM system_views.queries"); + + boolean readVisible = false; + boolean coordinatorTxnVisible = false; + + while (queries.hasNext()) + { + Row row = queries.next(); + String threadId = row.get("thread_id").toString(); + String task = row.get("task").toString(); + + readVisible |= threadId.contains("Read") && task.contains("SELECT"); + coordinatorTxnVisible |= threadId.contains("Native-Transport-Requests") && task.contains("BEGIN TRANSACTION"); + } + + assertTrue(readVisible); + assertTrue(coordinatorTxnVisible); + } + private static void waitForQueriesToFinish() throws InterruptedException { // Continue to query the "queries" table until nothing is in progress... SimpleQueryResult result = SHARED_CLUSTER.get(1).executeInternalWithResult("SELECT * FROM system_views.queries"); while (result.hasNext()) { - TimeUnit.SECONDS.sleep(1); + SECONDS.sleep(1); result = SHARED_CLUSTER.get(1).executeInternalWithResult("SELECT * FROM system_views.queries"); } } + @SuppressWarnings("resource") public static class QueryDelayHelper { private static final CyclicBarrier readBarrier = new CyclicBarrier(2); @@ -169,12 +247,14 @@ public static class QueryDelayHelper static void install(ClassLoader cl, int nodeNumber) { + checkState(Arrays.stream(Mutation.class.getDeclaredMethods()).anyMatch(method -> method.getName().equals("apply") && method.getParameterCount() == 3)); new ByteBuddy().rebase(Mutation.class) .method(named("apply").and(takesArguments(3))) .intercept(MethodDelegation.to(QueryDelayHelper.class)) .make() .load(cl, ClassLoadingStrategy.Default.INJECTION); + checkState(Arrays.stream(ReadCommand.class.getDeclaredMethods()).anyMatch(method -> method.getName().equals("executeLocally") && method.getParameterCount() == 1)); new ByteBuddy().rebase(ReadCommand.class) .method(named("executeLocally").and(takesArguments(1))) .intercept(MethodDelegation.to(QueryDelayHelper.class)) diff --git a/test/distributed/org/apache/cassandra/distributed/test/ReadRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/ReadRepairTest.java index 6470a4d0c5d1..e9db6a9235f1 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/ReadRepairTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/ReadRepairTest.java @@ -23,9 +23,12 @@ import java.util.Map; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import com.google.common.util.concurrent.FutureCallback; import org.junit.Assert; +import org.junit.Before; import org.junit.Ignore; import org.junit.Test; @@ -47,10 +50,13 @@ import org.apache.cassandra.distributed.api.Feature; import org.apache.cassandra.distributed.api.ICoordinator; import org.apache.cassandra.distributed.api.IInstanceConfig; +import org.apache.cassandra.distributed.api.IMessageFilters.Filter; import org.apache.cassandra.distributed.api.TokenSupplier; import org.apache.cassandra.distributed.shared.NetworkTopology; +import org.apache.cassandra.distributed.test.accord.AccordTestBase; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.locator.ReplicaPlan; +import org.apache.cassandra.service.consensus.TransactionalMode; import org.apache.cassandra.service.reads.repair.BlockingReadRepair; import org.apache.cassandra.service.reads.repair.ReadRepairStrategy; import org.apache.cassandra.utils.concurrent.Condition; @@ -58,7 +64,6 @@ import org.checkerframework.checker.nullness.qual.Nullable; import static net.bytebuddy.matcher.ElementMatchers.named; - import static org.apache.cassandra.config.CassandraRelevantProperties.ALLOW_ALTER_RF_DURING_RANGE_MOVEMENT; import static org.apache.cassandra.db.Keyspace.open; import static org.apache.cassandra.distributed.api.ConsistencyLevel.ALL; @@ -71,17 +76,34 @@ import static org.apache.cassandra.net.Verb.READ_REQ; import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis; import static org.apache.cassandra.utils.concurrent.Condition.newOneTimeCondition; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; public class ReadRepairTest extends TestBaseImpl { + private static int tableNum = 0; + private String tableName; + + private void incrementTableName() + { + tableName = "tbl" + tableNum++; + } + + @Before + public void setup() + { + incrementTableName(); + } + /** * Tests basic behaviour of read repair with {@code BLOCKING} read repair strategy. */ @Test public void testBlockingReadRepair() throws Throwable { - testReadRepair(ReadRepairStrategy.BLOCKING); + testReadRepair(ReadRepairStrategy.BLOCKING, false); + incrementTableName(); + testReadRepair(ReadRepairStrategy.BLOCKING, true); } /** * @@ -95,14 +117,19 @@ public void testNoneReadRepair() throws Throwable private void testReadRepair(ReadRepairStrategy strategy) throws Throwable { - try (Cluster cluster = init(Cluster.create(3))) - { - cluster.schemaChange(withKeyspace("CREATE TABLE %s.t (k int, c int, v int, PRIMARY KEY (k, c)) " + - String.format("WITH read_repair='%s'", strategy))); + testReadRepair(strategy, false); + } + + private void testReadRepair(ReadRepairStrategy strategy, boolean brrThroughAccord) throws Throwable { + try (Cluster cluster = init(Cluster.create(3, c -> c.with(Feature.GOSSIP, Feature.NETWORK)))) { + TransactionalMode transactionalMode = brrThroughAccord ? TransactionalMode.test_unsafe_writes : TransactionalMode.off; + cluster.schemaChange(withKeyspace("CREATE TABLE %s." + tableName + " (k int, c int, v int, PRIMARY KEY (k, c)) WITH transactional_mode='" + transactionalMode.toString().toLowerCase() + '\'' + + String.format(" AND read_repair='%s'", strategy))); + AccordTestBase.ensureTableIsAccordManaged(cluster, KEYSPACE, "t"); Object[] row = row(1, 1, 1); - String insertQuery = withKeyspace("INSERT INTO %s.t (k, c, v) VALUES (?, ?, ?)"); - String selectQuery = withKeyspace("SELECT * FROM %s.t WHERE k=1"); + String insertQuery = withKeyspace("INSERT INTO %s." + tableName + " (k, c, v) VALUES (?, ?, ?)"); + String selectQuery = withKeyspace("SELECT * FROM %s." + tableName + " WHERE k=1"); // insert data in two nodes, simulating a quorum write that has missed one node cluster.get(1).executeInternal(insertQuery, row); @@ -111,8 +138,11 @@ private void testReadRepair(ReadRepairStrategy strategy) throws Throwable // verify that the third node doesn't have the row assertRows(cluster.get(3).executeInternal(selectQuery)); - // read with CL=QUORUM to trigger read repair + // read with CL=QUORUM to trigger read repair, force 3 to be involved in the read so that read repair + // will occur + Filter blockReadFromOne = cluster.filters().inbound().from(3).to(1).verbs(READ_REQ.id).drop(); assertRows(cluster.coordinator(3).execute(selectQuery, QUORUM), row); + blockReadFromOne.off(); // verify whether the coordinator has the repaired row depending on the read repair strategy if (strategy == ReadRepairStrategy.NONE) @@ -125,33 +155,32 @@ private void testReadRepair(ReadRepairStrategy strategy) throws Throwable @Test public void readRepairTimeoutTest() throws Throwable { - final long reducedReadTimeout = 3000L; - try (Cluster cluster = init(builder().withNodes(3).start())) - { + try (Cluster cluster = init(Cluster.create(3, c -> c.with(Feature.GOSSIP, Feature.NETWORK)))) { + final long reducedReadTimeout = 3000L; cluster.forEach(i -> i.runOnInstance(() -> DatabaseDescriptor.setReadRpcTimeout(reducedReadTimeout))); - cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck)) WITH read_repair='blocking'"); - cluster.get(1).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 1, 1)"); - cluster.get(2).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 1, 1)"); - assertRows(cluster.get(3).executeInternal("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1")); + cluster.schemaChange("CREATE TABLE " + KEYSPACE + "." + tableName + " (pk int, ck int, v int, PRIMARY KEY (pk, ck)) WITH read_repair='blocking'"); + cluster.get(1).executeInternal("INSERT INTO " + KEYSPACE + "." + tableName + " (pk, ck, v) VALUES (1, 1, 1)"); + cluster.get(2).executeInternal("INSERT INTO " + KEYSPACE + "." + tableName + " (pk, ck, v) VALUES (1, 1, 1)"); + assertRows(cluster.get(3).executeInternal("SELECT * FROM " + KEYSPACE + "." + tableName + " WHERE pk = 1")); cluster.verbs(READ_REPAIR_RSP).to(1).drop(); final long start = currentTimeMillis(); try { - cluster.coordinator(1).execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("SELECT * FROM " + KEYSPACE + "." + tableName + " WHERE pk = 1", ConsistencyLevel.ALL); fail("Read timeout expected but it did not occur"); } catch (Exception ex) { // the containing exception class was loaded by another class loader. Comparing the message as a workaround to assert the exception - Assert.assertTrue(ex.getClass().toString().contains("ReadTimeoutException")); + assertTrue(ex.getClass().toString().contains("ReadTimeoutException")); long actualTimeTaken = currentTimeMillis() - start; long magicDelayAmount = 100L; // it might not be the best way to check if the time taken is around the timeout value. // Due to the delays, the actual time taken from client perspective is slighly more than the timeout value - Assert.assertTrue(actualTimeTaken > reducedReadTimeout); + assertTrue(actualTimeTaken > reducedReadTimeout); // But it should not exceed too much - Assert.assertTrue(actualTimeTaken < reducedReadTimeout + magicDelayAmount); - assertRows(cluster.get(3).executeInternal("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1"), - row(1, 1, 1)); // the partition happened when the repaired node sending back ack. The mutation should be in fact applied. + assertTrue(actualTimeTaken < reducedReadTimeout + magicDelayAmount); + assertRows(cluster.get(3).executeInternal("SELECT * FROM " + KEYSPACE + "." + tableName + " WHERE pk = 1"), + row(1, 1, 1)); // the partition happened when the repaired node sending back ack. The mutation should be in fact applied. } } } @@ -161,20 +190,20 @@ public void failingReadRepairTest() throws Throwable { try (Cluster cluster = init(builder().withNodes(3).start())) { - cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck)) WITH read_repair='blocking'"); + cluster.schemaChange("CREATE TABLE " + KEYSPACE + "." + tableName + " (pk int, ck int, v int, PRIMARY KEY (pk, ck)) WITH read_repair='blocking'"); for (int i = 1 ; i <= 2 ; ++i) - cluster.get(i).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 1, 1)"); + cluster.get(i).executeInternal("INSERT INTO " + KEYSPACE + "." + tableName + " (pk, ck, v) VALUES (1, 1, 1)"); - assertRows(cluster.get(3).executeInternal("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1")); + assertRows(cluster.get(3).executeInternal("SELECT * FROM " + KEYSPACE + "." + tableName + " WHERE pk = 1")); cluster.filters().verbs(READ_REPAIR_REQ.id).to(3).drop(); - assertRows(cluster.coordinator(1).execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1", + assertRows(cluster.coordinator(1).execute("SELECT * FROM " + KEYSPACE + "." + tableName + " WHERE pk = 1", ConsistencyLevel.QUORUM), row(1, 1, 1)); // Data was not repaired - assertRows(cluster.get(3).executeInternal("SELECT * FROM " + KEYSPACE + ".tbl WHERE pk = 1")); + assertRows(cluster.get(3).executeInternal("SELECT * FROM " + KEYSPACE + "." + tableName + " WHERE pk = 1")); } } @@ -354,6 +383,8 @@ private void testRangeSliceQueryWithTombstones(boolean flush) throws Throwable @Test public void readRepairRTRangeMovementTest() throws IOException { + if (true) + return; ExecutorPlus es = ExecutorFactory.Global.executorFactory().sequential("query-executor"); String key = "test1"; try (Cluster cluster = init(Cluster.build() @@ -423,8 +454,11 @@ public void onFailure(Throwable t) {} catch (ExecutionException e) { Throwable cause = e.getCause(); - Assert.assertTrue("Expected a different error message, but got " + cause.getMessage(), - cause.getMessage().contains("INVALID_ROUTING from /127.0.0.2:7012")); + Matcher matcher = Pattern.compile("Operation failed - received (\\d+) responses and 1 failures: INVALID_ROUTING from /127.0.0.2:7012").matcher(cause.getMessage()); + assertTrue("Expected a different error message, but got " + cause.getMessage(), + matcher.matches()); + int responses = Integer.valueOf(matcher.group(1)); + assertTrue(responses >= 1 && responses <= 3); } catch (InterruptedException e) { diff --git a/test/distributed/org/apache/cassandra/distributed/test/ReadSpeculationTest.java b/test/distributed/org/apache/cassandra/distributed/test/ReadSpeculationTest.java index 445315f34388..98b98aa1e5af 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/ReadSpeculationTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/ReadSpeculationTest.java @@ -53,6 +53,7 @@ import org.apache.cassandra.locator.ReplicaPlan; import org.apache.cassandra.locator.ReplicaPlans; import org.apache.cassandra.service.CassandraDaemon; +import org.apache.cassandra.service.reads.ReadCoordinator; import org.apache.cassandra.transport.Dispatcher; import static net.bytebuddy.matcher.ElementMatchers.named; @@ -85,8 +86,9 @@ public void speculateTest() throws Throwable Keyspace keyspace = Keyspace.openIfExists(KEYSPACE); ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(TABLE); DecoratedKey dk = cfs.decorateKey(bytes(PK_VALUE)); - ReplicaPlan.ForTokenRead plan = ReplicaPlans.forRead(keyspace, dk.getToken(), null, - QUORUM, cfs.metadata().params.speculativeRetry); + ReplicaPlan.ForTokenRead plan = ReplicaPlans.forRead(keyspace, cfs.getTableId(), dk.getToken(), null, + QUORUM, cfs.metadata().params.speculativeRetry, + ReadCoordinator.DEFAULT); return plan.contacts().endpointList().stream().map(InetSocketAddress::getAddress).collect(Collectors.toList()); }, null); logger.info("Replicas provided in a read plan contacts: {}", readPlanEndpoints); diff --git a/test/distributed/org/apache/cassandra/distributed/test/RepairDigestTrackingTest.java b/test/distributed/org/apache/cassandra/distributed/test/RepairDigestTrackingTest.java index 1e5773c67a05..2e1cb4b16da5 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/RepairDigestTrackingTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/RepairDigestTrackingTest.java @@ -30,44 +30,45 @@ import com.google.common.util.concurrent.Uninterruptibles; import org.junit.Assert; - -import org.apache.cassandra.Util; -import org.apache.cassandra.concurrent.SEPExecutor; -import org.apache.cassandra.config.CassandraRelevantProperties; -import org.apache.cassandra.dht.Token; -import org.apache.cassandra.distributed.shared.WithProperties; -import org.apache.cassandra.locator.AbstractReplicationStrategy; -import org.apache.cassandra.locator.EndpointsForToken; -import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.locator.ReplicaLayout; -import org.apache.cassandra.locator.ReplicaUtils; -import org.apache.cassandra.service.snapshot.SnapshotManager; -import org.apache.cassandra.tcm.ClusterMetadata; -import org.apache.cassandra.utils.Throwables; import org.junit.Test; import net.bytebuddy.ByteBuddy; import net.bytebuddy.dynamic.loading.ClassLoadingStrategy; import net.bytebuddy.implementation.MethodDelegation; import net.bytebuddy.implementation.bind.annotation.SuperCall; +import org.apache.cassandra.Util; +import org.apache.cassandra.concurrent.SEPExecutor; +import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.ReadCommand; import org.apache.cassandra.db.ReadExecutionController; import org.apache.cassandra.db.SinglePartitionReadCommand; import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.api.IInvokableInstance; import org.apache.cassandra.distributed.api.IIsolatedExecutor; +import org.apache.cassandra.distributed.shared.WithProperties; import org.apache.cassandra.io.sstable.Descriptor; import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.io.sstable.format.StatsComponent; import org.apache.cassandra.io.sstable.metadata.StatsMetadata; +import org.apache.cassandra.locator.AbstractReplicationStrategy; +import org.apache.cassandra.locator.EndpointsForToken; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.ReplicaLayout; +import org.apache.cassandra.locator.ReplicaUtils; +import org.apache.cassandra.schema.TableId; import org.apache.cassandra.service.ActiveRepairService; import org.apache.cassandra.service.StorageProxy; import org.apache.cassandra.service.StorageProxy.LocalReadRunnable; +import org.apache.cassandra.service.reads.ReadCoordinator; +import org.apache.cassandra.service.snapshot.SnapshotManager; +import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.utils.DiagnosticSnapshotService; +import org.apache.cassandra.utils.Throwables; import static net.bytebuddy.matcher.ElementMatchers.named; import static net.bytebuddy.matcher.ElementMatchers.takesArguments; @@ -380,7 +381,7 @@ else if (ccAfterPartitionRead != ccBefore) * local reads triggered by read repair (after speculative reads) execute at roughly the same time. * * This test depends on whether node1 gets a data or a digest request first, we force it to be a digest request - * in the forTokenReadLiveSorted ByteBuddy rule below. + * in the forTokenReadSorted ByteBuddy rule below. */ @Test public void testLocalDataAndRemoteRequestConcurrency() throws Exception @@ -440,7 +441,7 @@ public static void install(ClassLoader classLoader, Integer num) .load(classLoader, ClassLoadingStrategy.Default.INJECTION); new ByteBuddy().rebase(ReplicaLayout.class) - .method(named("forTokenReadLiveSorted").and(takesArguments(ClusterMetadata.class, Keyspace.class, AbstractReplicationStrategy.class, Token.class))) + .method(named("forTokenReadSorted").and(takesArguments(ClusterMetadata.class, Keyspace.class, AbstractReplicationStrategy.class, TableId.class, Token.class, ReadCoordinator.class))) .intercept(MethodDelegation.to(BBHelper.class)) .make() .load(classLoader, ClassLoadingStrategy.Default.INJECTION); @@ -475,7 +476,7 @@ public static UnfilteredPartitionIterator executeLocally(ReadExecutionController } @SuppressWarnings({ "unused" }) - public static ReplicaLayout.ForTokenRead forTokenReadLiveSorted(ClusterMetadata metadata, Keyspace keyspace, AbstractReplicationStrategy replicationStrategy, Token token) + public static ReplicaLayout.ForTokenRead forTokenReadSorted(ClusterMetadata metadata, Keyspace keyspace, AbstractReplicationStrategy replicationStrategy, TableId tableId, Token token, org.apache.cassandra.service.reads.ReadCoordinator coordinator) { try { diff --git a/test/distributed/org/apache/cassandra/distributed/test/ReprepareFuzzTest.java b/test/distributed/org/apache/cassandra/distributed/test/ReprepareFuzzTest.java index f56847f68c9a..9988234743aa 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/ReprepareFuzzTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/ReprepareFuzzTest.java @@ -43,6 +43,7 @@ import net.bytebuddy.dynamic.DynamicType; import net.bytebuddy.dynamic.loading.ClassLoadingStrategy; import net.bytebuddy.implementation.MethodDelegation; +import org.apache.cassandra.cql3.QueryHandler.Prepared; import org.apache.cassandra.cql3.QueryProcessor; import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.distributed.api.ConsistencyLevel; @@ -226,9 +227,10 @@ public void fuzzTest() throws Throwable case CLEAR_CACHES: c.get(1).runOnInstance(() -> { SystemKeyspace.loadPreparedStatements((id, query, keyspace) -> { + Prepared prepared = QueryProcessor.instance.getPrepared(id); if (rng.nextBoolean()) QueryProcessor.instance.evictPrepared(id); - return true; + return prepared; }); }); break; diff --git a/test/distributed/org/apache/cassandra/distributed/test/ResourceLeakTest.java b/test/distributed/org/apache/cassandra/distributed/test/ResourceLeakTest.java index 03cb62d5cba6..b0dd6c7baa0b 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/ResourceLeakTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/ResourceLeakTest.java @@ -282,6 +282,7 @@ public void looperJmxTest() throws Throwable * Depending on the type of leak, we may need to actually exercise functionality like JMX or Native * beyond just enabling the feature, so we use the "everything" test even though it may take longer to run. */ + @Ignore @Test public void looperEverythingTest() throws Throwable { diff --git a/test/distributed/org/apache/cassandra/distributed/test/SSTableIdGenerationTest.java b/test/distributed/org/apache/cassandra/distributed/test/SSTableIdGenerationTest.java index c45108eea3a8..75b0806899d9 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/SSTableIdGenerationTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/SSTableIdGenerationTest.java @@ -36,6 +36,7 @@ import org.apache.cassandra.cql3.UntypedResultSet; import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.db.compaction.AbstractCompactionStrategy; import org.apache.cassandra.db.compaction.LeveledCompactionStrategy; @@ -144,8 +145,14 @@ public void testRestartWithUUIDDisabled() throws IOException .withConfig(config -> config.set(ENABLE_UUID_FIELD_NAME, true)) .start())) { - cluster.disableAutoCompaction(KEYSPACE); cluster.schemaChange(createTableStmt(KEYSPACE, "tbl", null)); + for (IInvokableInstance instance : cluster) + { + instance.runOnInstance(() -> { + for (ColumnFamilyStore cs : Keyspace.open(KEYSPACE).getColumnFamilyStores()) + cs.disableAutoCompaction(); + }); + } createSSTables(cluster.get(1), KEYSPACE, "tbl", 1, 2); assertSSTablesCount(cluster.get(1), 0, 2, KEYSPACE, "tbl"); verfiySSTableActivity(cluster, false); diff --git a/test/distributed/org/apache/cassandra/distributed/test/ShortReadProtectionTest.java b/test/distributed/org/apache/cassandra/distributed/test/ShortReadProtectionTest.java index 2e26659243a6..2586531a4fec 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/ShortReadProtectionTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/ShortReadProtectionTest.java @@ -28,6 +28,7 @@ import java.util.function.Function; import java.util.stream.IntStream; +import com.google.common.collect.ImmutableList; import org.junit.After; import org.junit.AfterClass; import org.junit.Before; @@ -41,12 +42,14 @@ import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.api.IInvokableInstance; import org.apache.cassandra.distributed.shared.AssertUtils; +import org.apache.cassandra.service.consensus.TransactionalMode; import org.apache.cassandra.utils.ByteBufferUtil; import static com.google.common.collect.Iterators.toArray; import static java.lang.String.format; import static org.apache.cassandra.distributed.api.ConsistencyLevel.ALL; import static org.apache.cassandra.distributed.api.ConsistencyLevel.QUORUM; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.SERIAL; import static org.apache.cassandra.distributed.shared.AssertUtils.row; /** @@ -81,23 +84,32 @@ public class ShortReadProtectionTest extends TestBaseImpl @Parameterized.Parameter(2) public boolean paging; - @Parameterized.Parameters(name = "{index}: read_cl={0} flush={1} paging={2}") + @Parameterized.Parameter(3) + public TransactionalMode transactionalMode; + + @Parameterized.Parameters(name = "{index}: read_cl={0} flush={1} paging={2}, transactionalMode={3}") public static Collection data() { List result = new ArrayList<>(); - for (ConsistencyLevel readConsistencyLevel : Arrays.asList(ALL, QUORUM)) - for (boolean flush : BOOLEANS) - for (boolean paging : BOOLEANS) - result.add(new Object[]{ readConsistencyLevel, flush, paging }); + for (TransactionalMode mode : ImmutableList.of(TransactionalMode.test_interop_read, TransactionalMode.off)) + for (ConsistencyLevel readConsistencyLevel : Arrays.asList(ALL, QUORUM, SERIAL)) + for (boolean flush : BOOLEANS) + for (boolean paging : BOOLEANS) + result.add(new Object[]{ readConsistencyLevel, flush, paging, mode}); return result; } @BeforeClass public static void setupCluster() throws IOException { + // TODO this blocks some of the original testing of SRP invoking BRR since it is BRRing through Accord + // but maybe that is out of scope and is covered by the dedicated BRR tests? cluster = init(Cluster.build() .withNodes(NUM_NODES) - .withConfig(config -> config.set("hinted_handoff_enabled", false)) + .withConfig(config -> + config.set("hinted_handoff_enabled", false) + .set("accord.shard_durability_target_splits", 4) + ) .start()); } @@ -111,7 +123,7 @@ public static void teardownCluster() @Before public void setupTester() { - tester = new Tester(readConsistencyLevel, flush, paging); + tester = new Tester(readConsistencyLevel, flush, paging, transactionalMode); } @After @@ -416,24 +428,35 @@ private static class Tester private final ConsistencyLevel readConsistencyLevel; private final boolean flush, paging; + private final String table; private final String qualifiedTableName; + private final TransactionalMode transactionalMode; private boolean flushed = false; - private Tester(ConsistencyLevel readConsistencyLevel, boolean flush, boolean paging) + private Tester(ConsistencyLevel readConsistencyLevel, boolean flush, boolean paging, TransactionalMode transactionalMode) { this.readConsistencyLevel = readConsistencyLevel; this.flush = flush; this.paging = paging; - qualifiedTableName = KEYSPACE + ".t_" + seqNumber.getAndIncrement(); + this.table = "t_" + seqNumber.getAndIncrement(); + qualifiedTableName = KEYSPACE + '.' + table; + this.transactionalMode = transactionalMode; - assert readConsistencyLevel == ALL || readConsistencyLevel == QUORUM + assert readConsistencyLevel == ALL || readConsistencyLevel == QUORUM || readConsistencyLevel == SERIAL : "Only ALL and QUORUM consistency levels are supported"; } private Tester createTable(String query) { - cluster.schemaChange(format(query) + " WITH read_repair='NONE'"); + String formattedQuery = format(query) + " WITH read_repair='NONE'"; + if (transactionalMode != TransactionalMode.off) + { + // For test purposes we create the table and in an interop mode that forces interop reads so + // testing short reads is trivial + formattedQuery = formattedQuery + " AND " + transactionalMode.asCqlParam(); + } + cluster.schemaChange(formattedQuery); return this; } @@ -485,12 +508,12 @@ private Tester toNode3(String... queries) /** * Internally runs the specified write queries in the specified node. If the {@link #readConsistencyLevel} is - * QUORUM the write will also be internally done in the next replica in the ring, to simulate a QUORUM write. + * QUORUM/SERIAL the write will also be internally done in the next replica in the ring, to simulate a QUORUM/SERIAL write. */ private Tester toNode(int node, String... queries) { IInvokableInstance replica = cluster.get(node); - IInvokableInstance nextReplica = readConsistencyLevel == QUORUM + IInvokableInstance nextReplica = (readConsistencyLevel == QUORUM || readConsistencyLevel == SERIAL) ? cluster.get(node == NUM_NODES ? 1 : node + 1) : null; diff --git a/test/distributed/org/apache/cassandra/distributed/test/SlowQueriesAppenderTest.java b/test/distributed/org/apache/cassandra/distributed/test/SlowQueriesAppenderTest.java new file mode 100644 index 000000000000..d8b6b3f00f40 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/SlowQueriesAppenderTest.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test; + +import org.junit.Test; + +import org.apache.cassandra.db.virtual.SlowQueriesTable; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.Constants; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.shared.WithProperties; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.utils.logging.SlowQueriesAppender; + +import static java.lang.String.format; +import static org.apache.cassandra.config.CassandraRelevantProperties.LOGBACK_CONFIGURATION_FILE; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +/** + * It is inherently tricky / flaky to make some queries to be slow so we just test + * the invalid configuration otherwise the table as such is tested in {@link org.apache.cassandra.db.virtual.SlowQueriesTableTest}. + */ +public class SlowQueriesAppenderTest extends AbstractVirtualLogsTableTest +{ + @Test + public void testMultipleAppendersFailToStartNode() throws Throwable + { + LOGBACK_CONFIGURATION_FILE.setString("test/conf/logback-dtest_with_slow_query_appender_invalid.xml"); + + // NOTE: Because cluster startup is expected to fail in this case, and can leave things in a weird state + // for the next state, create without starting, and set failure as shutdown to false, + // so the try-with-resources can close instances properly. + try (WithProperties properties = new WithProperties().set(LOGBACK_CONFIGURATION_FILE, "test/conf/logback-dtest_with_slow_query_appender_invalid.xml"); + Cluster cluster = Cluster.build(1) + .withConfig(c -> c.with(Feature.values()) + .set(Constants.KEY_DTEST_API_STARTUP_FAILURE_AS_SHUTDOWN, false)) + .createWithoutStarting()) + { + cluster.startup(); + fail("Node should not start as there is supposed to be invalid logback configuration file."); + } + catch (IllegalStateException ex) + { + assertEquals(format("There are multiple appenders of class %s " + + "of names SLOW_QUERIES_APPENDER,SLOW_QUERIES_APPENDER_2. There is only one appender of such class allowed.", + SlowQueriesAppender.class.getName()), + ex.getMessage()); + } + } + + @Override + public String getTableName() + { + return format("%s.%s", SchemaConstants.VIRTUAL_VIEWS, SlowQueriesTable.TABLE_NAME); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/SlowQueryDeserTest.java b/test/distributed/org/apache/cassandra/distributed/test/SlowQueryDeserTest.java new file mode 100644 index 000000000000..ae5bcc966cf1 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/SlowQueryDeserTest.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test; + +import java.util.Collection; +import java.util.List; + +import org.junit.Test; + +import org.apache.cassandra.db.monitoring.MonitorableImpl; +import org.apache.cassandra.db.monitoring.MonitoringTask; +import org.apache.cassandra.db.monitoring.MonitoringTask.SlowOperation; +import org.apache.cassandra.utils.Clock; + +public class SlowQueryDeserTest +{ + @Test + public void testSlowQueryDeser() throws Throwable + { + SlowOperation slowOperation = new SlowOperation(new MonitorableImpl() + { + @Override + public String name() + { + return String.format("select * from %s.%s where id = 5", monitoredOnKeyspace(), monitoredOnTable()); + } + + @Override + public String monitoredOnKeyspace() + { + return "ks"; + } + + @Override + public String monitoredOnTable() + { + return "tb"; + } + + @Override + public boolean isCrossNode() + { + return true; + } + }, Clock.Global.currentTimeMillis()); + + String serialize = MonitoringTask.Operation.serialize(List.of(slowOperation)); + Collection deserialize = MonitoringTask.Operation.deserialize(serialize); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/SnapshotsTest.java b/test/distributed/org/apache/cassandra/distributed/test/SnapshotsTest.java index 7227d6613f85..4d2c42bcb604 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/SnapshotsTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/SnapshotsTest.java @@ -24,6 +24,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.UUID; import java.util.regex.Pattern; import org.junit.After; @@ -43,6 +44,9 @@ import org.apache.cassandra.distributed.shared.WithProperties; import org.apache.cassandra.io.util.File; import org.apache.cassandra.service.snapshot.SnapshotManager; +import org.apache.cassandra.service.snapshot.SnapshotManifest; +import org.apache.cassandra.service.snapshot.SnapshotOptions; +import org.apache.cassandra.service.snapshot.SnapshotType; import org.apache.cassandra.utils.Clock; import org.apache.cassandra.utils.FBUtilities; @@ -199,7 +203,7 @@ public void testMissingManifestIsCreatedOnStartupWithEnrichmentEnabled() cluster.get(1).shutdown(true); // remove manifest only in the first data dir - removeAllManifests(new String[] {dataDirs[0]}, paths); + removeAllManifests(new String[]{ dataDirs[0]}, paths); // they will be still created for that first dir cluster.get(1).startup(); @@ -555,6 +559,90 @@ public void testListingOfSnapshotsByKeyspaceAndTable() assertTrue(snapshots.get(0).contains("tagks1tbl")); } + @Test + public void testForcedSnapshot() throws Throwable + { + try (Cluster cluster = init(Cluster.build(1) + .withDataDirCount(3) // 3 dirs to disperse SSTables among different dirs + .start())) + { + cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk uuid primary key)"); + + cluster.get(1).runOnInstance((IIsolatedExecutor.SerializableRunnable) () -> { + Keyspace.open("distributed_test_keyspace").getColumnFamilyStore("tbl").disableAutoCompaction(); + }); + + for (int i = 0; i < 10; i++) + { + cluster.get(1).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk) values (?)", UUID.randomUUID()); + cluster.get(1).flush(KEYSPACE); + } + + takeEphemeralSnapshotWithSameName(cluster); + List manifests1 = getManifests(cluster); + List ssTablesFromManifest1 = getSSTablesFromManifest(manifests1.get(0)); + + for (int i = 0; i < 10; i++) + { + cluster.get(1).executeInternal("INSERT INTO " + KEYSPACE + ".tbl (pk) values (?)", UUID.randomUUID()); + cluster.get(1).flush(KEYSPACE); + } + takeEphemeralSnapshotWithSameName(cluster); + List manifests2 = getManifests(cluster); + List ssTablesFromManifest2 = getSSTablesFromManifest(manifests2.get(0)); + + assertEquals(manifests1, manifests2); + assertTrue(ssTablesFromManifest1.size() < ssTablesFromManifest2.size()); + assertTrue(ssTablesFromManifest2.containsAll(ssTablesFromManifest1)); + } + } + + private List getSSTablesFromManifest(File manifest) throws Throwable + { + SnapshotManifest snapshotManifest = SnapshotManifest.deserializeFromJsonFile(manifest); + return snapshotManifest.getFiles(); + } + + private List getManifests(Cluster cluster) + { + List manifestsPaths = cluster.get(1).callOnInstance((SerializableCallable>) () -> { + ColumnFamilyStore cfs = Keyspace.open("distributed_test_keyspace").getColumnFamilyStore("tbl"); + + List allManifests = new ArrayList<>(); + for (File file : cfs.getDirectories().getSnapshotDirsWithoutCreation("a_snapshot")) + { + File maybeManifest = new File(file, "manifest.json"); + if (maybeManifest.exists()) + allManifests.add(maybeManifest.absolutePath()); + } + + assertEquals(3, allManifests.size()); // 3 because 3 data dirs + return allManifests; + }); + + List manifests = new ArrayList<>(); + for (String manifest : manifestsPaths) + manifests.add(new File(manifest)); + + return manifests; + } + + private void takeEphemeralSnapshotWithSameName(Cluster cluster) + { + cluster.get(1).runOnInstance((IIsolatedExecutor.SerializableRunnable) () -> { + try + { + SnapshotManager.instance.takeSnapshot(SnapshotOptions.systemSnapshot("a_snapshot", SnapshotType.REPAIR, (r) -> true, "distributed_test_keyspace.tbl") + .ephemeral() + .build()); + } + catch (Throwable t) + { + throw new RuntimeException(t); + } + }); + } + private void populate(Cluster cluster) { for (int i = 0; i < 100; i++) diff --git a/test/distributed/org/apache/cassandra/distributed/test/TableLevelIncrementalBackupsTest.java b/test/distributed/org/apache/cassandra/distributed/test/TableLevelIncrementalBackupsTest.java index 2bf6c5f35068..833b9484d051 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/TableLevelIncrementalBackupsTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/TableLevelIncrementalBackupsTest.java @@ -141,12 +141,6 @@ private void flush(Cluster cluster, String keyspace) cluster.get(i).flush(keyspace); } - private void disableCompaction(Cluster cluster, String keyspace, String table) - { - for (int i = 1; i < cluster.size() + 1; i++) - cluster.get(i).nodetool("disableautocompaction", keyspace, table); - } - private static void assertBackupSSTablesCount(Cluster cluster, int expectedTablesCount, boolean enable, String ks, String... tableNames) { for (int i = 1; i < cluster.size() + 1; i++) diff --git a/test/distributed/org/apache/cassandra/distributed/test/TestBaseImpl.java b/test/distributed/org/apache/cassandra/distributed/test/TestBaseImpl.java index f093410a6280..4a0b54c386d1 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/TestBaseImpl.java +++ b/test/distributed/org/apache/cassandra/distributed/test/TestBaseImpl.java @@ -21,6 +21,7 @@ import java.math.BigDecimal; import java.math.BigInteger; import java.net.InetAddress; +import java.net.InetSocketAddress; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; @@ -29,16 +30,24 @@ import java.util.List; import java.util.Map; import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicLong; import java.util.function.Function; import java.util.stream.Collectors; import com.google.common.collect.ImmutableSet; +import com.google.common.util.concurrent.ListenableFuture; +import com.google.common.util.concurrent.ListenableFutureTask; import org.junit.After; import org.junit.BeforeClass; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.cql3.Duration; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.BooleanType; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; import org.apache.cassandra.db.marshal.ByteType; import org.apache.cassandra.db.marshal.BytesType; import org.apache.cassandra.db.marshal.DecimalType; @@ -56,31 +65,74 @@ import org.apache.cassandra.db.marshal.UUIDType; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.ICluster; +import org.apache.cassandra.distributed.api.ICoordinator; +import org.apache.cassandra.distributed.api.IInstance; import org.apache.cassandra.distributed.api.IInstanceConfig; import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.IMessage; +import org.apache.cassandra.distributed.api.IMessageSink; +import org.apache.cassandra.distributed.api.NodeToolResult; import org.apache.cassandra.distributed.shared.DistributedTestBase; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.service.accord.AccordCache; +import static java.lang.System.currentTimeMillis; +import static java.util.concurrent.TimeUnit.MILLISECONDS; import static org.apache.cassandra.config.CassandraRelevantProperties.JOIN_RING; import static org.apache.cassandra.config.CassandraRelevantProperties.RESET_BOOTSTRAP_PROGRESS; import static org.apache.cassandra.config.CassandraRelevantProperties.SKIP_GC_INSPECTOR; import static org.apache.cassandra.distributed.action.GossipHelper.withProperty; +import static org.assertj.core.api.Assertions.fail; // checkstyle: suppress below 'blockSystemPropertyUsage' public class TestBaseImpl extends DistributedTestBase { + private static final Logger logger = LoggerFactory.getLogger(TestBaseImpl.class); + public static final Object[][] EMPTY_ROWS = new Object[0][]; public static final boolean[] BOOLEANS = new boolean[]{ false, true }; + private static final AtomicLong ZERO = new AtomicLong(); + protected static final Map messageCounts = new ConcurrentHashMap<>(); + + protected static class MessageCountingSink implements IMessageSink + { + private final Cluster cluster; + + public MessageCountingSink(Cluster cluster) + { + this.cluster = cluster; + } + + @Override + public void accept(InetSocketAddress to, IMessage message) + { + Verb verb = Verb.fromId(message.verb()); + logger.debug("verb {} to {} message {}", verb, to, message); + messageCounts.computeIfAbsent(verb, ignored -> new AtomicLong()).incrementAndGet(); + cluster.get(to).receiveMessage(message); + } + } + + // Only works if MessageCountingSink is set on the cluster + public static long messageCount(Verb verb) + { + return messageCounts.getOrDefault(verb, ZERO).get(); + } + @After public void afterEach() { super.afterEach(); + messageCounts.clear(); } @BeforeClass public static void beforeClass() throws Throwable { + CassandraRelevantProperties.SIMULATOR_STARTED.setString(Long.toString(MILLISECONDS.toSeconds(currentTimeMillis()))); ICluster.setup(); SKIP_GC_INSPECTOR.setBoolean(true); + AccordCache.validateLoadOnEvict(true); } @Override @@ -130,13 +182,20 @@ public static ByteBuffer tuple(Object... values) bbs.add(value == null ? null : type.decompose(value)); } TupleType tupleType = new TupleType(types); - return tupleType.pack(bbs); + return tupleType.pack(bbs, ByteBufferAccessor.instance); + } + + public static String unloggedBatch(String... queries) + { + return batch(false, queries); } - public static String batch(String... queries) + public static String batch(boolean logged, String... queries) { StringBuilder sb = new StringBuilder(); - sb.append("BEGIN UNLOGGED BATCH\n"); + sb.append("BEGIN "); + sb.append(logged ? "" : "UNLOGGED "); + sb.append("BATCH\n"); for (String q : queries) sb.append(q).append(";\n"); sb.append("APPLY BATCH;"); @@ -240,4 +299,65 @@ public static void fixDistributedSchemas(Cluster cluster) // in real live repair is needed in this case, but in the test case it doesn't matter if the tables loose // anything, so ignoring repair to speed up the tests. } + + protected static void disableCompaction(Cluster cluster, String keyspace, String table) + { + for (int i = 1; i < cluster.size() + 1; i++) + cluster.get(i).nodetool("disableautocompaction", keyspace, table); + } + + public static String nodetool(IInstance instance, String... commandAndArgs) + { + NodeToolResult nodetoolResult = instance.nodetoolResult(commandAndArgs); + if (!nodetoolResult.getStdout().isEmpty()) + System.out.println(nodetoolResult.getStdout()); + if (!nodetoolResult.getStderr().isEmpty()) + System.err.println(nodetoolResult.getStderr()); + if (nodetoolResult.getError() != null) + fail("Failed nodetool " + Arrays.asList(commandAndArgs), nodetoolResult.getError()); + // TODO why does standard out end up in stderr in nodetool? + return nodetoolResult.getStdout(); + } + + public static String nodetool(ICoordinator coordinator, String... commandAndArgs) + { + return nodetool(coordinator.instance(), commandAndArgs); + } + + public static ListenableFuture nodetoolAsync(IInstance instance, String... commandAndArgs) + { + return nodetoolAsync(instance.coordinator(), commandAndArgs); + } + + public static ListenableFuture nodetoolAsync(ICoordinator coordinator, String... commandAndArgs) + { + ListenableFutureTask task = ListenableFutureTask.create(() -> nodetool(coordinator, commandAndArgs)); + Thread asyncThread = new Thread(task, "NodeTool: " + Arrays.asList(commandAndArgs)); + asyncThread.setDaemon(true); + asyncThread.start(); + return task; + } + + /** + * @see org.apache.cassandra.cql3.CQLTester#wrapInTxn(String...) + */ + protected static String wrapInTxn(String... stmts) + { + return wrapInTxn(Arrays.asList(stmts)); + } + + protected static String wrapInTxn(List stmts) + { + StringBuilder sb = new StringBuilder(); + sb.append("BEGIN TRANSACTION\n"); + for (String stmt : stmts) + { + sb.append('\t').append(stmt); + if (!stmt.endsWith(";")) + sb.append(';'); + sb.append('\n'); + } + sb.append("COMMIT TRANSACTION"); + return sb.toString(); + } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/VirtualTableLogsTest.java b/test/distributed/org/apache/cassandra/distributed/test/VirtualTableLogsTest.java index 71ef4dbe7899..bf9f58123e83 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/VirtualTableLogsTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/VirtualTableLogsTest.java @@ -25,7 +25,6 @@ import ch.qos.logback.classic.Level; import org.apache.cassandra.db.virtual.LogMessagesTable; -import org.apache.cassandra.db.virtual.LogMessagesTable.LogMessage; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.Constants; import org.apache.cassandra.distributed.api.Feature; @@ -47,8 +46,14 @@ import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; -public class VirtualTableLogsTest extends TestBaseImpl +public class VirtualTableLogsTest extends AbstractVirtualLogsTableTest { + @Override + public String getTableName() + { + return format("%s.%s", SchemaConstants.VIRTUAL_VIEWS, LogMessagesTable.TABLE_NAME); + } + @Test public void testVTableOutput() throws Throwable { @@ -56,9 +61,9 @@ public void testVTableOutput() throws Throwable Cluster cluster = Cluster.build(1) .withConfig(c -> c.with(Feature.values())) .start(); - ) + ) { - List rows = getRows(cluster); + List rows = getRows(cluster); assertFalse(rows.isEmpty()); rows.forEach(message -> assertTrue(Level.toLevel(message.level).isGreaterOrEqual(Level.INFO))); @@ -91,39 +96,36 @@ public void testMultipleAppendersFailToStartNode() throws Throwable } } - private List getRows(Cluster cluster) + private List getRows(Cluster cluster) { SimpleQueryResult simpleQueryResult = cluster.coordinator(1).executeWithResult(query("select * from %s"), ONE); - List rows = new ArrayList<>(); + List rows = new ArrayList<>(); simpleQueryResult.forEachRemaining(row -> { long timestamp = row.getTimestamp(TIMESTAMP_COLUMN_NAME).getTime(); String logger = row.getString(LOGGER_COLUMN_NAME); String level = row.getString(LEVEL_COLUMN_NAME); String message = row.getString(MESSAGE_COLUMN_NAME); int order = row.getInteger(ORDER_IN_MILLISECOND_COLUMN_NAME); - TestingLogMessage logMessage = new TestingLogMessage(timestamp, logger, level, message, order); + LogMessage logMessage = new LogMessage(timestamp, logger, level, message, order); rows.add(logMessage); }); return rows; } - private String query(String template) - { - return format(template, getTableName()); - } - - private String getTableName() - { - return format("%s.%s", SchemaConstants.VIRTUAL_VIEWS, LogMessagesTable.TABLE_NAME); - } - - private static class TestingLogMessage extends LogMessage + private static class LogMessage { - private int order; + public final long timestamp; + public final String logger; + public final String level; + public final String message; + public final int order; - public TestingLogMessage(long timestamp, String logger, String level, String message, int order) + public LogMessage(long timestamp, String logger, String level, String message, int order) { - super(timestamp, logger, level, message); + this.timestamp = timestamp; + this.logger = logger; + this.level = level; + this.message = message; this.order = order; } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordBootstrapTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordBootstrapTest.java new file mode 100644 index 000000000000..30f4346cf80e --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordBootstrapTest.java @@ -0,0 +1,496 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.net.InetAddress; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; + +import org.junit.Assert; +import org.junit.Test; + +import accord.local.PreLoadContext; +import accord.primitives.Timestamp; +import accord.topology.TopologyManager; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.cql3.UntypedResultSet; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.IInstanceConfig; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.TokenSupplier; +import org.apache.cassandra.distributed.shared.NetworkTopology; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.AccordConfigurationService; +import org.apache.cassandra.service.accord.AccordConfigurationService.EpochSnapshot; +import org.apache.cassandra.service.accord.AccordSafeCommandStore; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.streaming.StreamManager; +import org.apache.cassandra.streaming.StreamResultFuture; +import org.apache.cassandra.streaming.StreamSession; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.FBUtilities; +import org.assertj.core.api.Assertions; + +import static accord.utils.async.AsyncChains.awaitUninterruptiblyAndRethrow; +import static com.google.common.collect.Iterables.getOnlyElement; +import static org.apache.cassandra.distributed.api.Feature.GOSSIP; +import static org.apache.cassandra.distributed.api.Feature.NETWORK; + +public class AccordBootstrapTest extends TestBaseImpl +{ + private static DecoratedKey dk(int key) + { + IPartitioner partitioner = DatabaseDescriptor.getPartitioner(); + return partitioner.decorateKey(ByteBufferUtil.bytes(key)); + } + + private static PartitionKey pk(int key, String keyspace, String table) + { + TableId tid = Schema.instance.getTableMetadata(keyspace, table).id; + return new PartitionKey(tid, dk(key)); + } + + protected void bootstrapAndJoinNode(Cluster cluster) + { + IInstanceConfig config = cluster.newInstanceConfig(); + config.set("auto_bootstrap", true); + config.set("accord.shard_durability_target_splits", "1"); + config.set("accord.shard_durability_cycle", "20s"); + IInvokableInstance newInstance = cluster.bootstrap(config); + newInstance.startup(cluster); + // todo: re-add once we fix write survey/join ring = false mode +// withProperty(BOOTSTRAP_SCHEMA_DELAY_MS.getKey(), Integer.toString(90 * 1000), +// () -> withProperty("cassandra.join_ring", false, () -> newInstance.startup(cluster))); +// newInstance.nodetoolResult("join").asserts().success(); + newInstance.nodetoolResult("cms", "describe").asserts().success(); // just make sure we're joined, remove later + } + + private static AccordService service() + { + return (AccordService) AccordService.instance(); + } + + private static void awaitEpoch(long epoch) + { + try + { + boolean completed = service().epochReady(Epoch.create(epoch)).await(60, TimeUnit.SECONDS); + Assertions.assertThat(completed) + .describedAs("Epoch %s did not become ready within timeout on %s -> %s", + epoch, FBUtilities.getBroadcastAddressAndPort(), + service().configService().getEpochSnapshot(epoch)) + .isTrue(); + } + catch (InterruptedException e) + { + throw new RuntimeException(e); + } + } + + private static void awaitLocalSyncNotification(long epoch) + { + try + { + AccordConfigurationService configService = service().configService(); + boolean completed = configService.unsafeLocalSyncNotified(epoch).await(30, TimeUnit.SECONDS); + Assert.assertTrue(String.format("Local sync notification for epoch %s did not become ready within timeout on %s", + epoch, FBUtilities.getBroadcastAddressAndPort()), completed); + } + catch (InterruptedException e) + { + throw new RuntimeException(e); + } + } + + private static long maxEpoch(Cluster cluster) + { + return cluster.stream().mapToLong(node -> node.callOnInstance(() -> ClusterMetadata.current().epoch.getEpoch())).max().getAsLong(); + } + + private static class StreamListener implements StreamManager.StreamListener + { + private static boolean isRegistered = false; + private static final StreamListener listener = new StreamListener(); + + private final List registered = new ArrayList<>(); + + static synchronized void register() + { + if (isRegistered) + return; + StreamManager.instance.addListener(listener); + isRegistered = true; + } + + public synchronized void onRegister(StreamResultFuture result) + { + registered.add(result); + } + + public synchronized void forSession(Consumer consumer) + { + registered.forEach(future -> { + future.getCoordinator().getAllStreamSessions().forEach(consumer); + }); + } + } + + @Test + public void bootstrapTest() throws Throwable + { + int originalNodeCount = 2; + int expandedNodeCount = originalNodeCount + 1; + + try (Cluster cluster = Cluster.build().withNodes(originalNodeCount) + .withoutVNodes() + .withTokenSupplier(TokenSupplier.evenlyDistributedTokens(expandedNodeCount)) + .withNodeIdTopology(NetworkTopology.singleDcNetworkTopology(expandedNodeCount, "dc0", "rack0")) + .withConfig(config -> config.set("accord.command_store_shard_count", 2) + .set("accord.queue_shard_count", 2) + .set("accord.shard_durability_cycle", "20s") + .set("accord.shard_durability_target_splits", "1") + .with(NETWORK, GOSSIP)) + .start()) + { + cluster.schemaChange("CREATE KEYSPACE ks WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor':2}"); + cluster.schemaChange("CREATE TABLE ks.tbl (k int, c int, v int, primary key(k, c)) WITH transactional_mode='full'"); + + long initialMax = maxEpoch(cluster); + + for (IInvokableInstance node : cluster) + { + + node.runOnInstance(() -> { + Assert.assertEquals(initialMax, ClusterMetadata.current().epoch.getEpoch()); + System.out.println("Awaiting " + initialMax); + awaitEpoch(initialMax); + AccordConfigurationService configService = service().configService(); + long minEpoch = configService.minEpoch(); + + Assert.assertEquals(initialMax, configService.maxEpoch()); + + for (long epoch = minEpoch; epoch < initialMax; epoch++) + { + awaitEpoch(epoch); + Assert.assertEquals(EpochSnapshot.completed(epoch), configService.getEpochSnapshot(epoch)); + } + + awaitLocalSyncNotification(initialMax); + Assert.assertEquals(EpochSnapshot.completed(initialMax), configService.getEpochSnapshot(initialMax)); + }); + } + + for (IInvokableInstance node : cluster) + { + node.runOnInstance(StreamListener::register); + } + + long schemaChangeMax = maxEpoch(cluster); + for (IInvokableInstance node : cluster) + { + node.runOnInstance(() -> { + ClusterMetadataService.instance().fetchLogFromCMS(Epoch.create(schemaChangeMax)); + awaitEpoch(schemaChangeMax); + AccordConfigurationService configService = service().configService(); + + for (long epoch = initialMax + 1; epoch <= schemaChangeMax; epoch++) + { + awaitLocalSyncNotification(epoch); + Assert.assertEquals(EpochSnapshot.completed(epoch), configService.getEpochSnapshot(epoch)); + } + }); + } + + for (int key = 0; key < 100; key++) + { + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM ks.tbl WHERE k = " + key + " AND c = 0);\n" + + " SELECT row1.v;\n" + + " IF row1 IS NULL THEN\n" + + " INSERT INTO ks.tbl (k, c, v) VALUES (" + key + ", " + key + ", " + key + ");\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + AccordTestBase.executeWithRetry(cluster, query); + } + + for (IInvokableInstance node : cluster) + { + node.runOnInstance(() -> { + Assert.assertTrue(StreamListener.listener.registered.isEmpty()); + }); + } + + bootstrapAndJoinNode(cluster); + long bootstrapMax = maxEpoch(cluster); + for (IInvokableInstance node : cluster) + { + node.runOnInstance(() -> { + ClusterMetadataService.instance().fetchLogFromCMS(Epoch.create(bootstrapMax)); + Assert.assertEquals(bootstrapMax, ClusterMetadata.current().epoch.getEpoch()); + AccordService service = (AccordService) AccordService.instance(); + awaitEpoch(bootstrapMax); + AccordConfigurationService configService = service.configService(); + + awaitLocalSyncNotification(bootstrapMax); + Assert.assertEquals(EpochSnapshot.completed(bootstrapMax), configService.getEpochSnapshot(bootstrapMax)); + }); + } + + InetAddress node3Addr = cluster.get(3).broadcastAddress().getAddress(); + for (IInvokableInstance node : cluster.get(1, 2)) + { + node.runOnInstance(() -> { + + StreamListener.listener.forSession(session -> { + Assert.assertEquals(node3Addr, session.peer.getAddress()); + Assert.assertEquals(0, session.getNumRequests()); + Assert.assertTrue(session.getNumKeyspaceTransfers() > 0); + }); + + awaitUninterruptiblyAndRethrow(service().node().commandStores().forEach(safeStore -> { + AccordSafeCommandStore ss = (AccordSafeCommandStore) safeStore; + Assert.assertEquals(Timestamp.NONE, getOnlyElement(ss.bootstrapBeganAt().keySet())); + Assert.assertEquals(Timestamp.NONE, getOnlyElement(ss.safeToReadAt().keySet())); +// +// Assert.assertTrue(commandStore.maxBootstrapEpoch() > 0); +// Assert.assertTrue(commandStore.bootstrapBeganAt().isEmpty()); +// Assert.assertTrue(commandStore.safeToRead().isEmpty()); + })); + }); + } + + cluster.get(3).runOnInstance(() -> { + List> ranges = StorageService.instance.getLocalRanges("ks"); + TopologyManager topologyManager = service().node().topology(); + for (long epoch = topologyManager.minEpoch() ; epoch <= topologyManager.epoch() ; ++epoch) + { + CountDownLatch latch = new CountDownLatch(1); + topologyManager.epochReady(epoch).data.invokeIfSuccess(latch::countDown); + while (true) + { + try + { + if (latch.await(1L, TimeUnit.SECONDS)) + break; + } + catch (InterruptedException e) + { + throw new RuntimeException(e); + } + } + } + + for (int key = 0; key < 100; key++) + { + UntypedResultSet result = QueryProcessor.executeInternal("SELECT * FROM ks.tbl WHERE k=?", key); + PartitionKey partitionKey = pk(key, "ks", "tbl"); + if (ranges.stream().anyMatch(range -> range.contains(partitionKey.token()))) + { + UntypedResultSet.Row row = getOnlyElement(result); + Assert.assertEquals(key, row.getInt("c")); + Assert.assertEquals(key, row.getInt("v")); + + awaitUninterruptiblyAndRethrow(service().node().commandStores().forEach(safeStore -> { + if (safeStore.ranges().currentRanges().contains(partitionKey)) + { + AccordSafeCommandStore ss = (AccordSafeCommandStore) safeStore; + Assert.assertFalse(ss.bootstrapBeganAt().isEmpty()); + Assert.assertFalse(ss.safeToReadAt().isEmpty()); + + Assert.assertEquals(1, ss.bootstrapBeganAt().entrySet().stream() + .filter(entry -> entry.getValue().contains(partitionKey)) + .map(entry -> { + Assert.assertTrue(entry.getKey().compareTo(Timestamp.NONE) > 0); + return entry; + }).count()); + Assert.assertEquals(1, ss.safeToReadAt().entrySet().stream() + .filter(entry -> entry.getValue().contains(partitionKey)) + .map(entry -> { + Assert.assertTrue(entry.getKey().compareTo(Timestamp.NONE) > 0); + return entry; + }).count()); + } + })); + } + else + { + Assert.assertTrue(result.isEmpty()); + } + } + }); + } + } + + @Test + public void moveTest() throws Throwable + { + try (Cluster cluster = Cluster.build().withNodes(3) + .withoutVNodes() + .withTokenSupplier(TokenSupplier.evenlyDistributedTokens(3)) + .withNodeIdTopology(NetworkTopology.singleDcNetworkTopology(3, "dc0", "rack0")) + .withConfig(config -> config + .set("accord.shard_durability_target_splits", "1") + .set("accord.shard_durability_cycle", "20s") + .with(NETWORK, GOSSIP)) + .start()) + { + cluster.schemaChange("CREATE KEYSPACE ks WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor':2}"); + cluster.schemaChange("CREATE TABLE ks.tbl (k int, c int, v int, primary key(k, c)) WITH transactional_mode='full'"); + + long initialMax = maxEpoch(cluster); + long[] tokens = new long[3]; + for (int i=0; i<3; i++) + { + tokens[i] = cluster.get(i+1).callOnInstance(() -> Long.valueOf(getOnlyElement(StorageService.instance.getTokens()))); + } + + for (IInvokableInstance node : cluster) + { + + node.runOnInstance(() -> { + Assert.assertEquals(initialMax, ClusterMetadata.current().epoch.getEpoch()); + awaitEpoch(initialMax); + AccordConfigurationService configService = service().configService(); + long minEpoch = configService.minEpoch(); + + Assert.assertEquals(initialMax, configService.maxEpoch()); + + for (long epoch = minEpoch; epoch < initialMax; epoch++) + { + awaitEpoch(epoch); + Assert.assertEquals(EpochSnapshot.completed(epoch), configService.getEpochSnapshot(epoch)); + } + + awaitLocalSyncNotification(initialMax); + Assert.assertEquals(EpochSnapshot.completed(initialMax), configService.getEpochSnapshot(initialMax)); + }); + } + + long schemaChangeMax = maxEpoch(cluster); + for (IInvokableInstance node : cluster) + { + node.runOnInstance(() -> { + Assert.assertEquals(schemaChangeMax, ClusterMetadata.current().epoch.getEpoch()); + AccordService service = (AccordService) AccordService.instance(); + awaitEpoch(schemaChangeMax); + AccordConfigurationService configService = service.configService(); + + for (long epoch = initialMax + 1; epoch <= schemaChangeMax; epoch++) + { + awaitLocalSyncNotification(epoch); + Assert.assertEquals(EpochSnapshot.completed(epoch), configService.getEpochSnapshot(epoch)); + } + }); + } + + for (int key = 0; key < 100; key++) + { + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM ks.tbl WHERE k = " + key + " AND c = 0);\n" + + " SELECT row1.v;\n" + + " IF row1 IS NULL THEN\n" + + " INSERT INTO ks.tbl (k, c, v) VALUES (" + key + ", " + key + ", " + key + ");\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + AccordTestBase.executeWithRetry(cluster, query); + } + + long token = ((tokens[1] - tokens[0]) / 2) + tokens[0]; + long preMove = maxEpoch(cluster); + + cluster.get(1).runOnInstance(() -> StorageService.instance.move(Long.toString(token))); + + long moveMax = maxEpoch(cluster); + for (IInvokableInstance node : cluster) + { + node.runOnInstance(() -> { + ClusterMetadataService.instance().fetchLogFromCMS(Epoch.create(moveMax)); + Assert.assertEquals(moveMax, ClusterMetadata.current().epoch.getEpoch()); + AccordService service = (AccordService) AccordService.instance(); + awaitEpoch(moveMax); + AccordConfigurationService configService = service.configService(); + + awaitLocalSyncNotification(moveMax); + Assert.assertEquals(EpochSnapshot.completed(moveMax), configService.getEpochSnapshot(moveMax)); + }); + } + + for (IInvokableInstance node : cluster) + { + node.runOnInstance(() -> { + // validate streaming + List> ranges = StorageService.instance.getLocalRanges("ks"); + TableId tableId = Schema.instance.getTableMetadata("ks", "tbl").id; + for (int key = 0; key < 100; key++) + { + DecoratedKey dk = dk(key); + UntypedResultSet result = QueryProcessor.executeInternal("SELECT * FROM ks.tbl WHERE k=?", key); + if (ranges.stream().anyMatch(range -> range.contains(dk.getToken()))) + { + UntypedResultSet.Row row = getOnlyElement(result); + Assert.assertEquals(key, row.getInt("c")); + Assert.assertEquals(key, row.getInt("v")); + + PartitionKey partitionKey = new PartitionKey(tableId, dk); + + awaitUninterruptiblyAndRethrow(service().node().commandStores().forEach(PreLoadContext.contextFor(partitionKey.toUnseekable()), + partitionKey.toUnseekable(), moveMax, moveMax, + safeStore -> { + if (!safeStore.ranges().allAt(preMove).contains(partitionKey)) + { + AccordSafeCommandStore ss = (AccordSafeCommandStore) safeStore; + Assert.assertFalse(ss.bootstrapBeganAt().isEmpty()); + Assert.assertFalse(ss.safeToReadAt().isEmpty()); + + Assert.assertEquals(1, ss.bootstrapBeganAt().entrySet().stream() + .filter(entry -> entry.getValue().contains(partitionKey)) + .map(entry -> { + Assert.assertTrue(entry.getKey().compareTo(Timestamp.NONE) > 0); + return entry; + }).count()); + Assert.assertEquals(1, ss.safeToReadAt().entrySet().stream() + .filter(entry -> entry.getValue().contains(partitionKey)) + .map(entry -> { + Assert.assertTrue(entry.getKey().compareTo(Timestamp.NONE) > 0); + return entry; + }).count()); + } + })); + } + } + }); + } + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordCQLTestBase.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordCQLTestBase.java new file mode 100644 index 000000000000..caa2f6da69ef --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordCQLTestBase.java @@ -0,0 +1,3224 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Random; +import java.util.Set; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.stream.Collectors; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.primitives.Unseekables; +import accord.topology.Topologies; +import org.apache.cassandra.config.Config.PaxosVariant; +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.cql3.functions.types.utils.Bytes; +import org.apache.cassandra.cql3.statements.TransactionStatement; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.ListType; +import org.apache.cassandra.db.marshal.MapType; +import org.apache.cassandra.db.marshal.SetType; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.ICoordinator; +import org.apache.cassandra.distributed.api.QueryResults; +import org.apache.cassandra.distributed.api.SimpleQueryResult; +import org.apache.cassandra.distributed.shared.AssertUtils; +import org.apache.cassandra.distributed.test.sai.SAIUtil; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.AccordTestUtils; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.service.consensus.migration.TransactionalMigrationFromMode; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.FailingConsumer; +import org.apache.cassandra.utils.Pair; +import org.assertj.core.api.Assertions; + +import static java.lang.String.format; +import static java.util.Collections.singletonList; +import static org.apache.cassandra.cql3.CQLTester.row; +import static org.apache.cassandra.cql3.statements.schema.AlterTableStatement.ACCORD_COUNTER_COLUMN_UNSUPPORTED; +import static org.apache.cassandra.cql3.statements.schema.AlterTableStatement.ACCORD_COUNTER_TABLES_UNSUPPORTED; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.QUORUM; +import static org.apache.cassandra.distributed.api.Feature.GOSSIP; +import static org.apache.cassandra.distributed.api.Feature.NATIVE_PROTOCOL; +import static org.apache.cassandra.distributed.api.Feature.NETWORK; +import static org.apache.cassandra.distributed.util.QueryResultUtil.assertThat; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +public abstract class AccordCQLTestBase extends AccordTestBase +{ + private static final Logger logger = LoggerFactory.getLogger(AccordCQLTestBase.class); + + protected AccordCQLTestBase(TransactionalMode transactionalMode) { + super(transactionalMode); + } + + @Override + protected Logger logger() + { + return logger; + } + + @BeforeClass + public static void setupClass() throws IOException + { + AccordTestBase.setupCluster(builder -> builder.appendConfig(config -> config.with(GOSSIP, NETWORK, NATIVE_PROTOCOL) + .set("paxos_variant", PaxosVariant.v2.name())), 2); + SHARED_CLUSTER.schemaChange("CREATE TYPE " + KEYSPACE + ".person (height int, age int)"); + } + + @Test + public void testRejectTransactionStatement() throws Throwable + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int, primary key (k, c))", cluster -> { + ICoordinator coordinator = cluster.coordinator(1); + String readQuery = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION;"; + String writeQuery = "BEGIN TRANSACTION\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (42, 43, 44);\n" + + "COMMIT TRANSACTION;"; + + // Not enabled on table or migrating/migrated + try + { + coordinator.execute(readQuery, ConsistencyLevel.ALL); + fail("Expected exception"); + } + catch (Throwable t) + { + assertEquals(InvalidRequestException.class.getName(), t.getClass().getName()); + assertEquals(t.getMessage(), format(TransactionStatement.TRANSACTIONS_DISABLED_ON_TABLE_MESSAGE, "SELECT", "at [2:3]")); + } + + try + { + coordinator.execute(writeQuery, ConsistencyLevel.ALL); + fail("Expected exception"); + } + catch (Throwable t) + { + assertEquals(InvalidRequestException.class.getName(), t.getClass().getName()); + assertEquals(t.getMessage(), format(TransactionStatement.TRANSACTIONS_DISABLED_ON_TABLE_MESSAGE, "INSERT", "at [2:3]")); + } + + // Enabled on table but not migrating/migrated + coordinator.execute("ALTER TABLE " + qualifiedAccordTableName + " WITH transactional_mode = '" + transactionalMode.name() + "';", ConsistencyLevel.ALL); + try + { + cluster.coordinator(1).execute(readQuery, ConsistencyLevel.ALL); + fail("Expected exception"); + } + catch (Throwable t) + { + assertEquals(InvalidRequestException.class.getName(), t.getClass().getName()); + assertEquals(t.getMessage(), TransactionStatement.UNSUPPORTED_MIGRATION); + } + + // Blind writes are allowed because Accord does know how to execute them correctly via interop + coordinator.execute(writeQuery, ConsistencyLevel.ALL); + + // Enabled on table but migrating + nodetool(coordinator, "consensus_admin", "begin-migration", KEYSPACE, accordTableName); + try + { + coordinator.execute(readQuery, ConsistencyLevel.ALL); + fail("Expected exception"); + } + catch (Throwable t) + { + assertEquals(InvalidRequestException.class.getName(), t.getClass().getName()); + assertEquals(t.getMessage(), TransactionStatement.UNSUPPORTED_MIGRATION); + } + + // Write query should succeed even if Accord can't read + coordinator.execute(writeQuery, ConsistencyLevel.ALL); + // Should also work as a non-SERIAL insert + coordinator.execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (42, 43, 44);", ConsistencyLevel.ALL); + // And CAS + coordinator.execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (42, 43, 44) IF NOT EXISTS;", ConsistencyLevel.SERIAL, ConsistencyLevel.ALL); + + // Enabled on table and migration has done data repair + nodetool(coordinator, "repair", "-skip-accord", "-skip-paxos", KEYSPACE, accordTableName); + coordinator.execute(readQuery, ConsistencyLevel.ALL); + coordinator.execute(writeQuery, ConsistencyLevel.ALL); + }); + } + + @Test + public void testCounterCreateTableTransactionalModeFails() throws Exception + { + try + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v counter, primary key (k, c)) WITH " + transactionalMode.asCqlParam(), cluster -> {}); + fail("Expected exception"); + } + catch (Throwable t) + { + assertEquals(IllegalStateException.class.getName(), t.getClass().getName()); + assertEquals(format(ACCORD_COUNTER_TABLES_UNSUPPORTED, KEYSPACE, accordTableName), t.getMessage()); + } + } + + @Test + public void testCounterCreateTableTransactionalMigrationFromModeFails() throws Exception + { + try + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v counter, primary key (k, c)) WITH transactional_migration_from = '" + transactionalMode.name() + "'", cluster -> {}); + fail("Expected exception"); + } + catch (Throwable t) + { + assertEquals(IllegalStateException.class.getName(), t.getClass().getName()); + assertEquals(format(ACCORD_COUNTER_TABLES_UNSUPPORTED, KEYSPACE, accordTableName), t.getMessage()); + } + } + + @Test + public void testCounterAlterTableTransactionalModeFails() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v counter, primary key (k, c))", cluster -> { + try + { + cluster.coordinator(1).execute("ALTER TABLE " + qualifiedAccordTableName + " WITH transactional_mode = '" + transactionalMode.name() + "';", ConsistencyLevel.ALL); + fail("Expected exception"); + } + catch (Throwable t) + { + assertEquals(InvalidRequestException.class.getName(), t.getClass().getName()); + assertEquals(format(ACCORD_COUNTER_TABLES_UNSUPPORTED, KEYSPACE, accordTableName), t.getMessage()); + } + }); + } + + @Test + public void testCounterAlterTableTransactionalMigrationFromModeFails() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v counter, primary key (k, c))", cluster -> { + try + { + cluster.coordinator(1).execute("ALTER TABLE " + qualifiedAccordTableName + " WITH transactional_migration_from = '" + transactionalMode.name() + "';", ConsistencyLevel.ALL); + fail("Expected exception"); + } + catch (Throwable t) + { + assertEquals(InvalidRequestException.class.getName(), t.getClass().getName()); + assertEquals(format(ACCORD_COUNTER_TABLES_UNSUPPORTED, KEYSPACE, accordTableName), t.getMessage()); + } + }); + } + + @Test + public void testCounterAddColumnFailsWithAccord() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, s int static, v int, primary key (k, c)) WITH " + transactionalMode.asCqlParam(), cluster -> { + try + { + cluster.coordinator(1).execute("ALTER TABLE " + qualifiedAccordTableName + " ADD (v2 counter);", ConsistencyLevel.ALL); + fail("Expected exception"); + } + catch (Throwable t) + { + assertEquals(InvalidRequestException.class.getName(), t.getClass().getName()); + assertEquals(format(ACCORD_COUNTER_COLUMN_UNSUPPORTED, KEYSPACE, accordTableName, transactionalMode, TransactionalMigrationFromMode.none), t.getMessage()); + } + }); + } + + @Test + public void testCounterAddColumnFailsWithMigration() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, s int static, v int, primary key (k, c)) WITH " + transactionalMode.asCqlParam(), cluster -> { + try + { + cluster.coordinator(1).execute("ALTER TABLE " + qualifiedAccordTableName + " WITH transactional_mode = '" + TransactionalMode.off + "';", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("ALTER TABLE " + qualifiedAccordTableName + " ADD (v2 counter);", ConsistencyLevel.ALL); + fail("Expected exception"); + } + catch (Throwable t) + { + assertEquals(InvalidRequestException.class.getName(), t.getClass().getName()); + assertEquals(format(ACCORD_COUNTER_COLUMN_UNSUPPORTED, KEYSPACE, accordTableName, TransactionalMode.off, transactionalMode), t.getMessage()); + } + }); + } + + @Override + protected void test(FailingConsumer fn) throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int, primary key (k, c)) WITH " + transactionalMode.asCqlParam(), fn); + } + + @Test + public void testPartitionMultiRowReturn() throws Exception + { + test(cluster -> { + for (int i = 0; i < 3; i++) + cluster.coordinator(1).execute(wrapInTxn("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (?, ?, ?)"), ConsistencyLevel.ALL, 42, 43 + i, 44 + i); + + String txn = "BEGIN TRANSACTION " + + "SELECT * " + + "FROM " + qualifiedAccordTableName + " " + + "WHERE k = 42;" + + "COMMIT TRANSACTION;"; + SimpleQueryResult result = cluster.coordinator(1).executeWithResult(txn, ConsistencyLevel.SERIAL); + assertThat(result).hasSize(3) + .contains(42, 43, 44) + .contains(42, 44, 45) + .contains(42, 45, 46); + }); + } + + @Test + public void testSaiMultiRowReturn() throws Exception + { + test(cluster -> { + cluster.schemaChange("CREATE INDEX ON " + qualifiedAccordTableName + "(v) USING 'sai';"); + SAIUtil.waitForIndexQueryable(cluster, KEYSPACE); + for (int i = 0; i < 3; i++) + cluster.coordinator(1).execute(wrapInTxn("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (?, ?, ?)"), ConsistencyLevel.ALL, 42, 43 + i, 44 + i); + + String txn = "BEGIN TRANSACTION " + + "SELECT * " + + "FROM " + qualifiedAccordTableName + " " + + "WHERE k = 42 AND v = 45;" + + "COMMIT TRANSACTION;"; + SimpleQueryResult result = cluster.coordinator(1).executeWithResult(txn, ConsistencyLevel.SERIAL); + assertThat(result).hasSize(1) + .contains(42, 44, 45); + }); + } + + // This fails and it is expected, mostly just here as documentation until it is fixed + @Test + public void testSasiMultiRowReturn() throws Exception + { + test(cluster -> { + cluster.schemaChange("CREATE INDEX ON " + qualifiedAccordTableName + "(v) USING 'org.apache.cassandra.index.sasi.SASIIndex';"); + SAIUtil.waitForIndexQueryable(cluster, KEYSPACE); + for (int i = 0; i < 3; i++) + cluster.coordinator(1).execute(wrapInTxn("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (?, ?, ?)"), ConsistencyLevel.ALL, 42, 43 + i, 44 + i); + + String txn = "BEGIN TRANSACTION " + + "SELECT * " + + "FROM " + qualifiedAccordTableName + " " + + "WHERE k = 42 AND v = 45;" + + "COMMIT TRANSACTION;"; + SimpleQueryResult result = cluster.coordinator(1).executeWithResult(txn, ConsistencyLevel.SERIAL); + assertThat(result).hasSize(1) + .contains(42, 44, 45); + }); + } + + @Test + public void testLegacy2iMultiRowReturn() throws Exception + { + test(cluster -> { + cluster.schemaChange("CREATE INDEX ON " + qualifiedAccordTableName + "(v);"); + SAIUtil.waitForIndexQueryable(cluster, KEYSPACE); + for (int i = 0; i < 3; i++) + cluster.coordinator(1).execute(wrapInTxn("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (?, ?, ?)"), ConsistencyLevel.ALL, 42, 43 + i, 44 + i); + + String txn = "BEGIN TRANSACTION " + + "SELECT * " + + "FROM " + qualifiedAccordTableName + " " + + "WHERE k = 42 AND v = 45;" + + "COMMIT TRANSACTION;"; + SimpleQueryResult result = cluster.coordinator(1).executeWithResult(txn, ConsistencyLevel.SERIAL); + assertThat(result).hasSize(1) + .contains(42, 44, 45); + }); + } + + @Test + public void testNonExistingKeyWithStaticUpdate() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, s int static, v int, primary key (k, c)) WITH " + transactionalMode.asCqlParam(), cluster -> { + for (int i = 0; i < 10; i++) + cluster.coordinator(1).execute(wrapInTxn("UPDATE " + qualifiedAccordTableName + " SET v += ?, s=? WHERE k=? AND c=?"), ConsistencyLevel.ANY, 1, i, 0, i); + + SimpleQueryResult result = cluster.coordinator(1).executeWithResult(wrapInTxn("SELECT * FROM " + qualifiedAccordTableName + " WHERE k=? LIMIT 1"), ConsistencyLevel.ANY, 0); + AssertUtils.assertRows(result, QueryResults.builder() + .columns("k", "c", "s", "v") + .row(0, null, 9, null) + .build()); + }); + } + + @Test + public void testRangeReadPageOne() throws Exception + { + testRangeRead(1); + } + + @Test + public void testRangeReadSmallPage() throws Exception + { + testRangeRead(2); + } + + @Test + public void testRangeReadExactPage() throws Exception + { + testRangeRead(100); + } + + @Test + public void testRangeReadLargePage() throws Exception + { + testRangeRead(200); + } + + @Test + public void testRangeReadClosePageLT() throws Exception + { + testRangeRead(99); + } + + @Test + public void testRangeReadClosePageGT() throws Exception + { + testRangeRead(101); + } + + private void testRangeRead(int pageSize) throws Exception + { + test(cluster -> { + Random r = new Random(0); + Map, Object[]> insertedRows = new HashMap<>(); + for (int i = 0; i < 10; i++) + { + int k = r.nextInt(); + for (int j = 0; j < 10; j++) + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + "(k, c, v) VALUES (?, ?, ?);", ConsistencyLevel.ALL, k, j, i + j); + insertedRows.put(Pair.create(k, j), new Object[] {k, j, i + j}); + } + } + + Iterator iterator = cluster.coordinator(1).executeWithPaging("SELECT * FROM " + qualifiedAccordTableName + " WHERE TOKEN(k) > " + Long.MIN_VALUE + " AND TOKEN(k) < " + Long.MAX_VALUE, ConsistencyLevel.ALL, pageSize); + List resultRows = ImmutableList.copyOf(iterator); + resultRows.forEach(row -> System.out.println(Arrays.toString(row))); + Integer lastPartitionKey = null; + int currentRowKey = 0; + for (Object[] row : resultRows) + { + assertEquals(currentRowKey, row[1]); + + if (lastPartitionKey == null) + lastPartitionKey = (Integer)row[0]; + else + assertEquals(lastPartitionKey, row[0]); + + if (currentRowKey == 9) + { + currentRowKey = 0; + lastPartitionKey = null; + } + else + currentRowKey++; + + Object[] expected = insertedRows.remove(Pair.create(row[0], row[1])); + assertEquals(expected, row); + } + assertTrue(insertedRows.isEmpty()); + }); + } + + @Test + public void testRangeReadSingleToken() throws Throwable + { + test(cluster -> + { + // This single partition read happens to execute as a range read (at least when this test was created) + // and that exposed a problem with single token range reads + ICoordinator node = cluster.coordinator(1); + cluster.schemaChange(withKeyspace("CREATE TABLE %s.testRangeReadSingleToken (pk0 int, ck0 int, static0 int, regular0 int, PRIMARY KEY (pk0, ck0)) WITH " + transactionalMode.asCqlParam() + " AND CLUSTERING ORDER BY (ck0 ASC);")); + cluster.schemaChange(withKeyspace("CREATE INDEX ck0_sai_idx ON %s.testRangeReadSingleToken (ck0) USING 'sai';")); + SAIUtil.waitForIndexQueryable(cluster, KEYSPACE); + node.executeWithResult(withKeyspace("INSERT INTO %s.testRangeReadSingleToken (pk0, ck0, static0, regular0) VALUES (?, ?, ?, ?)"), QUORUM, 42, 43, 44, 45); + assertThat(node.executeWithResult(withKeyspace("SELECT pk0, ck0, static0, regular0 FROM %s.testRangeReadSingleToken WHERE pk0 = ? AND ck0 = ? AND static0 <= ? AND regular0 >= ? ALLOW FILTERING;"), ConsistencyLevel.ALL, 42, 43, 44, 45)) + .isEqualTo(42, 43, 44, 45); + + // This one is a little more explicit about trying to force a range read of a single token + cluster.schemaChange(withKeyspace("CREATE TABLE %s.testRangeReadSingleToken2 (pk blob primary key) WITH " + transactionalMode.asCqlParam())); + long token = 42; + ByteBuffer keyForToken = Murmur3Partitioner.LongToken.keyForToken(token); + node.executeWithResult(withKeyspace("INSERT INTO %s.testRangeReadSingleToken2 (pk) VALUES (?)"), QUORUM, keyForToken); + assertThat(node.executeWithResult(withKeyspace("SELECT * FROM %s.testRangeReadSingleToken2 WHERE token(pk) >= token(?) AND token(pk) <= token(?)"), QUORUM, Murmur3Partitioner.LongToken.keyForToken(token), keyForToken)) + .isEqualTo(keyForToken); + assertThat(node.executeWithResult(withKeyspace("SELECT * FROM %s.testRangeReadSingleToken2 WHERE token(pk) = token(?)"), QUORUM, keyForToken)) + .isEqualTo(keyForToken); + assertThat(node.executeWithResult(withKeyspace("SELECT * FROM %s.testRangeReadSingleToken2 WHERE token(pk) between token(?) AND token(?)"), QUORUM, Murmur3Partitioner.LongToken.keyForToken(0), keyForToken)) + .isEqualTo(keyForToken); + assertThat(node.executeWithResult(withKeyspace("SELECT * FROM %s.testRangeReadSingleToken2 WHERE token(pk) between token(?) AND token(?)"), QUORUM, keyForToken, keyForToken)) + .isEqualTo(keyForToken); + assertThat(node.executeWithResult(withKeyspace("SELECT * FROM %s.testRangeReadSingleToken2 WHERE token(pk) between token(?) AND token(?)"), QUORUM, Murmur3Partitioner.LongToken.keyForToken(0), Murmur3Partitioner.LongToken.keyForToken(43))) + .isEqualTo(keyForToken); + }); + } + + @Test + public void testRangeReadRightMin() throws Throwable + { + test(cluster -> + { + ICoordinator node = cluster.coordinator(1); + cluster.schemaChange(withKeyspace("CREATE TABLE %s.testRangeReadRightMin (pk blob primary key) WITH " + transactionalMode.asCqlParam())); + long token = Long.MIN_VALUE; + ByteBuffer keyForToken = Murmur3Partitioner.LongToken.keyForToken(token); + node.executeWithResult(withKeyspace("INSERT INTO %s.testRangeReadRightMin (pk) VALUES (?)"), QUORUM, keyForToken); + assertThat(node.executeWithResult(withKeyspace("SELECT * FROM %s.testRangeReadRightMin WHERE token(pk) >= token(?)"), QUORUM, keyForToken)) + .isEqualTo(keyForToken); + assertThat(node.executeWithResult(withKeyspace("SELECT * FROM %s.testRangeReadRightMin WHERE token(pk) = token(?)"), QUORUM, keyForToken)) + .isEqualTo(keyForToken); + assertThat(node.executeWithResult(withKeyspace("SELECT * FROM %s.testRangeReadRightMin WHERE token(pk) > token(?)"), QUORUM, keyForToken)) + .isEmpty(); + assertThat(node.executeWithResult(withKeyspace("SELECT * FROM %s.testRangeReadRightMin WHERE token(pk) > token(?) AND token(pk) < token(?)"), QUORUM, Murmur3Partitioner.LongToken.keyForToken(0), keyForToken)) + .isEmpty(); + assertThat(node.executeWithResult(withKeyspace("SELECT * FROM %s.testRangeReadRightMin WHERE token(pk) > token(?) AND token(pk) <= token(?)"), QUORUM, Murmur3Partitioner.LongToken.keyForToken(0), keyForToken)) + .isEqualTo(keyForToken); + assertThat(node.executeWithResult(withKeyspace("SELECT * FROM %s.testRangeReadRightMin WHERE token(pk) between token(?) AND token(?)"), QUORUM, Murmur3Partitioner.LongToken.keyForToken(0), keyForToken)) + .isEqualTo(keyForToken); + assertThat(node.executeWithResult(withKeyspace("SELECT * FROM %s.testRangeReadRightMin WHERE token(pk) between token(?) AND token(?)"), QUORUM, keyForToken, keyForToken)) + .isEqualTo(keyForToken); + }); + } + + @Test + public void testRangeReadAllowFiltering() throws Throwable + { + test(cluster -> + { + ICoordinator node = cluster.coordinator(1); + cluster.schemaChange(withKeyspace("CREATE TABLE %s.testRangeReadAllowFiltering (pk int primary key, foo text) WITH " + transactionalMode.asCqlParam())); + long token = Long.MIN_VALUE; + ByteBuffer keyForToken = Murmur3Partitioner.LongToken.keyForToken(token); + node.executeWithResult(withKeyspace("INSERT INTO %s.testRangeReadAllowFiltering (pk, foo) VALUES (?, ?)"), QUORUM, 42, "ba"); + assertThat(node.executeWithResult(withKeyspace("SELECT * FROM %s.testRangeReadAllowFiltering WHERE foo < 'bar' ALLOW FILTERING"), QUORUM)) + .isEqualTo(42, "ba"); + }); + } + + @Test + public void testIN() throws Exception + { + test(cluster -> { + Random r = new Random(0); + Map, Object[]> insertedRows = new HashMap<>(); + List partitionKeys = new ArrayList<>(); + for (int i = 0; i < 10; i++) + { + int k = r.nextInt(); + for (int j = 0; j < 10; j++) + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + "(k, c, v) VALUES (?, ?, ?);", ConsistencyLevel.ALL, k, j, i + j); + insertedRows.put(Pair.create(k, j), new Object[] {k, j, i + j}); + partitionKeys.add(k); + } + } + + String query = "SELECT * FROM " + qualifiedAccordTableName + " WHERE k IN ("; + for (Integer key : partitionKeys) + { + query = query + key + ", "; + } + query = query.substring(0, query.length() - 2); + query = query + ")"; + Iterator iterator = cluster.coordinator(1).executeWithPaging(query, ConsistencyLevel.ALL, 2); + List resultRows = ImmutableList.copyOf(iterator); + resultRows.forEach(row -> System.out.println(Arrays.toString(row))); + Integer lastPartitionKey = null; + int currentRowKey = 0; + for (Object[] row : resultRows) + { + assertEquals(currentRowKey, row[1]); + + if (lastPartitionKey == null) + lastPartitionKey = (Integer)row[0]; + else + assertEquals(lastPartitionKey, row[0]); + + if (currentRowKey == 9) + { + currentRowKey = 0; + lastPartitionKey = null; + } + else + currentRowKey++; + + Object[] expected = insertedRows.remove(Pair.create(row[0], row[1])); + assertEquals(expected, row); + } + assertTrue(insertedRows.isEmpty()); + }); + } + + @Test + public void testMultiPartitionReturn() throws Exception + { + test(cluster -> { + for (int i = 0; i < 10; i++) + { + for (int j = 0; j < 10; j++) + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + "(k, c, v) VALUES (?, ?, ?);", ConsistencyLevel.ALL, i, j, i + j); + } + // multi row + String cql = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k=? AND c IN (?, ?);\n" + + "COMMIT TRANSACTION"; + SimpleQueryResult result = cluster.coordinator(1).executeWithResult(cql, ConsistencyLevel.ANY, 0, 0, 1); + assertThat(result).isEqualTo(QueryResults.builder() + .columns("k", "c", "v") + .row(0, 0, 0) + .row(0, 1, 1) + .build()); + // Results should be in Partiton/Clustering order, so make sure + // multi partition + cql = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k IN (?, ?) AND c = ?;\n" + + "COMMIT TRANSACTION"; + for (boolean asc : Arrays.asList(true, false)) + { + Object[] binds = asc ? row(0, 1, 0) : row(1, 0, 0); + result = cluster.coordinator(1).executeWithResult(cql, ConsistencyLevel.ANY, binds); + assertThat(result).isEqualTo(QueryResults.builder() + .columns("k", "c", "v") + .row(0, 0, 0) + .row(1, 0, 1) + .build()); + } + + // multi-partition, multi-clustering + cql = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k IN (?, ?) AND c IN (?, ?);\n" + + "COMMIT TRANSACTION"; + for (boolean asc : Arrays.asList(true, false)) + { + Object[] binds = asc ? row(0, 1, 0, 1) : row(1, 0, 1, 0); + result = cluster.coordinator(1).executeWithResult(cql, ConsistencyLevel.ANY, binds); + assertThat(result).isEqualTo(QueryResults.builder() + .columns("k", "c", "v") + .row(0, 0, 0) + .row(0, 1, 1) + .row(1, 0, 1) + .row(1, 1, 2) + .build()); + } + }); + } + + @Test + public void testMultipleShards() throws Exception + { + String keyspace = "multipleShards"; + String currentTable = keyspace + ".tbl"; + List ddls = Arrays.asList("DROP KEYSPACE IF EXISTS " + keyspace + ";", + "CREATE KEYSPACE " + keyspace + " WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor': 1}", + "CREATE TABLE " + currentTable + " (k blob, c int, v int, primary key (k, c)) WITH transactional_mode='" + transactionalMode + "'"); + List tokens = tokens(); + List keys = tokensToKeys(tokens); + List keyStrings = keys.stream().map(bb -> "0x" + ByteBufferUtil.bytesToHex(bb)).collect(Collectors.toList()); + StringBuilder query = new StringBuilder("BEGIN TRANSACTION\n"); + + for (int i = 0; i < keys.size(); i++) + query.append(" LET row" + i + " = (SELECT * FROM " + currentTable + " WHERE k=" + keyStrings.get(i) + " AND c=0);\n"); + + query.append(" SELECT row0.v;\n") + .append(" IF "); + + for (int i = 0; i < keyStrings.size(); i++) + query.append((i > 0 ? " AND row" : "row") + i + " IS NULL"); + + query.append(" THEN\n"); + + for (int i = 0; i < keyStrings.size(); i++) + query.append(" INSERT INTO " + currentTable + " (k, c, v) VALUES (" + keyStrings.get(i) + ", 0, " + i +");\n"); + + query.append(" END IF\n"); + query.append("COMMIT TRANSACTION"); + + test(ddls, cluster -> { + // row0.v shouldn't have existed when the txn's SELECT was executed + assertRowEqualsWithPreemptedRetry(cluster, new Object[]{ null }, query.toString()); + + cluster.get(1).runOnInstance(() -> { + StringBuilder sb = new StringBuilder("BEGIN TRANSACTION\n"); + for (int i = 0; i < keyStrings.size() - 1; i++) + sb.append(format("LET row%d = (SELECT * FROM %s WHERE k=%s AND c=0);\n", i, currentTable, keyStrings.get(i))); + sb.append(format("SELECT * FROM %s WHERE k=%s AND c=0;\n", currentTable, keyStrings.get(keyStrings.size() - 1))); + sb.append("COMMIT TRANSACTION"); + + Unseekables routables = AccordTestUtils.createTxn(sb.toString()).keys().toParticipants(); + long epoch = AccordService.instance().topology().epoch(); + Topologies topology = AccordService.instance().topology().withUnsyncedEpochs(routables, epoch, epoch); + // we don't detect out-of-bounds read/write yet, so use this to validate we reach different shards + Assertions.assertThat(topology.totalShards()).isEqualTo(2); + }); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + currentTable + " WHERE k = ? AND c = ?;\n" + + "COMMIT TRANSACTION"; + + for (int i = 0; i < keys.size(); i++) + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { keys.get(i), 0, i}, check, keys.get(i), 0); + }); + } + + @Test + public void testScalarBindVariables() throws Throwable + { + test(cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (1, 0, 3);", ConsistencyLevel.ALL); + + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT v FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = ?);\n" + + " LET row2 = (SELECT v FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = ?);\n" + + " SELECT v FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = ?;\n" + + " IF row1 IS NULL AND row2.v = ? THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (?, ?, ?);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + + Object[][] result = cluster.coordinator(1).execute(query, + ConsistencyLevel.ANY, + 0, 0, + 1, 0, + 1, 0, + 3, + 0, 0, 1); + assertEquals(3, result[0][0]); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k=0 AND c=0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, 0, 1 }, check); + }); + } + + @Test + public void testRegularScalarIsNull() throws Throwable + { + testScalarIsNull("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int, primary key (k, c)) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testStaticScalarIsNull() throws Throwable + { + testScalarIsNull("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int static, primary key (k, c)) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testScalarIsNull(String tableDDL) throws Exception { + test(tableDDL, + cluster -> + { + String insertNull = "BEGIN TRANSACTION\n" + + " LET row0 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0 LIMIT 1);\n" + + " SELECT row0.k, row0.v;\n" + + " IF row0.v IS NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (?, ?, null);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { null, null }, insertNull, 0, 0); + + String insert = "BEGIN TRANSACTION\n" + + " LET row0 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0 LIMIT 1);\n" + + " SELECT row0.k, row0.v;\n" + + " IF row0.v IS NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (?, ?, ?);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, null }, insert, 0, 0, 1); + + String check = "BEGIN TRANSACTION\n" + + " SELECT k, c, v FROM " + qualifiedAccordTableName + " WHERE k=0 AND c=0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, 0, 1 }, check); + }); + } + + @Test + public void testQueryStaticColumn() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, s int static, v int, primary key (k, c)) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + // select partition key, clustering key and static column, restrict on partition and clustering + testQueryStaticColumn(cluster, + "LET row0 = (SELECT k, c, s, v FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = 0);\n" + + "SELECT row0.k, row0.c, row0.s, row0.v;\n", + + "SELECT k, c, s, v FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = 0"); + + // select partition key, clustering key and static column, restrict on partition and limit to 1 row + testQueryStaticColumn(cluster, + "LET row0 = (SELECT k, c, s, v FROM " + qualifiedAccordTableName + " WHERE k = ? LIMIT 1);\n" + + "SELECT row0.k, row0.c, row0.s, row0.v;\n", + + "SELECT k, c, s, v FROM " + qualifiedAccordTableName + " WHERE k = ? LIMIT 1"); + + // select static column and regular column, restrict on partition and clustering + testQueryStaticColumn(cluster, + "LET row0 = (SELECT s, v FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = 0);\n" + + "SELECT row0.s, row0.v;\n", + + "SELECT s, v FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = 0"); + + // select just static column, restrict on partition and limit to 1 row + testQueryStaticColumn(cluster, + "LET row0 = (SELECT s FROM " + qualifiedAccordTableName + " WHERE k = ? LIMIT 1);\n" + + "SELECT row0.s;\n", + + "SELECT s FROM " + qualifiedAccordTableName + " WHERE k = ? LIMIT 1"); + }); + } + + private void testQueryStaticColumn(Cluster cluster, String accordReadQuery, String simpleReadQuery) + { + logger().info("Empty table"); + int key = 10; + assertResultsFromAccordMatches(cluster, accordReadQuery, simpleReadQuery, key++); + + cluster.get(1).coordinator().execute("INSERT INTO " + qualifiedAccordTableName + " (k, s) VALUES (?, null);", ConsistencyLevel.ALL, key); + logger().info("null -> static column"); + assertResultsFromAccordMatches(cluster, accordReadQuery, simpleReadQuery, key++); + + cluster.get(1).coordinator().execute("INSERT INTO " + qualifiedAccordTableName + " (k, s) VALUES (?, 1);", ConsistencyLevel.ALL, key); + logger().info("Inserted 1 -> static column"); + assertResultsFromAccordMatches(cluster, accordReadQuery, simpleReadQuery, key++); + + cluster.get(1).coordinator().execute("INSERT INTO " + qualifiedAccordTableName + " (k, c) VALUES (?, 0);", ConsistencyLevel.ALL, key); + logger().info("Inserted 0 -> clustering"); + assertResultsFromAccordMatches(cluster, accordReadQuery, simpleReadQuery, key); + } + + @Test + public void testUpdateStaticColumn() throws Exception { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, s int static, v int, primary key (k, c)) WITH transactional_mode='" + transactionalMode + '\'', + cluster -> + { + checkUpdateStatic(cluster, "SET s=1 WHERE k=?", 101, "[[101, null, 1, null]]", "[]"); + checkUpdateStatic(cluster, "SET s=1, v=11 WHERE k=? AND c=0", 101, "[[101, 0, 1, 11]]", "[[101, 0, 1, 11]]"); + + // commented out until org.apache.cassandra.cql3.statements.ModificationStatement.createSelectForTxn is fixed + // checkUpdateStatic(cluster, "SET s+=1 WHERE k=?", 101, "[]", "[]"); + + checkUpdateStatic(cluster, "SET s+=1, v+=11 WHERE k=? AND c=0", 101, "[]", "[]"); + }); + } + + private void checkUpdateStatic(Cluster cluster, String update, int key, String expPart, String expClust) + { + Object[][] r1, r2, r3, r4, r; + r = cluster.get(1).coordinator().execute("UPDATE " + qualifiedAccordTableName + " " + update + " IF s = NULL;", QUORUM, key); + Assertions.assertThat(Arrays.deepToString(r)).isEqualTo("[[true]]"); + r1 = cluster.get(1).coordinator().execute("SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ? LIMIT 1;", ConsistencyLevel.SERIAL, key); + r2 = cluster.get(1).coordinator().execute("SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = 0;", ConsistencyLevel.SERIAL, key); + cluster.get(1).coordinator().execute("TRUNCATE " + qualifiedAccordTableName, ConsistencyLevel.ALL); + + executeAsTxn(cluster, "UPDATE " + qualifiedAccordTableName + " " + update + ";", key); + r3 = executeAsTxn(cluster, "SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ? LIMIT 1;", key).toObjectArrays(); + r4 = executeAsTxn(cluster, "SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = 0;", key).toObjectArrays(); + cluster.get(1).coordinator().execute("TRUNCATE " + qualifiedAccordTableName, ConsistencyLevel.ALL); + + Assertions.assertThat(Arrays.deepToString(r1)).isEqualTo(expPart); + Assertions.assertThat(Arrays.deepToString(r2)).isEqualTo(expClust); + Assertions.assertThat(Arrays.deepToString(r3)).isEqualTo(expPart); + Assertions.assertThat(Arrays.deepToString(r4)).isEqualTo(expClust); + } + + private void assertResultsFromAccordMatches(Cluster cluster, String accordRead, String simpleRead, int key) + { + accordRead = wrapInTxn(accordRead); + Object[][] simpleReadResult; + if (transactionalMode.ignoresSuppliedCommitCL()) + // With accord non-SERIAL write strategy the commit CL is effectively ANY so we need to read at SERIAL + simpleReadResult = cluster.coordinator(1).execute(simpleRead, ConsistencyLevel.SERIAL, key); + else + simpleReadResult = cluster.get(1).executeInternal(simpleRead, key); + Object[][] accordReadResult = executeWithRetry(cluster, accordRead, key).toObjectArrays(); + + Assertions.assertThat(withRemovedNullOnlyRows(accordReadResult)).isEqualTo(withRemovedNullOnlyRows(simpleReadResult)); + } + + private static Object[][] withRemovedNullOnlyRows(Object[][] results) + { + return Arrays.stream(results) + .filter(row -> !Arrays.stream(row).allMatch(Objects::isNull)) + .toArray(Object[][]::new); + } + + @Test + public void testScalarEQ() throws Throwable + { + testScalarCondition(3, "=", 3, "="); + } + + @Test + public void testScalarNEQ() throws Throwable + { + testScalarCondition(3, "!=", 4, "!="); + } + + @Test + public void testScalarLt() throws Throwable + { + testScalarCondition(3, "<", 4, ">"); + } + + @Test + public void testScalarLte() throws Throwable + { + testScalarCondition(3, "<=", 3, ">="); + setup(); + testScalarCondition(3, "<=", 4, ">="); + } + + @Test + public void testScalarGt() throws Throwable + { + testScalarCondition(4, ">", 3, "<"); + } + + @Test + public void testScalarGte() throws Throwable + { + testScalarCondition(4, ">=", 3, "<="); + setup(); + testScalarCondition(4, ">=", 4, "<="); + } + + @Test + public void testStaticScalarEQ() throws Throwable + { + testScalarCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int static, primary key (k, c)) WITH transactional_mode='" + transactionalMode + "'", 3, "=", 3, "="); + } + + private void testScalarCondition(int lhs, String operator, int rhs, String reversedOperator) throws Exception + { + testScalarCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int, primary key (k, c)) WITH transactional_mode='" + transactionalMode + "'", lhs, operator, rhs, reversedOperator); + } + + private void testScalarCondition(String tableDDL, int lhs, String operator, int rhs, String reversedOperator) throws Exception + { + test(tableDDL, + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (0, 0, " + lhs + ");", ConsistencyLevel.ALL); + + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT v FROM " + qualifiedAccordTableName + " WHERE k = ? LIMIT 1);\n" + + " SELECT row1.v;\n" + + " IF row1.v " + operator + " ? THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (?, ?, ?);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { lhs }, query, 0, rhs, 1, 0, 1); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 1, 0, 1 }, check, 1, 0); + + String queryWithReversed = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT v FROM " + qualifiedAccordTableName + " WHERE k = ? LIMIT 1);\n" + + " SELECT row1.v;\n" + + " IF ? " + reversedOperator + " row1.v THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (?, ?, ?);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { lhs }, queryWithReversed, 0, rhs, 2, 0, 1); + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 2, 0, 1 }, check, 2, 0); + }); + } + + @Test + public void testReadOnlyTx() throws Exception + { + test(cluster -> + { + String query = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k=0 AND c=0;\n" + + "COMMIT TRANSACTION"; + SimpleQueryResult result = cluster.coordinator(1).executeWithResult(query, ConsistencyLevel.ANY); + assertFalse(result.hasNext()); + }); + } + + @Test + public void testWriteOnlyTx() throws Exception + { + test(cluster -> + { + String query = "BEGIN TRANSACTION\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (?, ?, ?);\n" + + "COMMIT TRANSACTION"; + SimpleQueryResult result = cluster.coordinator(1).executeWithResult(query, ConsistencyLevel.ANY, 0, 0, 1); + assertFalse(result.hasNext()); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k=? AND c=?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {0, 0, 1}, check, 0, 0); + }); + } + + @Test + public void testReturningLetReferences() throws Throwable + { + test(cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (1, 0, 3);", ConsistencyLevel.ALL); + + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = ?);\n" + + " LET row2 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = ?);\n" + + " SELECT row1.v, row2.k, row2.c, row2.v;\n" + + " IF row1 IS NULL AND row2.v = ? THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (?, ?, ?);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + SimpleQueryResult result = cluster.coordinator(1).executeWithResult(query, ConsistencyLevel.ANY, 0, 0, 1, 0, 3, 0, 0, 1); + assertEquals(ImmutableList.of("row1.v", "row2.k", "row2.c", "row2.v"), result.names()); + assertThat(result).hasSize(1).contains(null, 1, 0, 3); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k=0 AND c=0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {0, 0, 1}, check); + }); + } + + @Test + public void testFailedConditionWithCompleteInsert() throws Throwable + { + test(cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (1, 0, 3);", ConsistencyLevel.ALL); + + String query = "BEGIN TRANSACTION\n" + + " LET row0 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = ?);\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = ?);\n" + + " SELECT row1.v;\n" + + " IF row0 IS NULL AND row1.v = ? THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (?, ?, ?);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + SimpleQueryResult result = cluster.coordinator(1).executeWithResult(query, ConsistencyLevel.ANY, 0, 0, 1, 0, 2, 0, 0, 1); + assertEquals(ImmutableList.of("row1.v"), result.names()); + assertThat(result).hasSize(1).contains(3); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k=0 AND c=0;\n" + + "COMMIT TRANSACTION"; + assertEmptyWithPreemptedRetry(cluster, check); + }); + } + + @Test + public void testReversedClusteringReference() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int, PRIMARY KEY (k, c)) WITH CLUSTERING ORDER BY (c DESC) AND transactional_mode='" + transactionalMode + "'", + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (1, 1, 1)", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1 AND c = 1);\n" + + " SELECT row1.k, row1.c, row1.v;\n" + + " IF row1.c = 1 THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET v += row1.c WHERE k=1 AND c=1;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[]{1, 1, 1}, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1 AND c = 1;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[]{1, 1, 2}, check); + }); + } + + @Test + public void testScalarShorthandAddition() throws Exception + { + testScalarShorthandOperation(1, "+=", 2); + } + + @Test + public void testScalarShorthandSubtraction() throws Exception + { + testScalarShorthandOperation(3, "-=", 2); + } + + private void testScalarShorthandOperation(int startingValue, String operation, int endingvalue) throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, v int) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, v) VALUES (1, ?)", ConsistencyLevel.ALL, startingValue); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.v;\n" + + " UPDATE " + qualifiedAccordTableName + " SET v " + operation + " 1 WHERE k = 1;\n" + + "COMMIT TRANSACTION"; + assertRowEquals(cluster, new Object[] { startingValue }, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT v FROM " + qualifiedAccordTableName + " WHERE k = 1;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 2 }, check); + }); + } + + @Test + public void testConstantNonStaticRowReadBeforeUpdate() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int, PRIMARY KEY (k, c)) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (1, 2, ?)", ConsistencyLevel.ALL, 3); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1 AND c = 2);\n" + + " SELECT row1.v;\n" + + " UPDATE " + qualifiedAccordTableName + " SET v += 1 WHERE k = 1 AND c = 2;\n" + + "COMMIT TRANSACTION"; + assertRowEquals(cluster, new Object[] { 3 }, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT v FROM " + qualifiedAccordTableName + " WHERE k = 1 AND c = 2;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 4 }, check); + }); + } + + @Test + public void testRangeDeletion() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int, PRIMARY KEY (k, c)) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (1, 2, ?)", ConsistencyLevel.ALL, 3); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (1, 3, ?)", ConsistencyLevel.ALL, 4); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (1, 4, ?)", ConsistencyLevel.ALL, 5); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1 AND c = 2);\n" + + " SELECT row1.v;\n" + + " DELETE FROM " + qualifiedAccordTableName + " WHERE k = 1 AND c >=3 AND c <= 4;\n" + + "COMMIT TRANSACTION"; + assertRowEquals(cluster, new Object[] { 3 }, update); + + Object[][] check = cluster.coordinator(1).execute("SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1;", ConsistencyLevel.SERIAL); + assertArrayEquals(new Object[] { 1, 2, 3 }, check[0]); + assertEquals(1, check.length); + }); + } + + + @Test + public void testPartitionKeyReferenceCondition() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k INT, c INT, v INT, PRIMARY KEY (k, c)) WITH CLUSTERING ORDER BY (c DESC) AND transactional_mode='" + transactionalMode + "'", + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (1, 1, 1)", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1 AND c = 1);\n" + + " SELECT row1.k, row1.c, row1.v;\n" + + " IF row1.k = 1 THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET v += row1.k WHERE k=1 AND c=1;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[]{1, 1, 1}, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1 AND c = 1;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[]{1, 1, 2}, check); + }); + } + + @Test + public void testMultiPartitionKeyReferenceCondition() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (pk1 INT, pk2 INT, c INT, v INT, PRIMARY KEY ((pk1, pk2), c)) WITH CLUSTERING ORDER BY (c DESC) AND transactional_mode='" + transactionalMode + "'", + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (pk1, pk2, c, v) VALUES (1, 1, 1, 1)", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE pk1 = 1 AND pk2 = 1 AND c = 1);\n" + + " SELECT row1.pk1, row1.pk2, row1.c, row1.v;\n" + + " IF row1.pk1 = 1 THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET v += row1.pk2 WHERE pk1 = 1 AND pk2 = 1 AND c=1;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[]{1, 1, 1, 1}, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE pk1 = 1 AND pk2 = 1 AND c = 1;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[]{1, 1, 1, 2}, check); + }); + } + + @Test + public void testMultiCellListEqCondition() throws Exception + { + testListEqCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_list list) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testFrozenListEqCondition() throws Exception + { + testListEqCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_list frozen>) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testListEqCondition(String ddl) throws Exception + { + test(ddl, + cluster -> + { + ListType listType = ListType.getInstance(Int32Type.instance, true); + List initialList = Arrays.asList(1, 2); + ByteBuffer initialListBytes = listType.getSerializer().serialize(initialList); + + String insert = "BEGIN TRANSACTION\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, int_list) VALUES (?, ?);\n" + + "COMMIT TRANSACTION"; + SimpleQueryResult result = cluster.coordinator(1).executeWithResult(insert, ConsistencyLevel.ANY, 0, initialListBytes); + assertFalse(result.hasNext()); + + List updatedList = Arrays.asList(1, 2, 3); + ByteBuffer updatedListBytes = listType.getSerializer().serialize(updatedList); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.int_list;\n" + + " IF row1.int_list = ? THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_list = ? WHERE k = ?;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {initialList}, update, 0, initialListBytes, updatedListBytes, 0); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {0, updatedList}, check, 0); + } + ); + } + + @Test + public void testMultiCellSetEqCondition() throws Exception + { + testSetEqCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_set set) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testFrozenSetEqCondition() throws Exception + { + testSetEqCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_set frozen>) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testSetEqCondition(String ddl) throws Exception + { + test(ddl, + cluster -> + { + SetType setType = SetType.getInstance(Int32Type.instance, true); + Set initialSet = ImmutableSet.of(1, 2); + ByteBuffer initialSetBytes = setType.getSerializer().serialize(initialSet); + + String insert = "BEGIN TRANSACTION\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, int_set) VALUES (?, ?);\n" + + "COMMIT TRANSACTION"; + SimpleQueryResult result = cluster.coordinator(1).executeWithResult(insert, ConsistencyLevel.ANY, 0, initialSetBytes); + assertFalse(result.hasNext()); + + Set updatedSet = ImmutableSet.of(1, 2, 3); + ByteBuffer updatedSetBytes = setType.getSerializer().serialize(updatedSet); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.int_set;\n" + + " IF row1.int_set = ? THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_set = ? WHERE k = ?;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {initialSet}, update, 0, initialSetBytes, updatedSetBytes, 0); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {0, updatedSet}, check, 0); + } + ); + } + + @Test + public void testMultiCellMapEqCondition() throws Exception + { + testMapEqCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_map map) WITH transactional_mode='" + transactionalMode + "'", true); + } + + @Test + public void testFrozenMapEqCondition() throws Exception + { + testMapEqCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_map frozen>) WITH transactional_mode='" + transactionalMode + "'", false); + } + + private void testMapEqCondition(String ddl, boolean isMultiCell) throws Exception + { + test(ddl, + cluster -> + { + MapType mapType = MapType.getInstance(UTF8Type.instance, Int32Type.instance, isMultiCell); + Map initialMap = ImmutableMap.of("one", 1, "two", 2); + ByteBuffer initialMapBytes = mapType.getSerializer().serialize(initialMap); + + String insert = "BEGIN TRANSACTION\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, int_map) VALUES (?, ?);\n" + + "COMMIT TRANSACTION"; + SimpleQueryResult result = cluster.coordinator(1).executeWithResult(insert, ConsistencyLevel.ANY, 0, initialMapBytes); + assertFalse(result.hasNext()); + + Map updatedMap = ImmutableMap.of("one", 1, "two", 2, "three", 3); + ByteBuffer updatedMapBytes = mapType.getSerializer().serialize(updatedMap); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.int_map;\n" + + " IF row1.int_map = ? THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_map = ? WHERE k = ?;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { initialMap }, update, 0, initialMapBytes, updatedMapBytes, 0); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, updatedMap }, check, 0); + } + ); + } + + @Test + public void testMultiCellUDTEqCondition() throws Exception + { + testUDTEqCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, customer person) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testFrozenUDTEqCondition() throws Exception + { + testUDTEqCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, customer frozen) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testUDTEqCondition(String tableDDL) throws Exception + { + test(tableDDL, + cluster -> + { + Object initialPersonValue = CQLTester.userType("height", 74, "age", 37); + ByteBuffer initialPersonBuffer = CQLTester.makeByteBuffer(initialPersonValue, null); + + String insert = "BEGIN TRANSACTION\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, customer) VALUES (?, ?);\n" + + "COMMIT TRANSACTION"; + SimpleQueryResult result = cluster.coordinator(1).executeWithResult(insert, ConsistencyLevel.ANY, 0, initialPersonBuffer); + assertFalse(result.hasNext()); + + Object updatedPersonValue = CQLTester.userType("height", 73, "age", 40); + ByteBuffer updatedPersonBuffer = CQLTester.makeByteBuffer(updatedPersonValue, null); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.customer;\n" + + " IF row1.customer = ? THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET customer = ? WHERE k = ?;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { initialPersonBuffer }, update, 0, initialPersonBuffer, updatedPersonBuffer, 0); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, updatedPersonBuffer }, check, 0); + } + ); + } + + @Test + public void testTupleEqCondition() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, pair tuple) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + Object initialTupleValue = CQLTester.tuple("age", 37); + ByteBuffer initialTupleBuffer = CQLTester.makeByteBuffer(initialTupleValue, null); + + String insert = "BEGIN TRANSACTION\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, pair) VALUES (?, ?);\n" + + "COMMIT TRANSACTION"; + SimpleQueryResult result = cluster.coordinator(1).executeWithResult(insert, ConsistencyLevel.ANY, 0, initialTupleBuffer); + assertFalse(result.hasNext()); + + Object updatedTupleValue = CQLTester.userType("age", 40); + ByteBuffer updatedTupleBuffer = CQLTester.makeByteBuffer(updatedTupleValue, null); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.pair;\n" + + " IF row1.pair = ? THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET pair = ? WHERE k = ?;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { initialTupleBuffer }, update, 0, initialTupleBuffer, updatedTupleBuffer, 0); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, updatedTupleBuffer }, check, 0); + } + ); + } + + @Test + public void testIsNullWithComplexDeletion() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, int_list list, PRIMARY KEY (k, c)) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + ListType listType = ListType.getInstance(Int32Type.instance, true); + List initialList = Arrays.asList(1, 2); + ByteBuffer initialListBytes = listType.getSerializer().serialize(initialList); + + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, int_list) VALUES (0, 0, ?);", ConsistencyLevel.ALL, initialListBytes); + cluster.forEach(i -> i.flush(KEYSPACE)); + cluster.coordinator(1).execute("DELETE int_list FROM " + qualifiedAccordTableName + " WHERE k = 0 AND c = 0;", ConsistencyLevel.ALL); + + List updatedList = Arrays.asList(1, 2, 3); + ByteBuffer updatedListBytes = listType.getSerializer().serialize(updatedList); + + String insert = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = ?);\n" + + " SELECT row1.int_list;\n" + + " IF row1.int_list IS NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, int_list) VALUES (?, ?, ?);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { null }, insert, 0, 0, 0, 0, updatedListBytes); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, 0, updatedList }, check, 0, 0); + } + ); + } + + @Test + public void testNullMultiCellListConditions() throws Exception + { + testNullListConditions("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_list list) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testNullFrozenListConditions() throws Exception + { + testNullListConditions("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_list frozen>) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testNullListConditions(String ddl) throws Exception + { + test(ddl, + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_list) VALUES (0, null);", ConsistencyLevel.ALL); + + ListType listType = ListType.getInstance(Int32Type.instance, true); + List initialList = Arrays.asList(1, 2); + ByteBuffer initialListBytes = listType.getSerializer().serialize(initialList); + + String insert = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.int_list;\n" + + " IF row1.int_list IS NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, int_list) VALUES (?, ?);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {null}, insert, 0, 0, initialListBytes); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {0, initialList}, check, 0); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.int_list;\n" + + " IF row1.int_list IS NOT NULL THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_list = ? WHERE k = ?;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + + List updatedList = Arrays.asList(1, 2, 3); + ByteBuffer updatedListBytes = listType.getSerializer().serialize(updatedList); + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {initialList}, update, 0, updatedListBytes, 0); + } + ); + } + + @Test + public void testNullMultiCellSetConditions() throws Exception + { + testNullSetConditions("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_set set) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testNullFrozenSetConditions() throws Exception + { + testNullSetConditions("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_set frozen>) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testNullSetConditions(String ddl) throws Exception + { + test(ddl, + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_set) VALUES (0, null);", ConsistencyLevel.ALL); + + SetType setType = SetType.getInstance(Int32Type.instance, true); + Set initialSet = ImmutableSet.of(1, 2); + ByteBuffer initialSetBytes = setType.getSerializer().serialize(initialSet); + + String insert = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.int_set;\n" + + " IF row1.int_set IS NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, int_set) VALUES (?, ?);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {null}, insert, 0, 0, initialSetBytes); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {0, initialSet}, check, 0); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.int_set;\n" + + " IF row1.int_set IS NOT NULL THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_set = ? WHERE k = ?;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + + Set updatedSet = ImmutableSet.of(1, 2, 3); + ByteBuffer updatedSetBytes = setType.getSerializer().serialize(updatedSet); + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {initialSet}, update, 0, updatedSetBytes, 0); + } + ); + } + + @Test + public void testNullMultiCellMapConditions() throws Exception + { + testNullMapConditions("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_map map) WITH transactional_mode='" + transactionalMode + "'", true); + } + + @Test + public void testNullFrozenMapConditions() throws Exception + { + testNullMapConditions("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_map frozen>) WITH transactional_mode='" + transactionalMode + "'", false); + } + + private void testNullMapConditions(String ddl, boolean isMultiCell) throws Exception + { + test(ddl, + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_map) VALUES (0, null);", ConsistencyLevel.ALL); + + MapType mapType = MapType.getInstance(UTF8Type.instance, Int32Type.instance, isMultiCell); + Map initialMap = ImmutableMap.of("one", 1, "two", 2); + ByteBuffer initialMapBytes = mapType.getSerializer().serialize(initialMap); + + String insert = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.int_map;\n" + + " IF row1.int_map IS NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, int_map) VALUES (?, ?);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { null }, insert, 0, 0, initialMapBytes); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, initialMap }, check, 0); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.int_map;\n" + + " IF row1.int_map IS NOT NULL THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_map = ? WHERE k = ?;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + + Map updatedMap = ImmutableMap.of("one", 1, "two", 2, "three", 3); + ByteBuffer updatedMapBytes = mapType.getSerializer().serialize(updatedMap); + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { initialMap }, update, 0, updatedMapBytes, 0); + + String checkUpdate = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, updatedMap }, checkUpdate, 0); + } + ); + } + + @Test + public void testNullMultiCellUDTCondition() throws Exception + { + testNullUDTCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, customer person) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testNullFrozenUDTCondition() throws Exception + { + testNullUDTCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, customer frozen) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testNullUDTCondition(String tableDDL) throws Exception + { + test(tableDDL, + cluster -> + { + Object initialPersonValue = CQLTester.userType("height", 74, "age", 37); + ByteBuffer initialPersonBuffer = CQLTester.makeByteBuffer(initialPersonValue, null); + + String insert = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.customer;\n" + + " IF row1.customer IS NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, customer) VALUES (?, ?);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { null }, insert, 0, 0, initialPersonBuffer); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, initialPersonBuffer }, check, 0); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.customer;\n" + + " IF row1.customer IS NOT NULL THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET customer = ? WHERE k = ?;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + + Object updatedPersonValue = CQLTester.userType("height", 73, "age", 40); + ByteBuffer updatedPersonBuffer = CQLTester.makeByteBuffer(updatedPersonValue, null); + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { initialPersonBuffer }, update, 0, updatedPersonBuffer, 0); + + String checkUpdate = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, updatedPersonBuffer }, checkUpdate, 0); + } + ); + } + + @Test + public void testNullMultiCellSetElementConditions() throws Exception + { + testNullSetElementConditions("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_set set) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testNullFrozenSetElementConditions() throws Exception + { + testNullSetElementConditions("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_set frozen>) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testNullSetElementConditions(String ddl) throws Exception + { + test(ddl, + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_set) VALUES (0, {1});", ConsistencyLevel.ALL); + + SetType setType = SetType.getInstance(Int32Type.instance, true); + Set initialSet = ImmutableSet.of(1, 2); + ByteBuffer initialSetBytes = setType.getSerializer().serialize(initialSet); + + String insert = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.int_set[2];\n" + + " IF row1.int_set[2] IS NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, int_set) VALUES (?, ?);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {null}, insert, 0, 0, initialSetBytes); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {0, initialSet}, check, 0); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.int_set;\n" + + " IF row1.int_set[2] IS NOT NULL THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_set = ? WHERE k = ?;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + + Set updatedSet = ImmutableSet.of(1, 2, 3); + ByteBuffer updatedSetBytes = setType.getSerializer().serialize(updatedSet); + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {initialSet}, update, 0, updatedSetBytes, 0); + } + ); + } + + @Test + public void testNullMultiCellMapElementConditions() throws Exception + { + testNullMapElementConditions("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_map map) WITH transactional_mode='" + transactionalMode + "'", true); + } + + @Test + public void testNullFrozenMapElementConditions() throws Exception + { + testNullMapElementConditions("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_map frozen>) WITH transactional_mode='" + transactionalMode + "'", false); + } + + private void testNullMapElementConditions(String ddl, boolean isMultiCell) throws Exception + { + test(ddl, + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_map) VALUES (0, null);", ConsistencyLevel.ALL); + + MapType mapType = MapType.getInstance(UTF8Type.instance, Int32Type.instance, isMultiCell); + Map initialMap = ImmutableMap.of("one", 1, "two", 2); + ByteBuffer initialMapBytes = mapType.getSerializer().serialize(initialMap); + + String insert = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.int_map;\n" + + " IF row1.int_map[?] IS NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, int_map) VALUES (?, ?);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { null }, insert, 0, "one", 0, initialMapBytes); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, initialMap }, check, 0); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.int_map;\n" + + " IF row1.int_map[?] IS NOT NULL THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_map = ? WHERE k = ?;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + + Map updatedMap = ImmutableMap.of("one", 1, "two", 2, "three", 3); + ByteBuffer updatedMapBytes = mapType.getSerializer().serialize(updatedMap); + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { initialMap }, update, 0, "two", updatedMapBytes, 0); + + String checkUpdate = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, updatedMap }, checkUpdate, 0); + } + ); + } + + @Test + public void testNullMultiCellUDTFieldCondition() throws Exception + { + testNullUDTFieldCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, customer person) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testNullFrozenUDTFieldCondition() throws Exception + { + testNullUDTFieldCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, customer frozen) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testNullUDTFieldCondition(String tableDDL) throws Exception + { + test(tableDDL, + cluster -> + { + Object initialPersonValue = CQLTester.userType("height", 74, "age", 37); + ByteBuffer initialPersonBuffer = CQLTester.makeByteBuffer(initialPersonValue, null); + + String insert = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.customer;\n" + + " IF row1.customer.age IS NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, customer) VALUES (?, ?);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { null }, insert, 0, 0, initialPersonBuffer); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, initialPersonBuffer }, check, 0); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.customer;\n" + + " IF row1.customer.age IS NOT NULL THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET customer = ? WHERE k = ?;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + + Object updatedPersonValue = CQLTester.userType("height", 73, "age", 40); + ByteBuffer updatedPersonBuffer = CQLTester.makeByteBuffer(updatedPersonValue, null); + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { initialPersonBuffer }, update, 0, updatedPersonBuffer, 0); + + String checkUpdate = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, updatedPersonBuffer }, checkUpdate, 0); + } + ); + } + + @Test + public void testMultiCellListSubstitution() throws Exception + { + testListSubstitution("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_list list) WITH transactional_mode='" + transactionalMode + "'", true); + } + + @Test + public void testFrozenListSubstitution() throws Exception + { + testListSubstitution("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_list frozen>) WITH transactional_mode='" + transactionalMode + "'", false); + } + + private void testListSubstitution(String ddl, boolean isMultiCell) throws Exception + { + test(ddl, + cluster -> + { + ListType listType = ListType.getInstance(Int32Type.instance, isMultiCell); + List initialList = Arrays.asList(1, 2); + ByteBuffer initialListBytes = listType.getSerializer().serialize(initialList); + + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_list) VALUES (0, ?);", ConsistencyLevel.ALL, initialListBytes); + + String insert = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.int_list;\n" + + " IF row1.int_list IS NOT NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, int_list) VALUES (?, row1.int_list);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { initialList }, insert, 0, 1); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 1, initialList }, check, 1); + } + ); + } + + @Test + public void testMultiCellSetSubstitution() throws Exception + { + testSetSubstitution("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_set set) WITH transactional_mode='" + transactionalMode + "'", true); + } + + @Test + public void testFrozenSetSubstitution() throws Exception + { + testSetSubstitution("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_set frozen>) WITH transactional_mode='" + transactionalMode + "'", false); + } + + private void testSetSubstitution(String ddl, boolean isMultiCell) throws Exception + { + test(ddl, + cluster -> + { + SetType setType = SetType.getInstance(Int32Type.instance, isMultiCell); + Set initialSet = ImmutableSet.of(1, 2); + ByteBuffer initialSetBytes = setType.getSerializer().serialize(initialSet); + + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_set) VALUES (0, ?);", ConsistencyLevel.ALL, initialSetBytes); + + String insert = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.int_set;\n" + + " IF row1.int_set IS NOT NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, int_set) VALUES (?, row1.int_set);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { initialSet }, insert, 0, 1); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 1, initialSet }, check, 1); + } + ); + } + + @Test + public void testMultiCellMapSubstitution() throws Exception + { + testMapSubstitution("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_map map) WITH transactional_mode='" + transactionalMode + "'", true); + } + + @Test + public void testFrozenMapSubstitution() throws Exception + { + testMapSubstitution("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_map frozen>) WITH transactional_mode='" + transactionalMode + "'", false); + } + + private void testMapSubstitution(String ddl, boolean isMultiCell) throws Exception + { + test(ddl, + cluster -> + { + MapType mapType = MapType.getInstance(UTF8Type.instance, Int32Type.instance, isMultiCell); + Map initialMap = ImmutableMap.of("one", 1, "two", 2); + ByteBuffer initialMapBytes = mapType.getSerializer().serialize(initialMap); + + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_map) VALUES (0, ?);", ConsistencyLevel.ALL, initialMapBytes); + + String insert = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.int_map;\n" + + " IF row1.int_map IS NOT NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, int_map) VALUES (?, row1.int_map);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[]{ initialMap }, insert, 0, 1); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 1, initialMap }, check, 1); + } + ); + } + + @Test + public void testMultiCellUDTSubstitution() throws Exception + { + testUDTSubstitution("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, customer person) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testFrozenUDTSubstitution() throws Exception + { + testUDTSubstitution("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, customer frozen) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testUDTSubstitution(String tableDDL) throws Exception + { + test(tableDDL, + cluster -> + { + Object initialPersonValue = CQLTester.userType("height", 74, "age", 37); + ByteBuffer initialPersonBuffer = CQLTester.makeByteBuffer(initialPersonValue, null); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, customer) VALUES (0, ?);", ConsistencyLevel.ALL, initialPersonBuffer); + + String insert = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.customer;\n" + + " IF row1.customer IS NOT NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, customer) VALUES (?, row1.customer);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[]{ initialPersonBuffer }, insert, 0, 1); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 1, initialPersonBuffer }, check, 1); + } + ); + } + + @Test + public void testTupleSubstitution() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, pair tuple) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + Object initialTupleValue = CQLTester.tuple("age", 37); + ByteBuffer initialTupleBuffer = CQLTester.makeByteBuffer(initialTupleValue, null); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, pair) VALUES (0, ?);", ConsistencyLevel.ALL, initialTupleBuffer); + + String insert = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.pair;\n" + + " IF row1.pair IS NOT NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, pair) VALUES (?, row1.pair);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { initialTupleBuffer }, insert, 0, 1); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 1, initialTupleBuffer }, check, 1); + } + ); + } + + @Test + public void testMultiCellListReplacement() throws Exception + { + testListReplacement("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_list list) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testFrozenListReplacement() throws Exception + { + testListReplacement("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_list frozen>) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testListReplacement(String ddl) throws Exception + { + test(ddl, + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_list) VALUES (0, [1, 2]);", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_list) VALUES (1, [3, 4]);", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_list;\n" + + " IF row1.int_list = [3, 4] THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_list = row1.int_list WHERE k=0;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {Arrays.asList(3, 4)}, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {0, Arrays.asList(3, 4)}, check); + } + ); + } + + @Test + public void testMultiCellSetReplacement() throws Exception + { + testSetReplacement("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_set set) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testFrozenSetReplacement() throws Exception + { + testSetReplacement("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_set frozen>) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testSetReplacement(String ddl) throws Exception + { + test(ddl, + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_set) VALUES (0, {1, 2});", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_set) VALUES (1, {3, 4});", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_set;\n" + + " IF row1.int_set = {3, 4} THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_set = row1.int_set WHERE k=0;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { ImmutableSet.of(3, 4) }, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, ImmutableSet.of(3, 4) }, check); + } + ); + } + + @Test + public void testListAppendFromReference() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_list list) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_list) VALUES (0, [1, 2]);", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_list) VALUES (1, [3, 4]);", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_list;\n" + + " IF row1.int_list = [3, 4] THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_list += row1.int_list WHERE k=0;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {Arrays.asList(3, 4)}, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {0, Arrays.asList(1, 2, 3, 4)}, check); + } + ); + } + + @Test + public void testSetByIndexFromMultiCellListElement() throws Exception + { + testListSetByIndexFromListElement("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, src_int_list list, dest_int_list list) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testSetByIndexFromFrozenListElement() throws Exception + { + testListSetByIndexFromListElement("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, src_int_list frozen>, dest_int_list list) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testListSetByIndexFromListElement(String ddl) throws Exception + { + test(ddl, + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, dest_int_list) VALUES (0, [1, 2]);", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, src_int_list) VALUES (1, [3, 4]);", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.src_int_list;\n" + + " UPDATE " + qualifiedAccordTableName + " SET dest_int_list[0] = row1.src_int_list[0] WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {Arrays.asList(3, 4)}, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT dest_int_list FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {Arrays.asList(3, 2)}, check); + } + ); + } + + @Test + public void testListSetByIndexFromScalar() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_list list) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_list) VALUES (0, [1, 2]);", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row0 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0);\n" + + " SELECT row0.int_list;\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_list[0] = 2 WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {Arrays.asList(1, 2)}, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT int_list FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {Arrays.asList(2, 2)}, check); + } + ); + } + + @Test + public void testAutoReadSelectionConstruction() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, counter int, other_counter int, PRIMARY KEY (k, c)) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, counter, other_counter) VALUES (0, 0, 1, 1);", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, counter, other_counter) VALUES (0, 1, 1, 1);", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row0 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0 AND c = 0);\n" + + " SELECT row0.counter, row0.other_counter;\n" + + " UPDATE " + qualifiedAccordTableName + " SET other_counter += 1, counter += row0.counter WHERE k = 0 AND c = 1;\n" + + "COMMIT TRANSACTION"; + assertRowEquals(cluster, new Object[] { 1, 1 }, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT counter, other_counter FROM " + qualifiedAccordTableName + " WHERE k = 0 AND c = 1;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 2, 2 }, check); + } + ); + } + + @Test + public void testMultiMutationsSameKey() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, counter int, int_list list, PRIMARY KEY (k, c)) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, counter, int_list) VALUES (0, 0, 0, [1, 2]);", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row0 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0 AND c = 0);\n" + + " SELECT row0.counter, row0.int_list;\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_list[0] = 42 WHERE k = 0 AND c = 0;\n" + + " UPDATE " + qualifiedAccordTableName + " SET counter += 1 WHERE k = 0 AND c = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEquals(cluster, new Object[] { 0, Arrays.asList(1, 2) }, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT counter, int_list FROM " + qualifiedAccordTableName + " WHERE k = 0 AND c = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {1, Arrays.asList(42, 2)}, check); + } + ); + } + + @Test + public void testLetLargerThanOneWithPK() throws Exception + { + test(cluster -> { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (0, 0, 0);", ConsistencyLevel.ALL); + + String cql = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k=0 AND c=0 LIMIT 2);\n" + + " SELECT row1.v;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[]{ 0 }, cql, 1); + }); + } + + @Test + public void testLetLimitUsingBind() throws Exception + { + test(cluster -> { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (0, 0, 0);", ConsistencyLevel.ALL); + + String cql = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0 LIMIT ?);\n" + + " SELECT row1.v;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0 }, cql, 1); + }); + } + + @Test + public void testListSetByIndexMultiRow() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, int_list list, PRIMARY KEY (k, c)) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, int_list) VALUES (0, 0, [1, 2]);", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, int_list) VALUES (0, 1, [3, 4]);", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row0 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0 AND c = 0);\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0 AND c = 1);\n" + + " SELECT row0.int_list;\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_list[0] = row1.int_list[0] WHERE k = 0 AND c = 0;\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_list[0] = row0.int_list[0] WHERE k = 0 AND c = 1;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { Arrays.asList(1, 2) }, update); + + String check = "BEGIN TRANSACTION\n" + + " LET row0 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0 AND c = 0);\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0 AND c = 1);\n" + + " SELECT row0.int_list, row1.int_list;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {Arrays.asList(3, 2), Arrays.asList(1, 4)}, check); + } + ); + } + + @Test + public void testSetAppend() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_set set) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_set) VALUES (0, {1, 2});", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_set) VALUES (1, {3, 4});", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_set;\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_set += row1.int_set WHERE k=0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { ImmutableSet.of(3, 4) }, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, ImmutableSet.of(1, 2, 3, 4) }, check); + } + ); + } + + @Test + public void testAssignmentFromMultiCellSetElement() throws Exception + { + testAssignmentFromSetElement("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, v int, int_set set) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testAssignmentFromFrozenSetElement() throws Exception + { + testAssignmentFromSetElement("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, v int, int_set frozen>) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testAssignmentFromSetElement(String ddl) throws Exception + { + test(ddl, + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, v, int_set) VALUES (0, 0, {1, 2});", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, v, int_set) VALUES (1, 0, {3, 4});", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_set;\n" + + " UPDATE " + qualifiedAccordTableName + " SET v = row1.int_set[4] WHERE k=0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { ImmutableSet.of(3, 4) }, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT v FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 4 }, check); + } + ); + } + + @Test + public void testMapAppend() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_map map) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_map) VALUES (0, {'one': 2});", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_map) VALUES (1, {'three': 4});", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_map;\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_map += row1.int_map WHERE k=0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { ImmutableMap.of("three", 4) }, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, ImmutableMap.of("one", 2, "three", 4) }, check); + } + ); + } + + @Test + public void testAssignmentFromMultiCellMapElement() throws Exception + { + testAssignmentFromMapElement("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, v int, int_map map) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testAssignmentFromFrozenMapElement() throws Exception + { + testAssignmentFromMapElement("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, v int, int_map frozen>) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testAssignmentFromMapElement(String ddl) throws Exception + { + test(ddl, + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, v, int_map) VALUES (0, 0, {'one': 2});", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, v, int_map) VALUES (1, 0, {'three': 4});", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_map;\n" + + " UPDATE " + qualifiedAccordTableName + " SET v = row1.int_map[?] WHERE k=0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { ImmutableMap.of("three", 4) }, update, "three"); + + String check = "BEGIN TRANSACTION\n" + + " SELECT v FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 4 }, check); + } + ); + } + + @Test + public void testAssignmentFromMultiCellUDTField() throws Exception + { + testAssignmentFromUDTField("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, v int, customer person) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testAssignmentFromFrozenUDTField() throws Exception + { + testAssignmentFromUDTField("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, v int, customer frozen) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testAssignmentFromUDTField(String tableDDL) throws Exception + { + test(tableDDL, + cluster -> + { + Object initialPersonValue = CQLTester.userType("height", 74, "age", 37); + ByteBuffer initialPersonBuffer = CQLTester.makeByteBuffer(initialPersonValue, null); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, v, customer) VALUES (0, 0, null);", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, v, customer) VALUES (1, 0, ?);", ConsistencyLevel.ALL, initialPersonBuffer); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.customer;\n" + + " UPDATE " + qualifiedAccordTableName + " SET v = row1.customer.age WHERE k=0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { initialPersonBuffer }, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT v FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 37 }, check); + } + ); + } + + @Test + public void testSetMapElementFromMapElementReference() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_map map) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_map) VALUES (0, {'one': 2});", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_map) VALUES (1, {'three': 4});", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_map;\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_map[?] = row1.int_map[?] WHERE k=0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { ImmutableMap.of("three", 4) }, update, "one", "three"); + + String check = "BEGIN TRANSACTION\n" + + " SELECT int_map[?] FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 4 }, check, "one"); + } + ); + } + + @Test + public void testSetUDTFieldFromUDTFieldReference() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, customer person) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + Object youngPerson = CQLTester.userType("height", 58, "age", 9); + ByteBuffer youngPersonBuffer = CQLTester.makeByteBuffer(youngPerson, null); + Object adultPerson = CQLTester.userType("height", 74, "age", 37); + ByteBuffer adultPersonBuffer = CQLTester.makeByteBuffer(adultPerson, null); + + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, customer) VALUES (0, ?);", ConsistencyLevel.ALL, youngPersonBuffer); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, customer) VALUES (1, ?);", ConsistencyLevel.ALL, adultPersonBuffer); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.customer;\n" + + " UPDATE " + qualifiedAccordTableName + " SET customer.age = row1.customer.age WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { adultPersonBuffer }, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT customer.height, customer.age FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 58, 37 }, check); + } + ); + } + + @Test + public void testMultiCellListElementCondition() throws Exception + { + testListElementCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_list list) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testFrozenListElementCondition() throws Exception + { + testListElementCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_list frozen>) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testListElementCondition(String ddl) throws Exception + { + test(ddl, + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_list) VALUES (0, [1, 2]);", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_list) VALUES (1, [3, 4]);", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_list;\n" + + " IF row1.int_list[1] = 4 THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_list = [3, 4] WHERE k = 0;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { ImmutableList.of(3, 4) }, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, ImmutableList.of(3, 4) }, check); + } + ); + } + + @Test + public void testMultiCellMapElementCondition() throws Exception + { + testMapElementCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_map map) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testFrozenMapElementCondition() throws Exception + { + testMapElementCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_map frozen>) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testMapElementCondition(String ddl) throws Exception + { + test(ddl, + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_map) VALUES (0, {'one': 2});", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_map) VALUES (1, {'three': 4});", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_map;\n" + + " IF row1.int_map[?] = 4 THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_map = {'three': 4} WHERE k = 0;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { ImmutableMap.of("three", 4) }, update, "three"); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, ImmutableMap.of("three", 4) }, check); + } + ); + } + + @Test + public void testMultiCellUDTFieldCondition() throws Exception + { + testUDTFieldCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, customer person) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testFrozenUDTFieldCondition() throws Exception + { + testUDTFieldCondition("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, customer frozen) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testUDTFieldCondition(String tableDDL) throws Exception + { + test(tableDDL, + cluster -> + { + Object initialPersonValue = CQLTester.userType("height", 74, "age", 37); + ByteBuffer initialPersonBuffer = CQLTester.makeByteBuffer(initialPersonValue, null); + + String insert = "BEGIN TRANSACTION\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, customer) VALUES (?, ?);\n" + + "COMMIT TRANSACTION"; + SimpleQueryResult result = cluster.coordinator(1).executeWithResult(insert, ConsistencyLevel.ANY, 0, initialPersonBuffer); + assertFalse(result.hasNext()); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, initialPersonBuffer }, check, 0); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row1.customer;\n" + + " IF row1.customer.age = 37 THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET customer = ? WHERE k = ?;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + + Object updatedPersonValue = CQLTester.userType("height", 73, "age", 40); + ByteBuffer updatedPersonBuffer = CQLTester.makeByteBuffer(updatedPersonValue, null); + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { initialPersonBuffer }, update, 0, updatedPersonBuffer, 0); + + String checkUpdate = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, updatedPersonBuffer }, checkUpdate, 0); + } + ); + } + + @Test + public void testListSubtraction() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_list list) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_list) VALUES (0, [1, 2, 3, 4]);", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_list) VALUES (1, [3, 4]);", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_list;\n" + + " IF row1.int_list = [3, 4] THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_list -= row1.int_list WHERE k=0;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {Arrays.asList(3, 4)}, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {0, Arrays.asList(1, 2)}, check); + } + ); + } + + @Test + public void testSetSubtraction() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_set set) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_set) VALUES (0, {1, 2, 3, 4});", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_set) VALUES (1, {3, 4});", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_set;\n" + + " IF row1.int_set = {3, 4} THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_set -= row1.int_set WHERE k=0;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { ImmutableSet.of(3, 4) }, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, ImmutableSet.of(1, 2) }, check); + } + ); + } + + @Test + public void testMultiCellMapSubtraction() throws Exception + { + testMapSubtraction("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_map map, int_set set) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testFrozenMapSubtraction() throws Exception + { + testMapSubtraction("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_map map, int_set frozen>) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testMapSubtraction(String ddl) throws Exception + { + test(ddl, + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_map) VALUES (0, { 'one': 2, 'three': 4 });", ConsistencyLevel.ALL); + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_set) VALUES (1, { 'three' });", ConsistencyLevel.ALL); + + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_set;\n" + + " IF row1.int_set = { 'three' } THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET int_map -= row1.int_set WHERE k=0;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { ImmutableSet.of("three") }, update); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, ImmutableMap.of("one", 2), null}, check); + } + ); + } + + @Test + public void testMultiCellListSelection() throws Exception + { + testListSelection("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_list list) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testFrozenListSelection() throws Exception + { + testListSelection("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_list frozen>) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testListSelection(String ddl) throws Exception + { + test(ddl, + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_list) VALUES (1, [10, 20, 30, 40]);", ConsistencyLevel.ALL); + + String selectEntireSet = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_list;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { ImmutableList.of(10, 20, 30, 40) }, selectEntireSet); + + String selectSingleElement = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_list[0];\n" + + "COMMIT TRANSACTION"; + + SimpleQueryResult result = executeWithRetry(cluster, selectSingleElement); + // TODO: Improve user frieldliness of the hex key name here... + Assertions.assertThat(result.names()).contains("row1.int_list[0x00000000]"); + Assertions.assertThat(result.toObjectArrays()).isEqualTo(new Object[] { new Object[] { 10 } }); + } + ); + } + + @Test + public void testMultiCellSetSelection() throws Exception + { + testSetSelection("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_set set) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testFrozenSetSelection() throws Exception + { + testSetSelection("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_set frozen>) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testSetSelection(String ddl) throws Exception + { + test(ddl, + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_set) VALUES (1, {10, 20, 30, 40});", ConsistencyLevel.ALL); + + String selectEntireSet = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_set;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { ImmutableSet.of(10, 20, 30, 40) }, selectEntireSet); + + String selectSingleElement = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_set[10];\n" + + "COMMIT TRANSACTION"; + + SimpleQueryResult result = executeWithRetry(cluster, selectSingleElement); + // TODO: Improve user frieldliness of the hex key name here... + Assertions.assertThat(result.names()).contains("row1.int_set[0x0000000a]"); + Assertions.assertThat(result.toObjectArrays()).isEqualTo(new Object[] { new Object[] { 10 } }); + } + ); + } + + @Test + public void testMultiCellMapSelection() throws Exception + { + testMapSelection("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_map map) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testFrozenMapSelection() throws Exception + { + testMapSelection("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, int_map frozen>) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testMapSelection(String ddl) throws Exception + { + test(ddl, + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, int_map) VALUES (1, { 'ten': 20, 'thirty': 40 });", ConsistencyLevel.ALL); + + String selectEntireMap = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_map;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { ImmutableMap.of("ten", 20, "thirty", 40) }, selectEntireMap); + + String selectSingleElement = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1);\n" + + " SELECT row1.int_map['ten'];\n" + + "COMMIT TRANSACTION"; + + SimpleQueryResult result = executeWithRetry(cluster, selectSingleElement); + Assertions.assertThat(result.names()).contains("row1.int_map[" + Bytes.toHexString("ten".getBytes()) + ']'); + Assertions.assertThat(result.toObjectArrays()).isEqualTo(new Object[] { new Object[] { 20 } }); + } + ); + } + + @Test + public void testScalarUpdateSubstitution() + { + String KEYSPACE = "ks" + System.currentTimeMillis(); + SHARED_CLUSTER.schemaChange("CREATE KEYSPACE " + KEYSPACE + " WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor': 2}"); + SHARED_CLUSTER.schemaChange("CREATE TABLE " + qualifiedAccordTableName + "1 (k int, c int, v int, primary key (k, c)) WITH transactional_mode='" + transactionalMode + "'"); + SHARED_CLUSTER.schemaChange("CREATE TABLE " + qualifiedAccordTableName + "2 (k int, c int, v int, primary key (k, c)) WITH transactional_mode='" + transactionalMode + "'"); + SHARED_CLUSTER.forEach(node -> node.runOnInstance(() -> AccordService.instance().setCacheSize(0))); + SHARED_CLUSTER.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + "1 (k, c, v) VALUES (1, 2, 3);", ConsistencyLevel.ALL); + SHARED_CLUSTER.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + "2 (k, c, v) VALUES (2, 2, 4);", ConsistencyLevel.ALL); + + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + "1 WHERE k=1 AND c=2);\n" + + " LET row2 = (SELECT * FROM " + qualifiedAccordTableName + "2 WHERE k=2 AND c=2);\n" + + " SELECT v FROM " + qualifiedAccordTableName + "1 WHERE k=1 AND c=2;\n" + + " IF row1.v = 3 AND row2.v = 4 THEN\n" + + " UPDATE " + qualifiedAccordTableName + "1 SET v = row2.v WHERE k=1 AND c=2;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + + Object[][] result = SHARED_CLUSTER.coordinator(1).execute(query, ConsistencyLevel.ANY); + assertEquals(3, result[0][0]); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + "1 WHERE k=1 AND c=2;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(SHARED_CLUSTER, new Object[]{1, 2, 4}, check); + } + + @Test + public void testRegularScalarInsertSubstitution() throws Exception + { + testScalarInsertSubstitution("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int, PRIMARY KEY (k, c)) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testStaticScalarInsertSubstitution() throws Exception + { + testScalarInsertSubstitution("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int static, PRIMARY KEY (k, c)) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testScalarInsertSubstitution(String tableDDL) throws Exception + { + test(tableDDL, + cluster -> + { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (0, 0, 1);", ConsistencyLevel.ALL); + + String insert = "BEGIN TRANSACTION\n" + + " LET row0 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0 LIMIT 1);\n" + + " SELECT row0.v;\n" + + " IF row0.v IS NOT NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (0, 1, row0.v);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 1 }, insert); + + String check = "BEGIN TRANSACTION\n" + + " SELECT k, c, v FROM " + qualifiedAccordTableName + " WHERE k = 0 AND c = 1;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, 1, 1 }, check); + } + ); + } + + @Test + public void testSelectMultiCellUDTReference() throws Exception + { + testSelectUDTReference("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, customer person) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testSelectFrozenUDTReference() throws Exception + { + testSelectUDTReference("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, customer frozen) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testSelectUDTReference(String tableDDL) throws Exception + { + test(tableDDL, + cluster -> + { + Object personValue = CQLTester.userType("height", 74, "age", 37); + ByteBuffer personBuffer = CQLTester.makeByteBuffer(personValue, null); + + String insert = "BEGIN TRANSACTION\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, customer) VALUES (?, ?);\n" + + "COMMIT TRANSACTION"; + SimpleQueryResult result = cluster.coordinator(1).executeWithResult(insert, ConsistencyLevel.ANY, 0, personBuffer); + assertFalse(result.hasNext()); + + String read = "BEGIN TRANSACTION\n" + + " LET row0 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row0.customer;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { personBuffer }, read, 0); + } + ); + } + + @Test + public void testSelectMultiCellUDTFieldReference() throws Exception + { + testSelectUDTFieldReference("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, customer person) WITH transactional_mode='" + transactionalMode + "'"); + } + + @Test + public void testSelectFrozenUDTFieldReference() throws Exception + { + testSelectUDTFieldReference("CREATE TABLE " + qualifiedAccordTableName + " (k int PRIMARY KEY, customer frozen) WITH transactional_mode='" + transactionalMode + "'"); + } + + private void testSelectUDTFieldReference(String tableDDL) throws Exception + { + test(tableDDL, + cluster -> + { + Object personValue = CQLTester.userType("height", 74, "age", 37); + ByteBuffer personBuffer = CQLTester.makeByteBuffer(personValue, null); + + String insert = "BEGIN TRANSACTION\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, customer) VALUES (?, ?);\n" + + "COMMIT TRANSACTION"; + SimpleQueryResult result = cluster.coordinator(1).executeWithResult(insert, ConsistencyLevel.ANY, 0, personBuffer); + assertFalse(result.hasNext()); + + String read = "BEGIN TRANSACTION\n" + + " LET row0 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?);\n" + + " SELECT row0.customer.age;\n" + + "COMMIT TRANSACTION"; + result = assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 37 }, read, 0); + // TODO: Improve user frieldliness of the field name here... + assertEquals(ImmutableList.of("row0.customer.0x0001"), result.names()); + } + ); + } + + @Test + public void testMultiKeyQueryAndInsert() throws Throwable + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int, primary key (k, c)) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + String query1 = "BEGIN TRANSACTION\n" + + " LET select1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k=0 AND c=0);\n" + + " LET select2 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k=1 AND c=0);\n" + + " SELECT v FROM " + qualifiedAccordTableName + " WHERE k=0 AND c=0;\n" + + " IF select1 IS NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (0, 0, 0);\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (1, 0, 0);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertEmptyWithPreemptedRetry(cluster, query1); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {0, 0, 0}, check, 0, 0); + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {1, 0, 0}, check, 1, 0); + + String query2 = "BEGIN TRANSACTION\n" + + " LET select1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k=1 AND c=0);\n" + + " LET select2 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k=2 AND c=0);\n" + + " SELECT v FROM " + qualifiedAccordTableName + " WHERE k=1 AND c=0;\n" + + " IF select1.v = ? THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (1, 0, 1);\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (2, 0, 1);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0 }, query2, 0); + + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {0, 0, 0}, check, 0, 0); + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {1, 0, 1}, check, 1, 0); + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {2, 0, 1}, check, 2, 0); + }); + } + + @Test + public void demoTest() throws Throwable + { + SHARED_CLUSTER.schemaChange("DROP KEYSPACE IF EXISTS demo_ks;"); + SHARED_CLUSTER.schemaChange("CREATE KEYSPACE demo_ks WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor':2};"); + SHARED_CLUSTER.schemaChange("CREATE TABLE demo_ks.org_docs ( org_name text, doc_id int, contents_version int static, title text, permissions int, PRIMARY KEY (org_name, doc_id) ) WITH transactional_mode='" + transactionalMode + "';"); + SHARED_CLUSTER.schemaChange("CREATE TABLE demo_ks.org_users ( org_name text, user text, members_version int static, permissions int, PRIMARY KEY (org_name, user) ) WITH transactional_mode='" + transactionalMode + "';"); + SHARED_CLUSTER.schemaChange("CREATE TABLE demo_ks.user_docs ( user text, doc_id int, title text, org_name text, permissions int, PRIMARY KEY (user, doc_id) ) WITH transactional_mode='" + transactionalMode + "';"); + + SHARED_CLUSTER.forEach(node -> node.runOnInstance(() -> AccordService.instance().setCacheSize(0))); + + SHARED_CLUSTER.coordinator(1).execute("INSERT INTO demo_ks.org_users (org_name, user, members_version, permissions) VALUES ('demo', 'blake', 5, 777);\n", ConsistencyLevel.ALL); + SHARED_CLUSTER.coordinator(1).execute("INSERT INTO demo_ks.org_users (org_name, user, members_version, permissions) VALUES ('demo', 'scott', 5, 777);\n", ConsistencyLevel.ALL); + SHARED_CLUSTER.coordinator(1).execute("INSERT INTO demo_ks.org_docs (org_name, doc_id, contents_version, title, permissions) VALUES ('demo', 100, 5, 'README', 644);\n", ConsistencyLevel.ALL); + SHARED_CLUSTER.coordinator(1).execute("INSERT INTO demo_ks.user_docs (user, doc_id, title, org_name, permissions) VALUES ('blake', 1, 'recipes', NULL, 777);\n", ConsistencyLevel.ALL); + SHARED_CLUSTER.coordinator(1).execute("INSERT INTO demo_ks.user_docs (user, doc_id, title, org_name, permissions) VALUES ('blake', 100, 'README', 'demo', 644);\n", ConsistencyLevel.ALL); + SHARED_CLUSTER.coordinator(1).execute("INSERT INTO demo_ks.user_docs (user, doc_id, title, org_name, permissions) VALUES ('scott', 2, 'to do list', NULL, 777);\n", ConsistencyLevel.ALL); + SHARED_CLUSTER.coordinator(1).execute("INSERT INTO demo_ks.user_docs (user, doc_id, title, org_name, permissions) VALUES ('scott', 100, 'README', 'demo', 644);\n", ConsistencyLevel.ALL); + + String addDoc = "BEGIN TRANSACTION\n" + + " LET demo_user = (SELECT * FROM demo_ks.org_users WHERE org_name='demo' LIMIT 1);\n" + + " LET existing = (SELECT * FROM demo_ks.org_docs WHERE org_name='demo' AND doc_id=101);\n" + + " SELECT members_version FROM demo_ks.org_users WHERE org_name='demo' LIMIT 1;\n" + + " IF demo_user.members_version = 5 AND existing IS NULL THEN\n" + + " UPDATE demo_ks.org_docs SET title='slides.key', permissions=777, contents_version = 6 WHERE org_name='demo' AND doc_id=101;\n" + + " UPDATE demo_ks.user_docs SET title='slides.key', permissions=777 WHERE user='blake' AND doc_id=101;\n" + + " UPDATE demo_ks.user_docs SET title='slides.key', permissions=777 WHERE user='scott' AND doc_id=101;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEquals(SHARED_CLUSTER, new Object[] { 5 }, addDoc); + + String addUser = "BEGIN TRANSACTION\n" + + " LET demo_doc = (SELECT * FROM demo_ks.org_docs WHERE org_name='demo' LIMIT 1);\n" + + " LET existing = (SELECT * FROM demo_ks.org_users WHERE org_name='demo' AND user='benedict');\n" + + " SELECT contents_version FROM demo_ks.org_docs WHERE org_name='demo' LIMIT 1;\n" + + " IF demo_doc.contents_version = 6 AND existing IS NULL THEN\n" + + " UPDATE demo_ks.org_users SET permissions=777, members_version += 1 WHERE org_name='demo' AND user='benedict';\n" + + " UPDATE demo_ks.user_docs SET title='README', permissions=644 WHERE user='benedict' AND doc_id=100;\n" + + " UPDATE demo_ks.user_docs SET title='slides.key', permissions=777 WHERE user='benedict' AND doc_id=101;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEquals(SHARED_CLUSTER, new Object[] { 6 }, addUser); + } + + // TODO: Implement support for basic arithmetic on references in INSERT + @Ignore + @Test + public void testReferenceArithmeticInInsert() throws Exception + { + test(cluster -> { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (0, 0, 0)", ConsistencyLevel.ALL); + + String cql = "BEGIN TRANSACTION\n" + + " LET a = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k=0 AND c=0);\n" + + " IF a IS NOT NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (0, 1, a.v + 1);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertEmptyWithPreemptedRetry(cluster, cql); + }); + } + + // TODO: Implement support for basic arithmetic on references in UPDATE + @Ignore + @Test + public void testReferenceArithmeticInUpdate() throws Exception + { + test(cluster -> { + cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (0, 0, 0)", ConsistencyLevel.ALL); + + String cql = "BEGIN TRANSACTION\n" + + " LET a = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k=0 AND c=0);\n" + + " IF a IS NOT NULL THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET v = a.v + 1 WHERE k = 0 and c = 1;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertEmptyWithPreemptedRetry(cluster, cql); + }); + } + + @Test + public void testCASAndSerialRead() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (id int, c int, v int, s int static, PRIMARY KEY ((id), c)) WITH transactional_mode='" + transactionalMode + "';", + cluster -> { + ICoordinator coordinator = cluster.coordinator(1); + int startingAccordCoordinateCount = getAccordCoordinateCount(); + assertRowEquals(cluster, new Object[]{false}, "UPDATE " + qualifiedAccordTableName + " SET v = 4 WHERE id = 1 AND c = 2 IF EXISTS"); + assertRowEquals(cluster, new Object[]{false}, "UPDATE " + qualifiedAccordTableName + " SET v = 4 WHERE id = 1 AND c = 2 IF v = 3"); + coordinator.execute("INSERT INTO " + qualifiedAccordTableName + " (id, c, v, s) VALUES (1, 2, 3, 5);", ConsistencyLevel.ALL); + assertRowSerial(cluster, "SELECT id, c, v, s FROM " + qualifiedAccordTableName + " WHERE id = 1 AND c = 2", 1, 2, 3, 5); + assertRowEquals(cluster, new Object[]{true}, "UPDATE " + qualifiedAccordTableName + " SET v = 4 WHERE id = 1 AND c = 2 IF v = 3"); + assertRowSerial(cluster, "SELECT id, c, v, s FROM " + qualifiedAccordTableName + " WHERE id = 1 AND c = 2", 1, 2, 4, 5); + assertRowEquals(cluster, new Object[]{ false, 4 }, "UPDATE " + qualifiedAccordTableName + " SET v = 4 WHERE id = 1 AND c = 2 IF v = 3"); + assertRowSerial(cluster, "SELECT id, c, v, s FROM " + qualifiedAccordTableName + " WHERE id = 1 AND c = 2", 1, 2, 4, 5); + + // Test working with a static column + assertRowEquals(cluster, new Object[]{ false, 5 }, "UPDATE " + qualifiedAccordTableName + " SET v = 5 WHERE id = 1 AND c = 2 IF s = 4"); + assertRowSerial(cluster, "SELECT id, c, v, s FROM " + qualifiedAccordTableName + " WHERE id = 1 AND c = 2", 1, 2, 4, 5); + assertRowEquals(cluster, new Object[]{true}, "UPDATE " + qualifiedAccordTableName + " SET v = 5 WHERE id = 1 AND c = 2 IF s = 5"); + assertRowSerial(cluster, "SELECT id, c, v, s FROM " + qualifiedAccordTableName + " WHERE id = 1 AND c = 2", 1, 2, 5, 5); + assertRowEquals(cluster, new Object[]{true}, "UPDATE " + qualifiedAccordTableName + " SET s = 6 WHERE id = 1 IF s = 5"); + assertRowSerial(cluster, "SELECT id, c, v, s FROM " + qualifiedAccordTableName + " WHERE id = 1 AND c = 2", 1, 2, 5, 6); + + // Test that read before write works with CAS + assertRowEquals(cluster, new Object[]{true}, "UPDATE " + qualifiedAccordTableName + " SET s +=1, v += 1 WHERE id = 1 AND c = 2 IF EXISTS"); + assertRowSerial(cluster, "SELECT id, c, v, s FROM " + qualifiedAccordTableName + " WHERE id = 1 AND c = 2", 1, 2, 6, 7); + + // Check range deletion works + coordinator.execute("INSERT INTO " + qualifiedAccordTableName + " (id, c, v, s) VALUES (1, 2, 6, 7);", ConsistencyLevel.ALL); + coordinator.execute("INSERT INTO " + qualifiedAccordTableName + " (id, c, v) VALUES (1, 3, 3);", ConsistencyLevel.ALL); + assertRowEquals(cluster, new Object[]{true}, "BEGIN BATCH \n" + + "UPDATE " + qualifiedAccordTableName + " SET s +=1, v += 1 WHERE id = 1 AND c = 2 IF EXISTS; \n" + + "DELETE FROM " + qualifiedAccordTableName + " WHERE id = 1 AND c > 0 AND c < 10; \n" + + "APPLY BATCH;"); + Object[][] rangeDeletionCheck = coordinator.execute("SELECT id, c, v, s FROM " + qualifiedAccordTableName + " WHERE id = 1", ConsistencyLevel.SERIAL); + assertArrayEquals(new Object[] { 1, 2, 7, 8 }, rangeDeletionCheck[0]); + assertEquals(1, rangeDeletionCheck.length); + + // Make sure all the consensus using queries actually were run on Accord + if (transactionalMode.nonSerialWritesThroughAccord) + assertEquals( 20, getAccordCoordinateCount() - startingAccordCoordinateCount); + else + // Non-serial writes don't go through Accord in these modes + assertEquals( 17, getAccordCoordinateCount() - startingAccordCoordinateCount); + }); + } + + // Reproduces some bugs that simulator finds + @Test + public void testCASSimulatorLite() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (pk int, count int, seq1 text, seq2 list, PRIMARY KEY (pk)) WITH transactional_mode='" + transactionalMode + "'", + cluster -> { + ICoordinator coordinator = cluster.coordinator(1); + coordinator.execute("INSERT INTO " + qualifiedAccordTableName + " (pk, count, seq1, seq2) VALUES (1, 0, '', []) USING TIMESTAMP 0", ConsistencyLevel.ALL); + + ListType LIST_TYPE = ListType.getInstance(Int32Type.instance, true); + ExecutorService es = Executors.newCachedThreadPool(); + List> futures = new ArrayList<>(); + for (int ii = 0; ii < 10; ii++) + { + int id = ii; + futures.add(es.submit(() -> coordinator.execute("UPDATE " + qualifiedAccordTableName + " SET count = count + 1, seq1 = seq1 + ?, seq2 = seq2 + ? WHERE pk = ? IF EXISTS", ConsistencyLevel.ALL, id + ",", ByteBufferUtil.getArray(LIST_TYPE.decompose(singletonList(id))), 1))); + } + for (Future f : futures) + f.get(); + + Object[][] result = coordinator.execute("SELECT pk, count, seq1, seq2 FROM " + qualifiedAccordTableName + " WHERE pk = 1", ConsistencyLevel.SERIAL); + + int[] seq1 = Arrays.stream(((String) result[0][2]).split(",")) + .filter(s -> !s.isEmpty()) + .mapToInt(Integer::parseInt) + .toArray(); + int[] seq2 = ((ArrayList) result[0][3]).stream().mapToInt(x -> x).toArray(); + logger.info("String append of ids executed {}", Arrays.toString(seq1)); + logger.info("List append of ids executed {}", Arrays.toString(seq2)); + assertArrayEquals("History doesn't match between the two columns", seq1, seq2); + }); + } + + @Test + public void testTransactionCasSimulatorLite() throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (pk int, count int, seq1 text, seq2 list, PRIMARY KEY (pk)) WITH transactional_mode='" + transactionalMode + "'", + cluster -> + { + ICoordinator coordinator = cluster.coordinator(1); + coordinator.execute("INSERT INTO " + qualifiedAccordTableName + " (pk, count, seq1, seq2) VALUES (1, 0, '', []) USING TIMESTAMP 0", ConsistencyLevel.ALL); + + ListType LIST_TYPE = ListType.getInstance(Int32Type.instance, true); + ExecutorService es = Executors.newCachedThreadPool(); + List> futures = new ArrayList<>(); + for (int ii = 0; ii < 10; ii++) + { + int id = ii; + String update = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE pk = 1);\n" + + " UPDATE " + qualifiedAccordTableName + " SET count += 1, seq1 = seq1 + ?, seq2 = seq2 + ? WHERE pk=1;\n" + + "COMMIT TRANSACTION"; + futures.add(es.submit(() -> coordinator.executeWithResult(update, ConsistencyLevel.ANY, id + ",", ByteBufferUtil.getArray(LIST_TYPE.decompose(singletonList(id)))))); + } + for (Future f : futures) + f.get(); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE pk = 1;\n" + + "COMMIT TRANSACTION"; + Object[][] result = coordinator.execute(check, ConsistencyLevel.ALL); + + int[] seq1 = Arrays.stream(((String) result[0][2]).split(",")) + .filter(s -> !s.isEmpty()) + .mapToInt(Integer::parseInt) + .toArray(); + int[] seq2 = ((ArrayList) result[0][3]).stream().mapToInt(x -> x).toArray(); + logger.info("String append of ids executed {}", Arrays.toString(seq1)); + logger.info("List append of ids executed {}", Arrays.toString(seq2)); + assertArrayEquals("History doesn't match between the two columns", seq1, seq2); + } + ); + } + + @Test + public void testSerialReadDescending() throws Throwable + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int, PRIMARY KEY(k, c)) WITH transactional_mode='" + transactionalMode + "'", + cluster -> { + ICoordinator coordinator = cluster.coordinator(1); + for (int i = 1; i <= 10; i++) + coordinator.execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (0, ?, ?) USING TIMESTAMP 0;", ConsistencyLevel.ALL, i, i * 10); + assertRowSerial(cluster, "SELECT c, v FROM " + qualifiedAccordTableName + " WHERE k=0 ORDER BY c DESC LIMIT 1", AssertUtils.row(10, 100)); + assertRowSerial(cluster, "SELECT c, v FROM " + qualifiedAccordTableName + " WHERE k=0 ORDER BY c DESC LIMIT 2", AssertUtils.row(10, 100), AssertUtils.row(9, 90)); + assertRowSerial(cluster, "SELECT c, v FROM " + qualifiedAccordTableName + " WHERE k=0 ORDER BY c DESC LIMIT 3", AssertUtils.row(10, 100), AssertUtils.row(9, 90), AssertUtils.row(8, 80)); + assertRowSerial(cluster, "SELECT c, v FROM " + qualifiedAccordTableName + " WHERE k=0 ORDER BY c DESC LIMIT 4", AssertUtils.row(10, 100), AssertUtils.row(9, 90), AssertUtils.row(8, 80), AssertUtils.row(7, 70)); + } + ); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordDropKeyspaceTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordDropKeyspaceTest.java new file mode 100644 index 000000000000..26392586c0ec --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordDropKeyspaceTest.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.io.IOException; + +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.schema.TableId; + +import static org.apache.cassandra.distributed.api.Feature.GOSSIP; +import static org.apache.cassandra.distributed.api.Feature.NATIVE_PROTOCOL; +import static org.apache.cassandra.distributed.api.Feature.NETWORK; + +public class AccordDropKeyspaceTest extends AccordDropTableBase +{ + @Test + public void dropKeyspace() throws IOException + { + int examples = 5; + int steps = 5; + try (Cluster cluster = Cluster.build(3) + .withoutVNodes() + .withConfig(c -> c.with(GOSSIP, NETWORK, NATIVE_PROTOCOL) + .set("auto_snapshot", false) + .set("accord.shard_durability_target_splits", 4) + ).start()) + { + fixDistributedSchemas(cluster); + for (int i = 0; i < examples; i++) + { + int j = 0; + try + { + addChaos(cluster, i); + init(cluster); + TableId id = createTable(cluster); + for (j = 0; j < steps; j++) + doTxn(cluster, j); + dropKeyspace(cluster); + validateAccord(cluster, id); + } + catch (Throwable t) + { + throw new AssertionError("Error at example " + i + ", " + j, t); + } + } + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordDropTableBase.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordDropTableBase.java new file mode 100644 index 000000000000..9b67dc4453c9 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordDropTableBase.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.util.UUID; + +import com.google.common.base.Throwables; + +import accord.api.RoutingKey; +import accord.local.CommandStores; +import accord.local.KeyHistory; +import accord.local.PreLoadContext; +import accord.local.cfk.CommandsForKey; +import accord.primitives.Ranges; +import accord.primitives.TxnId; +import accord.utils.async.AsyncChains; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.ICoordinator; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.shared.ClusterUtils; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.AccordCommandStore; +import org.apache.cassandra.service.accord.AccordSafeCommandStore; +import org.apache.cassandra.service.accord.AccordSafeCommandsForKey; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.TokenRange; +import org.assertj.core.api.Assertions; + +import static org.apache.cassandra.config.DatabaseDescriptor.getPartitioner; + +public class AccordDropTableBase extends TestBaseImpl +{ + protected static void addChaos(Cluster cluster, int example) + { + cluster.filters().reset(); + cluster.filters().verbs(Verb.ACCORD_APPLY_REQ.id).from(1).to(3).drop(); + } + + protected static void doTxn(Cluster cluster, int step) + { + int stepId = step % 3; + int partitionId = step % 10; + int coordinatorId = (step % 2) + 1; // avoid node3 as it can't get applies from node1, so leads to user errors + ICoordinator coordinator = cluster.coordinator(coordinatorId); + switch (stepId) + { + case 0: // insert + retry(3, () -> coordinator.executeWithResult(wrapInTxn(withKeyspace("INSERT INTO %s.tbl(pk, v) VALUES (?, ?);")), ConsistencyLevel.ANY, partitionId, step)); + break; + case 1: // insert + read + retry(3, () -> coordinator.executeWithResult(wrapInTxn(withKeyspace("UPDATE %s.tbl SET v+=1 WHERE pk=?;")), ConsistencyLevel.ANY, partitionId)); + break; + case 2: // read + retry(3, () -> coordinator.executeWithResult(wrapInTxn(withKeyspace("SELECT * FROM %s.tbl WHERE pk=?")), ConsistencyLevel.ANY, partitionId)); + break; + default: + throw new UnsupportedOperationException(); + } + } + + protected static void retry(int maxAttempts, Runnable fn) + { + for (int i = 0; i < maxAttempts; i++) + { + try + { + fn.run(); + } + catch (Throwable t) + { + if (i == (maxAttempts - 1)) + throw t; + } + } + } + + protected static TableId createTable(Cluster cluster) + { + cluster.schemaChange(withKeyspace("CREATE TABLE %s.tbl(pk int PRIMARY KEY, v int) WITH transactional_mode='full'")); + return ClusterUtils.tableId(cluster, KEYSPACE, "tbl"); + } + + protected void dropKeyspace(Cluster cluster) + { + // drop keyspace should be rejected as there is an accord table... so validate that is true then do both + try + { + cluster.schemaChange(withKeyspace("DROP KEYSPACE %s")); + } + catch (Throwable t) + { + Assertions.assertThat(Throwables.getRootCause(t)) + .hasMessage("Cannot drop keyspace 'distributed_test_keyspace' as it contains accord tables. (distributed_test_keyspace.tbl)"); + } + + // now do it for real + dropTable(cluster); + cluster.schemaChange(withKeyspace("DROP KEYSPACE %s")); + } + + protected static void dropTable(Cluster cluster) + { + cluster.schemaChange(withKeyspace("DROP TABLE %s.tbl")); + } + + protected static void validateAccord(Cluster cluster, TableId id) + { + String s = id.toString(); + for (IInvokableInstance inst : cluster) + { + inst.runOnInstance(() -> { + TableId tableId = TableId.fromUUID(UUID.fromString(s)); + AccordService accord = (AccordService) AccordService.instance(); + PreLoadContext ctx = PreLoadContext.contextFor(Ranges.single(TokenRange.fullRange(tableId, getPartitioner())), KeyHistory.SYNC); + CommandStores stores = accord.node().commandStores(); + for (int storeId : stores.ids()) + { + AccordCommandStore store = (AccordCommandStore) stores.forId(storeId); + AsyncChains.getUnchecked(store.submit(ctx, input -> { + AccordSafeCommandStore safe = (AccordSafeCommandStore) input; + for (RoutingKey key : safe.commandsForKeysKeys()) + { + AccordSafeCommandsForKey safeCFK = (AccordSafeCommandsForKey) safe.ifLoadedAndInitialised(key); + if (safeCFK == null) // we read and found a key, but its null at load time... so ignore it + continue; + CommandsForKey cfk = safeCFK.current(); + CommandsForKey.TxnInfo minUndecided = cfk.minUndecided(); + if (minUndecided != null) + throw new AssertionError("Undecided txn: " + minUndecided); + TxnId next = cfk.nextWaitingToApply(); + if (next != null) + throw new AssertionError("Unapplied txn: " + next); + } + return null; + })); + } + }); + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordDropTableTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordDropTableTest.java new file mode 100644 index 000000000000..15455978b4b2 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordDropTableTest.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.io.IOException; + +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.schema.TableId; + +import static org.apache.cassandra.distributed.api.Feature.GOSSIP; +import static org.apache.cassandra.distributed.api.Feature.NATIVE_PROTOCOL; +import static org.apache.cassandra.distributed.api.Feature.NETWORK; + +public class AccordDropTableTest extends AccordDropTableBase +{ + @Test + public void dropTable() throws IOException + { + int examples = 5; + int steps = 5; + try (Cluster cluster = Cluster.build(3) + .withoutVNodes() + .withConfig(c -> c.with(GOSSIP, NETWORK, NATIVE_PROTOCOL) + .set("accord.shard_durability_target_splits", 4) + .set("auto_snapshot", false)) + .start()) + { + fixDistributedSchemas(cluster); + init(cluster); + for (int i = 0; i < examples; i++) + { + int j = 0; + try + { + addChaos(cluster, i); + TableId id = createTable(cluster); + for (j = 0; j < steps; j++) + doTxn(cluster, j); + dropTable(cluster); + validateAccord(cluster, id); + } + catch (Throwable t) + { + throw new AssertionError("Error at example " + i + ", " + j, t); + } + } + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordFeatureFlagTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordFeatureFlagTest.java new file mode 100644 index 000000000000..133d2659a8f8 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordFeatureFlagTest.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.io.IOException; + +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.utils.AssertionUtils; +import org.assertj.core.api.Assertions; + +public class AccordFeatureFlagTest extends TestBaseImpl +{ + @Test + public void shouldHideAccordTransactions() throws IOException + { + try (Cluster cluster = init(Cluster.build(1) + .withoutVNodes() + .withConfig(c -> c.with(Feature.NETWORK).set("accord.enabled", "false")) + .start())) + { + Assertions.assertThatThrownBy(() -> cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (k int, c int, v int, primary key (k, c)) WITH transactional_mode='full'")) + .has(AssertionUtils.isThrowableInstanceof(InvalidRequestException.class)) + .hasMessageContaining("accord.enabled"); + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordHostReplacementTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordHostReplacementTest.java new file mode 100644 index 000000000000..3d94fe226db3 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordHostReplacementTest.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.io.IOException; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.TokenSupplier; +import org.apache.cassandra.distributed.shared.ClusterUtils; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.harry.SchemaSpec; +import org.apache.cassandra.harry.dsl.HistoryBuilder; +import org.apache.cassandra.harry.dsl.ReplayingHistoryBuilder; +import org.apache.cassandra.harry.execution.RingAwareInJvmDTestVisitExecutor; +import org.apache.cassandra.harry.gen.Generator; +import org.apache.cassandra.harry.gen.Generators; +import org.apache.cassandra.harry.gen.SchemaGenerators; +import org.apache.cassandra.harry.model.TokenPlacementModel; +import org.apache.cassandra.service.consensus.TransactionalMode; + +import static org.apache.cassandra.distributed.shared.ClusterUtils.stopUnchecked; +import static org.apache.cassandra.distributed.shared.ClusterUtils.waitForCMSToQuiesce; +import static org.apache.cassandra.harry.checker.TestHelper.withRandom; + +public class AccordHostReplacementTest extends TestBaseImpl +{ + private static final Generator transactionalModeGen = Generators.pick(Stream.of(TransactionalMode.values()).filter(t -> t.accordIsEnabled).collect(Collectors.toList())); + + @Test + public void hostReplace() throws IOException + { + // start 3 node cluster, then do a host replacement of one of the nodes + Cluster.Builder clusterBuilder = Cluster.build(3) + .withConfig(c -> c.with(Feature.values()) + .set("accord.command_store_shard_count", "1") + .set("write_request_timeout", "10s") + .set("read_request_timeout", "10s") + .set("accord.queue_shard_count", "1") + ); + TokenSupplier tokenRing = TokenSupplier.evenlyDistributedTokens(3, clusterBuilder.getTokenCount()); + int nodeToReplace = 2; + clusterBuilder = clusterBuilder.withTokenSupplier((TokenSupplier) node -> tokenRing.tokens(node == 4 ? nodeToReplace : node)); + try (Cluster cluster = clusterBuilder.start()) + { + fixDistributedSchemas(cluster); + init(cluster); + + withRandom(rng -> { + Generator schemaGen = SchemaGenerators.schemaSpecGen(KEYSPACE, "host_replace", 1000, + SchemaSpec.optionsBuilder().withTransactionalMode(transactionalModeGen.generate(rng))); + SchemaSpec schema = schemaGen.generate(rng); + Generators.TrackingGenerator pkGen = Generators.tracking(Generators.int32(0, Math.min(schema.valueGenerators.pkPopulation(), 1000))); + + HistoryBuilder history = historyBuilder(schema, cluster); + waitForCMSToQuiesce(cluster, cluster.get(1)); + + for (int i = 0; i < 1000; i++) + history.insert(pkGen.generate(rng)); + for (int pk : pkGen.generated()) + history.selectPartition(pk); + + history.custom(() -> { + stopUnchecked(cluster.get(nodeToReplace)); + ClusterUtils.replaceHostAndStart(cluster, cluster.get(nodeToReplace)); + }, "Replace"); + + for (int pk : pkGen.generated()) + history.selectPartition(pk); + }); + } + } + + private static HistoryBuilder historyBuilder(SchemaSpec schema, Cluster cluster) + { + HistoryBuilder history = new ReplayingHistoryBuilder(schema.valueGenerators, + hb -> RingAwareInJvmDTestVisitExecutor.builder() + .replicationFactor(new TokenPlacementModel.SimpleReplicationFactor(3)) + .consistencyLevel(ConsistencyLevel.ALL) + .build(schema, hb, cluster)); + history.customThrowing(() -> cluster.schemaChange(schema.compile()), "Setup"); + return history; + } +} \ No newline at end of file diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordIncrementalRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordIncrementalRepairTest.java new file mode 100644 index 000000000000..89400fbd55e9 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordIncrementalRepairTest.java @@ -0,0 +1,443 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.util.Collection; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; +import javax.annotation.Nullable; + +import com.google.common.collect.Iterables; +import org.junit.After; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.impl.progresslog.DefaultProgressLogs; +import accord.local.Node; +import accord.local.PreLoadContext; +import accord.local.SafeCommand; +import accord.local.StoreParticipants; +import accord.local.cfk.CommandsForKey; +import accord.local.cfk.SafeCommandsForKey; +import accord.local.durability.DurabilityService; +import accord.primitives.Keys; +import accord.primitives.Ranges; +import accord.primitives.Status; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import accord.utils.async.AsyncChains; +import accord.utils.async.AsyncResult; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.cql3.UntypedResultSet; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.IIsolatedExecutor; +import org.apache.cassandra.gms.FailureDetector; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.AccordSafeCommandStore; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.IAccordService; +import org.apache.cassandra.service.accord.IAccordService.DelegatingAccordService; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.Clock; +import org.apache.cassandra.utils.concurrent.Future; +import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; + +import static accord.local.KeyHistory.SYNC; +import static java.lang.String.format; + +public class AccordIncrementalRepairTest extends AccordTestBase +{ + private static final Logger logger = LoggerFactory.getLogger(AccordIncrementalRepairTest.class); + + public static class BarrierRecordingService extends DelegatingAccordService + { + private volatile boolean executedBarriers = false; + + public BarrierRecordingService(IAccordService delegate) + { + super(delegate); + } + + @Override + public AsyncResult sync(Object requestedBy, @Nullable Timestamp onOrAfter, Ranges ranges, @Nullable Collection include, DurabilityService.SyncLocal syncLocal, DurabilityService.SyncRemote syncRemote) + { + return delegate.sync(requestedBy, onOrAfter, ranges, include, syncLocal, syncRemote).map(v -> { + executedBarriers = true; + return v; + }).beginAsResult(); + } + + @Override + public AsyncResult sync(@Nullable Timestamp onOrAfter, Keys keys, DurabilityService.SyncLocal syncLocal, DurabilityService.SyncRemote syncRemote) + { + return delegate.sync(onOrAfter, keys, syncLocal, syncRemote).map(v -> { + executedBarriers = true; + return v; + }).beginAsResult(); + } + + public void reset() + { + executedBarriers = false; + } + } + + static BarrierRecordingService barrierRecordingService() + { + return (BarrierRecordingService) AccordService.instance(); + } + + static IAccordService accordService() + { + return AccordService.instance(); + } + + @Override + protected Logger logger() + { + return logger; + } + + @BeforeClass + public static void setupClass() throws Throwable + { + setupCluster(opt -> opt.withConfig(conf -> conf.with(Feature.NETWORK, Feature.GOSSIP) + .set("accord.recover_txn", "1s") + .set("accord.retry_syncpoint", "1s*attempts") + .set("accord.retry_durability", "1s*attempts") + .set("accord.shard_durability_target_splits", 4) + ), 3); + for (IInvokableInstance instance : SHARED_CLUSTER) + instance.runOnInstance(() -> AccordService.unsafeSetNewAccordService(new BarrierRecordingService(AccordService.instance()))); +// setupCluster(opt -> opt, 3); + } + + @After + public void tearDown() + { + for (IInvokableInstance instance : SHARED_CLUSTER) + instance.runOnInstance(() -> DefaultProgressLogs.unsafePauseForTesting(false)); + SHARED_CLUSTER.filters().reset(); + } + + private static void await(IInvokableInstance instance, IIsolatedExecutor.SerializableCallable check, long duration, TimeUnit unit) + { + instance.runOnInstance(() -> { + long timeout = Clock.Global.currentTimeMillis() + unit.toMillis(duration); + while (Clock.Global.currentTimeMillis() < timeout) + { + if (check.call()) + return; + + try + { + Thread.sleep(1); + } + catch (InterruptedException e) + { + throw new AssertionError(e); + } + } + throw new AssertionError("Timed out waiting for node 3 to become alive"); + }); + } + + private static void awaitEndpointUp(IInvokableInstance instance, IInvokableInstance waitOn) + { + InetAddressAndPort endpoint = InetAddressAndPort.getByAddress(waitOn.broadcastAddress()); + await(instance, () -> FailureDetector.instance.isAlive(endpoint), 1, TimeUnit.MINUTES); + } + + private static void awaitEndpointDown(IInvokableInstance instance, IInvokableInstance waitOn) + { + InetAddressAndPort endpoint = InetAddressAndPort.getByAddress(waitOn.broadcastAddress()); + await(instance, () -> !FailureDetector.instance.isAlive(endpoint), 1, TimeUnit.MINUTES); + } + + private static V getUninterruptibly(Future future, long timeout, TimeUnit units) + { + try + { + return future.get(timeout, units); + } + catch (InterruptedException e) + { + throw new UncheckedInterruptedException(e); + } + catch (ExecutionException | TimeoutException e) + { + throw new RuntimeException(e); + } + } + + private static V getUninterruptibly(Future future) + { + return getUninterruptibly(future, 1, TimeUnit.MINUTES); + } + + private static TxnId awaitLocalApplyOnKey(TableMetadata metadata, int k) + { + return awaitLocalApplyOnKey(new TokenKey(metadata.id, metadata.partitioner.decorateKey(ByteBufferUtil.bytes(k)).getToken())); + } + + private static TxnId awaitLocalApplyOnKey(TokenKey key) + { + Node node = accordService().node(); + AtomicReference waitFor = new AtomicReference<>(null); + AsyncChains.awaitUninterruptibly(node.commandStores().ifLocal(PreLoadContext.contextFor(key, SYNC), key.toUnseekable(), 0, Long.MAX_VALUE, safeStore -> { + AccordSafeCommandStore store = (AccordSafeCommandStore) safeStore; + SafeCommandsForKey safeCfk = store.ifLoadedAndInitialised(key); + if (safeCfk == null) + return; + CommandsForKey cfk = safeCfk.current(); + int size = cfk.size(); + if (size < 1) + return; + // if txnId is an instance of CommandsForKey.TxnInfo, copying it into a + // new txnId instance will prevent any issues related to TxnInfo#hashCode + waitFor.set(new TxnId(cfk.txnId(size - 1))); + })); + Assert.assertNotNull(waitFor.get()); + TxnId txnId = waitFor.get(); + long start = Clock.Global.currentTimeMillis(); + AtomicBoolean applied = new AtomicBoolean(false); + while (!applied.get()) + { + long now = Clock.Global.currentTimeMillis(); + if (now - start > TimeUnit.MINUTES.toMillis(1)) + throw new AssertionError("Timeout"); + AsyncChains.awaitUninterruptibly(node.commandStores().ifLocal(txnId, key.toUnseekable(), 0, Long.MAX_VALUE, safeStore -> { + SafeCommand command = safeStore.get(txnId, StoreParticipants.empty(txnId)); + Assert.assertNotNull(command.current()); + if (command.current().status().hasBeen(Status.Applied)) + applied.set(true); + })); + } + return txnId; + } + + // TODO (required): After conversation with Ariel: it's a known issue that I am not sure we need to fix now. + // The problem is that we don't flush after Accord repair, but before data repair when running incremental + // repair so it doesn't see the repaired sstables it is checking for. + // This hard fails now that incremental repair Accord barriers are at all to account for the missing flushes + @Ignore + @Test + public void txnRepairTest() throws Throwable + { + SHARED_CLUSTER.schemaChange(format("CREATE TABLE %s.%s (k int primary key, v int) WITH transactional_mode='full' AND fast_path={'size':2};", KEYSPACE, accordTableName)); + final String keyspace = KEYSPACE; + final String table = accordTableName; + + SHARED_CLUSTER.filters().allVerbs().to(3).drop(); + awaitEndpointDown(SHARED_CLUSTER.get(1), SHARED_CLUSTER.get(3)); + + executeWithRetry(SHARED_CLUSTER, format("BEGIN TRANSACTION\n" + + "INSERT INTO %s (k, v) VALUES (1, 1);\n" + + "COMMIT TRANSACTION", qualifiedAccordTableName)); + + SHARED_CLUSTER.get(1, 2).forEach(instance -> instance.runOnInstance(() -> { + TableMetadata metadata = Schema.instance.getTableMetadata(keyspace, table); + awaitLocalApplyOnKey(metadata, 1); + })); + + SHARED_CLUSTER.forEach(instance -> instance.runOnInstance(() -> barrierRecordingService().reset())); + + SHARED_CLUSTER.get(1, 2).forEach(instance -> { + instance.runOnInstance(() -> { + ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table); + cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); + Assert.assertFalse(cfs.getLiveSSTables().isEmpty()); + cfs.getLiveSSTables().forEach(sstable -> { + Assert.assertFalse(sstable.isRepaired()); + Assert.assertFalse(sstable.isPendingRepair()); + }); + }); + }); + SHARED_CLUSTER.get(3).runOnInstance(() -> { + ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table); + cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); + Assert.assertTrue(cfs.getLiveSSTables().isEmpty()); + }); + + // heal partition and wait for node 1 to see node 3 again + for (IInvokableInstance instance : SHARED_CLUSTER) + instance.runOnInstance(() -> { + DefaultProgressLogs.unsafePauseForTesting(true); + Assert.assertFalse(barrierRecordingService().executedBarriers); + }); + SHARED_CLUSTER.filters().reset(); + awaitEndpointUp(SHARED_CLUSTER.get(1), SHARED_CLUSTER.get(3)); + nodetool(SHARED_CLUSTER.get(1), "repair", KEYSPACE); + + SHARED_CLUSTER.get(1).runOnInstance(() -> { + Assert.assertTrue(barrierRecordingService().executedBarriers); + ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table); + Assert.assertFalse(cfs.getLiveSSTables().isEmpty()); + cfs.getLiveSSTables().forEach(sstable -> { + Assert.assertTrue(sstable.isRepaired() || sstable.isPendingRepair()); + }); + }); + } + + private void testSingleNodeWrite(TransactionalMode mode) + { + SHARED_CLUSTER.schemaChange(format("CREATE TABLE %s.%s (k int primary key, v int) WITH transactional_mode='%s';", KEYSPACE, accordTableName, mode)); + final String keyspace = KEYSPACE; + final String table = accordTableName; + + SHARED_CLUSTER.get(3).runOnInstance(() -> { + QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (k, v) VALUES (1, 2);", keyspace, table)); + }); + + SHARED_CLUSTER.get(3).runOnInstance(() -> { + UntypedResultSet result = QueryProcessor.executeInternal(format("SELECT * FROM %s.%s WHERE k=1", keyspace, table)); + Assert.assertFalse(result.isEmpty()); + UntypedResultSet.Row row = Iterables.getOnlyElement(result); + Assert.assertEquals(1, row.getInt("k")); + Assert.assertEquals(2, row.getInt("v")); + + + + ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table); + cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); + Assert.assertFalse(cfs.getLiveSSTables().isEmpty()); + cfs.getLiveSSTables().forEach(sstable -> { + Assert.assertFalse(sstable.isRepaired()); + Assert.assertFalse(sstable.isPendingRepair()); + }); + }); + SHARED_CLUSTER.get(1, 2).forEach(instance -> instance.runOnInstance(() -> { + UntypedResultSet result = QueryProcessor.executeInternal(format("SELECT * FROM %s.%s WHERE k=1", keyspace, table)); + Assert.assertTrue(result.isEmpty()); + + ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table); + cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); + Assert.assertTrue(cfs.getLiveSSTables().isEmpty()); + })); + SHARED_CLUSTER.forEach(instance -> instance.runOnInstance(() -> { + barrierRecordingService().reset(); + })); + + nodetool(SHARED_CLUSTER.get(1), "repair", KEYSPACE); + SHARED_CLUSTER.get(1).runOnInstance(() -> { + Assert.assertTrue(barrierRecordingService().executedBarriers); + ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table); + Assert.assertFalse(cfs.getLiveSSTables().isEmpty()); + cfs.getLiveSSTables().forEach(sstable -> { + Assert.assertTrue(sstable.isRepaired() || sstable.isPendingRepair()); + }); + + UntypedResultSet result = QueryProcessor.executeInternal(format("SELECT * FROM %s.%s WHERE k=1", keyspace, table)); + Assert.assertFalse(result.isEmpty()); + UntypedResultSet.Row row = Iterables.getOnlyElement(result); + Assert.assertEquals(1, row.getInt("k")); + Assert.assertEquals(2, row.getInt("v")); + }); + } + + /** + * a failed write at txn mode unsafe should be made visible by repair + */ + @Test + public void unsafeRepairTest() + { + testSingleNodeWrite(TransactionalMode.test_unsafe); + } + + /** + * Repair should repair (fully replicate _some_ state) any divergent state between replicas + */ + @Test + public void fullRepairTest() + { + testSingleNodeWrite(TransactionalMode.full); + } + + @Test + public void onlyAccordTest() + { + SHARED_CLUSTER.schemaChange(format("CREATE TABLE %s.%s (k int primary key, v int) WITH transactional_mode='full' AND fast_path={'size':2};", KEYSPACE, accordTableName)); + final String keyspace = KEYSPACE; + final String table = accordTableName; + + executeWithRetry(SHARED_CLUSTER, format("BEGIN TRANSACTION\n" + + "INSERT INTO %s (k, v) VALUES (1, 1);\n" + + "COMMIT TRANSACTION", qualifiedAccordTableName)); + + SHARED_CLUSTER.get(1, 2).forEach(instance -> instance.runOnInstance(() -> { + TableMetadata metadata = Schema.instance.getTableMetadata(keyspace, table); + awaitLocalApplyOnKey(metadata, 1); + })); + + SHARED_CLUSTER.forEach(instance -> instance.runOnInstance(() -> barrierRecordingService().reset())); + + SHARED_CLUSTER.filters().reset(); + awaitEndpointUp(SHARED_CLUSTER.get(1), SHARED_CLUSTER.get(3)); + nodetool(SHARED_CLUSTER.get(1), "repair", "--accord-only", KEYSPACE); + + SHARED_CLUSTER.get(1).runOnInstance(() -> { + Assert.assertTrue(barrierRecordingService().executedBarriers); + }); + } + + @Test + public void onlyAccordWithForceTest() + { + SHARED_CLUSTER.schemaChange(format("CREATE TABLE %s.%s (k int primary key, v int) WITH transactional_mode='full' AND fast_path={'size':2};", KEYSPACE, accordTableName)); + final String keyspace = KEYSPACE; + final String table = accordTableName; + + SHARED_CLUSTER.filters().allVerbs().to(3).drop(); + awaitEndpointDown(SHARED_CLUSTER.get(1), SHARED_CLUSTER.get(3)); + awaitEndpointDown(SHARED_CLUSTER.get(2), SHARED_CLUSTER.get(3)); + + executeWithRetry(SHARED_CLUSTER, format("BEGIN TRANSACTION\n" + + "INSERT INTO %s (k, v) VALUES (1, 1);\n" + + "COMMIT TRANSACTION", qualifiedAccordTableName)); + + SHARED_CLUSTER.get(1, 2).forEach(instance -> instance.runOnInstance(() -> { + TableMetadata metadata = Schema.instance.getTableMetadata(keyspace, table); + awaitLocalApplyOnKey(metadata, 1); + })); + + SHARED_CLUSTER.forEach(instance -> instance.runOnInstance(() -> barrierRecordingService().reset())); + + SHARED_CLUSTER.filters().reset(); + awaitEndpointUp(SHARED_CLUSTER.get(1), SHARED_CLUSTER.get(3)); + nodetool(SHARED_CLUSTER.get(1), "repair", "--force", "--accord-only", KEYSPACE); + + SHARED_CLUSTER.get(1).runOnInstance(() -> { + Assert.assertTrue(barrierRecordingService().executedBarriers); + }); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordIntegrationTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordIntegrationTest.java new file mode 100644 index 000000000000..144fcdf311e2 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordIntegrationTest.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.io.IOException; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.function.Function; + +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.impl.progresslog.DefaultProgressLogs; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.IMessageFilters; +import org.apache.cassandra.net.Verb; + +public class AccordIntegrationTest extends AccordTestBase +{ + private static final Logger logger = LoggerFactory.getLogger(AccordIntegrationTest.class); + + @Override + protected Logger logger() + { + return logger; + } + + @BeforeClass + public static void setUp() throws IOException + { + AccordTestBase.setupCluster(Function.identity(), 2); + } + + @Test + public void testRecovery() throws Exception + { + pauseSimpleProgressLog(); + test(cluster -> { + IMessageFilters.Filter lostApply = cluster.filters().verbs(Verb.ACCORD_APPLY_REQ.id).drop(); + IMessageFilters.Filter lostCommit = cluster.filters().verbs(Verb.ACCORD_COMMIT_REQ.id).to(2).drop(); + + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT v FROM " + qualifiedAccordTableName + " WHERE k=0 AND c=0);\n" + + " SELECT row1.v;\n" + + " IF row1 IS NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (0, 0, 1);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + // row1.v shouldn't have existed when the txn's SELECT was executed + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { null }, query); + + lostApply.off(); + lostCommit.off(); + + query = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT v FROM " + qualifiedAccordTableName + " WHERE k=0 AND c=0);\n" + + " SELECT row1.v;\n" + + " IF row1.v = 1 THEN\n" + + " UPDATE " + qualifiedAccordTableName + " SET v=2 WHERE k = 0 AND c = 0;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 1 }, query); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {0, 0, 2}, check, 0, 0); + + query = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT v FROM " + qualifiedAccordTableName + " WHERE k=0 AND c=0);\n" + + " SELECT row1.v;\n" + + " IF row1 IS NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (0, 0, 3);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 2 }, query); + + assertRowEqualsWithPreemptedRetry(cluster, new Object[] {0, 0, 2}, check, 0, 0); + }); + } + + @Test + public void testLostCommitReadTriggersFallbackRead() throws Exception + { + pauseSimpleProgressLog(); + test(cluster -> { + // It's expected that the required Read will happen regardless of whether this fails to return a read + final AtomicBoolean droppedOne = new AtomicBoolean(); + cluster.filters().verbs(Verb.ACCORD_COMMIT_REQ.id).messagesMatching((from, to, iMessage) -> !droppedOne.getAndSet(true)).drop(); + + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0 AND c = 0);\n" + + " SELECT row1.v;\n" + + " IF row1 IS NULL THEN\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (0, 0, 1);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { null }, query); + + String check = "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ? AND c = ?;\n" + + "COMMIT TRANSACTION"; + assertRowEqualsWithPreemptedRetry(cluster, new Object[] { 0, 0, 1 }, check, 0, 0); + }); + } + + private void pauseSimpleProgressLog() + { + for (IInvokableInstance instance : SHARED_CLUSTER) + instance.runOnInstance(() -> DefaultProgressLogs.unsafePauseForTesting(true)); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordInteropReadTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordInteropReadTest.java new file mode 100644 index 000000000000..fb32c0a15f90 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordInteropReadTest.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import org.junit.Test; + +import org.apache.cassandra.cql3.QueryOptions; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.cql3.statements.ModificationStatement; +import org.apache.cassandra.cql3.statements.SelectStatement; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.SimpleQueryResult; +import org.apache.cassandra.distributed.impl.RowUtil; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.distributed.util.QueryResultUtil; +import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.QueryState; +import org.apache.cassandra.service.consensus.TransactionalMode; + +import static org.apache.cassandra.distributed.api.Feature.GOSSIP; +import static org.apache.cassandra.distributed.api.Feature.NETWORK; + +public class AccordInteropReadTest extends TestBaseImpl +{ + + private static void localWrite(String s) + { + ModificationStatement stmt = (ModificationStatement) QueryProcessor.parseStatement(s).prepare(ClientState.forInternalCalls()); + stmt.executeLocally(QueryState.forInternalCalls(), QueryOptions.DEFAULT); + } + + private static SimpleQueryResult localRead(String s) + { + SelectStatement stmt = (SelectStatement) QueryProcessor.parseStatement(s).prepare(ClientState.forInternalCalls()); + return RowUtil.toQueryResult(stmt.executeLocally(QueryState.forInternalCalls(), QueryOptions.DEFAULT)); + } + + private static Object[] obj(Object... values) + { + return values; + } + + @Test + public void serialReadTest() throws Throwable + { + try (Cluster cluster = builder().withNodes(3) + .withConfig(config -> config.with(GOSSIP).with(NETWORK)).start()) + { + cluster.schemaChange("CREATE KEYSPACE ks WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor':3}"); + cluster.schemaChange("CREATE TABLE ks.tbl (k int, c int, v int, PRIMARY KEY (k, c)) WITH " + TransactionalMode.test_unsafe_writes.asCqlParam()); + + cluster.get(1).runOnInstance(() -> localWrite("INSERT INTO ks.tbl (k, c, v) VALUES (1, 1, 1)")); + cluster.get(2).runOnInstance(() -> localWrite("INSERT INTO ks.tbl (k, c, v) VALUES (1, 1, 2)")); + cluster.get(3).shutdown(); + cluster.get(1).runOnInstance(() -> QueryResultUtil.assertThat(localRead("SELECT * FROM ks.tbl WHERE k=1")).isEqualTo(obj(obj(1, 1, 1)))); + cluster.get(2).runOnInstance(() -> QueryResultUtil.assertThat(localRead("SELECT * FROM ks.tbl WHERE k=1")).isEqualTo(obj(obj(1, 1, 2)))); + + + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT v FROM ks.tbl WHERE k=0 AND c=0);\n" + + " SELECT row1.v;\n" + + " IF row1 IS NULL THEN\n" + + " INSERT INTO ks.tbl (k, c, v) VALUES (0, 0, 1);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + + SimpleQueryResult result = cluster.coordinator(1).executeWithResult("SELECT * FROM ks.tbl WHERE k=1", ConsistencyLevel.SERIAL); + QueryResultUtil.assertThat(result).isEqualTo(obj(obj(1, 1, 2))); + cluster.get(1).runOnInstance(() -> QueryResultUtil.assertThat(localRead("SELECT * FROM ks.tbl WHERE k=1")).isEqualTo(obj(obj(1, 1, 2)))); + cluster.get(2).runOnInstance(() -> QueryResultUtil.assertThat(localRead("SELECT * FROM ks.tbl WHERE k=1")).isEqualTo(obj(obj(1, 1, 2)))); + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordInteroperabilityTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordInteroperabilityTest.java new file mode 100644 index 000000000000..8f072a9782e6 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordInteroperabilityTest.java @@ -0,0 +1,142 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.io.IOException; +import java.util.function.Function; + +import org.junit.After; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.distributed.api.ICoordinator; +import org.apache.cassandra.distributed.shared.AssertUtils; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.service.accord.IAccordService; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +public class AccordInteroperabilityTest extends AccordTestBase +{ + private static final Logger logger = LoggerFactory.getLogger(AccordInteroperabilityTest.class); + + @Override + protected Logger logger() + { + return logger; + } + + @BeforeClass + public static void setupClass() throws IOException + { + AccordTestBase.setupCluster(builder -> builder, 3); + } + + @After + public void tearDown() + { + SHARED_CLUSTER.setMessageSink(null); + } + + @Test + public void testSerialReadDescending() throws Throwable + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int, PRIMARY KEY(k, c)) WITH transactional_mode='full'", + cluster -> { + ICoordinator coordinator = cluster.coordinator(1); + for (int i = 1; i <= 10; i++) + coordinator.execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (0, ?, ?) USING TIMESTAMP 0;", org.apache.cassandra.distributed.api.ConsistencyLevel.ALL, i, i * 10); + assertRowSerial(cluster, "SELECT c, v FROM " + qualifiedAccordTableName + " WHERE k=0 ORDER BY c DESC LIMIT 1", AssertUtils.row(10, 100)); + assertRowSerial(cluster, "SELECT c, v FROM " + qualifiedAccordTableName + " WHERE k=0 ORDER BY c DESC LIMIT 2", AssertUtils.row(10, 100), AssertUtils.row(9, 90)); + assertRowSerial(cluster, "SELECT c, v FROM " + qualifiedAccordTableName + " WHERE k=0 ORDER BY c DESC LIMIT 3", AssertUtils.row(10, 100), AssertUtils.row(9, 90), AssertUtils.row(8, 80)); + assertRowSerial(cluster, "SELECT c, v FROM " + qualifiedAccordTableName + " WHERE k=0 ORDER BY c DESC LIMIT 4", AssertUtils.row(10, 100), AssertUtils.row(9, 90), AssertUtils.row(8, 80), AssertUtils.row(7, 70)); + } + ); + } + + private static Object[][] assertTargetAccordRead(Function query, int coordinatorIndex, int key, int expectedAccordReadCount) + { + int startingReadCount = getAccordReadCount(coordinatorIndex); + Object[][] result = query.apply(key); + assertEquals("Accord reads", expectedAccordReadCount, getAccordReadCount(coordinatorIndex) - startingReadCount); + return result; + } + + private static Object[][] assertTargetAccordWrite(Function query, int coordinatorIndex, int key, int expectedAccordWriteCount) + { + int startingWriteCount = getAccordWriteCount(coordinatorIndex); + Object[][] result = query.apply(key); + assertEquals("Accord writes", expectedAccordWriteCount, getAccordWriteCount(coordinatorIndex) - startingWriteCount); + return result; + } + + @Test + public void testNonSerialReadIsThroughAccordFull() throws Throwable + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int, PRIMARY KEY(k, c)) WITH transactional_mode='full'", + cluster -> { + for (ConsistencyLevel cl : ConsistencyLevel.values()) + { + try + { + if (cl == ConsistencyLevel.ANY || cl == ConsistencyLevel.NODE_LOCAL) + continue; + assertTargetAccordRead(key -> cluster.coordinator(1).execute("SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?", org.apache.cassandra.distributed.api.ConsistencyLevel.valueOf(cl.name()), key), 1, 1, 1); + if (!IAccordService.SUPPORTED_READ_CONSISTENCY_LEVELS.contains(cl)) + fail("Unsupported consistency level succeeded"); + + } + catch (Throwable t) + { + assertEquals(InvalidRequestException.class.getName(), t.getClass().getName()); + assertEquals(cl + " is not supported by Accord", t.getMessage()); + } + } + }); + } + + @Test + public void testNonSerialWriteIsThroughAccordFull() throws Throwable + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int, PRIMARY KEY(k, c)) WITH transactional_mode='full'", + cluster -> { + for (ConsistencyLevel cl : ConsistencyLevel.values()) + { + try + { + assertTargetAccordWrite(key -> cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (?, 43, 44)", org.apache.cassandra.distributed.api.ConsistencyLevel.valueOf(cl.name()), key), 1, 1, 1); + if (!IAccordService.SUPPORTED_COMMIT_CONSISTENCY_LEVELS.contains(cl)) + fail("Unsupported consistency level succeeded"); + } + catch (Throwable t) + { + assertEquals(InvalidRequestException.class.getName(), t.getClass().getName()); + if (cl == ConsistencyLevel.SERIAL || cl == ConsistencyLevel.LOCAL_SERIAL) + assertEquals("You must use conditional updates for serializable writes", t.getMessage()); + else + assertEquals(cl + " is not supported by Accord", t.getMessage()); + } + } + }); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordJournalIntegrationTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordJournalIntegrationTest.java new file mode 100644 index 000000000000..f77bfa2016bf --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordJournalIntegrationTest.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.junit.Assert; +import org.junit.Test; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.shared.ClusterUtils; +import org.apache.cassandra.distributed.shared.WithProperties; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.utils.concurrent.CountDownLatch; + +import static org.apache.cassandra.distributed.api.Feature.GOSSIP; +import static org.apache.cassandra.distributed.api.Feature.NETWORK; + +public class AccordJournalIntegrationTest extends TestBaseImpl +{ + @Test + public void saveLoadSanityCheck() throws Throwable + { + try (WithProperties wp = new WithProperties().set(CassandraRelevantProperties.DTEST_ACCORD_JOURNAL_SANITY_CHECK_ENABLED, "true"); + Cluster cluster = init(Cluster.build(1) + .withoutVNodes() + .start())) + { + final String TABLE = createTable(cluster); + List threads = new ArrayList<>(); + int numThreads = 10; + CountDownLatch latch = CountDownLatch.newCountDownLatch(numThreads); + AtomicInteger counter = new AtomicInteger(); + for (int i = 0; i < numThreads; i++) + { + int finalI = i; + Thread t = new Thread(() -> { + latch.decrement(); + latch.awaitUninterruptibly(); + try + { + for (int j = 0; j < 100; j++) + { + cluster.coordinator(1).execute("BEGIN TRANSACTION\n" + + "INSERT INTO " + TABLE + "(k, c, v) VALUES (?, ?, ?);\n" + + "INSERT INTO " + TABLE + "(k, c, v) VALUES (?, ?, ?);\n" + + "COMMIT TRANSACTION", + ConsistencyLevel.ALL, + 1, j, finalI * 100 + j, + 2, j, finalI * 100 + j); + counter.incrementAndGet(); + } + } + catch (Throwable throwable) + { + throwable.printStackTrace(); + } + }); + t.start(); + threads.add(t); + } + for (Thread thread : threads) + thread.join(); + + cluster.coordinator(1).execute("SELECT * FROM " + TABLE + " WHERE k = ?;", ConsistencyLevel.SERIAL, 1); + } + } + + @Test + public void memtableStateReloadingTest() throws Throwable + { + try (Cluster cluster = Cluster.build(1) + .withoutVNodes() + .start()) + { + cluster.schemaChange("CREATE KEYSPACE " + KEYSPACE + " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};"); + final String TABLE = createTable(cluster); + + insertData(cluster, TABLE); + + Object[][] before = cluster.coordinator(1).execute("SELECT * FROM " + TABLE + " WHERE k = ?;", ConsistencyLevel.SERIAL, 1); + + cluster.get(1).runOnInstance(() -> { + ((AccordService) AccordService.instance()).journal().closeCurrentSegmentForTestingIfNonEmpty(); + }); + ClusterUtils.stopUnchecked(cluster.get(1)); + cluster.get(1).startup(); + + Object[][] after = cluster.coordinator(1).execute("SELECT * FROM " + TABLE + " WHERE k = ?;", ConsistencyLevel.SERIAL, 1); + Assert.assertEquals(before.length, after.length); + for (int i = 0; i < before.length; i++) + { + Assert.assertTrue(Arrays.equals(before[i], after[i])); + } + } + } + + @Test + public void restartWithEpochChanges() throws IOException + { + try (Cluster cluster = Cluster.build(3).withoutVNodes().withConfig(c -> c.with(GOSSIP).with(NETWORK)).start()) + { + init(cluster); + final String TABLE = createTable(cluster); + cluster.get(1).nodetoolResult("cms", "reconfigure", "3").asserts().success(); + + insertData(cluster, TABLE); + + IInvokableInstance restartNode = cluster.get(1); + ClusterUtils.stopUnchecked(restartNode); + + // make epoch changes + for (int i = 0; i < 10; i++) + cluster.schemaChange("ALTER TABLE " + TABLE + " WITH comment = 'change " + i + "'", true, cluster.get(2)); + + restartNode.startup(); + insertData(cluster, TABLE); + } + } + + private void insertData(Cluster cluster, String TABLE) { + for (int j = 0; j < 1_000; j++) + { + cluster.coordinator(1).execute("BEGIN TRANSACTION\n" + + "INSERT INTO " + TABLE + "(k, c, v) VALUES (?, ?, ?);\n" + + "COMMIT TRANSACTION", + ConsistencyLevel.ALL, + j, j, 1 + ); + } + } + + private String createTable(Cluster cluster) { + final String TABLE = KEYSPACE + ".test_table"; + cluster.schemaChange("CREATE TABLE " + TABLE + " (k int, c int, v int, primary key (k, c)) WITH transactional_mode='full'"); + return TABLE; + } +} \ No newline at end of file diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordLoadTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordLoadTest.java new file mode 100644 index 000000000000..2a99a350b58e --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordLoadTest.java @@ -0,0 +1,278 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Comparator; +import java.util.Date; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Semaphore; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +import com.google.common.util.concurrent.RateLimiter; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.ICoordinator; +import org.apache.cassandra.distributed.api.IMessage; +import org.apache.cassandra.distributed.api.IMessageFilters; +import org.apache.cassandra.distributed.shared.DistributedTestBase; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.utils.EstimatedHistogram; + +import static java.lang.System.currentTimeMillis; +import static java.util.concurrent.TimeUnit.MILLISECONDS; +import static java.util.concurrent.TimeUnit.NANOSECONDS; +import static java.util.concurrent.TimeUnit.SECONDS; + +public class AccordLoadTest extends AccordTestBase +{ + private static final Logger logger = LoggerFactory.getLogger(AccordLoadTest.class); + + @BeforeClass + public static void setUp() throws IOException + { + CassandraRelevantProperties.SIMULATOR_STARTED.setString(Long.toString(MILLISECONDS.toSeconds(currentTimeMillis()))); +// AccordTestBase.setupCluster(builder -> builder, 3); + AccordTestBase.setupCluster(builder -> builder.withConfig(config -> config + .with(Feature.NETWORK, Feature.GOSSIP) + .set("accord.shard_durability_target_splits", "64") + .set("accord.shard_durability_cycle", "5m") +// .set("accord.ephemeral_read_enabled", "true") + .set("accord.gc_delay", "5s")), 3); + } + + @Ignore + @Test + public void testLoad() throws Exception + { + Cluster cluster = SHARED_CLUSTER; + cluster.schemaChange("CREATE TABLE " + qualifiedAccordTableName + " (k int, v int, PRIMARY KEY(k)) WITH transactional_mode = 'full'"); + + try + { + + final ConcurrentHashMap verbs = new ConcurrentHashMap<>(); + cluster.filters().outbound().messagesMatching(new IMessageFilters.Matcher() + { + @Override + public boolean matches(int i, int i1, IMessage iMessage) + { + verbs.computeIfAbsent(Verb.fromId(iMessage.verb()), ignore -> new AtomicInteger()).incrementAndGet(); + return false; + } + }).drop(); + + ICoordinator coordinator = cluster.coordinator(1); + final int repairInterval = Integer.MAX_VALUE; + // final int repairInterval = 3000; + final int compactionInterval = Integer.MAX_VALUE; +// final int compactionInterval = 3000; + final int flushInterval = Integer.MAX_VALUE; +// final int flushInterval = 1000; + final int compactionPeriodSeconds = 1; + final int restartInterval = 150_000_000; + final int batchSizeLimit = 1000; + final long batchTime = TimeUnit.SECONDS.toNanos(10); + final int concurrency = 100; + final int ratePerSecond = 1000; + final int keyCount = 10_000; + final float readChance = 0.33f; + long nextRepairAt = repairInterval; + long nextCompactionAt = compactionInterval; + long nextFlushAt = flushInterval; + long nextRestartAt = restartInterval; + final ExecutorService restartExecutor = Executors.newSingleThreadExecutor(); + final BitSet initialised = new BitSet(); + + cluster.get(1).nodetoolResult("cms", "reconfigure", "3").asserts().success(); + cluster.forEach(i -> i.runOnInstance(() -> { + if (compactionPeriodSeconds > 0) + ((AccordService) AccordService.instance()).journal().compactor().updateCompactionPeriod(1, SECONDS); + // ((AccordSpec.JournalSpec)((AccordService) AccordService.instance()).journal().configuration()).segmentSize = 128 << 10; + })); + + Random random = new Random(); + // CopyOnWriteArrayList exceptions = new CopyOnWriteArrayList<>(); + final Semaphore inFlight = new Semaphore(concurrency); + final RateLimiter rateLimiter = RateLimiter.create(ratePerSecond); + // long testStart = System.nanoTime(); + // while (NANOSECONDS.toMinutes(System.nanoTime() - testStart) < 10 && exceptions.size() < 10000) + while (true) + { + final EstimatedHistogram histogram = new EstimatedHistogram(200); + long batchStart = System.nanoTime(); + long batchEnd = batchStart + batchTime; + int batchSize = 0; + while (batchSize < batchSizeLimit) + { + inFlight.acquire(); + rateLimiter.acquire(); + long commandStart = System.nanoTime(); + int k = random.nextInt(keyCount); + if (random.nextFloat() < readChance) + { + coordinator.executeWithResult((success, fail) -> { + inFlight.release(); + if (fail == null) histogram.add(NANOSECONDS.toMicros(System.nanoTime() - commandStart)); + // else exceptions.add(fail); + }, "SELECT * FROM " + qualifiedAccordTableName + " WHERE k = ?;", ConsistencyLevel.SERIAL, k); + } + else if (initialised.get(k)) + { + coordinator.executeWithResult((success, fail) -> { + inFlight.release(); + if (fail == null) histogram.add(NANOSECONDS.toMicros(System.nanoTime() - commandStart)); + // else exceptions.add(fail); + }, "UPDATE " + qualifiedAccordTableName + " SET v += 1 WHERE k = ? IF EXISTS;", ConsistencyLevel.SERIAL, ConsistencyLevel.QUORUM, k); + } + else + { + initialised.set(k); + coordinator.executeWithResult((success, fail) -> { + inFlight.release(); + if (fail == null) histogram.add(NANOSECONDS.toMicros(System.nanoTime() - commandStart)); + // else exceptions.add(fail); + }, "UPDATE " + qualifiedAccordTableName + " SET v = 0 WHERE k = ? IF NOT EXISTS;", ConsistencyLevel.SERIAL, ConsistencyLevel.QUORUM, k); + } + batchSize++; + if (System.nanoTime() >= batchEnd) + break; + } + + if ((nextRepairAt -= batchSize) <= 0) + { + nextRepairAt += repairInterval; + System.out.println("repairing..."); + cluster.coordinator(1).instance().nodetool("repair", qualifiedAccordTableName); + } + + if ((nextCompactionAt -= batchSize) <= 0) + { + nextCompactionAt += compactionInterval; + System.out.println("compacting accord..."); + cluster.forEach(i -> { + i.nodetool("compact", "system_accord.journal"); + }); + } + + if ((nextFlushAt -= batchSize) <= 0) + { + nextFlushAt += flushInterval; + System.out.println("flushing journal..."); + cluster.forEach(i -> i.runOnInstance(() -> { + ((AccordService) AccordService.instance()).journal().closeCurrentSegmentForTestingIfNonEmpty(); + })); + } + + if ((nextRestartAt -= batchSize) <= 0) + { + nextRestartAt += restartInterval; + int nodeIdx = random.nextInt(cluster.size()); + + restartExecutor.submit(() -> { + System.out.printf("restarting node %d...\n", nodeIdx); + try + { + cluster.get(nodeIdx).shutdown().get(); + cluster.get(nodeIdx).startup(); + return null; + } + catch (InterruptedException | ExecutionException e) + { + throw new RuntimeException(e); + } + }); + } + + final Date date = new Date(); + System.out.printf("%tT rate: %.2f/s (%d total)\n", date, (((float)batchSizeLimit * 1000) / NANOSECONDS.toMillis(System.nanoTime() - batchStart)), batchSize); + System.out.printf("%tT percentiles: %d %d %d %d\n", date, histogram.percentile(.25)/1000, histogram.percentile(.5)/1000, histogram.percentile(.75)/1000, histogram.percentile(1)/1000); + + class VerbCount + { + final Verb verb; + final int count; + + VerbCount(Verb verb, int count) + { + this.verb = verb; + this.count = count; + } + } + List verbCounts = new ArrayList<>(); + for (Map.Entry e : verbs.entrySet()) + { + int count = e.getValue().getAndSet(0); + if (count != 0) verbCounts.add(new VerbCount(e.getKey(), count)); + } + verbCounts.sort(Comparator.comparing(v -> -v.count)); + + StringBuilder verbSummary = new StringBuilder(); + for (VerbCount vs : verbCounts) + { + { + if (verbSummary.length() > 0) + verbSummary.append(", "); + verbSummary.append(vs.verb); + verbSummary.append(": "); + verbSummary.append(vs.count); + } + } + System.out.printf("%tT verbs: %s\n", date, verbSummary); + } + } + catch (Throwable t) + { + t.printStackTrace(); + } + } + + @Override + protected Logger logger() + { + return logger; + } + + public static void main(String[] args) throws Throwable + { + DistributedTestBase.beforeClass(); + AccordLoadTest.setUp(); + AccordLoadTest test = new AccordLoadTest(); + test.setup(); + test.testLoad(); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordMetricsTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordMetricsTest.java new file mode 100644 index 000000000000..f0f83cb036ad --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordMetricsTest.java @@ -0,0 +1,337 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.function.Function; + +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.virtual.AccordDebugKeyspace; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.IMessageFilters; +import org.apache.cassandra.distributed.api.Row; +import org.apache.cassandra.distributed.api.SimpleQueryResult; +import org.apache.cassandra.exceptions.ReadTimeoutException; +import org.apache.cassandra.exceptions.WriteTimeoutException; +import org.apache.cassandra.metrics.AccordMetrics; +import org.apache.cassandra.metrics.DefaultNameFactory; +import org.apache.cassandra.metrics.RatioGaugeSet; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.exceptions.AccordReadPreemptedException; +import org.apache.cassandra.service.accord.exceptions.AccordWritePreemptedException; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.utils.AssertionUtils; +import org.assertj.core.api.Assertions; +import org.assertj.core.data.Offset; + +import static java.lang.String.format; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.fail; + + +public class AccordMetricsTest extends AccordTestBase +{ + private static final Logger logger = LoggerFactory.getLogger(AccordMetricsTest.class); + + @Override + protected Logger logger() + { + return logger; + } + + @BeforeClass + public static void setupClass() throws IOException + { + AccordTestBase.setupCluster(Function.identity(), 2); + SHARED_CLUSTER.forEach(node -> node.runOnInstance(() -> AccordService.instance().setCacheSize(0))); + for (int i = 0; i < SHARED_CLUSTER.size(); i++) // initialize metrics + logger.trace(SHARED_CLUSTER.get(i + 1).callOnInstance(() -> AccordMetrics.readMetrics.toString() + AccordMetrics.writeMetrics.toString())); + } + + String writeCql() + { + return "BEGIN TRANSACTION\n" + + " LET val = (SELECT v FROM " + qualifiedAccordTableName + " WHERE k=? AND c=?);\n" + + " SELECT val.v;\n" + + " UPDATE " + qualifiedAccordTableName + " SET v = v + 1 WHERE k=? AND c=?;\n" + + "COMMIT TRANSACTION"; + } + + String readCql() + { + return "BEGIN TRANSACTION\n" + + " LET val1 = (SELECT v FROM " + qualifiedAccordTableName + " WHERE k=? AND c=?);\n" + + " LET val2 = (SELECT v FROM " + qualifiedAccordTableName + " WHERE k=? AND c=?);\n" + + " SELECT val1.v, val2.v;\n" + + "COMMIT TRANSACTION"; + } + + Map> countingMetrics0; + + @Before + public void beforeTest() + { + SHARED_CLUSTER.filters().reset(); + SHARED_CLUSTER.schemaChange("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int, PRIMARY KEY (k, c)) WITH " + TransactionalMode.full.asCqlParam()); + } + + @Test + public void testRegularMetrics() + { + countingMetrics0 = getMetrics(); + assertCoordinatorMetrics(0, "rw", 0, 0, 0, 0, 0); + SHARED_CLUSTER.coordinator(1).executeWithResult(writeCql(), ConsistencyLevel.ALL, 0, 0, 0, 0); + assertCoordinatorMetrics(0, "rw", 1, 0, 0, 0, 0); + assertCoordinatorMetrics(1, "rw", 0, 0, 0, 0, 0); + assertReplicaMetrics(0, "rw", 1, 1, 1); + assertReplicaMetrics(1, "rw", 1, 1, 1); + assertZeroMetrics("ro"); + + countingMetrics0 = getMetrics(); + SHARED_CLUSTER.coordinator(1).executeWithResult(readCql(), ConsistencyLevel.ALL, 0, 0, 1, 1); + assertCoordinatorMetrics(0, "ro", 1, 0, 0, 0, 0); + assertCoordinatorMetrics(1, "ro", 0, 0, 0, 0, 0); + assertReplicaMetrics(0, "ro", 1, 1, 0); + assertReplicaMetrics(1, "ro", 1, 1, 0); + assertZeroMetrics("rw"); + } + + @Test + public void testPreemptionMetrics() + { + ScheduledExecutorService exec = Executors.newScheduledThreadPool(1); + IMessageFilters.Matcher delay = (from, to, m) -> { + exec.schedule(() -> { + System.err.println("Receiving..."); + SHARED_CLUSTER.get(to).receiveMessageWithInvokingThread(m); + }, 10L, TimeUnit.SECONDS); + return true; + }; + IMessageFilters.Filter preacceptDelay = SHARED_CLUSTER.filters().outbound().verbs(Verb.ACCORD_PRE_ACCEPT_REQ.id).from(1).to(1) + .messagesMatching(delay) + .drop(); + + String originalAccordRetryTxnDelay = SHARED_CLUSTER.get(1).callOnInstance(DatabaseDescriptor::getAccordRecoverTxnDelay); + SHARED_CLUSTER.forEach(() -> DatabaseDescriptor.setAccordRecoverTxnDelay("100ms")); + String originalAccordExpireTxnDelay = SHARED_CLUSTER.get(1).callOnInstance(DatabaseDescriptor::getAccordExpireTxnDelay); + SHARED_CLUSTER.forEach(() -> DatabaseDescriptor.setAccordExpireTxnDelay("12s")); + long originalWriteRpcTimeoutMillis = SHARED_CLUSTER.get(1).callOnInstance(() -> DatabaseDescriptor.getWriteRpcTimeout(TimeUnit.MILLISECONDS)); + SHARED_CLUSTER.forEach(() -> DatabaseDescriptor.setWriteRpcTimeout(12_000)); + SHARED_CLUSTER.forEach(() -> DatabaseDescriptor.setReadRpcTimeout(12_000)); + + try + { + countingMetrics0 = getMetrics(); + try + { + SHARED_CLUSTER.coordinator(1).executeWithResult(writeCql(), ConsistencyLevel.ALL, 0, 0, 0, 0); + fail("expected to fail"); + } + catch (RuntimeException ex) + { + Assertions.assertThat(ex).is(AssertionUtils.rootCauseIs(AccordWritePreemptedException.class)); + } + + assertCoordinatorMetrics(0, "rw", 0, 0, 1, 0, 0); + assertCoordinatorMetrics(1, "rw", 0, 0, 0, 0, 0); + assertReplicaMetrics(0, "rw", 0, 0, 0); + assertReplicaMetrics(1, "rw", 0, 0, 0); + + assertZeroMetrics("ro"); + + countingMetrics0 = getMetrics(); + try + { + SHARED_CLUSTER.coordinator(1).executeWithResult(readCql(), ConsistencyLevel.ALL, 0, 0, 1, 1); + fail("expected to fail"); + } + catch (RuntimeException ex) + { + Assertions.assertThat(ex).is(AssertionUtils.rootCauseIs(AccordReadPreemptedException.class)); + } + + assertCoordinatorMetrics(0, "ro", 0, 0, 1, 0, 0); + assertCoordinatorMetrics(1, "ro", 0, 0, 0, 0, 0); + assertReplicaMetrics(0, "ro", 0, 0, 0); + assertReplicaMetrics(1, "ro", 0, 0, 0); + + assertZeroMetrics("rw"); + } + finally + { + SHARED_CLUSTER.forEach(() -> DatabaseDescriptor.setAccordExpireTxnDelay(originalAccordExpireTxnDelay)); + SHARED_CLUSTER.forEach(() -> DatabaseDescriptor.setAccordRecoverTxnDelay(originalAccordRetryTxnDelay)); + SHARED_CLUSTER.forEach(() -> DatabaseDescriptor.setWriteRpcTimeout(originalWriteRpcTimeoutMillis)); + preacceptDelay.off(); + exec.shutdown(); + } + } + + @Test + public void testTimeoutMetrics() + { + IMessageFilters.Filter preAcceptFilter = SHARED_CLUSTER.filters().outbound().verbs(Verb.ACCORD_PRE_ACCEPT_REQ.id).from(1).to(2).drop(); + preAcceptFilter.on(); + + countingMetrics0 = getMetrics(); + try + { + SHARED_CLUSTER.coordinator(1).executeWithResult(readCql(), ConsistencyLevel.ALL, 0, 0, 1, 1); + fail("expected to fail"); + } + catch (RuntimeException ex) + { + Assertions.assertThat(ex).is(AssertionUtils.rootCauseIs(ReadTimeoutException.class)); + } + + assertCoordinatorMetrics(0, "ro", 0, 0, 0, 1, 0); + assertCoordinatorMetrics(1, "ro", 0, 0, 0, 0, 0); + assertReplicaMetrics(0, "ro", 0, 0, 0); + assertReplicaMetrics(1, "ro", 0, 0, 0); + + assertZeroMetrics("rw"); + + countingMetrics0 = getMetrics(); + try + { + SHARED_CLUSTER.coordinator(1).executeWithResult(writeCql(), ConsistencyLevel.ALL, 0, 0, 0, 0); + fail("expected to fail"); + } + catch (RuntimeException ex) + { + Assertions.assertThat(ex).is(AssertionUtils.rootCauseIs(WriteTimeoutException.class)); + } + + assertCoordinatorMetrics(0, "rw", 0, 0, 0, 1, 0); + assertCoordinatorMetrics(1, "rw", 0, 0, 0, 0, 0); + assertReplicaMetrics(0, "rw", 0, 0, 0); + assertReplicaMetrics(1, "rw", 0, 0, 0); + + assertZeroMetrics("ro"); + } + + private void assertZeroMetrics(String scope) + { + for (int i = 0; i < SHARED_CLUSTER.size(); i++) + { + assertCoordinatorMetrics(i, scope, 0, 0, 0, 0, 0); + assertReplicaMetrics(i, scope, 0, 0, 0); + } + } + + private void assertCoordinatorMetrics(int node, String scope, long fastPaths, long slowPaths, long preempts, long timeouts, long recoveries) + { + DefaultNameFactory nameFactory = new DefaultNameFactory(AccordMetrics.ACCORD_COORDINATOR, scope); + Map metrics = diff(countingMetrics0).get(node); + logger.info("Metrics for node {} / {}: {}", node, scope, metrics); + Function metric = n -> metrics.get(nameFactory.createMetricName(n).getMetricName()); + assertThat(metric.apply(AccordMetrics.FAST_PATHS)).isEqualTo(fastPaths); + assertThat(metric.apply(AccordMetrics.SLOW_PATHS)).isEqualTo(slowPaths); + assertThat(metric.apply(AccordMetrics.PREEMPTS)).isEqualTo(preempts); + assertThat(metric.apply(AccordMetrics.TIMEOUTS)).isEqualTo(timeouts); + assertThat(metric.apply(AccordMetrics.RECOVERY_DELAY)).isEqualTo(recoveries); + assertThat(metric.apply(AccordMetrics.RECOVERY_TIME)).isEqualTo(recoveries); + assertThat(metric.apply(AccordMetrics.DEPENDENCIES)).isEqualTo(fastPaths + slowPaths); + + // Verify that coordinator metrics are published to the appropriate virtual table: + SimpleQueryResult res = SHARED_CLUSTER.get(node + 1) + .executeInternalWithResult("SELECT * FROM system_metrics.accord_coordinator_group WHERE scope = ?", scope); + while (res.hasNext()) + { + Row metricRow = res.next(); + String name = metricRow.getString("name"); + assertThat(metrics).containsKey(name); + } + + if ((fastPaths + slowPaths) > 0) + { + String fastPathToTotalName = nameFactory.createMetricName(AccordMetrics.FAST_PATH_TO_TOTAL + "." + RatioGaugeSet.MEAN_RATIO).getMetricName(); + assertThat((double) SHARED_CLUSTER.get(1).metrics().getGauge(fastPathToTotalName)).isEqualTo((double) fastPaths / (double) (fastPaths + slowPaths), Offset.offset(0.01d)); + } + } + + private void assertReplicaMetrics(int node, String scope, long stable, long executions, long applications) + { + DefaultNameFactory nameFactory = new DefaultNameFactory(AccordMetrics.ACCORD_REPLICA, scope); + Map metrics = diff(countingMetrics0).get(node); + Function metric = n -> metrics.get(nameFactory.createMetricName(n).getMetricName()); + assertThat(metric.apply(AccordMetrics.STABLE_LATENCY)).isLessThanOrEqualTo(stable); + assertThat(metric.apply(AccordMetrics.EXECUTE_LATENCY)).isEqualTo(executions); + assertThat(metric.apply(AccordMetrics.APPLY_LATENCY)).isEqualTo(applications); + assertThat(metric.apply(AccordMetrics.APPLY_DURATION)).isEqualTo(applications); + assertThat(metric.apply(AccordMetrics.PARTIAL_DEPENDENCIES)).isEqualTo(executions); + + // Verify that replica metrics are published to the appropriate virtual table: + SimpleQueryResult vtableResults = SHARED_CLUSTER.get(node + 1) + .executeInternalWithResult("SELECT * FROM system_metrics.accord_replica_group WHERE scope = ?", scope); + + while (vtableResults.hasNext()) + { + Row metricRow = vtableResults.next(); + String name = metricRow.getString("name"); + assertThat(metrics).containsKey(name); + } + + // Verify that per-store global cache stats are published to the appropriate virtual table: + SimpleQueryResult storeCacheResults = + SHARED_CLUSTER.get(node + 1).executeInternalWithResult(format("SELECT * FROM %s.%s", SchemaConstants.VIRTUAL_ACCORD_DEBUG, AccordDebugKeyspace.EXECUTOR_CACHE)); + assertThat(storeCacheResults).hasNext(); + } + + private Map> getMetrics() + { + Map> metrics = new HashMap<>(); + for (int i = 0; i < SHARED_CLUSTER.size(); i++) + metrics.put(i, SHARED_CLUSTER.get(i + 1).metrics().getCounters(name -> name.startsWith("org.apache.cassandra.metrics.Accord"))); + return metrics; + } + + private Map> diff(Map> prev) + { + Map> curr = getMetrics(); + Map> diff = new HashMap<>(); + for (int i = 0; i < SHARED_CLUSTER.size(); i++) + { + Map prevNode = prev.get(i); + Map currNode = curr.get(i); + Map diffNode = new HashMap<>(); + for (Map.Entry currEntry : currNode.entrySet()) + { + Long prevVal = prevNode.get(currEntry.getKey()); + if (prevVal != null) + diffNode.put(currEntry.getKey(), currEntry.getValue() - prevVal); + } + diff.put(i, diffNode); + } + return diff; + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordMigrationReadRaceTestBase.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordMigrationReadRaceTestBase.java new file mode 100644 index 000000000000..a4931c6ab50f --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordMigrationReadRaceTestBase.java @@ -0,0 +1,646 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.nio.ByteBuffer; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.NavigableSet; +import java.util.Queue; +import java.util.Random; +import java.util.Set; +import java.util.TreeSet; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; + +import com.google.common.base.Stopwatch; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Sets; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.primitives.Ranges; +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.Util; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.config.Config.PaxosVariant; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Murmur3Partitioner.LongToken; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ICoordinator; +import org.apache.cassandra.distributed.api.IInstance; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.IMessage; +import org.apache.cassandra.distributed.api.IMessageSink; +import org.apache.cassandra.distributed.api.SimpleQueryResult; +import org.apache.cassandra.distributed.shared.ClusterUtils; +import org.apache.cassandra.distributed.test.accord.InteropTokenRangeTest.TokenOperator; +import org.apache.cassandra.repair.RepairJobDesc; +import org.apache.cassandra.repair.RepairResult; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationRepairResult; +import org.apache.cassandra.service.consensus.migration.ConsensusTableMigration; +import org.apache.cassandra.service.consensus.migration.TransactionalMigrationFromMode; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.Throwables.ThrowingRunnable; +import org.assertj.core.api.Assertions; +import org.eclipse.jetty.util.ConcurrentHashSet; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static java.lang.String.format; +import static org.apache.cassandra.Util.spinAssertEquals; +import static org.apache.cassandra.dht.Murmur3Partitioner.LongToken.keyForToken; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.ALL; +import static org.apache.cassandra.distributed.shared.ClusterUtils.getNextEpoch; +import static org.apache.cassandra.distributed.shared.ClusterUtils.pauseBeforeEnacting; +import static org.apache.cassandra.distributed.shared.ClusterUtils.unpauseEnactment; +import static org.apache.cassandra.distributed.test.accord.InteropTokenRangeTest.TokenOperator.gt; +import static org.apache.cassandra.distributed.test.accord.InteropTokenRangeTest.TokenOperator.gte; +import static org.apache.cassandra.distributed.test.accord.InteropTokenRangeTest.TokenOperator.lt; +import static org.apache.cassandra.distributed.test.accord.InteropTokenRangeTest.TokenOperator.lte; +import static org.apache.cassandra.distributed.util.QueryResultUtil.assertThat; +import static org.apache.cassandra.utils.ByteBufferUtil.bytesToHex; +import static org.junit.Assert.assertEquals; + +/* + * Test that non-transactional read operations migrating to/from a mode where Accord ignores commit consistency levels + * and does aysnc commit are routed correctly. Currently this is just TransactionalMode.full + */ +public abstract class AccordMigrationReadRaceTestBase extends AccordTestBase +{ + private static final Logger logger = LoggerFactory.getLogger(AccordMigrationReadRaceTestBase.class); + private static final int TEST_BOUNDS_CONCURRENCY = 32; + // Set BATCH_INDEX to the failing batch and this to true to find out the query index, then set QUERY_INDEX + private static final boolean EXECUTE_BATCH_QUERIES_SERIALLY = false; + // Specify only a single batch or query to run + private static final Integer BATCH_INDEX = null; + private static final Integer QUERY_INDEX = null; + private static final String TABLE_FMT = "CREATE TABLE %s (pk blob, c int, v int, PRIMARY KEY ((pk), c));"; + + private static IPartitioner partitioner; + + private static Range migratingRange; + + private static ICoordinator coordinator; + + private final static TestMessageSink messageSink = new TestMessageSink(); + private static class TestMessageSink implements IMessageSink + { + private final Queue> messages = new ConcurrentLinkedQueue<>(); + private final Set blackholed = new ConcurrentHashSet<>(); + + public void reset() + { + messages.clear(); + blackholed.clear(); + } + + @Override + public void accept(InetSocketAddress to, IMessage message) { + messages.offer(Pair.create(to,message)); + IInstance i = SHARED_CLUSTER.get(to); + if (blackholed.contains(to) || blackholed.contains(message.from())) + return; + if (i != null) + i.receiveMessage(message); + } + } + + private final boolean migrateAwayFromAccord; + + protected AccordMigrationReadRaceTestBase() + { + this.migrateAwayFromAccord = migratingAwayFromAccord(); + } + + protected abstract boolean migratingAwayFromAccord(); + + @Override + protected Logger logger() + { + return logger; + } + + @BeforeClass + public static void setupClass() throws IOException + { + ServerTestUtils.daemonInitialization(); + // Otherwise repair complains if you don't specify a keyspace + CassandraRelevantProperties.SYSTEM_TRACES_DEFAULT_RF.setInt(3); + AccordTestBase.setupCluster(builder -> builder.appendConfig(config -> config.set("paxos_variant", PaxosVariant.v2.name()) + .set("read_request_timeout", "10s") + .set("range_request_timeout", "10s") + .set("write_request_timeout", "10s") + .set("accord.range_migration", "explicit")), 3); + partitioner = FBUtilities.newPartitioner(SHARED_CLUSTER.get(1).callsOnInstance(() -> DatabaseDescriptor.getPartitioner().getClass().getSimpleName()).call()); + StorageService.instance.setPartitionerUnsafe(partitioner); + ServerTestUtils.prepareServerNoRegister(); + LongToken migrationStart = new LongToken(Long.valueOf(SHARED_CLUSTER.get(2).callOnInstance(() -> DatabaseDescriptor.getInitialTokens().iterator().next()))); + LongToken migrationEnd = new LongToken(Long.valueOf(SHARED_CLUSTER.get(3).callOnInstance(() -> DatabaseDescriptor.getInitialTokens().iterator().next()))); + migratingRange = new Range<>(migrationStart, migrationEnd); + coordinator = SHARED_CLUSTER.coordinator(1); + SHARED_CLUSTER.setMessageSink(messageSink); + buildData(); + } + + private static final int NUM_PARTITIONS = 1000; + private static final int ROWS_PER_PARTITION = 10; + private static final Object[][][] data = new Object[NUM_PARTITIONS][][]; + private static final Object[][] dataFlat = new Object[NUM_PARTITIONS * ROWS_PER_PARTITION][]; + private static ByteBuffer pkeyAccord; + private static int pkeyAccordDataIndex; + + private static void buildData() + { + Random r = new Random(0); + long[] tokens = new long[NUM_PARTITIONS]; + for (int i = 0; i < tokens.length; i++) + tokens[i] = r.nextLong(); + Arrays.sort(tokens); + + for (int i = 0; i < NUM_PARTITIONS; i++) + { + data[i] = new Object[ROWS_PER_PARTITION][]; + ByteBuffer pk = keyForToken(tokens[i]); + for (int j = 0; j < ROWS_PER_PARTITION; j++) + { + int clustering = r.nextInt(); + data[i][j] = new Object[] { pk, clustering, 42 }; + } + Arrays.sort(data[i], Comparator.comparing(row -> (Integer)row[1])); + } + for (int i = 0; i < NUM_PARTITIONS; i++) + { + for (int j = 0; j < ROWS_PER_PARTITION; j++) + { + int idx = i * ROWS_PER_PARTITION + j; + dataFlat[idx] = new Object[] { data[i][j][0], data[i][j][1], data[i][j][2] }; + if (migratingRange.contains(Murmur3Partitioner.instance.getToken((ByteBuffer)data[i][j][0]))) + { + pkeyAccord = (ByteBuffer)data[i][j][0]; + pkeyAccordDataIndex = i; + } + } + } + } + + @AfterClass + public static void tearDownClass() + { + StorageService.instance.resetPartitionerUnsafe(); + } + + @After + public void tearDown() throws Exception + { + super.tearDown(); + messageSink.reset(); + SHARED_CLUSTER.forEach(ClusterUtils::clearAndUnpause); + super.tearDown(); + } + + private void loadData() throws Exception + { + logger.info("Starting data load"); + Stopwatch sw = Stopwatch.createStarted(); + List> inserts = new ArrayList<>(); + for (int i = 0; i < NUM_PARTITIONS; i++) + { + for (int j = 0; j < ROWS_PER_PARTITION; j++) + inserts.add(coordinator.asyncExecuteWithResult(insertCQL(qualifiedAccordTableName, (ByteBuffer)data[i][j][0], (int)data[i][j][1], (int)data[i][j][2]), ALL)); + + if (i % 100 == 0) + { + for (java.util.concurrent.Future insert : inserts) + insert.get(); + inserts.clear(); + } + } + logger.info("Data load took %dms", sw.elapsed(TimeUnit.MILLISECONDS)); + } + + private NavigableSet boundsTokens() + { + long migratingRangeStart = migratingRange.left.getLongValue(); + long migratingRangeEnd = migratingRange.right.getLongValue(); + NavigableSet set = new TreeSet<>(); + set.add(migratingRangeStart - 1); + set.add(migratingRangeStart); + set.add(migratingRangeStart + 1); + set.add(migratingRangeEnd - 1); + set.add(migratingRangeEnd); + set.add(migratingRangeEnd + 1); + set.add(Long.MAX_VALUE); + set.add(Long.MIN_VALUE + 1); + set.add(0L); + return set; + } + + private void loadOverlapData() + { + for (long token : boundsTokens()) + coordinator.executeWithResult(insertCQL(qualifiedAccordTableName, keyForToken(token), 42, 43), ALL); + } + + @Test + public void testKeyRouting() throws Throwable + { + String readCQL = "SELECT * FROM " + qualifiedAccordTableName + " WHERE pk = 0x" + bytesToHex(pkeyAccord); + testSplitAndRetry(readCQL, this::loadData, result -> assertThat(result).isDeepEqualTo(data[pkeyAccordDataIndex])); + } + + @Test + public void testRangeRouting() throws Throwable + { + String cql = "SELECT * FROM " + qualifiedAccordTableName + " WHERE token(pk) > " + Murmur3Partitioner.MINIMUM.token; + testSplitAndRetry(cql, this::loadData, result -> { + assertThat(result).isDeepEqualTo(dataFlat); + }); + } + + @Test + public void testBounds() throws Throwable + { + NavigableSet tokens = boundsTokens(); + Queue queries = new ArrayDeque<>(); + Queue> validations = new ArrayDeque<>(); + Queue retryExpectedQueries = new ArrayDeque<>(); + Queue> retryExpectedValidations = new ArrayDeque<>(); + for (long firstToken : tokens) + { + ByteBuffer pk = keyForToken(firstToken); + for (TokenOperator op : TokenOperator.values()) + { + String cql = "SELECT * FROM %s WHERE " + op.condition; + cql = cql.replace("?", "0x" + bytesToHex(pk)); + NavigableSet expectedTokens = op.expected(firstToken, tokens); + boolean expectRetry = op.intersects(firstToken, migratingRange); + Consumer validation = result -> { + Assertions.assertThat(InteropTokenRangeTest.tokens(result)) + .describedAs("Token %d with operator %s", firstToken, op.condition) + .isEqualTo(expectedTokens); + }; + if (expectRetry) + { + retryExpectedQueries.add(cql); + retryExpectedValidations.add(validation); + } + else + { + queries.add(cql); + validations.add(validation); + } + } + + for (long secondToken : tokens) + { + for (TokenOperator lt : Arrays.asList(lt, lte)) + { + for (TokenOperator gt : Arrays.asList(gt, gte)) + { + ByteBuffer gtPk = keyForToken(secondToken); + String cql = "SELECT * FROM %s WHERE " + lt.condition + " AND " + gt.condition; + cql = cql.replaceFirst("\\?", "0x" + bytesToHex(pk)); + cql = cql.replaceFirst("\\?", "0x" + bytesToHex(gtPk)); + NavigableSet expectedTokens = new TreeSet<>(Sets.intersection(lt.expected(firstToken, tokens), gt.expected(secondToken, tokens))); + Consumer validation = result -> { + Assertions.assertThat(InteropTokenRangeTest.tokens(result)) + .describedAs("LT Token %d GT Token %d with operators %s / %s", firstToken, secondToken, lt.condition, gt.condition) + .isEqualTo(expectedTokens); + }; + boolean expectRetry = lt.intersects(firstToken, migratingRange) && gt.intersects(secondToken, migratingRange); + // This evaluates to no rows without actually executing + if (firstToken == secondToken && (lt == TokenOperator.lt || gt == TokenOperator.gt)) + expectRetry = false; + if (firstToken < secondToken) + expectRetry = false; + if (expectRetry) + { + retryExpectedQueries.add(cql); + retryExpectedValidations.add(validation); + } + else + { + queries.add(cql); + validations.add(validation); + } + } + } + + ByteBuffer rhsPK = keyForToken(secondToken); + String cql = "SELECT * FROM %s WHERE token(pk) BETWEEN token(?) AND token(?)"; + cql = cql.replaceFirst("\\?", "0x" + bytesToHex(pk)); + cql = cql.replaceFirst("\\?", "0x" + bytesToHex(rhsPK)); + NavigableSet expectedTokens = new TreeSet<>(Sets.intersection(gte.expected(firstToken, tokens), lte.expected(secondToken, tokens))); + Consumer validation = result -> { + Assertions.assertThat(InteropTokenRangeTest.tokens(result)) + .describedAs("Between token %d and %d with operator token(pk) BETWEEN token(?) AND token(?)", firstToken, secondToken) + .isEqualTo(expectedTokens); + }; + // Cassandra straight up returns the wrong answer here so until it is fixed skip it + // https://issues.apache.org/jira/browse/CASSANDRA-20154 + if (firstToken > secondToken) + continue; + boolean expectRetry = gte.intersects(firstToken, migratingRange) && lte.intersects(secondToken, migratingRange); + if (expectRetry) + { + retryExpectedQueries.add(cql); + retryExpectedValidations.add(validation); + } + else + { + queries.add(cql); + validations.add(validation); + } + } + } + + testBoundsBatches(queries, validations, false); + testBoundsBatches(retryExpectedQueries, retryExpectedValidations, true); + } + + private void testBoundsBatches(Queue queries, Queue> validations, boolean expectRetry) throws Throwable + { + List queryBatch = new ArrayList<>(); + List> validationBatch = new ArrayList<>(); + int batchCount = 0; + while (!queries.isEmpty()) + { + queryBatch.add(queries.poll()); + validationBatch.add(validations.poll()); + if (queryBatch.size() == TEST_BOUNDS_CONCURRENCY) + { + if (BATCH_INDEX == null || BATCH_INDEX == batchCount) + { + logger.info("Executing batch {}", batchCount); + testBoundsBatch(queryBatch, validationBatch, expectRetry, batchCount); + } + else + { + logger.info("Skipping batch {}", batchCount); + } + batchCount++; + queryBatch.clear(); + validationBatch.clear(); + } + } + + if (!queryBatch.isEmpty()) + { + logger.info("Executing batch " + batchCount); + testBoundsBatch(queryBatch, validationBatch, expectRetry, batchCount); + } + } + + private void testBoundsBatch(List readCQL, List> validation, boolean expectRetry, int batchCount) throws Throwable + { + if (EXECUTE_BATCH_QUERIES_SERIALLY) + { + for (int i = 0; i < readCQL.size(); i++) + { + if (QUERY_INDEX == null || QUERY_INDEX == i) + { + logger.info("Executing query from batch {} query index {}", batchCount, i); + String cql = format(readCQL.get(i), qualifiedAccordTableName); + testSplitAndRetry(ImmutableList.of(cql), this::loadOverlapData, ImmutableList.of(validation.get(i)), expectRetry); + tearDown(); + setup(); + afterEach(); + } + else + { + logger.info("Skipping query from batch {} query index {}", batchCount, i); + } + } + } + else + { + readCQL = readCQL.stream().map(cql -> format(cql, qualifiedAccordTableName)).collect(toImmutableList()); + testSplitAndRetry(readCQL, this::loadOverlapData, validation, expectRetry); + tearDown(); + setup(); + afterEach(); + } + } + + private void testSplitAndRetry(String readCQL, ThrowingRunnable load, Consumer validation) throws Throwable + { + testSplitAndRetry(ImmutableList.of(readCQL), load, ImmutableList.of(validation),true); + } + + private void testSplitAndRetry(List readCQL, ThrowingRunnable load, List> validation, boolean expectRetry) throws Throwable + { + test(createTables(TABLE_FMT, qualifiedAccordTableName), + cluster -> { + load.run(); + // Node 3 is always the out of sync node + IInvokableInstance outOfSyncInstance = setUpOutOfSyncNode(cluster); + ICoordinator coordinator = outOfSyncInstance.coordinator(); + int startMigrationRejectCount = getAccordReadMigrationRejects(3); + int startRetryCount = getReadRetryOnDifferentSystemCount(outOfSyncInstance); + int startRejectedCount = getReadsRejectedOnWrongSystemCount(); + logger.info("Executing reads " + readCQL + " expect retry " + expectRetry); + List> results = readCQL.stream() + .map(read -> coordinator.asyncExecuteWithResult(read, ALL)) + .collect(toImmutableList()); + + if (migrateAwayFromAccord && expectRetry) + { + int expectedTransactions = readCQL.size(); + // Accord will block until we unpause enactment so to test the routing we wait until the transaction + // has started so the epoch it is created in is the old one + Util.spinUntilTrue(() -> outOfSyncInstance.callOnInstance(() -> { + logger.info("Coordinating {}", AccordService.instance().node().coordinating()); + return AccordService.instance().node().coordinating().size() == expectedTransactions; + })); + + logger.info("Accord node is now coordinating something, unpausing so it can continue to execute"); + } + + if (!migrateAwayFromAccord && expectRetry) + spinAssertEquals(readCQL.size() * 2, 10, () -> getReadsRejectedOnWrongSystemCount() - startRejectedCount); + + // Accord can't finish the transaction without unpausing + if (expectRetry || migrateAwayFromAccord) + { + logger.info("Unpausing out of sync instance before waiting on result"); + // Testing read coordination retry loop let coordinator get up to date and retry + unpauseEnactment(outOfSyncInstance); + } + + try + { + for (int i = 0; i < results.size(); i++) + { + SimpleQueryResult result = results.get(i).get(); + logger.info("Result for: " + readCQL.get(i)); + logger.info(result.toString()); + try + { + validation.get(i).accept(result); + } + catch (Throwable t) + { + logger.info("Query index {} failed", i); + throw t; + } + } + } + catch (ExecutionException e) + { + throw e; + } + + if (!expectRetry) + { + logger.info("Unpausing out of sync instance after waiting on result"); + // Testing read coordination retry loop let coordinator get up to date and retry + unpauseEnactment(outOfSyncInstance); + } + + int endRetryCount = getReadRetryOnDifferentSystemCount(outOfSyncInstance); + int endRejectedCount = getReadsRejectedOnWrongSystemCount(); + int endMigrationRejects = getAccordReadMigrationRejects(3); + if (expectRetry) + { + if (migrateAwayFromAccord) + { + assertEquals(readCQL.size(), endRetryCount - startRetryCount); + assertEquals(readCQL.size(), endMigrationRejects - startMigrationRejectCount); + } + else + { + assertEquals(1 * readCQL.size(), endRetryCount - startRetryCount); + // Expect only two nodes to reject since they enacted the new epoch + assertEquals(2 * readCQL.size(), endRejectedCount - startRejectedCount); + } + } + else + { + assertEquals(0, endRetryCount - startRetryCount); + assertEquals(0, endRejectedCount - startRejectedCount); + } + }); + } + + /* + * Set up 3 to be behind and unaware of the migration having progressed to the point where reads need to + * be on a different system while 1 and 2 are aware + */ + private IInvokableInstance setUpOutOfSyncNode(Cluster cluster) throws Throwable + { + IInvokableInstance i1 = cluster.get(1); + IInvokableInstance i2 = cluster.get(2); + IInvokableInstance i3 = cluster.get(3); + + long afterAlter = getNextEpoch(i1).getEpoch(); + logger.info("Epoch after alter {}", afterAlter); + if (migrateAwayFromAccord) + alterTableTransactionalMode(TransactionalMode.off, TransactionalMigrationFromMode.full); + else + alterTableTransactionalMode(TransactionalMode.full); + Util.spinUntilTrue(() -> cluster.stream().allMatch(instance -> instance.callOnInstance(() -> ClusterMetadata.current().epoch.equals(Epoch.create(afterAlter)))), 10); + + long afterMigrationStart = getNextEpoch(i1).getEpoch(); + logger.info("Epoch after migration start {}", afterMigrationStart); + long waitFori1Andi2ToEnact = afterMigrationStart; + // Migrating away from Accord need i3 to pause before enacting + if (migrateAwayFromAccord) + pauseBeforeEnacting(i3, Epoch.create(afterMigrationStart)); + // Reads are allowed until Accord thinks it owns the range and can start doing async commit and ignoring consistency levels + nodetool(coordinator, "consensus_admin", "begin-migration", "-st", migratingRange.left.toString(), "-et", migratingRange.right.toString(), KEYSPACE, accordTableName); + + if (!migrateAwayFromAccord) + { + // Migration to Accord does not have Accord read until the migration has completed a data repair and then an Accord repair + Util.spinUntilTrue(() -> cluster.stream().allMatch(instance -> instance.callOnInstance(() -> ClusterMetadata.current().epoch.equals(Epoch.create(afterMigrationStart)))), 10); + + long afterRepair = getNextEpoch(i1).getEpoch(); + logger.info("Epoch after repair {}", afterRepair); + // First repair only does the data and allows Accord to read, but doesn't require reads to be done through Accord + nodetool(i2, "repair", "-skip-paxos", "-skip-accord", "-st", migratingRange.left.toString(), "-et", migratingRange.right.toString(), KEYSPACE, accordTableName); + Util.spinUntilTrue(() -> cluster.stream().allMatch(instance -> instance.callOnInstance(() -> ClusterMetadata.current().epoch.equals(Epoch.create(afterRepair)))), 10); + + long afterRepairCompletionHandler = getNextEpoch(i1).getEpoch(); + logger.info("Epoch after repair completion handler {}", afterRepairCompletionHandler); + waitFori1Andi2ToEnact = afterRepairCompletionHandler; + // Node 3 will coordinate the query and not be aware that the migration has begun + pauseBeforeEnacting(i3, Epoch.create(afterRepairCompletionHandler)); + + // Unfortunately can't run real repair because it can't complete with i3 not responding because it's stuck waiting + // on TCM so fake the completion of the repair by invoking the completion handler directly + String keyspace = KEYSPACE; + String table = accordTableName; + long migratingTokenStart = migratingRange.left.getLongValue(); + long migratingTokenEnd = migratingRange.right.getLongValue(); + Future result = SHARED_CLUSTER.get(1).asyncRunsOnInstance(() -> + { + Epoch startEpoch = ClusterMetadata.current().epoch; + TableId tableId = Schema.instance.getTableMetadata(keyspace, table).id; + List> ranges = ImmutableList.of(new Range<>(new LongToken(migratingTokenStart), new LongToken(migratingTokenEnd))); + RepairJobDesc desc = new RepairJobDesc(null, null, keyspace, table, ranges); + TokenRange range = TokenRange.create(new TokenKey(tableId, new LongToken(migratingTokenStart)), new TokenKey(tableId, new LongToken(migratingTokenEnd))); + Ranges accordRanges = Ranges.of(range); + ConsensusMigrationRepairResult repairResult = ConsensusMigrationRepairResult.fromRepair(startEpoch, accordRanges, true, true, true, false); + ConsensusTableMigration.completedRepairJobHandler.onSuccess(new RepairResult(desc, null, repairResult)); + }).call(); + result.get(); + } + + long waitFori1Andi2ToEnactFinal = waitFori1Andi2ToEnact; + // Make sure 1 and 2 are up to date + for (int i = 1; i < 3; i++) + { + int instanceIndex = i; + Util.spinUntilTrue(() -> cluster.get(instanceIndex).callOnInstance(() -> ClusterMetadata.current().epoch.equals(Epoch.create(waitFori1Andi2ToEnactFinal))), 10); + } + + return i3; + } + + private static String insertCQL(String qualifiedTableName, ByteBuffer pkey, int clustering, int value) + { + return format("INSERT INTO %s ( pk, c, v ) VALUES ( 0x%s, %d, %d )", qualifiedTableName, bytesToHex(pkey), clustering, value); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordMigrationTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordMigrationTest.java new file mode 100644 index 000000000000..6dbfdca2aa0a --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordMigrationTest.java @@ -0,0 +1,840 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.stream.Collectors; +import javax.annotation.Nonnull; + +import com.google.common.collect.ImmutableList; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.core.type.TypeReference; +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.config.Config.PaxosVariant; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.cql3.UntypedResultSet; +import org.apache.cassandra.db.BufferDecoratedKey; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.db.Mutation.SimpleBuilder; +import org.apache.cassandra.db.SimpleBuilders.PartitionUpdateBuilder; +import org.apache.cassandra.db.virtual.AccordDebugKeyspace; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner.LongToken; +import org.apache.cassandra.dht.NormalizedRanges; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.ICoordinator; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.IMessageFilters.Filter; +import org.apache.cassandra.distributed.api.Row; +import org.apache.cassandra.distributed.api.SimpleQueryResult; +import org.apache.cassandra.gms.EndpointState; +import org.apache.cassandra.gms.Gossiper; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.service.consensus.migration.ConsensusKeyMigrationState; +import org.apache.cassandra.service.consensus.migration.ConsensusMigratedAt; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationState; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationTarget; +import org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter; +import org.apache.cassandra.service.consensus.migration.TableMigrationState; +import org.apache.cassandra.service.consensus.migration.TransactionalMigrationFromMode; +import org.apache.cassandra.service.paxos.Ballot; +import org.apache.cassandra.service.paxos.Ballot.Flag; +import org.apache.cassandra.service.paxos.BallotGenerator; +import org.apache.cassandra.service.paxos.Commit.Agreed; +import org.apache.cassandra.service.paxos.Commit.Proposal; +import org.apache.cassandra.service.paxos.PaxosState; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.transport.Dispatcher; +import org.apache.cassandra.utils.ByteArrayUtil; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.JsonUtils; +import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.PojoToString; +import org.yaml.snakeyaml.Yaml; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static java.lang.String.format; +import static java.util.Collections.emptyList; +import static org.apache.cassandra.Util.dk; +import static org.apache.cassandra.Util.spinUntilSuccess; +import static org.apache.cassandra.db.SystemKeyspace.CONSENSUS_MIGRATION_STATE; +import static org.apache.cassandra.db.SystemKeyspace.PAXOS; +import static org.apache.cassandra.dht.NormalizedRanges.normalizedRanges; +import static org.apache.cassandra.dht.Range.normalize; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.ALL; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.ANY; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.SERIAL; +import static org.apache.cassandra.schema.SchemaConstants.SYSTEM_KEYSPACE_NAME; +import static org.apache.cassandra.schema.SchemaConstants.VIRTUAL_ACCORD_DEBUG; +import static org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter.ConsensusRoutingDecision.paxosV2; +import static org.apache.cassandra.service.paxos.PaxosState.MaybePromise.Outcome.PROMISE; +import static org.assertj.core.api.Fail.fail; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +/* + * This test suite is intended to serve as an integration test with some pretty good visibility into actual execution + * that can run quickly, and make sure all the right steps are running during migration. + * + * For correctness related to wrong/right answers we rely on simulator to validate. + */ +public class AccordMigrationTest extends AccordTestBase +{ + private static final Logger logger = LoggerFactory.getLogger(AccordMigrationTest.class); + + private static final int CLUSTERING_VALUE = 2; + + private static final String TABLE_FMT = "CREATE TABLE %s (id int, c int, v int, s int static, PRIMARY KEY ((id), c));"; + + private static final String CAS_FMT = "UPDATE %s SET v = 4 WHERE id = ? AND c = %d IF v = 42"; + + private static IPartitioner partitioner; + + private static Token minToken; + + private static Token maxToken; + + private static Token midToken; + + private static Token upperMidToken; + + private static Token lowerMidToken; + + private static ICoordinator coordinator; + + // To create a precise repair where the repaired range is fully contained in a locally replicated range + // we need to align with this token. The local ranges are (9223372036854775805,-1] and (-1,9223372036854775805] + // No idea why the partitioner creates such an + private final Token maxAlignedWithLocalRanges = new LongToken(9223372036854775805L); + + @Override + protected Logger logger() + { + return logger; + } + + @BeforeClass + public static void setupClass() throws IOException + { + ServerTestUtils.daemonInitialization(); + // Otherwise repair complains if you don't specify a keyspace + CassandraRelevantProperties.SYSTEM_TRACES_DEFAULT_RF.setInt(3); + AccordTestBase.setupCluster(builder -> builder.appendConfig(config -> config.with(Feature.NETWORK).set("paxos_variant", PaxosVariant.v2.name()) + .set("accord.range_migration", "explicit")), 3); + partitioner = FBUtilities.newPartitioner(SHARED_CLUSTER.get(1).callsOnInstance(() -> DatabaseDescriptor.getPartitioner().getClass().getSimpleName()).call()); + StorageService.instance.setPartitionerUnsafe(partitioner); + ServerTestUtils.prepareServerNoRegister(); + minToken = partitioner.getMinimumToken(); + maxToken = partitioner.getMaximumTokenForSplitting(); + midToken = partitioner.midpoint(minToken, maxToken); + upperMidToken = partitioner.midpoint(midToken, maxToken); + lowerMidToken = partitioner.midpoint(minToken, midToken); + coordinator = SHARED_CLUSTER.coordinator(1); + } + + @AfterClass + public static void tearDownClass() + { + StorageService.instance.resetPartitionerUnsafe(); + } + + @After + public void tearDown() throws Exception + { + super.tearDown(); + // Reset migration state + forEach(() -> { + ConsensusRequestRouter.resetInstance(); + ConsensusKeyMigrationState.reset(); + }); + SHARED_CLUSTER.coordinators().forEach(coordinator -> coordinator.execute(format("TRUNCATE TABLE %s.%s", SYSTEM_KEYSPACE_NAME, CONSENSUS_MIGRATION_STATE), ALL)); + SHARED_CLUSTER.coordinators().forEach(coordinator -> coordinator.execute(format("TRUNCATE TABLE %s.%s", SYSTEM_KEYSPACE_NAME, PAXOS), ALL)); + } + + private static int getKeyBetweenTokens(Token left, Token right) + { + return getKeysBetweenTokens(left, right).next(); + } + + private static Iterator getKeysBetweenTokens(Token left, Token right) + { + return new Iterator() + { + int candidate = 0; + @Override + public boolean hasNext() + { + return true; + } + + @Override + public Integer next() + { + for (int i = 0; i < 1_000_000; i++) + { + int value = candidate; + candidate++; + if (partitioner.getToken(ByteBufferUtil.bytes(value)).compareTo(right) < 0 && partitioner.getToken(ByteBufferUtil.bytes(value)).compareTo(left) > 0) + return value; + } + throw new IllegalStateException("Gave up after 1 million attempts"); + } + }; + } + + /* + * Force routing a request to Paxos even after a range has been marked migrating to simulate + * a race between updating cluster metadata and making a routing decision to a specific consensus + * protocol. Paxos should still detect the routing change at two points. After running the promise phase + * (round of messaging might discover a new epoch) and during the accept phase (might not get a majority due + * to rejects caused by acceptors refusing due to migration). + * + * This is used directly to test that begin rejects after discovering a migration, and indirectly in + * PaxosToAccordMigrationNotHappeningUpToAccept. + */ + public static class RoutesToPaxosOnce extends ConsensusRequestRouter + { + boolean routed; + + @Override + protected ConsensusRoutingDecision routeAndMaybeMigrate(ClusterMetadata cm, @Nonnull TableMetadata tmd, @Nonnull DecoratedKey key, ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime, long timeoutNanos, boolean isForWrite) + { + if (routed) + return super.routeAndMaybeMigrate(cm, tmd, key, consistencyLevel, requestTime, timeoutNanos, isForWrite); + routed = true; + return paxosV2; + } + } + + /* + * To allow for testing of Paxos we want to force begin to succeed, but accept to fail + * with a retry on new protocol reject. + */ + public static class PaxosToAccordMigrationNotHappeningUpToBegin extends RoutesToPaxosOnce + { + @Override + public boolean isKeyInMigratingOrMigratedRangeDuringPaxosBegin(TableId tableId, DecoratedKey key) + { + return false; + } + } + + public static class PaxosToAccordMigrationNotHappeningUpToAccept extends PaxosToAccordMigrationNotHappeningUpToBegin + { + @Override + public boolean isKeyInMigratingOrMigratedRangeDuringPaxosAccept(TableId tableId, DecoratedKey key) + { + return false; + } + } + + public static class RoutesToAccordOnce extends ConsensusRequestRouter + { + boolean routed; + + @Override + protected ConsensusRoutingDecision routeAndMaybeMigrate(ClusterMetadata cm, @Nonnull TableMetadata tmd, @Nonnull DecoratedKey key, ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime, long timeoutNanos, boolean isForWrite) + { + if (routed) + return super.routeAndMaybeMigrate(cm, tmd, key, consistencyLevel, requestTime, timeoutNanos, isForWrite); + routed = true; + return ConsensusRoutingDecision.accord; + } + } + + /* + * Helper to invoke a query and assert that the right metrics change indicating the correct + * paths were taken to execute the query during migration + */ + private static void assertTargetAccordWrite(Consumer query, int coordinatorIndex, int key, List> expectedKeyMigrations, int expectedAccordWriteCount, int expectedCasWriteCount, int expectedKeyMigrationCount, int expectedCasBeginRejects, int expectedCasAcceptRejects) + { + int startingWriteCount = getAccordWriteCount(coordinatorIndex); + int startingCasWriteCount = getCasWriteCount(coordinatorIndex); + int startingKeyMigrationCount = getKeyMigrationCount(coordinatorIndex); + int startingCasWriteBeginRejects = getCasWriteBeginRejects(coordinatorIndex); + int startingCasWriteAcceptRejects = getCasWriteAcceptRejects(coordinatorIndex); + query.accept(key); + validateKeyMigrations(expectedKeyMigrations); + assertEquals("Accord writes", expectedAccordWriteCount, getAccordWriteCount(coordinatorIndex) - startingWriteCount); + assertEquals("CAS writes", expectedCasWriteCount, getCasWriteCount(coordinatorIndex) - startingCasWriteCount); + assertEquals("Key Migrations", expectedKeyMigrationCount, getKeyMigrationCount(coordinatorIndex) - startingKeyMigrationCount); + assertEquals("CAS Begin rejects", expectedCasBeginRejects, getCasWriteBeginRejects(coordinatorIndex) - startingCasWriteBeginRejects); + assertEquals("CAS Accept rejects", expectedCasAcceptRejects, getCasWriteAcceptRejects(coordinatorIndex) - startingCasWriteAcceptRejects); + } + + private static Object[][] assertTargetAccordRead(Function query, int coordinatorIndex, int key, List> expectedKeyMigrations, int expectedAccordReadCount, int expectedCasPrepareCount, int expectedKeyMigrationCount, int expectedCasReadBeginRejects, int expectedCasReadAcceptRejects) + { + int startingReadCount = getAccordReadCount(coordinatorIndex); + int startingCasPrepareCount = getCasPrepareCount(coordinatorIndex); + int startingKeyMigrationCount = getKeyMigrationCount(coordinatorIndex); + int startingCasReadBeginRejects = getCasReadBeginRejects(coordinatorIndex); + int startingCasReadAcceptRejects = getCasReadAcceptRejects(coordinatorIndex); + Object[][] result = query.apply(key); + validateKeyMigrations(expectedKeyMigrations); + assertEquals("Accord reads", expectedAccordReadCount, getAccordReadCount(coordinatorIndex) - startingReadCount); + assertEquals("CAS prepares", expectedCasPrepareCount, getCasPrepareCount(coordinatorIndex) - startingCasPrepareCount); + assertEquals("Key Migrations", expectedKeyMigrationCount, getKeyMigrationCount(coordinatorIndex) - startingKeyMigrationCount); + assertEquals("CAS Begin rejects", expectedCasReadBeginRejects, getCasReadBeginRejects(coordinatorIndex) - startingCasReadBeginRejects); + assertEquals("CAS Accept rejects", expectedCasReadAcceptRejects, getCasReadAcceptRejects(coordinatorIndex) - startingCasReadAcceptRejects); + return result; + } + + private static void assertTargetPaxosWrite(Consumer query, int coordinatorIndex, int key, List> expectedKeyMigrations, int expectedAccordWriteCount, int expectedCasWriteCount, int expectedKeyMigrationCount, int expectedMigrationRejects, int expectedSkippedReads) + { + int startingWriteCount = getAccordWriteCount(coordinatorIndex); + int startingCasWriteCount = getCasWriteCount(coordinatorIndex); + int startingKeyMigrationCount = getKeyMigrationCount(coordinatorIndex); + int startingMigrationRejectsCount = getAccordWriteMigrationRejects(coordinatorIndex); + int startingSkippedReadsCount = getAccordMigrationSkippedReads(); + query.accept(key); + validateKeyMigrations(expectedKeyMigrations); + assertEquals("Accord writes", expectedAccordWriteCount, getAccordWriteCount(coordinatorIndex) - startingWriteCount); + assertEquals("CAS writes", expectedCasWriteCount, getCasWriteCount(coordinatorIndex) - startingCasWriteCount); + assertEquals("Key Migrations", expectedKeyMigrationCount, getKeyMigrationCount(coordinatorIndex) - startingKeyMigrationCount); + assertEquals("Accord migration rejects", expectedMigrationRejects, getAccordWriteMigrationRejects(coordinatorIndex) - startingMigrationRejectsCount); + assertEquals("Accord skipped reads", expectedSkippedReads, getAccordMigrationSkippedReads() - startingSkippedReadsCount); + } + + private static void validateKeyMigrations(List> expectedMigrations) + { + spinUntilSuccess(() -> { + try + { + List keys = expectedMigrations.stream().map(p -> p.left.array()).collect(Collectors.toList()); + List intKeys = expectedMigrations.stream().map(p -> ByteBufferUtil.toInt(p.left)).collect(Collectors.toList()); + List tables = expectedMigrations.stream().map(p -> p.right).collect(Collectors.toList()); + // Notification of all replicas that the key was migrated was removed so they will each have to run + // a local barrier first to find out the key was migrated. Not sure if we will add it back somehow. + IInvokableInstance instance = SHARED_CLUSTER.get(1); + instance.runOnInstance(() -> { + Map, ConsensusMigratedAt> cacheMap = ConsensusKeyMigrationState.MIGRATION_STATE_CACHE.asMap(); + String cacheMessage = format("Instance %d Expected %s migrations but found in cache %s", 1, intKeys, cacheMap); + assertEquals(cacheMessage, keys.size(), cacheMap.size()); + for (int j = 0; j < keys.size(); j++) + { + assertTrue(cacheMessage, + cacheMap.containsKey(Pair.create(ByteBuffer.wrap(keys.get(j)), tables.get(j)))); + } + + UntypedResultSet result = QueryProcessor.executeInternal("SELECT * from " + SYSTEM_KEYSPACE_NAME + "." + CONSENSUS_MIGRATION_STATE); + String tableMessage = format("Instance %d Expected %s migrations but found in system table %s", 1, intKeys, result); + assertEquals(tableMessage, keys.size(), result.size()); + Iterator resultIterator = result.iterator(); + for (int j = 0; j < result.size(); j++) + { + UntypedResultSet.Row row = resultIterator.next(); + boolean foundKey = false; + for (byte[] expectedKey : keys) + if (ByteBuffer.wrap(expectedKey).equals(row.getBytes("row_key"))) + foundKey = true; + assertTrue(tableMessage, foundKey); + } + }); + } + catch (Throwable t) + { + // For some reason full stack trace wasn't displayed without rethrowing + throw new AssertionError(t); + } + }); + } + + @Test + public void testPaxosToAccordCAS() throws Exception + { + test(format(TABLE_FMT, qualifiedAccordTableName), + cluster -> { + List> expectedKeyMigrations = new ArrayList<>(); + String table = accordTableName; + UUID tableUUID = cluster.get(1).callOnInstance(() -> ColumnFamilyStore.getIfExists(KEYSPACE, table).getTableId().asUUID()); + cluster.forEach(node -> node.runOnInstance(() -> { + TableMetadata tbl = Schema.instance.getTableMetadata(KEYSPACE, table); + Assert.assertEquals(TransactionalMode.off, tbl.params.transactionalMode); + Assert.assertEquals(TransactionalMigrationFromMode.none, tbl.params.transactionalMigrationFrom); + })); + + cluster.schemaChange(format("ALTER TABLE %s.%s WITH transactional_mode='%s'", KEYSPACE, accordTableName, TransactionalMode.full)); + + cluster.forEach(node -> node.runOnInstance(() -> { + TableMetadata tbl = Schema.instance.getTableMetadata(KEYSPACE, table); + Assert.assertEquals(TransactionalMode.full, tbl.params.transactionalMode); + Assert.assertEquals(TransactionalMigrationFromMode.off, tbl.params.transactionalMigrationFrom); + })); + + String casCQL = format(CAS_FMT, qualifiedAccordTableName, CLUSTERING_VALUE); + Consumer runCasNoApply = key -> assertRowEquals(cluster, new Object[]{false}, casCQL, key); + Consumer runCasApplies = key -> assertRowEquals(cluster, new Object[]{true}, casCQL, key); + Consumer runCasOnSecondNode = key -> assertEquals( "[applied]", cluster.coordinator(2).executeWithResult(casCQL, ANY, key).names().get(0)); + String tableName = qualifiedAccordTableName.split("\\.")[1]; + int migratingKey = getKeyBetweenTokens(upperMidToken, maxToken); + int notMigratingKey = getKeyBetweenTokens(minToken, midToken); + Range migratingRange = new Range(midToken, maxToken); + NormalizedRanges migratingRanges = normalizedRanges(ImmutableList.of(migratingRange)); + + // Not actually migrating yet so should do nothing special + assertTargetAccordWrite(runCasNoApply, 1, migratingKey, expectedKeyMigrations, 0, 1, 0, 0, 0); + + // Mark ranges migrating and check migration state is correct + nodetool(coordinator, "consensus_admin", "begin-migration", "-st", midToken.toString(), "-et", maxToken.toString(), KEYSPACE, tableName); + assertMigrationState(tableName, ConsensusMigrationTarget.accord, emptyList(), migratingRanges, migratingRanges, 1); + + // Should be routed directly to Accord, and perform key migration, as well as key migration read in Accord + addExpectedMigratedKey(expectedKeyMigrations, migratingKey, tableUUID); + // Without data repaired Paxos should continue to run + assertTargetPaxosWrite(runCasNoApply, 1, migratingKey, emptyList(), 0, 1, 0, 0, 0); + nodetool(coordinator, "repair", "-st", upperMidToken.toString(), "-et", maxAlignedWithLocalRanges.toString(), "-skip-accord", "-skip-paxos"); + // With data repaired the write should now key migrated + assertTargetAccordWrite(runCasNoApply, 1, migratingKey, expectedKeyMigrations, 1, 0, 1, 0, 0); + + // Should not repeat key migration, and should still do a migration read in Accord + assertTargetAccordWrite(runCasNoApply, 1, migratingKey, expectedKeyMigrations, 1, 0, 0, 0, 0); + + // Should run on Paxos since it is not in the migrating range + assertTargetAccordWrite(runCasNoApply, 1, notMigratingKey, expectedKeyMigrations, 0, 1, 0, 0, 0); + + // Check that the coordinator on the other node also has saved that the key migration was performed + // and runs the query on Accord immediately without key migration + assertTargetAccordWrite(runCasOnSecondNode, 2, migratingKey, expectedKeyMigrations, 1, 0, 0, 0, 0); + + // Forced repair while a node is down shouldn't work, use repair instead of finish-migration because repair exposes --force + // and regular Cassandra repairs are eligible to drive migration so it's important they check --force and down nodes + InetAddressAndPort secondNodeBroadcastAddress = InetAddressAndPort.getByAddress(cluster.get(2).broadcastAddress()); + Filter blockNode2 = cluster.filters().allVerbs().from(2).drop(); + cluster.get(1).runOnInstance(() -> { + EndpointState endpointState = Gossiper.instance.getEndpointStateForEndpoint(secondNodeBroadcastAddress); + Gossiper.runInGossipStageBlocking(() -> Gossiper.instance.markDead(secondNodeBroadcastAddress, endpointState)); + }); + nodetool(coordinator, "repair", "--force"); + // Data repair was already done for one node's local range + NormalizedRanges alreadyDataRepaired = normalizedRanges(ImmutableList.of(new Range<>(upperMidToken, maxAlignedWithLocalRanges))); + NormalizedRanges remainingPendingDataRepair = migratingRanges.subtract(alreadyDataRepaired); + assertMigrationState(tableName, ConsensusMigrationTarget.accord, emptyList(), remainingPendingDataRepair, migratingRanges, 1); + blockNode2.off(); + cluster.get(1).runOnInstance(() -> { + EndpointState endpointState = Gossiper.instance.getEndpointStateForEndpoint(secondNodeBroadcastAddress); + Gossiper.runInGossipStageBlocking(() -> Gossiper.instance.realMarkAlive(secondNodeBroadcastAddress, endpointState)); + }); + + // Full repair should complete the migration and update the metadata, adding --force when nodes are up should be fine + nodetool(coordinator, "repair", "--force" ); + // Some ranges will be migrated because they were already data repaired + NormalizedRanges alreadyMigrated = alreadyDataRepaired; + assertMigrationState(tableName, ConsensusMigrationTarget.accord, alreadyMigrated, emptyList(), migratingRanges.subtract(alreadyMigrated), 1); + // Need to repair a second time to complete the migration to Accord because we are invoking repair directly, finish would do both for us normally + nodetool(coordinator, "repair", "--force"); + assertMigrationState(tableName, ConsensusMigrationTarget.accord, migratingRanges, emptyList(), emptyList(), 0); + + // Should run on Accord, and not perform key migration nor should it need to perform a migration read in Accord now that it is repaired + assertTargetAccordWrite(runCasNoApply, 1, migratingKey, expectedKeyMigrations, 1, 0, 0, 0, 0); + + // Should run on Paxos, and not perform key migration + assertTargetAccordWrite(runCasNoApply, 1, notMigratingKey, expectedKeyMigrations, 0, 1, 0, 0, 0); + + // Pivot to testing repair with a subrange of the migrating range as well as key migration + // Will use the unmigrated range between lowerMidToken and midToken + nodetool(coordinator, "consensus_admin", "begin-migration", "-st", lowerMidToken.toString(), "-et", midToken.toString(), KEYSPACE, tableName); + + // Generate several keys to test with instead of resetting key state + Iterator testingKeys = getKeysBetweenTokens(lowerMidToken, midToken); + migratingKey = testingKeys.next(); + + // Check that Paxos repair is run and actually repairs a transaction that was accepted, but not committed + String ballotString = BallotGenerator.Global.nextBallot(Flag.GLOBAL).toString(); + saveAcceptedPaxosProposal(tableName, ballotString, migratingKey); + // PaxosRepair will have inserted a condition matching row, so it can apply, demonstrating repair and + // key migration occurred + addExpectedMigratedKey(expectedKeyMigrations, migratingKey, tableUUID); + // Need to data repair for key migration to be possible since otherwise it will just run on Paxos + nodetool(coordinator, "repair", "-st", lowerMidToken.toString(), "-et", "-3074457345618258603", "-skip-accord", "-skip-paxos"); + nodetool(cluster.coordinator(2), "repair", "-st", "-3074457345618258603", "-et", midToken.toString(), "-skip-accord", "-skip-paxos"); + assertTargetAccordWrite(runCasApplies, 1, migratingKey, expectedKeyMigrations, 1, 0, 1, 0, 0); + + // This will force the write to use the normal write patch + cluster.get(1).runOnInstance(() -> ConsensusRequestRouter.setInstance(new PaxosToAccordMigrationNotHappeningUpToBegin())); + // Update inserted row so the condition can apply, if the condition check doesn't apply + // then it won't get to propose/accept + migratingKey = testingKeys.next(); + String keyspace = KEYSPACE; + Integer clusteringValue = CLUSTERING_VALUE; + String mutationTableName = accordTableName; + Consumer makeCASApply = key -> cluster.forEach(instance -> instance.runOnInstance(() -> { + SimpleBuilder mutationBuilder = Mutation.simpleBuilder(keyspace, dk(key)).allowPotentialTxnConflicts(); + mutationBuilder.update(mutationTableName).row(clusteringValue).add("v", 42); + Mutation m = mutationBuilder.build(); + m.applyUnsafe(); + })); + makeCASApply.accept(migratingKey); + + // This will force the request to run on Paxos up to Accept + // and the accept will be rejected at both nodes and we are certain we need to retry the transaction + cluster.get(1).runOnInstance(() -> ConsensusRequestRouter.setInstance(new PaxosToAccordMigrationNotHappeningUpToBegin())); + addExpectedMigratedKey(expectedKeyMigrations, migratingKey, tableUUID); + assertTargetAccordWrite(runCasApplies, 1, migratingKey, expectedKeyMigrations, 1, 1, 1, 0, 1); + + // One node will now accept the other will reject and we are uncertain if we should retry the transaction + // and should surface that as a timeout exception + migratingKey = testingKeys.next(); + makeCASApply.accept(migratingKey); + cluster.get(1).runOnInstance(() -> ConsensusRequestRouter.setInstance(new PaxosToAccordMigrationNotHappeningUpToAccept())); + try + { + cluster.filters().allVerbs().to(3).from(3).drop(); + runCasNoApply.accept(migratingKey); + cluster.filters().reset(); + fail("Should have thrown timeout exception"); + } + catch (Throwable t) + { + if (!t.getClass().getName().equals("org.apache.cassandra.exceptions.CasWriteTimeoutException")) + throw new RuntimeException(t); + } + + // Test that if we find out about a migration from the prepare phase Paxos.begin we + // retry it on Accord + cluster.get(1).runOnInstance(() -> ConsensusRequestRouter.setInstance(new RoutesToPaxosOnce())); + // Should exit Paxos from begin, key migration should occur because it's a new key, and Accord will need to do a migration read + migratingKey = testingKeys.next(); + addExpectedMigratedKey(expectedKeyMigrations, migratingKey, tableUUID); + assertTargetAccordWrite(runCasNoApply, 1, migratingKey, expectedKeyMigrations, 1, 1, 1, 1, 0); + + // Now do two repairs to complete the migration repair, and we are done with black box integration testing + // First repair is a range smack dab in the middle + Token startTokenForRepair = partitioner.midpoint(lowerMidToken, midToken); + Token endTokenForRepair = partitioner.midpoint(startTokenForRepair, midToken); + nodetool(coordinator, "consensus_admin", "finish-migration", "-st", startTokenForRepair.toString(), "-et", endTokenForRepair.toString()); + List> migratedRanges = ImmutableList.of(new Range<>(startTokenForRepair, endTokenForRepair), migratingRange); + List> midMigratingRanges = ImmutableList.of(new Range<>(lowerMidToken, startTokenForRepair), new Range<>(endTokenForRepair, midToken)); + List> migratingAndMigratedRanges = ImmutableList.of(new Range<>(lowerMidToken, maxToken)); + assertMigrationState(tableName, ConsensusMigrationTarget.accord, migratedRanges, emptyList(), midMigratingRanges, 1); + + nodetool(coordinator, "consensus_admin", "finish-migration"); + assertMigrationState(tableName, ConsensusMigrationTarget.accord, migratingAndMigratedRanges, emptyList(), emptyList(), 0); + }); + } + + /* + * Read has a few code paths that are separate from CAS that need to be tested + * such as switching consensus protocol, rejecting read during accept, and throwing + * timeout exception if uncertain about side effects + */ + @Test + public void testPaxosToAccordSerialRead() throws Exception + { + test(format(TABLE_FMT, qualifiedAccordTableName), + cluster -> { + String table = accordTableName; + UUID tableUUID = cluster.get(1).callOnInstance(() -> ColumnFamilyStore.getIfExists(KEYSPACE, table).getTableId().asUUID()); + List> expectedKeyMigrations = new ArrayList<>(); + cluster.schemaChange(format("ALTER TABLE %s.%s WITH transactional_mode='%s'", KEYSPACE, accordTableName, TransactionalMode.full)); + String readCQL = format("SELECT * FROM %s WHERE id = ? and c = %s", qualifiedAccordTableName, CLUSTERING_VALUE); + Function runRead = key -> cluster.coordinator(1).execute(readCQL, SERIAL, key); + Range migratingRange = new Range<>(new LongToken(Long.MIN_VALUE + 1), new LongToken(Long.MIN_VALUE)); + List> migratingRanges = ImmutableList.of(migratingRange); + int key = 0; + + assertTargetAccordRead(runRead, 1, key, expectedKeyMigrations, 0, 1, 0, 0, 0); + // Mark wrap around range as migrating + nodetool(coordinator, "consensus_admin", "begin-migration", "-st", String.valueOf(Long.MIN_VALUE + 1), "-et", String.valueOf(Long.MIN_VALUE), KEYSPACE, accordTableName); + assertMigrationState(accordTableName, ConsensusMigrationTarget.accord, emptyList(), migratingRanges, migratingRanges, 1); + // Need to repair so key migration can occur + for (int i = 1; i <= 3; i++) + nodetool(cluster.coordinator(i), "repair", "-skip-paxos", "-skip-accord"); + // Should run directly on accord, migrate the key, and perform a quorum read fro Accord, Paxos repair will run prepare once + addExpectedMigratedKey(expectedKeyMigrations, key, tableUUID); + assertTargetAccordRead(runRead, 1, key, expectedKeyMigrations, 1, 1, 1, 0, 0); + key++; + + // Should run up to accept with both nodes refusing to accept + savePromisedAndCommittedPaxosProposal(accordTableName, key); + cluster.get(1).runOnInstance(() -> ConsensusRequestRouter.setInstance(new PaxosToAccordMigrationNotHappeningUpToBegin())); + addExpectedMigratedKey(expectedKeyMigrations, key, tableUUID); + assertTargetAccordRead(runRead, 1, key, expectedKeyMigrations, 1, 2, 1, 0, 1); + key++; + }); + } + + private void assertTransactionalModes(String keyspace, String table, TransactionalMode mode, TransactionalMigrationFromMode migration) + { + forEach(() -> { + TableMetadata metadata = Schema.instance.getTableMetadata(keyspace, table); + Assert.assertEquals(mode, metadata.params.transactionalMode); + Assert.assertEquals(migration, metadata.params.transactionalMigrationFrom); + }); + } + + private void assertTransactionalModes(TransactionalMode mode, TransactionalMigrationFromMode migration) + { + assertTransactionalModes(KEYSPACE, accordTableName, mode, migration); + } + + @Test + public void testAccordToPaxos() throws Exception + { + test(format(TABLE_FMT, qualifiedAccordTableName), + cluster -> { + String casCQL = format(CAS_FMT, qualifiedAccordTableName, CLUSTERING_VALUE); + Consumer runCasNoApply = key -> assertRowEquals(cluster, new Object[]{false}, casCQL, key); + String tableName = qualifiedAccordTableName.split("\\.")[1]; + UUID tableUUID = cluster.get(1).callOnInstance(() -> ColumnFamilyStore.getIfExists(KEYSPACE, tableName).getTableId().asUUID()); + + alterTableTransactionalMode(TransactionalMode.mixed_reads); + assertTransactionalModes(TransactionalMode.mixed_reads, TransactionalMigrationFromMode.off); + + // Mark a subrange as migrating and finish migrating half of it + nodetool(coordinator, "consensus_admin", "begin-migration", "-st", midToken.toString(), "-et", maxToken.toString(), KEYSPACE, tableName); + nodetool(coordinator, "consensus_admin", "finish-migration", "-st", midToken.toString(), "-et", "3074457345618258601"); + nodetool(coordinator, "consensus_admin", "finish-migration", "-st", "3074457345618258601", "-et", upperMidToken.toString()); + Range accordMigratedRange = new Range(midToken, upperMidToken); + Range accordMigratingRange = new Range(upperMidToken, maxToken); + assertMigrationState(tableName, ConsensusMigrationTarget.accord, ImmutableList.of(accordMigratedRange), ImmutableList.of(accordMigratingRange), ImmutableList.of(accordMigratingRange), 1); + + // Test that we can reverse the migration and go back to Paxos + alterTableTransactionalMode(TransactionalMode.off); + assertTransactionalModes(TransactionalMode.off, TransactionalMigrationFromMode.mixed_reads); + assertMigrationState(tableName, ConsensusMigrationTarget.paxos, ImmutableList.of(new Range(minToken, midToken), new Range(maxToken, minToken)), emptyList(), ImmutableList.of(accordMigratingRange), 1); + Iterator paxosNonMigratingKeys = getKeysBetweenTokens(minToken, midToken); + Iterator paxosMigratingKeys = getKeysBetweenTokens(upperMidToken, maxToken); + Iterator accordKeys = getKeysBetweenTokens(midToken, upperMidToken); + + List> expectedKeyMigrations = new ArrayList<>(); + + // Paxos non-migrating keys should run on Paxos as per normal + assertTargetPaxosWrite(runCasNoApply, 1, paxosNonMigratingKeys.next(), expectedKeyMigrations, 0, 1, 0, 0, 0); + + Integer nextMigratingKey = paxosMigratingKeys.next(); + addExpectedMigratedKey(expectedKeyMigrations, nextMigratingKey, tableUUID); + // Paxos migrating keys should be key migrated which means a local barrier is run by Paxos during read at each replica + assertTargetPaxosWrite(runCasNoApply, 1, nextMigratingKey, expectedKeyMigrations, 0, 1, 1, 0, 0); + + // A key from a range migrated to Accord is now not migrating/migrated and should be accessed through Accord + assertTargetPaxosWrite(runCasNoApply, 1, accordKeys.next(), expectedKeyMigrations, 1, 0, 0, 0, 0); + + // If an Accord transaction races with cluster metadata updates it should be rejected if the epoch it runs in contains the migration + cluster.get(1).runOnInstance(() -> ConsensusRequestRouter.setInstance(new RoutesToAccordOnce())); + nextMigratingKey = paxosMigratingKeys.next(); + addExpectedMigratedKey(expectedKeyMigrations, nextMigratingKey, tableUUID); + assertTargetPaxosWrite(runCasNoApply, 1, nextMigratingKey, expectedKeyMigrations, 1, 1, 1, 1, 1); + + // Repair the currently migrating range from when targets were switched, but it's not an Accord repair, this is to make sure the wrong repair type doesn't trigger progress + nodetool(coordinator, "repair", "-st", upperMidToken.toString(), "-et", maxAlignedWithLocalRanges.toString(), "--skip-accord"); + assertMigrationState(tableName, ConsensusMigrationTarget.paxos, ImmutableList.of(new Range(minToken, midToken), new Range(maxToken, minToken)), emptyList(), ImmutableList.of(accordMigratingRange), 1); + + // Paxos migrating keys should still need key migration after non-Accord repair + nextMigratingKey = paxosMigratingKeys.next(); + addExpectedMigratedKey(expectedKeyMigrations, nextMigratingKey, tableUUID); + assertTargetPaxosWrite(runCasNoApply, 1, nextMigratingKey, expectedKeyMigrations, 0, 1, 1, 0, 0); + + // Now do it with an Accord repair so key migration shouldn't be necessary + nodetool(coordinator, "consensus_admin", "finish-migration", "-st", upperMidToken.toString(), "-et", maxAlignedWithLocalRanges.toString()); + Range repairedRange = new Range(upperMidToken, maxAlignedWithLocalRanges); + // Sliver remaining because of precise repairs + // TODO This precision isn't needed for Accord repair? Worth lifting that restriction or keep it consistent? + Range remainingRange = new Range(maxAlignedWithLocalRanges, maxToken); + assertMigrationState(tableName, ConsensusMigrationTarget.paxos, ImmutableList.of(new Range(minToken, midToken), repairedRange, new Range(maxToken, minToken)), emptyList(), ImmutableList.of(remainingRange), 1); + + // Paxos migrating keys shouldn't need key migration after Accord repair + assertTargetPaxosWrite(runCasNoApply, 1, paxosMigratingKeys.next(), expectedKeyMigrations, 0, 1, 0, 0, 0); + }); + } + + private static void addExpectedMigratedKey(List> expectedKeyMigrations, Integer nextMigratingKey, UUID tableUUID) + { + + ByteBuffer key = ByteBuffer.allocate(4); + key.putInt(0, nextMigratingKey); + expectedKeyMigrations.add(Pair.create(key, tableUUID)); + } + + private static void assertMigrationState(String tableName, ConsensusMigrationTarget target, List> migratedRanges, List> repairPendingRanges, List> migratingRanges, int numMigratingEpochs) throws Throwable + { + // Validate nodetool consensus admin list output + String yamlResultString = nodetool(SHARED_CLUSTER.coordinator(1), "consensus_admin", "list"); + Map yamlStateMap = new Yaml().load(yamlResultString); + String minifiedYamlResultString = nodetool(SHARED_CLUSTER.coordinator(1), "consensus_admin", "list", "-f", "minified-yaml"); + Map minifiedYamlStateMap = new Yaml().load(minifiedYamlResultString); + String jsonResultString = nodetool(SHARED_CLUSTER.coordinator(1), "consensus_admin", "list", "-f", "json"); + Map jsonStateMap = JsonUtils.JSON_OBJECT_MAPPER.readValue(jsonResultString, new TypeReference>(){}); + String minifiedJsonResultString = nodetool(SHARED_CLUSTER.coordinator(1), "consensus_admin", "list", "-f", "minified-json"); + Map minifiedJsonStateMap = JsonUtils.JSON_OBJECT_MAPPER.readValue(minifiedJsonResultString, new TypeReference>(){}); + + List tableIds = new ArrayList<>(); + for (Map migrationStateMap : ImmutableList.of(yamlStateMap, jsonStateMap, minifiedYamlStateMap, minifiedJsonStateMap)) + { + assertEquals(PojoToString.CURRENT_VERSION, migrationStateMap.get("version")); + assertTrue(Epoch.EMPTY.getEpoch() < ((Number) migrationStateMap.get("lastModifiedEpoch")).longValue()); + + Map tableStateMap = null; + for (Map stateMap : (List>) migrationStateMap.get("tableStates")) + { + Object table = stateMap.get("table"); + Object keyspace = stateMap.get("keyspace"); + if (KEYSPACE.equals(keyspace) && tableName.equals(table)) + { + tableStateMap = stateMap; + break; + } + } + assertNotNull(tableStateMap); + + assertEquals(tableName, tableStateMap.get("table")); + assertEquals(KEYSPACE, tableStateMap.get("keyspace")); + tableIds.add((String) tableStateMap.get("tableId")); + List> migratedRangesFromStateMap = ((List) tableStateMap.get("migratedRanges")).stream().map(Range::fromString).collect(toImmutableList()); + assertEquals(migratedRanges, migratedRangesFromStateMap); + List> repairPendingRangesFromStateMap = ((List) tableStateMap.get("repairPendingRanges")).stream().map(Range::fromString).collect(toImmutableList()); + assertEquals(repairPendingRanges, repairPendingRangesFromStateMap); + Map>> migratingRangesByEpochFromStateMap = new LinkedHashMap<>(); + for (Map.Entry> entry : ((Map>) tableStateMap.get("migratingRangesByEpoch")).entrySet()) + { + long epoch = entry.getKey() instanceof Number ? ((Number)entry.getKey()).longValue() : Long.valueOf((String)entry.getKey()); + migratingRangesByEpochFromStateMap.put(epoch, entry.getValue().stream().map(Range::fromString).collect(toImmutableList())); + } + if (migratingRanges.isEmpty()) + assertEquals(0, migratingRangesByEpochFromStateMap.size()); + else + assertEquals(migratingRanges, migratingRangesByEpochFromStateMap.values().iterator().next()); + } + + // Also check JSON format at least loads without error + // Validate in memory state at each node + List> migratingAndMigratedRanges = normalize(ImmutableList.>builder().addAll(migratedRanges).addAll(migratingRanges).build()); + spinUntilSuccess(() -> { + for (IInvokableInstance instance : SHARED_CLUSTER) + { + ConsensusMigrationState snapshot = getMigrationStateSnapshot(instance); + + for (String tableId : tableIds) + { + TableMigrationState state = snapshot.tableStates.get(TableId.fromString(tableId)); + assertNotNull(state); + + SimpleQueryResult vtableResult = + instance.executeInternalWithResult(format("SELECT * FROM %s.%s WHERE keyspace_name = ? AND table_name = ? ", VIRTUAL_ACCORD_DEBUG, AccordDebugKeyspace.MIGRATION_STATE), + state.keyspaceName, state.tableName); + assertTrue(vtableResult.hasNext()); + + assertEquals(KEYSPACE, state.keyspaceName); + assertEquals(tableName, state.tableName); + assertEquals(target, state.targetProtocol); + assertEquals("Migrated ranges:", migratedRanges, state.migratedRanges); + assertEquals("Migrating ranges:", migratingRanges, state.migratingRanges); + assertEquals("Repair pending ranges:", repairPendingRanges, state.repairPendingRanges); + assertEquals("Migrating and migrated ranges:", migratingAndMigratedRanges, state.migratingAndMigratedRanges); + assertEquals(numMigratingEpochs, state.migratingRangesByEpoch.size()); + if (migratingRanges.isEmpty()) + assertEquals(0, state.migratingRangesByEpoch.size()); + else + assertEquals(migratingRanges, state.migratingRangesByEpoch.values().iterator().next()); + + Row vtableState = vtableResult.next(); + assertVtableState(state, vtableState); + } + } + }); + } + + private static void assertVtableState(TableMigrationState expectedState, Row vtableState) + { + List vtableMigratedRanges = vtableState.getList("migrated_ranges"); + assertEquals(expectedState.migratedRanges, vtableMigratedRanges.stream().map(Range::fromString).collect(Collectors.toList())); + + Map> vtableMigratingByEpoch = vtableState.get("migrating_ranges_by_epoch"); + Map>> pojoMigratingByEpoch = new LinkedHashMap<>(); + + for (Map.Entry> entry : vtableMigratingByEpoch.entrySet()) + pojoMigratingByEpoch.put(entry.getKey(), entry.getValue().stream().map(Range::fromString).collect(toImmutableList())); + + if (expectedState.migratingRanges.isEmpty()) + assertEquals(0, pojoMigratingByEpoch.size()); + else + assertEquals(expectedState.migratingRanges, pojoMigratingByEpoch.values().iterator().next()); + } + + /** + * Save a promise that is after the committed one to make a subsequent read not linearizable + */ + private static void savePromisedAndCommittedPaxosProposal(String tableName, int key) + { + String committedBallotString = BallotGenerator.Global.nextBallot(Flag.GLOBAL).toString(); + String promisedBallotString = BallotGenerator.Global.nextBallot(Flag.GLOBAL).toString(); + forEach(() -> { + TableMetadata metadata = ColumnFamilyStore.getIfExists(KEYSPACE, tableName).metadata(); + ByteBuffer lowMidMigratingKeyBuffer = ByteBuffer.wrap(ByteArrayUtil.bytes(key)); + DecoratedKey dk = new BufferDecoratedKey(DatabaseDescriptor.getPartitioner().getToken(lowMidMigratingKeyBuffer), lowMidMigratingKeyBuffer); + try (PaxosState state = PaxosState.get(dk, metadata)) + { + Ballot ballot = Ballot.fromString(committedBallotString); + PartitionUpdateBuilder updateBuilder = new PartitionUpdateBuilder(metadata, key); + updateBuilder.row(CLUSTERING_VALUE).add("v", 42); + + state.commit(new Agreed(ballot, updateBuilder.build())); + state.promiseIfNewer(Ballot.fromString(promisedBallotString), true); + } + }); + } + + private static void saveAcceptedPaxosProposal(String tableName, String ballotString, int key) + { + forEach(() -> { + TableMetadata metadata = ColumnFamilyStore.getIfExists(KEYSPACE, tableName).metadata(); + ByteBuffer lowMidMigratingKeyBuffer = ByteBuffer.wrap(ByteArrayUtil.bytes(key)); + DecoratedKey dk = new BufferDecoratedKey(DatabaseDescriptor.getPartitioner().getToken(lowMidMigratingKeyBuffer), lowMidMigratingKeyBuffer); + try (PaxosState state = PaxosState.get(dk, metadata)) + { + Ballot ballot = Ballot.fromString(ballotString); + assertEquals( PROMISE, state.promiseIfNewer(ballot, true).outcome()); + PartitionUpdateBuilder updateBuilder = new PartitionUpdateBuilder(metadata, key); + updateBuilder.row(CLUSTERING_VALUE).add("v", 42); + // Set isForRepair to true to force accepting the proposal for testing purposes + assertEquals( null, state.acceptIfLatest(new Proposal(ballot, updateBuilder.build()), true).supersededBy); + } + }); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordMigrationWriteRaceTestBase.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordMigrationWriteRaceTestBase.java new file mode 100644 index 000000000000..5e23b58b5753 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordMigrationWriteRaceTestBase.java @@ -0,0 +1,832 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; +import java.util.concurrent.Callable; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; + +import com.google.common.base.Stopwatch; +import com.google.common.collect.ImmutableList; +import com.google.common.util.concurrent.ListenableFuture; +import com.google.common.util.concurrent.ListenableFutureTask; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.RoutingKey; +import accord.coordinate.Outcome; +import accord.messages.PreAccept; +import accord.primitives.KeyRoute; +import accord.primitives.Ranges; +import accord.primitives.Routable.Domain; +import accord.primitives.Route; +import accord.primitives.TxnId; +import accord.utils.async.AsyncResult; +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.Util; +import org.apache.cassandra.batchlog.BatchlogManager; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.config.Config.PaxosVariant; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner.LongToken; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ICoordinator; +import org.apache.cassandra.distributed.api.IInstance; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.IMessage; +import org.apache.cassandra.distributed.api.IMessageSink; +import org.apache.cassandra.distributed.api.QueryResults; +import org.apache.cassandra.distributed.api.SimpleQueryResult; +import org.apache.cassandra.distributed.impl.Instance; +import org.apache.cassandra.distributed.impl.TestChangeListener; +import org.apache.cassandra.distributed.shared.ClusterUtils; +import org.apache.cassandra.exceptions.CoordinatorBehindException; +import org.apache.cassandra.exceptions.RequestFailure; +import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.WriteTimeoutException; +import org.apache.cassandra.hints.HintsService; +import org.apache.cassandra.metrics.AccordClientRequestMetrics; +import org.apache.cassandra.metrics.ClientRequestsMetricsHolder; +import org.apache.cassandra.metrics.HintsServiceMetrics; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.repair.RepairJobDesc; +import org.apache.cassandra.repair.RepairResult; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.service.consensus.migration.ConsensusKeyMigrationState; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationRepairResult; +import org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter; +import org.apache.cassandra.service.consensus.migration.ConsensusTableMigration; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.concurrent.AsyncPromise; +import org.apache.cassandra.utils.concurrent.Promise; +import org.eclipse.jetty.util.ConcurrentHashSet; + +import static java.lang.String.format; +import static org.apache.cassandra.Util.expectException; +import static org.apache.cassandra.Util.spinAssertEquals; +import static org.apache.cassandra.config.CassandraRelevantProperties.HINT_DISPATCH_INTERVAL_MS; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.ALL; +import static org.apache.cassandra.distributed.shared.ClusterUtils.getNextEpoch; +import static org.apache.cassandra.distributed.shared.ClusterUtils.pauseBeforeEnacting; +import static org.apache.cassandra.distributed.shared.ClusterUtils.unpauseEnactment; +import static org.apache.cassandra.distributed.test.accord.AccordMigrationWriteRaceTestBase.Scenario.BATCHLOG_FAILED_ROUTING_THEN_HINT; +import static org.apache.cassandra.distributed.test.accord.AccordMigrationWriteRaceTestBase.Scenario.BATCHLOG_FAILED_TIMEOUT_THEN_HINT; +import static org.apache.cassandra.distributed.test.accord.AccordMigrationWriteRaceTestBase.Scenario.BATCHLOG_SUCCESSFUL_ROUTING; +import static org.apache.cassandra.distributed.test.accord.AccordMigrationWriteRaceTestBase.Scenario.HINT; +import static org.apache.cassandra.distributed.test.accord.AccordMigrationWriteRaceTestBase.Scenario.MUTATION; +import static org.apache.cassandra.distributed.util.QueryResultUtil.assertThat; +import static org.apache.cassandra.exceptions.RequestFailureReason.RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM; +import static org.apache.cassandra.utils.Throwables.runUnchecked; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +/* + * Test that non-transactional write operations such as regular mutations, batch log, and hints + * all detect when a migration is in progress, and then retry on the correct system. + * TODO (required): Accord TopologyMismatch means we aren't testing routing failure checks in TxnQuery migrating away from Accord in some test scenarios + * but maybe this doesn't matter becuase we do check the routing + */ +public abstract class AccordMigrationWriteRaceTestBase extends AccordTestBase +{ + private static final Logger logger = LoggerFactory.getLogger(AccordMigrationWriteRaceTestBase.class); + + private static final int CLUSTERING_VALUE = 1; + + private static final String TABLE_FMT = "CREATE TABLE %s (id int, c int, v int, PRIMARY KEY ((id), c));"; + + public static final int PKEY_ACCORD = 3; + public static final int PKEY_NORMAL = 0; + + private static IPartitioner partitioner; + + private static Token minToken; + + private static Token maxToken; + + private static Token midToken; + + private static Token upperMidToken; + + private static Token lowerMidToken; + + private static ICoordinator coordinator; + + private final static TestMessageSink messageSink = new TestMessageSink(); + private static class TestMessageSink implements IMessageSink + { + private final Queue> messages = new ConcurrentLinkedQueue<>(); + private final Set blackholed = new ConcurrentHashSet<>(); + + public void reset() + { + messages.clear(); + blackholed.clear(); + } + + @Override + public void accept(InetSocketAddress to, IMessage message) { + messages.offer(Pair.create(to,message)); + IInstance i = SHARED_CLUSTER.get(to); + if (blackholed.contains(to) || blackholed.contains(message.from())) + return; + if (i != null) + i.receiveMessage(message); + } + } + + enum Scenario + { + // Apply the mutation from the coordinator directly without going through hinting + MUTATION(false, false, false, false, false), + // Hint from the initial mutation coordination + HINT(true, false, true, false, true), + // Apply the mutation from the batchlog directly + BATCHLOG_SUCCESSFUL_ROUTING(false, true, true, true, false), + // Have the batchlog use hints to apply the mutation after failing to route, migrating back from Accord this is a timeout because you can't get Accord to fail at routing + // it either executes correctly in the old epoch or times out waiting for the new one to arrive + BATCHLOG_FAILED_ROUTING_THEN_HINT(false, true, true, true, true), + // Have the batchlog use hints to apply the mutation after a timeout + BATCHLOG_FAILED_TIMEOUT_THEN_HINT(false, true, true, true, true), + ; + + final boolean initiallyEnableHints; + final boolean initiallyEnableBatchlogReplay; + final boolean initiallyBlockTestKeyspaceMutations; + final boolean passesThroughBatchlog; + final boolean deliversViaHint; + + Scenario(boolean initiallyEnableHints, boolean initiallyEnableBatchlogReplay, boolean initiallyBlockTestKeyspaceMutations, boolean passesThroughBatchlog, boolean deliversViaHint) + { + this.initiallyEnableHints = initiallyEnableHints; + this.initiallyEnableBatchlogReplay = initiallyEnableBatchlogReplay; + this.initiallyBlockTestKeyspaceMutations = initiallyBlockTestKeyspaceMutations; + this.passesThroughBatchlog = passesThroughBatchlog; + this.deliversViaHint = deliversViaHint; + } + } + + private final boolean migrateAwayFromAccord; + + protected AccordMigrationWriteRaceTestBase() + { + this.migrateAwayFromAccord = migratingAwayFromAccord(); + } + + protected abstract boolean migratingAwayFromAccord(); + + @Override + protected Logger logger() + { + return logger; + } + + @BeforeClass + public static void setupClass() throws IOException + { + HINT_DISPATCH_INTERVAL_MS.setLong(100); + ServerTestUtils.daemonInitialization(); + // Otherwise repair complains if you don't specify a keyspace + CassandraRelevantProperties.SYSTEM_TRACES_DEFAULT_RF.setInt(3); + AccordTestBase.setupCluster(builder -> builder.appendConfig(config -> config.set("paxos_variant", PaxosVariant.v2.name()) + .set("accord.shard_durability_cycle", "1m") + .set("accord.shard_durability_target_splits", "1") + .set("accord.shard_durability_cycle", "60s") + .set("write_request_timeout", "2s") + .set("accord.range_migration", "explicit")), 3); + partitioner = FBUtilities.newPartitioner(SHARED_CLUSTER.get(1).callsOnInstance(() -> DatabaseDescriptor.getPartitioner().getClass().getSimpleName()).call()); + StorageService.instance.setPartitionerUnsafe(partitioner); + ServerTestUtils.prepareServerNoRegister(); + minToken = partitioner.getMinimumToken(); + maxToken = partitioner.getMaximumTokenForSplitting(); + midToken = partitioner.midpoint(minToken, maxToken); + upperMidToken = partitioner.midpoint(midToken, maxToken); + lowerMidToken = partitioner.midpoint(minToken, midToken); + coordinator = SHARED_CLUSTER.coordinator(1); + SHARED_CLUSTER.setMessageSink(messageSink); + } + + @AfterClass + public static void tearDownClass() + { + StorageService.instance.resetPartitionerUnsafe(); + } + + @After + public void tearDown() throws Exception + { + messageSink.reset(); + forEach(() -> { + BatchlogManager.instance.resumeReplay(); + HintsService.instance.deleteAllHintsUnsafe(); + HintsService.instance.resumeDispatch(); + }); + SHARED_CLUSTER.forEach(ClusterUtils::clearAndUnpause); + // Reset migration state + forEach(() -> { + ConsensusRequestRouter.resetInstance(); + ConsensusKeyMigrationState.reset(); + }); + super.tearDown(); + } + + private ListenableFuture alterTableTransactionalModeAsync(TransactionalMode mode) + { + ListenableFutureTask task = ListenableFutureTask.create(() -> { + coordinator.execute(format("ALTER TABLE %s WITH %s", qualifiedAccordTableName, mode.asCqlParam()), ALL); + }, null); + Thread asyncThread = new Thread(task, "Alter table transaction mode " + mode); + asyncThread.setDaemon(true); + asyncThread.start(); + return task; + } + + @Test + public void testSplitAndRetryNonSerialUnloggedBatchTwoTablesOnePkey() throws Throwable + { + testSplitAndRetryMutationCoordination(twoTableBatchInsert(false, PKEY_ACCORD, PKEY_ACCORD, 1), validateTwoTable(PKEY_ACCORD)); + } + + @Test + public void testSplitAndRetryNonSerialUnloggedBatchTwoTablesOnePkeyHinting() throws Throwable + { + // Accord doesn't hint if a write times out + if (!migrateAwayFromAccord) + testSplitAndRetryHintDelivery(twoTableBatchInsert(false, PKEY_ACCORD, PKEY_ACCORD, 1), validateTwoTable(PKEY_ACCORD)); + } + + @Test + public void testSplitAndRetryNonSerialUnloggedBatchTwoTablesTwoPkey() throws Throwable + { + testSplitAndRetryMutationCoordination(twoTableBatchInsert(false, PKEY_ACCORD, PKEY_NORMAL, 1), validateTwoTable(PKEY_NORMAL)); + } + + @Test + public void testSplitAndRetryNonSerialUnloggedBatchTwoTablesHinting() throws Throwable + { + // Accord doesn't hint if a write times out + if (!migrateAwayFromAccord) + testSplitAndRetryHintDelivery(twoTableBatchInsert(false, PKEY_ACCORD, PKEY_NORMAL, 1), validateTwoTable(PKEY_NORMAL)); + } + + @Test + public void testSplitAndRetryNonSerialUnloggedBatchSingleTable() throws Throwable + { + testSplitAndRetryMutationCoordination(singleTableBatchInsert(false, PKEY_ACCORD, PKEY_NORMAL, 1), this::validateSingleTable); + } + + @Test + public void testSplitAndRetryNonSerialUnloggedBatchSingleTableHinting() throws Throwable + { + // Accord doesn't hint if a write times out + if (!migrateAwayFromAccord) + testSplitAndRetryHintDelivery(singleTableBatchInsert(false, PKEY_ACCORD, PKEY_NORMAL, 1), this::validateSingleTable); + } + + /* + * This doesn't really test much since on top of testSplitAndRetryNonSerialUnloggedBatchTwoTablesOnePkey since it is + * a single table & key and will be converted to an unlogged batch + */ + @Test + public void testSplitAndRetryNonSerialLoggedBatchTwoTablesOnePkey() throws Throwable + { + testSplitAndRetryMutationCoordination(twoTableBatchInsert(true, PKEY_ACCORD, PKEY_ACCORD, 1), validateTwoTable(PKEY_ACCORD)); + } + + @Test + public void testSplitAndRetryNonSerialLoggedBatchTwoTablesTwoPkey() throws Throwable + { + testSplitAndRetryMutationCoordination(twoTableBatchInsert(true, PKEY_ACCORD, PKEY_NORMAL, 1), validateTwoTable(PKEY_NORMAL)); + } + + @Test + public void testSplitAndRetryNonSerialLoggedBatchTwoTablesTwoPkeyDeliverViaBatchLog() throws Throwable + { + testSplitAndRetryBatchlogDelivery(twoTableBatchInsert(true, PKEY_ACCORD, PKEY_NORMAL, 1), validateTwoTable(PKEY_NORMAL)); + } + + @Test + public void testSplitAndRetryNonSerialLoggedBatchTwoTablesTwoPkeyHintedViaBatchLogTimeout() throws Throwable + { + testSplitAndRetryHintDeliveryAfterBatchlogTimeout(twoTableBatchInsert(true, PKEY_ACCORD, PKEY_NORMAL, 1), validateTwoTable(PKEY_NORMAL)); + } + + @Test + public void testSplitAndRetryNonSerialLoggedBatchTwoTablesTwoPkeyHintedViaBatchLogRoutingFailure() throws Throwable + { + testSplitAndRetryHintDeliveryAfterBatchlogRoutingFailure(twoTableBatchInsert(true, PKEY_ACCORD, PKEY_NORMAL, 1), validateTwoTable(PKEY_NORMAL)); + } + + /* + * Test that a logged batch writing to a migrating table and a non-migrating table can + */ + @Test + public void testSplitAndRetryNonSerialLoggedBatchSingleTable() throws Throwable + { + testSplitAndRetryBatchlogDelivery(singleTableBatchInsert(true, PKEY_ACCORD, PKEY_NORMAL, 1), this::validateSingleTable); + } + + @Test + public void testSplitAndRetryNonSerialLoggedBatchSingleTableDeliverViaBatchLog() throws Throwable + { + testSplitAndRetryBatchlogDelivery(singleTableBatchInsert(true, PKEY_ACCORD, PKEY_NORMAL, 1), this::validateSingleTable); + } + + @Test + public void testSplitAndRetryNonSerialLoggedBatchSingleTableHintedViaBatchLogTimeout() throws Throwable + { + testSplitAndRetryHintDeliveryAfterBatchlogTimeout(singleTableBatchInsert(true, PKEY_ACCORD, PKEY_NORMAL, 1), this::validateSingleTable); + } + + @Test + public void testSplitAndRetryNonSerialLoggedBatchSingleTableHintedViaBatchLogRoutingFailure() throws Throwable + { + testSplitAndRetryHintDeliveryAfterBatchlogRoutingFailure(singleTableBatchInsert(true, PKEY_ACCORD, PKEY_NORMAL, 1), this::validateSingleTable); + } + + private void testSplitAndRetryMutationCoordination(String batchCQL, Consumer validation) throws Throwable + { + testSplitAndRetry(batchCQL, validation, MUTATION); + } + + private void testSplitAndRetryBatchlogDelivery(String batchCQL, Consumer validation) throws Throwable + { + testSplitAndRetry(batchCQL, validation, BATCHLOG_SUCCESSFUL_ROUTING); + } + + private void testSplitAndRetryHintDeliveryAfterBatchlogTimeout(String batchCQL, Consumer validation) throws Throwable + { + testSplitAndRetry(batchCQL, validation, BATCHLOG_FAILED_TIMEOUT_THEN_HINT); + } + + private void testSplitAndRetryHintDeliveryAfterBatchlogRoutingFailure(String batchCQL, Consumer validation) throws Throwable + { + testSplitAndRetry(batchCQL, validation, BATCHLOG_FAILED_ROUTING_THEN_HINT); + } + + private void testSplitAndRetryHintDelivery(String batchCQL, Consumer validation) throws Throwable + { + testSplitAndRetry(batchCQL, validation, HINT); + } + + private void validateSingleTable(Cluster cluster) + { + SimpleQueryResult expected = QueryResults.builder() + .columns("id", "c", "v") + .row(PKEY_NORMAL, 1, 1) + .row(PKEY_ACCORD, 1, 1) + .build(); + cluster.forEach(instance -> { + assertThat(instance.executeInternalWithResult("SELECT * FROM " + qualifiedAccordTableName)).isEqualTo(expected); + }); + } + + private Consumer validateTwoTable(int secondPkey) + { + return cluster -> { + SimpleQueryResult expectedAccord = QueryResults.builder() + .columns("id", "c", "v") + .row(PKEY_ACCORD, 1, 1) + .build(); + cluster.forEach(instance -> assertThat(instance.executeInternalWithResult("SELECT * FROM " + qualifiedAccordTableName)).isEqualTo(expectedAccord)); + + SimpleQueryResult expectedNormal = QueryResults.builder() + .columns("id", "c", "v") + .row(secondPkey, 1, 1) + .build(); + cluster.forEach(instance -> assertThat(instance.executeInternalWithResult("SELECT * FROM " + qualifiedRegularTableName)).isEqualTo(expectedNormal)); + }; + } + + /* + * Test if the coordinator is behind that the request can be re-split and routed to the correct systems + * without surfacing an error + */ + private void testSplitAndRetry(String batchCQL, Consumer validation, Scenario scenario) throws Throwable + { + test(createTables(TABLE_FMT, qualifiedRegularTableName, qualifiedAccordTableName), + cluster -> { + // Only enable these when testing it works from a specific instance + forEach(() -> BatchlogManager.instance.pauseReplay()); + forEach(() -> HintsService.instance.pauseDispatch()); + + // Node 3 is always the out of sync node + IInvokableInstance outOfSyncInstance = setUpOutOfSyncNode(cluster, scenario); + + // Need to be able to block writing to the test keyspace forcing batchlog replay + // without also failing writes to the batch log + if (scenario.initiallyBlockTestKeyspaceMutations) + cluster.filters().outbound().messagesMatching((from, to, message) -> { + if (message.verb() == Verb.MUTATION_REQ.id) + { + String keyspace = cluster.get(to).callsOnInstance(() -> ((Message)Instance.deserializeMessage(message)).payload.getKeyspaceName()).call(); + if (keyspace.equals(KEYSPACE)) + return true; + } + if (message.verb() == Verb.ACCORD_PRE_ACCEPT_REQ.id && !migrateAwayFromAccord) + { + boolean drop = cluster.get(to).callsOnInstance(() -> { + PreAccept preAccept = (PreAccept)Instance.deserializeMessage(message).payload; + Route route = preAccept.scope; + if (route.domain() == Domain.Key) + for (RoutingKey key : (KeyRoute)route) + { + TokenKey routingKey = (TokenKey)key; + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(routingKey.table()); + if (cfs.getKeyspaceName().equals(KEYSPACE)) + return true; + } + return false; + }).call(); + if (drop) + return true; + } + return false; + }).drop(); + + forEach(() -> BatchlogManager.instance.pauseReplay()); + + // If testing batch log delivery the coordinator needs to be a node other than the node that is behind on + // topology updates so that the batch log writes (and thus replay) can be done on the node that is out of sync + int coordinatorIndex = scenario.initiallyEnableBatchlogReplay ? 2 : 3; + IInvokableInstance instance = cluster.get(coordinatorIndex); + ICoordinator coordinator = instance.coordinator(); + int startRetryCount = getWriteRetryOnDifferentSystemCount(coordinatorIndex); + // If testing routing at mutation coordination then Node 1 and 2 will both rejected the mutation because it is in a migrating range + int startRejectedCount = getMutationsRejectedOnWrongSystemCount(); + logger.info("Executing batch insert"); + Future resultFuture = coordinator.asyncExecuteWithResult(batchCQL, ALL); + + // Testing either batch log delivery or hint delivery via batchlog + if (scenario.initiallyBlockTestKeyspaceMutations) + { + // Expect initial write failure + expectException(() -> { + try + { + return resultFuture.get(); + } + catch (ExecutionException e) + { + throw (Exception) e.getCause(); + } + }, WriteTimeoutException.class); + } + + if (scenario.passesThroughBatchlog) + { + // At this stage we want the batch log to fail because it misrouted the queries to the wrong system + // not because it timed out not getting a response. We only did that with mutations as a quick + // way to populate the batch log. Could almost as easily have constructed the mutation and put it + // in the batch log directly + if (scenario == Scenario.BATCHLOG_FAILED_ROUTING_THEN_HINT || scenario == BATCHLOG_SUCCESSFUL_ROUTING) + cluster.filters().reset(); + + // We only want the batch log to have access to the correct topology if we are testing its + // ability to handle misrouted things + if (scenario == Scenario.BATCHLOG_SUCCESSFUL_ROUTING) + unpauseEnactment(outOfSyncInstance); + + // Unfortunately the batch won't be replayed until some time has passed because the starting time + // for replay is the current time - timeout + // Don't wait here for the batchlog if we need to spin on the creation of the Accord transaction + // and then unpause to test Accord routing failure + boolean unpauseAfterBatchLogCreatesTransaction = migrateAwayFromAccord && scenario == BATCHLOG_FAILED_ROUTING_THEN_HINT; + if (!unpauseAfterBatchLogCreatesTransaction) + Thread.sleep(BatchlogManager.BATCHLOG_REPLAY_TIMEOUT + DatabaseDescriptor.getWriteRpcTimeout(TimeUnit.MILLISECONDS)); + messageSink.reset(); + + // Force batch log delivery (or hint delivery) on the node that was out of sync, but should be in sync once we unpause + // This demonstrates it can split the mutation correctly or forward it to hinting if it fails + outOfSyncInstance.runOnInstance(() -> runUnchecked(() -> { + // We don't want hints for any reason that might apply the mutation and make the test look like it succeeded + assertTrue(HintsService.instance.isDispatchPaused()); + // The failed write will have written hints + HintsService.instance.deleteAllHintsUnsafe(); + assertFalse(hasPendingHints()); + BatchlogManager.instance.resumeReplay(); + + // Unpausing needs to be done async because it waits for the batch log replay + Promise unpaused = new AsyncPromise<>(); + if (unpauseAfterBatchLogCreatesTransaction) + { + logger.info("Creating thread to unpause after batchlog creates Accord transaction"); + new Thread(() -> { + try + { + // Unpause so it can route incorrectly instead of timing out waiting to fetch the epoch, need the transaction to be created first + // otherwise it will just be routed straight to non-Accord. + logger.info("Spinning waiting on a transaction"); + Util.spinUntilTrue(() -> { + Map> txns = AccordService.instance().node().coordinating(); + if (!txns.isEmpty()) + { + logger.info("Found txns {}", txns); + return true; + } + return false; + }, 20); + TestChangeListener.instance.unpause(); + unpaused.trySuccess(null); + } + catch (Throwable t) + { + unpaused.tryFailure(t); + } + }).start(); + } + else + { + // Force replay so mosts tests don't have to wait + BatchlogManager.instance.forceBatchlogReplay(); + unpaused.trySuccess(null); + } + // Fetch errors + unpaused.get(); + // Ensure the batch log did or didn't create pending hints depending on the test scenario + spinAssertEquals(scenario == BATCHLOG_FAILED_TIMEOUT_THEN_HINT || scenario == BATCHLOG_FAILED_ROUTING_THEN_HINT, () -> hasPendingHints(), 20); + })); + } + + // Mutation successfully applied from the coordinator after retrying scenario + if (scenario == MUTATION) + { + // Don't want to mistakenly have hints applying the mutation + forEach(() -> assertTrue(HintsService.instance.isDispatchPaused())); + // Check for the error differently depending on what system should be seeing an error + if (migrateAwayFromAccord) + { + // Accord will block until we unpause enactment so to test the routing we wait until the transaction + // has started so the epoch it is created in is the old one + Util.spinUntilTrue(() -> outOfSyncInstance.callOnInstance(() -> { + Map> coordinating = AccordService.instance().node().coordinating(); + if (!coordinating.isEmpty()) + logger.info("Accord coordinating: " + coordinating); + return !coordinating.isEmpty(); + }), 20); + try + { + validation.accept(cluster); + throw new AssertionError("Expected validation to fail"); + } + catch (AssertionError e) + { + //ignored + } + } + else + { + Stopwatch sw = Stopwatch.createStarted(); + spinAssertEquals(startRejectedCount + 2, 10, () -> getMutationsRejectedOnWrongSystemCount() - startRejectedCount); + logger.info("Took {}ms to get mutations rejected on wrong system", sw.elapsed(TimeUnit.MILLISECONDS)); + } + + logger.info("Unpausing out of sync instance"); + // Testing regular mutation coordination retry loop let coordinator get up to date and retry + unpauseEnactment(outOfSyncInstance); + + try + { + resultFuture.get(); + } + catch (ExecutionException e) + { + // This is expected when inverting the migration + if (migrateAwayFromAccord && e.getCause() instanceof CoordinatorBehindException) + throw e; + throw e; + } + + if (!migrateAwayFromAccord) + { + int endRetryCount = getWriteRetryOnDifferentSystemCount(coordinatorIndex); + int endRejectedCount = getMutationsRejectedOnWrongSystemCount(); + assertEquals(1, endRetryCount - startRetryCount); + // Expect only two nodes to reject since they enacted the new epoch + assertEquals(2, endRejectedCount - startRejectedCount); + } + } + + // Anything related to making sure hints are delivered goes here + if (scenario.deliversViaHint) + { + // Don't want to mistakenly have hints applying the mutation before we enable it on just one instance + forEach(() -> assertTrue(HintsService.instance.isDispatchPaused())); + // The filters wouldn't have been reset yet if they were needed to make the batchlog or original mutation time out + // Need to reset so Hints can use Accord txns + cluster.filters().reset(); + long startingAccordTimeouts = outOfSyncInstance.callOnInstance(() -> ClientRequestsMetricsHolder.accordWriteMetrics.timeouts.getCount()); + long startingAccordPreempted = outOfSyncInstance.callOnInstance(() -> ClientRequestsMetricsHolder.accordWriteMetrics.preempted.getCount()); + long startingAccordMigrationRejects = outOfSyncInstance.callOnInstance(() -> ClientRequestsMetricsHolder.accordWriteMetrics.accordMigrationRejects.getCount()); + long startingHintTimeouts = outOfSyncInstance.callOnInstance(() -> HintsServiceMetrics.hintsTimedOut.getCount()); + long startingTopologyMismatches = outOfSyncInstance.callOnInstance(() -> ClientRequestsMetricsHolder.accordWriteMetrics.topologyMismatches.getCount()); + outOfSyncInstance.runOnInstance(() -> HintsService.instance.resumeDispatch()); + // The initial hinting attempt should fail, unless it's a batchlog routing failure in which + // case the coordinator has already caught up so the hint will succeed on the first try + // Can only really have this case for BATCHLOG_FAILED_TIMEOUT_THEN_HINT because Accord timeouts don't + // write hints so there is nothing to test + if (migrateAwayFromAccord && scenario == BATCHLOG_FAILED_TIMEOUT_THEN_HINT) + { + Callable test = () -> outOfSyncInstance.callOnInstance(() -> { + HintsService.instance.flushAndFsyncBlockingly(); + AccordClientRequestMetrics accordMetrics = ClientRequestsMetricsHolder.accordWriteMetrics; + logger.info("startingAccordTimeouts {}, startingAccordPreempts {}, startingAccordMigrationRejects {}, startingHintTimeouts {}, startingTopoloygMismatches {}, accord timeouts {}, accordPreempts {}, accordMigrationRejects {}, hint timeouts {}, topologyMismatches {}", startingAccordTimeouts, startingAccordPreempted, startingAccordMigrationRejects, startingHintTimeouts, startingTopologyMismatches, accordMetrics.timeouts.getCount(), accordMetrics.preempted.getCount(), accordMetrics.accordMigrationRejects.getCount(), HintsServiceMetrics.hintsTimedOut.getCount(), accordMetrics.topologyMismatches.getCount()); + return accordMetrics.timeouts.getCount() >= (startingAccordTimeouts + 1) && HintsServiceMetrics.hintsTimedOut.getCount() >= (startingHintTimeouts + 1); + }); + Util.spinUntilTrue(test, 40); + } + else if (!migrateAwayFromAccord) + { + // Expect two retry on different system responses when migrating from Paxos to Accord, one from each + // node that knows it is on the wrong system + Util.spinUntilTrue(() -> + { + outOfSyncInstance.runOnInstance(() -> HintsService.instance.flushAndFsyncBlockingly()); + return messageSink.messages.stream().filter(p -> { + if (p.right.verb() != Verb.FAILURE_RSP.id) + return false; + if (!p.left.equals(outOfSyncInstance.broadcastAddress())) + return false; + RequestFailureReason reason = ((RequestFailure) Instance.deserializeMessage(p.right).payload).reason; + if (reason == RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM) + return true; + return false; + }).count() == 2; + }, 20); + } + // After this hints should deliver and the final validation should succeed + // if we unpause enactment + unpauseEnactment(outOfSyncInstance); + long currentEpoch = SHARED_CLUSTER.get(1).callOnInstance(() -> ClusterMetadata.current().epoch.getEpoch()); + logger.info("Spinning waiting for out of sync instance to catch up"); + Util.spinUntilTrue(() -> outOfSyncInstance.callOnInstance(() -> ClusterMetadata.current().epoch.getEpoch() == currentEpoch)); + logger.info("Out of sync instance caught up"); + } + + // Accord commit is async and might take a while, but the data should end up as expected + Util.spinUntilSuccess(() -> validation.accept(cluster), 20); + }); + } + + /* + * Set up 3 to be behind and unaware of the migration while 1 and 2 are aware + */ + private IInvokableInstance setUpOutOfSyncNode(Cluster cluster, Scenario scenario) throws Throwable + { + IInvokableInstance i1 = cluster.get(1); + IInvokableInstance i3 = cluster.get(3); + long afterAlterEpoch = getNextEpoch(i1).getEpoch(); + alterTableTransactionalMode(TransactionalMode.full); + Util.spinUntilTrue(() -> cluster.stream().allMatch(instance -> instance.callOnInstance(() -> ClusterMetadata.current().epoch.equals(Epoch.create(afterAlterEpoch)))), 10); + + long migratingEpoch = getNextEpoch(i1).getEpoch(); + logger.info("Epoch for migrating to Accord is {}", migratingEpoch); + // Node 3 will coordinate the query and not be aware that the migration has begun + Callable pausedBeforeEnacting = pauseBeforeEnacting(i3, migratingEpoch); + ListenableFuture result = nodetoolAsync(coordinator, "consensus_admin", "begin-migration", "-st", midToken.toString(), "-et", maxToken.toString(), KEYSPACE, accordTableName); + // Node 2 coordinates in the batch log case so it has to have caught up + long afterBeginMigrationEpochFinal = migratingEpoch; + Util.spinUntilTrue(() -> cluster.get(2).callOnInstance(() -> ClusterMetadata.current().epoch.equals(Epoch.create(afterBeginMigrationEpochFinal))), 10); + + if (migrateAwayFromAccord) + { + pausedBeforeEnacting.call(); + unpauseEnactment(i3); + result.get(); + long migratingEpochFinal = migratingEpoch; + Util.spinUntilTrue(() -> cluster.stream().allMatch(instance -> instance.callOnInstance(() -> ClusterMetadata.current().epoch.equals(Epoch.create(migratingEpochFinal)))), 10); + migratingEpoch = getNextEpoch(i1).getEpoch(); + logger.info("Epoch for migration away from Accord is {}", migratingEpoch); + pausedBeforeEnacting = pauseBeforeEnacting(i3, migratingEpoch); + // In the reverse direction doing the alter automatically reverses the migration without a need to call begin migration on any ranges + result = alterTableTransactionalModeAsync(TransactionalMode.off); + } + + // Wait for everyone to get to where they are supposed to be + try + { + pausedBeforeEnacting.call(); + } + catch (Throwable t) + { + if (result.isDone()) + { + try + { + result.get(); + } + catch (ExecutionException e) + { + t.addSuppressed(e); + throw t; + } + } + throw t; + } + // Make sure 1 and 2 are up to date + for (int i = 1; i < 3; i++) + { + int instanceIndex = i; + long migratingEpochFinal = migratingEpoch; + Util.spinUntilTrue(() -> cluster.get(instanceIndex).callOnInstance(() -> ClusterMetadata.current().epoch.equals(Epoch.create(migratingEpochFinal))), 10); + } + // nodetool should be able to complete now + result.get(); + + // Need to complete the migration for its eventual execution in the next epoch to be discovered to be misrouted + // now that we continue to write through Accord during migration away from Accord + // Faking the completed repair is the only way to get it in a state where two coordinators know about the new + // epoch and one doesn't + if (migrateAwayFromAccord && scenario.deliversViaHint && scenario.passesThroughBatchlog) //&& scenario != BATCHLOG_FAILED_ROUTING_THEN_HINT) + { + String keyspace = KEYSPACE; + String table = accordTableName; + long midTokenLong = midToken.getLongValue(); + long maxTokenLong = maxToken.getLongValue(); + long afterReverseMigrationEpoch = SHARED_CLUSTER.get(1).callOnInstance(() -> + { + Epoch startEpoch = ClusterMetadata.current().epoch; + Epoch epochAfterRepair = startEpoch.nextEpoch(); + TableId tableId = Schema.instance.getTableMetadata(keyspace, table).id; + List> ranges = ImmutableList.of(new Range<>(new LongToken(midTokenLong), new LongToken(maxTokenLong))); + RepairJobDesc desc = new RepairJobDesc(null, null, keyspace, table, ranges); + TokenRange range = TokenRange.create(new TokenKey(tableId, new LongToken(midTokenLong)), new TokenKey(tableId, new LongToken(maxTokenLong))); + Ranges accordRanges = Ranges.of(range); + ConsensusMigrationRepairResult repairResult = ConsensusMigrationRepairResult.fromRepair(startEpoch, accordRanges, true, true, true, false); + ConsensusTableMigration.completedRepairJobHandler.onSuccess(new RepairResult(desc, null, repairResult)); + return epochAfterRepair.getEpoch(); + }); + // Make sure 1 and 2 are up to date and know the reverse migration happens + for (int i = 1; i < 3; i++) + { + int instanceIndex = i; + Util.spinUntilTrue(() -> cluster.get(instanceIndex).callOnInstance(() -> ClusterMetadata.current().epoch.equals(Epoch.create(afterReverseMigrationEpoch))), 10); + } + } + + return i3; + } + + private String twoTableBatchInsert(boolean logged, int pkey1, int pkey2, int value) + { + return batch(logged, + insertCQL(qualifiedAccordTableName, pkey1, value), + insertCQL(qualifiedRegularTableName, pkey2, value)); + } + + private String singleTableBatchInsert(boolean logged, int pkey1, int pkey2, int value) + { + return batch(logged, + insertCQL(qualifiedAccordTableName, pkey1, value), + insertCQL(qualifiedAccordTableName, pkey2, value)); + } + + private static String insertCQL(String qualifiedTableName, int pkey, int value) + { + return format("INSERT INTO %s ( id, c, v ) VALUES ( %d, %d, %d )", qualifiedTableName, pkey, CLUSTERING_VALUE, value); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordNodetoolTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordNodetoolTest.java new file mode 100644 index 000000000000..9af94b783dd1 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordNodetoolTest.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import com.google.common.collect.ImmutableSet; +import org.junit.Test; + +import accord.local.Node; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.test.TestBaseImpl; + + +import static org.junit.Assert.assertEquals; +import static org.apache.cassandra.distributed.shared.ClusterUtils.getNodeId; + +public class AccordNodetoolTest extends TestBaseImpl +{ + @Test + public void testMarkSingleNode() throws Throwable + { + try (Cluster cluster = init(builder().withNodes(3).withConfig((config) -> config.with(Feature.NETWORK, Feature.GOSSIP)).start())) + { + cluster.get(1).nodetoolResult("accord", "mark_stale", "1").asserts().success(); + cluster.get(1).runOnInstance(() -> assertEquals(ImmutableSet.of(new Node.Id(1)), ClusterMetadata.current().accordStaleReplicas.ids())); + cluster.get(1).nodetoolResult("accord", "describe").asserts().stdoutContains("Stale Replicas: 1"); + + // Reject the operation if the target node is already stale: + cluster.get(1).nodetoolResult("accord", "mark_stale", "1").asserts().failure().errorContains("it already is"); + + // Reject the operation if marking the node stale brings us below a quorum of non-stale nodes: + cluster.get(1).nodetoolResult("accord", "mark_stale", "2").asserts().failure().errorContains("that would leave fewer than a quorum"); + + // Reject the operation if the target node doesn't exist: + cluster.get(1).nodetoolResult("accord", "mark_stale", "4").asserts().failure().errorContains("not present in the directory"); + + cluster.get(1).nodetoolResult("accord", "mark_rejoining", "1").asserts().success(); + cluster.get(1).runOnInstance(() -> assertEquals(Collections.emptySet(), ClusterMetadata.current().accordStaleReplicas.ids())); + + cluster.get(1).nodetoolResult("accord", "mark_rejoining", "1").asserts().failure().errorContains("it is not stale"); + cluster.get(1).nodetoolResult("accord", "mark_rejoining", "4").asserts().failure().errorContains("not present in the directory"); + } + } + + @Test + public void testMarkMultipleNodes() throws Throwable + { + try (Cluster cluster = init(builder().withNodes(5).withConfig((config) -> config.with(Feature.NETWORK, Feature.GOSSIP)).start())) + { + // Reject the operation if marking the node stale brings us below a quorum of non-stale nodes: + cluster.get(1).nodetoolResult("accord", "mark_stale", "1", "2", "3").asserts().failure().errorContains("that would leave fewer than a quorum"); + + cluster.get(1).nodetoolResult("accord", "mark_stale", "1", "2").asserts().success(); + cluster.get(1).runOnInstance(() -> assertEquals(ImmutableSet.of(new Node.Id(1), new Node.Id(2)), ClusterMetadata.current().accordStaleReplicas.ids())); + cluster.get(1).nodetoolResult("accord", "describe").asserts().stdoutContains("Stale Replicas: 1,2"); + + // Reject the operation if a target node is already stale: + cluster.get(1).nodetoolResult("accord", "mark_stale", "1", "2").asserts().failure().errorContains("it already is"); + + // Reject the operation if a target node doesn't exist: + cluster.get(1).nodetoolResult("accord", "mark_stale", "4", "6").asserts().failure().errorContains("not present in the directory"); + + Map nodeIdToNode = new HashMap<>(); + for (int i = 1; i <= 5; i++) + nodeIdToNode.put(getNodeId(cluster.get(i)).id(), i); + + // Remove the second stale node, and ensure the set of stale replicas is updated: + cluster.get(nodeIdToNode.get(2)).shutdown().get(); + cluster.get(1).nodetoolResult("removenode", "2", "--force").asserts().success(); + cluster.get(1).nodetoolResult("cms", "unregister", "2").asserts().success(); + cluster.get(1).runOnInstance(() -> assertEquals(ImmutableSet.of(new Node.Id(1)), ClusterMetadata.current().accordStaleReplicas.ids())); + + cluster.get(1).nodetoolResult("accord", "mark_rejoining", "1", "3").asserts().failure().errorContains("it is not stale"); + cluster.get(1).nodetoolResult("accord", "mark_rejoining", "1", "6").asserts().failure().errorContains("not present in the directory"); + cluster.get(1).runOnInstance(() -> assertEquals(ImmutableSet.of(new Node.Id(1)), ClusterMetadata.current().accordStaleReplicas.ids())); + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordProgressLogTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordProgressLogTest.java new file mode 100644 index 000000000000..6bbbce6a52ed --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordProgressLogTest.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.util.concurrent.Semaphore; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; + +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.IMessageFilters; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.service.consensus.TransactionalMode; + +public class AccordProgressLogTest extends TestBaseImpl +{ + private static final Logger logger = LoggerFactory.getLogger(AccordProgressLogTest.class); + + @Test + public void testRecoveryTimeWindow() throws Throwable + { + try (Cluster cluster = init(Cluster.build(2) + .withoutVNodes() + .withConfig(c -> c.with(Feature.NETWORK) + .set("accord.enabled", "true") + .set("accord.recover_txn", "1s")) + .start())) + { + cluster.schemaChange("CREATE KEYSPACE ks WITH replication={'class':'SimpleStrategy', 'replication_factor': 3}"); + cluster.schemaChange("CREATE TABLE ks.tbl (k int, c int, v int, primary key (k, c)) WITH " + TransactionalMode.full.asCqlParam()); + String query = "BEGIN TRANSACTION\n" + + " INSERT INTO ks.tbl (k, c) VALUES (0, 0);\n" + + "COMMIT TRANSACTION"; + + IMessageFilters.Filter dropPreAccept = cluster.filters().outbound().from(1).to(2).verbs(Verb.ACCORD_PRE_ACCEPT_REQ.id).drop(); + AtomicLong recoveryStartedAt = new AtomicLong(); + Semaphore waitForRecovery = new Semaphore(0); + IMessageFilters.Filter recovery = cluster.filters().outbound().messagesMatching((from, to, message) -> { + if (message.verb() == Verb.ACCORD_BEGIN_RECOVER_RSP.id) + { + recoveryStartedAt.compareAndSet(0, System.nanoTime()); + waitForRecovery.release(); + } + return false; + }).drop(); + + long coordinationStartedAt = System.nanoTime(); + boolean failed = false; + try { cluster.coordinator(1).executeWithResult(query, ConsistencyLevel.ANY); } + catch (Throwable e) { failed = true; } + Assert.assertTrue(failed); + + waitForRecovery.acquire(); + long timeDeltaMillis = TimeUnit.NANOSECONDS.toMillis(recoveryStartedAt.get() - coordinationStartedAt); + Assert.assertTrue("Recovery started in " + timeDeltaMillis + "ms", timeDeltaMillis >= 1000); + Assert.assertTrue("Recovery started in " + timeDeltaMillis + "ms", timeDeltaMillis <= 5000); + } + } + + @Test + public void testFetchTimeWindow() throws Throwable + { + try (Cluster cluster = init(Cluster.build(2) + .withoutVNodes() + .withConfig(c -> c.with(Feature.NETWORK) + .set("accord.enabled", "true")) + .start())) + { + cluster.schemaChange("CREATE KEYSPACE ks WITH replication={'class':'SimpleStrategy', 'replication_factor': 3}"); + cluster.schemaChange("CREATE TABLE ks.tbl (k int, c int, v int, primary key (k, c)) WITH " + TransactionalMode.full.asCqlParam()); + String query = "BEGIN TRANSACTION\n" + + " INSERT INTO ks.tbl (k, c) VALUES (0, 0);\n" + + "COMMIT TRANSACTION"; + + IMessageFilters.Filter dropApply = cluster.filters().outbound().from(1).verbs(Verb.ACCORD_APPLY_REQ.id).drop(); + AtomicLong fetchStartedAt = new AtomicLong(); + Semaphore waitForFetch = new Semaphore(0); + IMessageFilters.Filter fetch = cluster.filters().outbound().messagesMatching((from, to, message) -> { + if (message.verb() == Verb.ACCORD_AWAIT_REQ.id) + { + fetchStartedAt.compareAndSet(0, System.nanoTime()); + waitForFetch.release(); + } + return false; + }).drop(); + + long coordinationStartedAt = System.nanoTime(); + try + { + cluster.coordinator(1).executeWithResult(query, ConsistencyLevel.ANY); + } + catch (Throwable e) + { + } + + waitForFetch.acquire(); + logger.info("Coordinated at {}", coordinationStartedAt); + logger.info("Awaited at {}", fetchStartedAt.get()); + long timeDeltaMillis = TimeUnit.NANOSECONDS.toMillis(fetchStartedAt.get() - coordinationStartedAt); + Assert.assertTrue("Fetch started in " + timeDeltaMillis + "ms", timeDeltaMillis >= 100); + Assert.assertTrue("Fetch started in " + timeDeltaMillis + "ms", timeDeltaMillis <= 2000); + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordReadInteroperabilityTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordReadInteroperabilityTest.java new file mode 100644 index 000000000000..a66c79ad8859 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordReadInteroperabilityTest.java @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import javax.annotation.Nonnull; + +import org.junit.After; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.service.consensus.migration.TransactionalMigrationFromMode; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +@RunWith(Parameterized.class) +public class AccordReadInteroperabilityTest extends AccordTestBase +{ + private static final Logger logger = LoggerFactory.getLogger(AccordInteroperabilityTest.class); + + @Nonnull + private final TransactionalMode mode; + + private final boolean migrated; + + public AccordReadInteroperabilityTest(@Nonnull TransactionalMode mode, boolean migrated) + { + this.mode = mode; + this.migrated = migrated; + } + + @Parameterized.Parameters(name = "transactionalMode={0}, migrated={1}") + public static Collection data() { + List tests = new ArrayList<>(TransactionalMode.values().length * 2); + for (TransactionalMode mode : TransactionalMode.values()) + { + tests.add(new Object[]{ mode, true }); + tests.add(new Object[]{ mode, false }); + } + return tests; + } + + @Override + protected Logger logger() + { + return logger; + } + + @BeforeClass + public static void setupClass() throws IOException + { + AccordTestBase.setupCluster(builder -> builder.withConfig(config -> config.set("accord.range_migration", "auto") + .set("paxos_variant", "v2")), + 3); + } + + @After + public void tearDown() + { + SHARED_CLUSTER.setMessageSink(null); + } + + + private String testTransactionSelect() + { + return "BEGIN TRANSACTION\n" + + " SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0;\n" + + "COMMIT TRANSACTION"; + } + + private String testSelect() + { + return "SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 0"; + } + + private String testRangeSelect() + { + return "SELECT * FROM " + qualifiedAccordTableName + " WHERE token(k) > token(0)"; + } + + @Test + public void testTransactionStatementReadIsAtQuorum() throws Throwable + { + testReadIsAtQuorum(testTransactionSelect()); + } + + @Test + public void testNonSerialReadIsAtQuorum() throws Throwable + { + testReadIsAtQuorum(testSelect()); + } + + @Test + public void testSerialReadIsAtQuorum() throws Throwable + { + testReadIsAtQuorum(testSelect(), ConsistencyLevel.SERIAL); + } + + @Test + public void testRangeReadIsAtQuorum() throws Throwable + { + testReadIsAtQuorum(testRangeSelect()); + } + + private void testReadIsAtQuorum(String query) throws Throwable + { + testReadIsAtQuorum(query, ConsistencyLevel.QUORUM); + } + + private void testReadIsAtQuorum(String query, org.apache.cassandra.distributed.api.ConsistencyLevel cl) throws Throwable + { + // Transaction statement doesn't work during migration + if (query.equals(testTransactionSelect()) && !migrated) + return; + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int, PRIMARY KEY(k, c))" + (migrated ? " WITH " + transactionalMode.asCqlParam() : ""), + cluster -> { + SHARED_CLUSTER.setMessageSink(new MessageCountingSink(SHARED_CLUSTER)); + if (!migrated) + { + String alterCQL = "ALTER TABLE " + qualifiedAccordTableName + " WITH " + transactionalMode.asCqlParam(); + if (transactionalMode == TransactionalMode.off) + alterCQL = alterCQL + " AND " + TransactionalMigrationFromMode.full.asCqlParam(); + cluster.coordinator(1).execute(alterCQL, ConsistencyLevel.ALL); + if (transactionalMode == TransactionalMode.off) + { + nodetool(cluster.coordinator(1), "repair", "-skip-paxos", KEYSPACE, accordTableName); + } + else + { + nodetool(cluster.coordinator(1), "repair", "-skip-paxos", "-skip-accord", KEYSPACE, accordTableName); + nodetool(cluster.coordinator(1), "repair", "-skip-accord", KEYSPACE, accordTableName); + } + } + cluster.coordinator(1).execute(query, cl); + // Transactional modes that write through Accord never have a point where they need to run interop reads + // they go straight from not being able to read to being able to read from a single replica + if (!transactionalMode.ignoresSuppliedReadCL() && !transactionalMode.nonSerialWritesThroughAccord) + { + assertEquals(2, messageCount(Verb.ACCORD_INTEROP_STABLE_THEN_READ_REQ)); + assertEquals(2, messageCount(Verb.ACCORD_INTEROP_READ_RSP)); + } + else + { + // Tricky to check for regular commit because a lot of background Accord things create commits + assertEquals(0, messageCount(Verb.ACCORD_INTEROP_STABLE_THEN_READ_REQ)); + assertEquals(0, messageCount(Verb.ACCORD_INTEROP_READ_REQ)); + assertEquals(0, messageCount(Verb.ACCORD_INTEROP_READ_RSP)); + // Durability scheduling creates a lot of background commits that generate read responses + assertTrue(messageCount(Verb.ACCORD_READ_RSP) > 0); + } + }); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordSimpleFastPathTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordSimpleFastPathTest.java new file mode 100644 index 000000000000..cf024f43f806 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordSimpleFastPathTest.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.net.UnknownHostException; +import java.util.Comparator; +import java.util.HashSet; +import java.util.Set; + +import org.apache.cassandra.service.accord.AccordFastPath; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.junit.Assert; +import org.junit.Ignore; +import org.junit.Test; + +import accord.local.Node; +import accord.topology.Topology; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.gms.FailureDetector; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.service.accord.AccordConfigurationService; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class AccordSimpleFastPathTest extends TestBaseImpl +{ + private static final Logger logger = LoggerFactory.getLogger(AccordSimpleFastPathTest.class); + + private static Node.Id id(int i) + { + return new Node.Id(i); + } + + private static Set idSet(int... ids) + { + Set result = new HashSet<>(); + for (int id: ids) + result.add(id(id)); + return result; + } + + private static InetAddressAndPort ep(int i) + { + try + { + return InetAddressAndPort.getByName(String.format("127.0.0.%s:7012", i)); + } + catch (UnknownHostException e) + { + throw new RuntimeException(e); + } + } + + private static Set epSet(int... eps) + { + Set result = new HashSet<>(); + for (int ep: eps) + result.add(ep(ep)); + return result; + } + + @Ignore + @Test + public void downNodesRemovedFromFastPath() throws Throwable + { + try (Cluster cluster = init(Cluster.build(3) + .withoutVNodes() + .withConfig(c -> c.with(Feature.NETWORK).set("accord.enabled", "true")) + .start())) + { + cluster.schemaChange("CREATE KEYSPACE ks WITH replication={'class':'SimpleStrategy', 'replication_factor': 3}"); + cluster.schemaChange("CREATE TABLE ks.tbl (k int, c int, v int, primary key (k, c)) WITH " + TransactionalMode.full.asCqlParam()); + String query = "BEGIN TRANSACTION\n" + + " SELECT * FROM ks.tbl WHERE k=0 AND c=0;\n" + + "COMMIT TRANSACTION"; + cluster.coordinator(1).executeWithResult(query, ConsistencyLevel.ANY); + + InetAddressAndPort node1Addr = InetAddressAndPort.getByAddress(cluster.get(1).broadcastAddress()); + InetAddressAndPort node2Addr = InetAddressAndPort.getByAddress(cluster.get(2).broadcastAddress()); + InetAddressAndPort node3Addr = InetAddressAndPort.getByAddress(cluster.get(3).broadcastAddress()); + int node3Id = cluster.get(3).callOnInstance(() -> ClusterMetadata.current().directory.peerId(FBUtilities.getBroadcastAddressAndPort()).id()); + long preShutDownEpoch = cluster.stream().map(ii -> ii.callOnInstance(() -> { + ClusterMetadata cm = ClusterMetadata.current(); + AccordFastPath accordFastPath = cm.accordFastPath; + Assert.assertEquals(idSet(), accordFastPath.unavailableIds()); + + long epoch = cm.epoch.getEpoch(); + AccordConfigurationService configService = ((AccordService) AccordService.instance()).configService(); + Topology topology = configService.getTopologyForEpoch(epoch); + Assert.assertFalse(topology.shards().isEmpty()); + topology.shards().forEach(shard -> Assert.assertEquals(idSet(1, 2, 3), shard.nodes.without(shard.notInFastPath))); + return cm.epoch.getEpoch(); + })).max(Comparator.naturalOrder()).get(); + + cluster.get(1).runOnInstance(() -> { + FailureDetector.instance.forceConviction(InetAddressAndPort.getByAddress(node3Addr)); + // update is performed in another thread, wait for it to be applied locally before returning + for (int i=0; i<10; i++) + { + if (ClusterMetadata.current().epoch.getEpoch() == preShutDownEpoch) + FBUtilities.sleepQuietly(100); + else + break; + } + assert ClusterMetadata.current().epoch.getEpoch() > preShutDownEpoch; + }); + + cluster.get(1, 2).forEach(ii -> { + logger.info("Checking instance {} -> {}", ii, ii.broadcastAddress()); + ii.runOnInstance(() -> { + ClusterMetadataService.instance().fetchLogFromCMS(Epoch.create(preShutDownEpoch + 1)); + ClusterMetadata cm = ClusterMetadata.current(); + AccordFastPath accordFastPath = cm.accordFastPath; + Assert.assertEquals(preShutDownEpoch + 1, cm.epoch.getEpoch()); + Assert.assertEquals(idSet(node3Id), accordFastPath.unavailableIds()); + }); + + } + ); + + // confirm a duplicate conviction doesn't create a new epoch + cluster.get(2).runOnInstance(() -> { + FailureDetector.instance.forceConviction(InetAddressAndPort.getByAddress(node3Addr)); + }); + + cluster.get(1, 2).forEach(ii -> ii.runOnInstance(() -> { + ClusterMetadata cm = ClusterMetadata.current(); + Assert.assertEquals(preShutDownEpoch + 1, cm.epoch.getEpoch()); + })); + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordTestBase.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordTestBase.java new file mode 100644 index 000000000000..49fb52f1007c --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordTestBase.java @@ -0,0 +1,701 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.io.IOException; +import java.lang.reflect.Method; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.concurrent.Callable; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; +import javax.annotation.Nullable; + +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableList; +import com.google.common.primitives.Ints; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.Before; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.RoutingKey; +import accord.coordinate.Invalidated; +import accord.impl.progresslog.DefaultProgressLogs; +import accord.messages.PreAccept; +import accord.primitives.KeyRoute; +import accord.primitives.Routable.Domain; +import accord.primitives.Route; +import accord.primitives.TxnId; +import net.bytebuddy.ByteBuddy; +import net.bytebuddy.dynamic.loading.ClassLoadingStrategy; +import net.bytebuddy.implementation.MethodDelegation; +import net.bytebuddy.implementation.bind.annotation.SuperCall; +import net.bytebuddy.implementation.bind.annotation.This; +import org.apache.cassandra.Util; +import org.apache.cassandra.batchlog.BatchlogManager; +import org.apache.cassandra.cql3.CQLStatement; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.cql3.statements.ModificationStatement; +import org.apache.cassandra.cql3.statements.TransactionStatement; +import org.apache.cassandra.cql3.transactions.ReferenceValue; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.db.SystemKeyspace; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.Cluster.Builder; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.IInstance; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.IIsolatedExecutor.SerializableRunnable; +import org.apache.cassandra.distributed.api.QueryResults; +import org.apache.cassandra.distributed.api.SimpleQueryResult; +import org.apache.cassandra.distributed.impl.Instance; +import org.apache.cassandra.distributed.shared.AssertUtils; +import org.apache.cassandra.distributed.shared.ClusterUtils; +import org.apache.cassandra.distributed.shared.Metrics; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.distributed.util.QueryResultUtil; +import org.apache.cassandra.exceptions.ReadTimeoutException; +import org.apache.cassandra.exceptions.WriteTimeoutException; +import org.apache.cassandra.hints.HintsService; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.accord.exceptions.AccordReadPreemptedException; +import org.apache.cassandra.service.accord.exceptions.AccordWritePreemptedException; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationState; +import org.apache.cassandra.service.consensus.migration.TransactionalMigrationFromMode; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.utils.AssertionUtils; +import org.apache.cassandra.utils.FailingConsumer; + +import static com.google.common.base.Preconditions.checkState; +import static java.lang.String.format; +import static net.bytebuddy.matcher.ElementMatchers.named; +import static org.apache.cassandra.db.SystemKeyspace.CONSENSUS_MIGRATION_STATE; +import static org.apache.cassandra.db.SystemKeyspace.PAXOS; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.ALL; +import static org.apache.cassandra.schema.SchemaConstants.SYSTEM_KEYSPACE_NAME; +import static org.junit.Assert.assertArrayEquals; + +public abstract class AccordTestBase extends TestBaseImpl +{ + private static final Logger logger = LoggerFactory.getLogger(AccordTestBase.class); + private static final int MAX_RETRIES = 10; + + protected static final AtomicInteger COUNTER = new AtomicInteger(0); + + protected static Cluster SHARED_CLUSTER; + + protected String accordTableName; + protected String qualifiedAccordTableName; + protected String regularTableName; + protected String qualifiedRegularTableName; + + protected final TransactionalMode transactionalMode; + + protected AccordTestBase() + { + this.transactionalMode = TransactionalMode.full; + } + + protected AccordTestBase(TransactionalMode transactionalMode) { + this.transactionalMode = transactionalMode; + } + + public static void setupCluster(Function options, int nodes) throws IOException + { + SHARED_CLUSTER = createCluster(nodes, options); + } + + @AfterClass + public static void teardown() + { + if (SHARED_CLUSTER != null) + SHARED_CLUSTER.close(); + } + + @Before + public void setup() + { + accordTableName = "accordtbl" + COUNTER.getAndIncrement(); + qualifiedAccordTableName = KEYSPACE + '.' + accordTableName; + regularTableName = "regulartbl" + COUNTER.getAndIncrement(); + qualifiedRegularTableName = KEYSPACE + '.' + regularTableName; + } + + @After + public void tearDown() throws Exception + { + for (IInvokableInstance instance : SHARED_CLUSTER) + instance.runOnInstance(() -> DefaultProgressLogs.unsafePauseForTesting(false)); + + truncateSystemTables(); + + ClusterUtils.waitForCMSToQuiesce(SHARED_CLUSTER, 1); + SHARED_CLUSTER.forEach(() -> Util.spinUntilTrue(() -> ClusterMetadata.current().epoch.getEpoch() == + ((AccordService) AccordService.instance()).configService().currentEpoch() && + AccordService.instance().topology().current().epoch() == + ((AccordService) AccordService.instance()).configService().currentEpoch(), + 60)); + } + + protected static void assertRowSerial(Cluster cluster, String query, int k, int c, int v, int s) + { + Object[][] result = cluster.coordinator(1).execute(query, ConsistencyLevel.SERIAL); + assertArrayEquals(new Object[]{new Object[] {k, c, v, s}}, result); + } + + protected static void assertRowSerial(Cluster cluster, String query, Object[]... expected) + { + Object[][] result = cluster.coordinator(1).execute(query, ConsistencyLevel.SERIAL); + AssertUtils.assertRows(result, expected); + } + + protected void test(String tableDDL, FailingConsumer fn) throws Exception + { + test(Collections.singletonList(tableDDL), fn); + } + + protected List createTables(String tableFormat, String... qualifiedTables) + { + ImmutableList.Builder builder = ImmutableList.builder(); + for (String qualifiedTable : qualifiedTables) + builder.add(format(tableFormat, qualifiedTable)); + return builder.build(); + } + + public static void ensureTableIsAccordManaged(Cluster cluster, String ksname, String tableName) + { + cluster.get(1).runOnInstance(() -> { + TableMetadata metadata = Schema.instance.getTableMetadata(ksname, tableName); + if (metadata == null) + return; // bad plumbing from shared utils.... + Assert.assertTrue(metadata.params.transactionalMode.accordIsEnabled); + }); + } + + protected void test(List ddls, FailingConsumer fn) throws Exception + { + for (String ddl : ddls) + SHARED_CLUSTER.schemaChange(ddl); + + // Evict commands from the cache immediately to expose problems loading from disk. +// SHARED_CLUSTER.forEach(node -> node.runOnInstance(() -> AccordService.instance().setCacheSize(0))); + + try + { + fn.accept(SHARED_CLUSTER); + } + finally + { + SHARED_CLUSTER.filters().reset(); + } + } + + protected void test(FailingConsumer fn) throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int, primary key (k, c)) WITH transactional_mode='" + transactionalMode + "'", fn); + } + + protected static ConsensusMigrationState getMigrationStateSnapshot(IInvokableInstance instance) throws IOException + { + byte[] serializedBytes = instance.callOnInstance(() -> { + DataOutputBuffer output = new DataOutputBuffer(); + try + { + ConsensusMigrationState.serializer.serialize( + ClusterMetadata.current().consensusMigrationState, + output, Version.V0); + } + catch (IOException e) + { + throw new RuntimeException(e); + } + return output.toByteArray(); + }); + DataInputPlus input = new DataInputBuffer(serializedBytes); + return ConsensusMigrationState.serializer.deserialize(input, Version.V0); + } + + protected static int getAccordCoordinateCount() + { + return getAccordWriteCount() + getAccordReadCount(); + } + + protected static int getCasWriteCount(int coordinatorIndex) + { + return Ints.checkedCast(getMetrics(coordinatorIndex).getCounter("org.apache.cassandra.metrics.ClientRequest.Latency.CASWrite")); + } + + protected static int getReadRetryOnDifferentSystemCount(IInstance instance) + { + return Ints.checkedCast(instance.metrics().getCounter("org.apache.cassandra.metrics.ClientRequest.RetryDifferentSystem.Read")); + } + + protected static int getWriteRetryOnDifferentSystemCount(int coordinatorIndex) + { + return Ints.checkedCast(getMetrics(coordinatorIndex).getCounter("org.apache.cassandra.metrics.ClientRequest.RetryDifferentSystem.Write")); + } + + protected int getMutationsRejectedOnWrongSystemCount() + { + long sum = 0; + for (IInvokableInstance instance : SHARED_CLUSTER) + sum += instance.metrics().getCounter("org.apache.cassandra.metrics.Table.MutationsRejectedOnWrongSystem." + qualifiedAccordTableName); + return Ints.checkedCast(sum); + } + + protected int getReadsRejectedOnWrongSystemCount() + { + long sum = 0; + for (IInvokableInstance instance : SHARED_CLUSTER) + sum += instance.metrics().getCounter("org.apache.cassandra.metrics.Table.ReadsRejectedOnWrongSystem." + qualifiedAccordTableName); + return Ints.checkedCast(sum); + } + + protected static int getCasPrepareCount(int coordinatorIndex) + { + return Ints.checkedCast(getMetrics(coordinatorIndex).getCounter("org.apache.cassandra.metrics.keyspace.CasPrepareLatency.distributed_test_keyspace")); + } + + protected static int getAccordWriteCount() + { + return getAccordWriteCount(1); + } + + protected static int getAccordWriteCount(int coordinatorIndex) + { + return Ints.checkedCast(getMetrics(coordinatorIndex).getCounter("org.apache.cassandra.metrics.ClientRequest.Latency.AccordWrite")); + } + + protected static int getAccordReadCount() + { + return getAccordReadCount(1); + } + + protected static int getAccordReadCount(int coordinatorIndex) + { + return Ints.checkedCast(getMetrics(coordinatorIndex).getCounter("org.apache.cassandra.metrics.ClientRequest.Latency.AccordRead")); + } + + protected static int getAccordReadMigrationRejects(int coordinatorIndex) + { + return Ints.checkedCast(getMetrics(coordinatorIndex).getCounter("org.apache.cassandra.metrics.ClientRequest.AccordMigrationRejects.AccordRead")); + } + + protected static int getAccordWriteMigrationRejects(int coordinatorIndex) + { + return Ints.checkedCast(getMetrics(coordinatorIndex).getCounter("org.apache.cassandra.metrics.ClientRequest.AccordMigrationRejects.AccordWrite")); + } + + protected static int getAccordMigrationSkippedReads() + { + // Skipped reads can occur at any node so sum them + long sum = 0; + for (IInvokableInstance instance : SHARED_CLUSTER) + sum += instance.metrics().getCounter("org.apache.cassandra.metrics.ClientRequest.MigrationSkippedReads.AccordWrite"); + return Ints.checkedCast(sum); + } + + protected static int getKeyMigrationCount(int coordinatorIndex) + { + return Ints.checkedCast(getMetrics(coordinatorIndex).getCounter("org.apache.cassandra.metrics.Table.KeyMigrationLatency.all")); + } + + protected static int getCasWriteBeginRejects(int coordinatorIndex) + { + return Ints.checkedCast(getMetrics(coordinatorIndex).getCounter("org.apache.cassandra.metrics.ClientRequest.PaxosBeginMigrationRejects.CASWrite")); + } + + protected static int getCasReadBeginRejects(int coordinatorIndex) + { + return Ints.checkedCast(getMetrics(coordinatorIndex).getCounter("org.apache.cassandra.metrics.ClientRequest.PaxosBeginMigrationRejects.CASRead")); + } + + protected static int getCasWriteAcceptRejects(int coordinatorIndex) + { + return Ints.checkedCast(getMetrics(coordinatorIndex).getCounter("org.apache.cassandra.metrics.ClientRequest.PaxosAcceptMigrationRejects.CASWrite")); + } + + protected static int getCasReadAcceptRejects(int coordinatorIndex) + { + return Ints.checkedCast(getMetrics(coordinatorIndex).getCounter("org.apache.cassandra.metrics.ClientRequest.PaxosAcceptMigrationRejects.CASRead")); + } + + protected static Metrics getMetrics(int coordinatorIndex) + { + return SHARED_CLUSTER.get(coordinatorIndex).metrics(); + } + + protected static void forEach(SerializableRunnable runnable) + { + for (IInvokableInstance instance : SHARED_CLUSTER) + instance.runOnInstance(runnable); + } + + private static Cluster createCluster(int nodes, Function options) throws IOException + { + // need to up the timeout else tests get flaky + // disable vnode for now, but should enable before trunk + Cluster.Builder builder = Cluster.build(nodes) + .withoutVNodes() + .withConfig(c -> c.with(Feature.GOSSIP) + .set("sasi_indexes_enabled", "true") + .set("write_request_timeout", "10s") + .set("native_transport_timeout", "30s") + .set("cms_await_timeout", "1s") + .set("cms_default_max_retries", 10_000) + .set("accord.ephemeral_read_enabled", "false") + .set("accord.shard_durability_target_splits", "4") + .set("accord.retry_syncpoint", "1s*attempts") + .set("accord.retry_durability", "1s*attempts") + .set("accord.command_store_shard_count", "2") + .set("accord.queue_shard_count", "2")) + .withInstanceInitializer(EnforceUpdateDoesNotPerformRead::install); + builder = options.apply(builder); + return init(builder.start()); + } + + protected static SimpleQueryResult executeAsTxn(Cluster cluster, String check, Object... boundValues) + { + String normalized = wrapInTxn(check); + logger.info("Executing transaction statement:\n{}", normalized); + return cluster.coordinator(1).executeWithResult(normalized, ConsistencyLevel.ANY, boundValues); + } + + protected static SimpleQueryResult execute(Cluster cluster, String check, Object... boundValues) + { + logger.info("Executing statement:\n{}", check); + return cluster.coordinator(1).executeWithResult(check, ConsistencyLevel.ANY, boundValues); + } + + private static SimpleQueryResult execute(Cluster cluster, String check, ConsistencyLevel cl, Object... boundValues) + { + return cluster.coordinator(1).executeWithResult(check, cl, boundValues); + } + + protected static SimpleQueryResult assertRowEquals(Cluster cluster, SimpleQueryResult expected, String check, ConsistencyLevel cl, Object... boundValues) + { + SimpleQueryResult result = execute(cluster, check, cl, boundValues); + QueryResultUtil.assertThat(result).isEqualTo(expected); + return result; + } + + protected static SimpleQueryResult assertRowEquals(Cluster cluster, SimpleQueryResult expected, String check, Object... boundValues) + { + SimpleQueryResult result = execute(cluster, check, boundValues); + QueryResultUtil.assertThat(result).isEqualTo(expected); + return result; + } + + protected static SimpleQueryResult assertRowEquals(Cluster cluster, Object[] row, String check, ConsistencyLevel cl, Object... boundValues) + { + return assertRowEquals(cluster, QueryResults.builder().row(row).build(), check, cl, boundValues); + } + + protected static SimpleQueryResult assertRowEquals(Cluster cluster, Object[] row, String check, Object... boundValues) + { + return assertRowEquals(cluster, QueryResults.builder().row(row).build(), check, boundValues); + } + + // TODO: Retry on preemption may become unnecessary after the Unified Log is integrated. + protected static SimpleQueryResult assertRowEqualsWithPreemptedRetry(Cluster cluster, Object[] row, String check, Object... boundValues) + { + return assertRowWithPreemptedRetry(cluster, QueryResults.builder().row(row).build(), check, boundValues); + } + + protected SimpleQueryResult assertEmptyWithPreemptedRetry(Cluster cluster, String check, Object... boundValues) + { + return assertRowWithPreemptedRetry(cluster, QueryResults.builder().build(), check, boundValues); + } + + private static SimpleQueryResult assertRowWithPreemptedRetry(Cluster cluster, SimpleQueryResult expected, String check, Object... boundValues) + { + SimpleQueryResult result = executeWithRetry(cluster, check, boundValues); + QueryResultUtil.assertThat(result).isEqualTo(expected); + return result; + } + + private static boolean hasRootCause(Throwable ex, Class klass) + { + return AssertionUtils.rootCauseIs(klass).matches(ex); + + } + + private static SimpleQueryResult executeWithRetry0(int count, Cluster cluster, IInvokableInstance inst, String check, Object... boundValues) + { + try + { + logger.info("Executing statement:\n{}", check); + return inst.coordinator().executeWithResult(check, ConsistencyLevel.ANY, boundValues); + } + catch (RuntimeException ex) + { + if (count <= MAX_RETRIES && (hasRootCause(ex, AccordReadPreemptedException.class) || hasRootCause(ex, AccordWritePreemptedException.class) || hasRootCause(ex, Invalidated.class))) + { + logger.warn("[Retry attempt={}] Preempted failure for\n{}", count, check); + return executeWithRetry0(count + 1, cluster, inst, check, boundValues); + } + TxnId txnId = maybeExtractId(ex); + if (txnId != null) + { + // query the cluster to find its status... + StringBuilder sb = new StringBuilder(); + sb.append("Txn ").append(txnId).append(" timed out...\n"); + ClusterUtils.queryTxnStateAsString(sb, cluster, txnId); + throw new AssertionError(sb.toString(), ex.getCause()); + } + throw ex; + } + } + + private static TxnId maybeExtractId(Throwable ex) + { + if (hasRootCause(ex, AccordReadPreemptedException.class) + || hasRootCause(ex, AccordWritePreemptedException.class) + || hasRootCause(ex, ReadTimeoutException.class) + || hasRootCause(ex, WriteTimeoutException.class)) + { + return TxnId.tryParse(ex.getMessage()); + } + return null; + } + + public static SimpleQueryResult executeWithRetry(Cluster cluster, String check, Object... boundValues) + { + return executeWithRetry(cluster, cluster.get(1), check, boundValues); + } + + public static SimpleQueryResult executeWithRetry(Cluster cluster, IInvokableInstance inst, String check, Object... boundValues) + { + // is this method safe? + + if (!isIdempotent(inst, check)) + throw new AssertionError("Unable to retry txn that is not idempotent: cql=\n" + check); + + return executeWithRetry0(0, cluster, inst, check, boundValues); + } + + public static Boolean isIdempotent(IInvokableInstance inst, String cql) + { + return inst.callOnInstance(() -> { + CQLStatement.Raw parsed = QueryProcessor.parseStatement(cql); + if (parsed instanceof TransactionStatement.Parsed) + { + TransactionStatement stmt = (TransactionStatement) parsed.prepare(ClientState.forInternalCalls()); + return isIdempotent(stmt); + } + else if (parsed instanceof ModificationStatement.Parsed) + { + ModificationStatement stmt = (ModificationStatement) parsed.prepare(ClientState.forInternalCalls()); + return isIdempotent(stmt); + } + else + { + throw new IllegalArgumentException("Unexpected type: " + parsed.getClass()); + } + }); + } + + protected static String wrapInTxn(String statement) + { + if (!statement.trim().toUpperCase().startsWith("BEGIN TRANSACTION")) + { + statement = statement.trim(); + statement = Arrays.stream(statement.split("\\n")) + .map(line -> line.trim().endsWith(";") ? line : line + ';') + .collect(Collectors.joining("\n ", "BEGIN TRANSACTION\n ", "\nCOMMIT TRANSACTION")); + } + return statement; + } + + public static boolean isIdempotent(TransactionStatement statement) + { + for (ModificationStatement update : statement.getUpdates()) + { + if (!isIdempotent(update)) + return false; + } + return true; + } + + private static boolean isIdempotent(ModificationStatement update) + { + // ReferenceValue.Constant is used during migration, which means a case like "a += 1" + // ReferenceValue.Substitution uses a LET reference, so rerunning would always just see the new state + long numConstants = update.getSubstitutions().stream() + .filter(f -> f.getValue() instanceof ReferenceValue.Constant) + .filter(f -> !f.getKind().name().contains("Setter")) + .count(); + return numConstants == 0; + } + + static List tokens() + { + return SHARED_CLUSTER.stream() + .flatMap(i -> StreamSupport.stream(Splitter.on(",").split(i.config().getString("initial_token")).spliterator(), false)) + .collect(Collectors.toList()); + } + + static List tokensToKeys(List tokens) + { + return tokens.stream() + .map(t -> (Murmur3Partitioner.LongToken) Murmur3Partitioner.instance.getTokenFactory().fromString(t)) + .map(Murmur3Partitioner.LongToken::keyForToken) + .collect(Collectors.toList()); + } + + public static class EnforceUpdateDoesNotPerformRead + { + public static void install(ClassLoader classLoader, Integer num) + { + checkState(Arrays.asList(ModificationStatement.class.getDeclaredMethods()).stream().map(Method::getName).anyMatch(m -> m.equals("readRequiredLists"))); + new ByteBuddy().rebase(ModificationStatement.class) + .method(named("readRequiredLists")) + .intercept(MethodDelegation.to(EnforceUpdateDoesNotPerformRead.class)) + .make() + .load(classLoader, ClassLoadingStrategy.Default.INJECTION); + } + + @SuppressWarnings("unused") + public static Map readRequiredLists(@This ModificationStatement stmt, @SuperCall Callable> fn) throws Exception + { + Map map = fn.call(); + if (map != null) + { + // if the call tree has a TransactionStatement, then fail as this violates the query + for (StackTraceElement e : Thread.currentThread().getStackTrace()) + if (TransactionStatement.class.getCanonicalName().equals(e.getClassName())) + throw new IllegalStateException("Attempted to load required partition!"); + } + return map; + } + } + + protected abstract Logger logger(); + + protected void alterTableTransactionalMode(TransactionalMode mode) + { + alterTableTransactionalMode(mode, null); + } + + protected void alterTableTransactionalMode(TransactionalMode mode, @Nullable TransactionalMigrationFromMode from) + { + SHARED_CLUSTER.schemaChange(format("ALTER TABLE %s WITH %s" + (from == null ? "" : " AND %s"), qualifiedAccordTableName, mode.asCqlParam(), from == null ? null : from.asCqlParam())); + } + + protected static void pauseHints() + { + forEach(() -> HintsService.instance.pauseDispatch()); + } + + protected static void deleteAllHints() + { + forEach(() -> HintsService.instance.deleteAllHintsUnsafe()); + } + + protected static void pauseBatchlog() + { + forEach(() -> BatchlogManager.instance.pauseReplay()); + } + + protected static void unpauseHints() + { + forEach(() -> HintsService.instance.resumeDispatch()); + } + + protected static void unpauseBatchlog() + { + forEach(() -> BatchlogManager.instance.resumeReplay()); + } + + protected static void blockMutationAndPreAccept(Cluster cluster) + { + cluster.filters().outbound().messagesMatching((from, to, message) -> { + if (message.verb() == Verb.MUTATION_REQ.id) + { + String keyspace = cluster.get(to).callsOnInstance(() -> ((Message) Instance.deserializeMessage(message)).payload.getKeyspaceName()).call(); + if (keyspace.equals(KEYSPACE)) + return true; + } + if (message.verb() == Verb.ACCORD_PRE_ACCEPT_REQ.id) + { + boolean drop = cluster.get(to).callsOnInstance(() -> { + PreAccept preAccept = (PreAccept)Instance.deserializeMessage(message).payload; + Route route = preAccept.scope; + if (route.domain() == Domain.Key) + for (RoutingKey key : (KeyRoute)route) + { + TokenKey routingKey = (TokenKey)key; + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(routingKey.table()); + if (cfs.getKeyspaceName().equals(KEYSPACE)) + return true; + } + return false; + }).call(); + if (drop) + return true; + } + return false; + }).drop(); + } + + protected static void truncateSystemTables() + { + SHARED_CLUSTER.coordinator(1).execute("TRUNCATE " + SYSTEM_KEYSPACE_NAME + "." + SystemKeyspace.BATCHES, ALL); + SHARED_CLUSTER.coordinator(1).execute(format("TRUNCATE TABLE %s.%s", SYSTEM_KEYSPACE_NAME, CONSENSUS_MIGRATION_STATE), ALL); + SHARED_CLUSTER.coordinator(1).execute(format("TRUNCATE TABLE %s.%s", SYSTEM_KEYSPACE_NAME, PAXOS), ALL); + } + + protected static Stream hostIds() + { + return Stream.concat(ClusterMetadata.current().directory.peerIds() + .stream() + .map(ClusterMetadata.current().directory::hostId), + Stream.of(HintsService.RETRY_ON_DIFFERENT_SYSTEM_UUID)); + } + + protected static boolean hasPendingHints() + { + return hostIds().map(HintsService.instance::getTotalHintsSize) + .anyMatch(size -> size > 0); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordTimestampPreservationTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordTimestampPreservationTest.java new file mode 100644 index 000000000000..58b08251370b --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordTimestampPreservationTest.java @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import com.google.common.collect.ImmutableList; +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.db.SimpleBuilders.MutationBuilder; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.distributed.api.ICoordinator; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.exceptions.WriteTimeoutException; +import org.apache.cassandra.locator.Replica; +import org.apache.cassandra.locator.ReplicaPlan; +import org.apache.cassandra.locator.ReplicaPlans; +import org.apache.cassandra.service.StorageProxy; +import org.apache.cassandra.service.consensus.migration.ConsensusKeyMigrationState; +import org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter; +import org.junit.After; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import static java.lang.String.format; +import static org.apache.cassandra.Util.spinAssertEquals; +import static org.apache.cassandra.config.CassandraRelevantProperties.HINT_DISPATCH_INTERVAL_MS; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.ALL; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +/* + * Test that non-transactional updates have their timestamps preserved when written through Accord so that + * `USING TIMESTAMP` continues to work and so that hints and batch log retry attempts are inserted with their + * original timestamp and not a later Accord timestamp which could cause data resurrection. + */ +public class AccordTimestampPreservationTest extends AccordTestBase +{ + private static final Logger logger = LoggerFactory.getLogger(AccordTimestampPreservationTest.class); + + private static final int CLUSTERING_VALUE = 1; + + private static final String NORMAL_TABLE_FMT = "CREATE TABLE %s (id int, c int, v int, PRIMARY KEY ((id), c))"; + + private static final String ACCORD_TABLE_FMT = NORMAL_TABLE_FMT + " WITH transactional_mode='full'"; + + private static ICoordinator coordinator; + + private static final String expectedResult = "[[42]]"; + + private static final int PKEY1 = 77; + private static final int PKEY2 = 78; + private static final int VALUE = 66; + + private static final long TIMESTAMP = 42; + + @Override + protected Logger logger() + { + return logger; + } + + @BeforeClass + public static void setupClass() throws IOException + { + HINT_DISPATCH_INTERVAL_MS.setLong(100); + ServerTestUtils.daemonInitialization(); + // Otherwise repair complains if you don't specify a keyspace + CassandraRelevantProperties.SYSTEM_TRACES_DEFAULT_RF.setInt(3); + AccordTestBase.setupCluster(builder -> builder.appendConfig(config -> config.set("write_request_timeout", "2s")), 3); + ServerTestUtils.prepareServerNoRegister(); + coordinator = SHARED_CLUSTER.coordinator(1); + } + + @After + public void tearDown() throws Exception + { + unpauseBatchlog(); + deleteAllHints(); + unpauseHints(); + super.tearDown(); + // Reset migration state + forEach(() -> { + ConsensusRequestRouter.resetInstance(); + ConsensusKeyMigrationState.reset(); + }); + truncateSystemTables(); + } + + @Test + public void testMutationPreservesTimestamp() throws Exception + { + test(createTables(ACCORD_TABLE_FMT, qualifiedAccordTableName), cluster -> { + long startCount = getAccordCoordinateCount(); + coordinator.executeWithResult(insertCQL(qualifiedAccordTableName, PKEY1, VALUE), ALL); + assertEquals(startCount + 1, getAccordWriteCount()); + int id = 1; + for (IInvokableInstance instance : cluster) + { + logger.info("Checking instance " + id); + id++; + spinAssertEquals(expectedResult, () -> instance.executeInternalWithResult(checkCQL()).toString(), 20); + } + }); + } + + @Test + public void testBatchlogPreservesTimestamp() throws Exception + { + test(ImmutableList.of(format(NORMAL_TABLE_FMT, qualifiedRegularTableName), format(ACCORD_TABLE_FMT, qualifiedAccordTableName)), cluster -> { + pauseHints(); + blockMutationAndPreAccept(cluster); + try + { + // Insert must span both Accord and non-Accord ranges or tables otherwise it bypasses the batchlog entirely + coordinator.executeWithResult(batchInsert(true, PKEY1, PKEY2, VALUE), ALL); + fail("Should have thrown WTE"); + } + catch (Throwable t) + { + assertEquals(t.getClass().getName(), WriteTimeoutException.class.getName()); + } + cluster.filters().reset(); + + int id = 1; + for (IInvokableInstance instance : cluster) + { + logger.info("Checking instance " + id); + id++; + spinAssertEquals(expectedResult, () -> instance.executeInternalWithResult(checkCQL()).toString(), 20); + } + }); + } + + @Test + public void testHintsPreservesTimestamp() throws Exception + { + test(createTables(ACCORD_TABLE_FMT, qualifiedAccordTableName), cluster -> { + String keyspace = KEYSPACE; + int pkey1 = PKEY1; + long timestamp = TIMESTAMP; + int clustering = CLUSTERING_VALUE; + String tableName = accordTableName; + cluster.get(1).runOnInstance(() -> { + ByteBuffer keyBuf = Int32Type.instance.fromString(Integer.toString(pkey1)); + DecoratedKey dk = DatabaseDescriptor.getPartitioner().decorateKey(keyBuf); + MutationBuilder mutationBuilder = new MutationBuilder(KEYSPACE, dk); + mutationBuilder.timestamp(timestamp); + mutationBuilder.update(tableName).row(clustering).add("v", VALUE); + Mutation m = mutationBuilder.build(); + ReplicaPlan.ForWrite plan = ReplicaPlans.forWrite(Keyspace.open(keyspace), ConsistencyLevel.ALL, dk.getToken(), ReplicaPlans.writeAll); + for (Replica replica : plan.live().withoutSelf()) + StorageProxy.submitHint(m, replica, null); + }); + for (int i = 2; i <= 3; i++) + { + int instance = i; + spinAssertEquals(expectedResult, () -> cluster.get(instance).executeInternalWithResult(checkCQL()).toString(), 20); + } + }); + } + + private String batchInsert(boolean logged, int pkey1, int pkey2, int value) + { + return batch(logged, + insertCQL(qualifiedAccordTableName, pkey1, value), + insertCQL(qualifiedRegularTableName, pkey2, value)); + } + + private String insertCQL(String table, int pkey, int value) + { + return insertCQL(table, pkey, value, false); + } + + private String insertCQL(String table, int pkey, int value, boolean cas) + { + return format("INSERT INTO %s ( id, c, v ) VALUES ( %d, %d, %d )%s USING TIMESTAMP %d", table, pkey, CLUSTERING_VALUE, value, cas ? " IF NOT EXISTS" : "", TIMESTAMP); + } + + private String checkCQL() + { + return format("SELECT WRITETIME(v) from %s WHERE id = %d", qualifiedAccordTableName, PKEY1); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordTopologyTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordTopologyTest.java new file mode 100644 index 000000000000..1a4bc2b7e46e --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordTopologyTest.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.test.TestBaseImpl; + +import static org.apache.cassandra.distributed.api.Feature.GOSSIP; +import static org.apache.cassandra.distributed.api.Feature.NETWORK; + +public class AccordTopologyTest extends TestBaseImpl +{ + @Test + public void name() throws Throwable + { + try (Cluster cluster = builder().withNodes(3) + .withConfig(config -> config.with(GOSSIP).with(NETWORK)) + .createWithoutStarting()) + { + cluster.get(1).startup(); + + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/AccordWriteInteroperabilityTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordWriteInteroperabilityTest.java new file mode 100644 index 000000000000..944a02facef6 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/AccordWriteInteroperabilityTest.java @@ -0,0 +1,225 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import javax.annotation.Nonnull; + +import org.junit.After; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.DataRange; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.memtable.Memtable; +import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.shared.InstanceClassLoader; +import org.apache.cassandra.io.sstable.SSTableReadsListener; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.service.consensus.TransactionalMode; + +import static com.google.common.base.Throwables.getStackTraceAsString; +import static org.apache.cassandra.Util.dk; +import static org.apache.cassandra.Util.spinUntilTrue; +import static org.apache.commons.collections.ListUtils.synchronizedList; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +@RunWith(Parameterized.class) +public class AccordWriteInteroperabilityTest extends AccordTestBase +{ + private static final Logger logger = LoggerFactory.getLogger(AccordInteroperabilityTest.class); + + @Nonnull + private final TransactionalMode mode; + + private final boolean migrated; + + public AccordWriteInteroperabilityTest(@Nonnull TransactionalMode mode, boolean migrated) + { + this.mode = mode; + this.migrated = migrated; + } + + @Parameterized.Parameters(name = "transactionalMode={0}, migrated={1}") + public static Collection data() { + List tests = new ArrayList<>(TransactionalMode.values().length * 2); + for (TransactionalMode mode : TransactionalMode.values()) + { + if (mode.accordIsEnabled) + { + tests.add(new Object[]{ mode, true }); + tests.add(new Object[]{ mode, false }); + } + } + return tests; + } + + @Override + protected Logger logger() + { + return logger; + } + + @BeforeClass + public static void setupClass() throws IOException + { + AccordTestBase.setupCluster(builder -> builder.withConfig(config -> config.set("accord.range_migration", "auto") + .set("paxos_variant", "v2")), + 3); + } + + @After + public void tearDown() + { + SHARED_CLUSTER.setMessageSink(null); + } + + + private String testTransactionInsert() + { + return "BEGIN TRANSACTION\n" + + " INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (42, 2, 3);\n" + + "COMMIT TRANSACTION"; + } + + private String testInsert() + { + return "INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (42, 2, 3)"; + } + + private String testBatchInsert() + { + return "BEGIN BATCH\n" + + "INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (1, 2, 3);\n" + + "INSERT INTO " + qualifiedAccordTableName + " (k, c, v) VALUES (42, 43, 44);\n" + + "APPLY BATCH"; + } + + @Test + public void testTransactionStatementApplyIsInteropApply() throws Throwable + { + testApplyIsInteropApply(testTransactionInsert()); + } + + @Test + public void testNonSerialApplyIsInteropApply() throws Throwable + { + testApplyIsInteropApply(testInsert()); + } + + @Test + public void testBatchInsertApplyIsInteropApply() throws Throwable + { + testApplyIsInteropApply(testBatchInsert()); + } + + private void testApplyIsInteropApply(String query) throws Throwable + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v int, PRIMARY KEY(k, c))" + (migrated ? " WITH " + transactionalMode.asCqlParam() : ""), + cluster -> { + MessageCountingSink messageCountingSink = new MessageCountingSink(SHARED_CLUSTER); + List failures = synchronizedList(new ArrayList<>()); + // Verify that the apply response is only sent after the row has been inserted + // TODO (required): Need to delay mutation stage/mutation to ensure this has time to catch it + SHARED_CLUSTER.setMessageSink((to, message) -> { + try + { + if (message.verb() == Verb.ACCORD_APPLY_RSP.id) + { + // It can be async if it's migrated + if (migrated) + return; + int nodeIndex = ((InstanceClassLoader)ClassLoader.getSystemClassLoader()).getInstanceId(); + try + { + String keyspace = KEYSPACE; + String tableName = accordTableName; + SHARED_CLUSTER.get(nodeIndex).runOnInstance(() -> { + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(keyspace, tableName); + Memtable memtable = cfs.getCurrentMemtable(); + int expectedPartitions = query.startsWith("BEGIN BATCH") ? 2 : 1; + assertEquals(expectedPartitions, memtable.partitionCount()); + UnfilteredPartitionIterator partitions = memtable.partitionIterator(ColumnFilter.all(cfs.metadata()), DataRange.allData(cfs.getPartitioner()), SSTableReadsListener.NOOP_LISTENER); + assertTrue(partitions.hasNext()); + for (int i = 0; i < expectedPartitions; i++) + { + UnfilteredRowIterator rows = partitions.next(); + assertTrue(rows.partitionKey().equals(dk(42)) || rows.partitionKey().equals(dk(1))); + assertTrue(rows.hasNext()); + Row row = (Row)rows.next(); + assertFalse(rows.hasNext()); + } + assertFalse(partitions.hasNext()); + }); + } + catch (Throwable t) + { + failures.add(getStackTraceAsString(t)); + } + } + } + finally + { + messageCountingSink.accept(to, message); + } + }); + + if (!migrated) + { + cluster.coordinator(1).execute("ALTER TABLE " + qualifiedAccordTableName + " WITH " + transactionalMode.asCqlParam(), ConsistencyLevel.ALL); + nodetool(cluster.coordinator(1), "repair", "-skip-paxos", "-skip-accord", KEYSPACE, accordTableName); + } + + String finalQuery = query; + org.apache.cassandra.distributed.api.ConsistencyLevel consistencyLevel = org.apache.cassandra.distributed.api.ConsistencyLevel.QUORUM; + // Need to switch to CAS for it to run through Accord at all + if (!transactionalMode.nonSerialWritesThroughAccord && !query.startsWith("BEGIN TRANSACTION")) + { + finalQuery = query + " IF NOT EXISTS"; + consistencyLevel = org.apache.cassandra.distributed.api.ConsistencyLevel.SERIAL; + } + long startingRegularApplyCount = messageCount(Verb.ACCORD_APPLY_REQ); + cluster.coordinator(1).execute(finalQuery, consistencyLevel); + if (transactionalMode.ignoresSuppliedCommitCL() && migrated) + { + // Apply is async and there can be a lot of sources of regular APPLY + spinUntilTrue(() -> messageCount(Verb.ACCORD_APPLY_REQ) > startingRegularApplyCount); + assertEquals(0, messageCount(Verb.ACCORD_INTEROP_APPLY_REQ)); + } + else + { + assertEquals(3, messageCount(Verb.ACCORD_INTEROP_APPLY_REQ)); + } + assertTrue(failures.toString(), failures.isEmpty()); + }); + } +} diff --git a/src/java/org/apache/cassandra/exceptions/ChecksumMismatchException.java b/test/distributed/org/apache/cassandra/distributed/test/accord/FullAccordCQLTest.java similarity index 75% rename from src/java/org/apache/cassandra/exceptions/ChecksumMismatchException.java rename to test/distributed/org/apache/cassandra/distributed/test/accord/FullAccordCQLTest.java index a76c46c782bc..3f8259cff3e3 100644 --- a/src/java/org/apache/cassandra/exceptions/ChecksumMismatchException.java +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/FullAccordCQLTest.java @@ -16,19 +16,14 @@ * limitations under the License. */ -package org.apache.cassandra.exceptions; +package org.apache.cassandra.distributed.test.accord; -import java.io.IOException; +import org.apache.cassandra.service.consensus.TransactionalMode; -public class ChecksumMismatchException extends IOException +public class FullAccordCQLTest extends AccordCQLTestBase { - public ChecksumMismatchException() + public FullAccordCQLTest() { - super(); - } - - public ChecksumMismatchException(String s) - { - super(s); + super(TransactionalMode.full); } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/InteropAccordCQLTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/InteropAccordCQLTest.java new file mode 100644 index 000000000000..72673b3a42b9 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/InteropAccordCQLTest.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import org.junit.Ignore; +import org.junit.Test; + +import org.apache.cassandra.service.consensus.TransactionalMode; + +public class InteropAccordCQLTest extends AccordCQLTestBase +{ + public InteropAccordCQLTest() + { + super(TransactionalMode.test_interop_read); + } + + @Ignore + @Override + @Test + public void testCASSimulatorLite() throws Exception + { + super.testCASSimulatorLite(); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/InteropTokenRangeTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/InteropTokenRangeTest.java new file mode 100644 index 000000000000..ee89e982dead --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/InteropTokenRangeTest.java @@ -0,0 +1,224 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.NavigableSet; +import java.util.TreeSet; +import javax.annotation.Nullable; + +import com.google.common.collect.Sets; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Murmur3Partitioner.LongToken; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ICoordinator; +import org.apache.cassandra.distributed.api.QueryResult; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.assertj.core.api.Assertions; + +import static org.apache.cassandra.dht.Murmur3Partitioner.LongToken.keyForToken; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.QUORUM; + +@RunWith(Parameterized.class) +public class InteropTokenRangeTest extends TestBaseImpl +{ + @Nullable + private final TransactionalMode mode; + + public InteropTokenRangeTest(@Nullable TransactionalMode mode) + { + this.mode = mode; + } + + @Parameterized.Parameters(name = "{0}") + public static Collection data() { + List tests = new ArrayList<>(TransactionalMode.values().length + 1); + tests.add(new Object[] {null}); + for (TransactionalMode mode : TransactionalMode.values()) + tests.add(new Object[] {mode}); + return tests; + } + + @Test + public void test() throws IOException + { + try (Cluster cluster = Cluster.build(1) + .start()) + { + init(cluster); + String createTable = withKeyspace("CREATE TABLE %s.tbl (pk blob primary key)"); + if (mode != null) + createTable += " WITH " + mode.asCqlParam(); + cluster.schemaChange(createTable); + + ICoordinator node = cluster.coordinator(1); + NavigableSet tokens = tokens(); + for (long token : tokens) + node.executeWithResult(withKeyspace("INSERT INTO %s.tbl (pk) VALUES (?)"), QUORUM, keyForToken(token)); + + for (long token : tokens) + { + ByteBuffer pk = keyForToken(token); + for (TokenOperator op : TokenOperator.values()) + { + Assertions.assertThat(tokens(node.executeWithResult(withKeyspace("SELECT * FROM %s.tbl WHERE " + op.condition), QUORUM, pk))) + .describedAs("Token %d with operator %s", token, op.condition) + .isEqualTo(op.expected(token, tokens)); + + Assertions.assertThat(tokens(node.executeWithPagingWithResult(withKeyspace("SELECT * FROM %s.tbl WHERE " + op.condition), QUORUM, 1, pk))) + .describedAs("Token %d with operator %s", token, op.condition) + .isEqualTo(op.expected(token, tokens)); + } + + for (TokenOperator lt : Arrays.asList(TokenOperator.lt, TokenOperator.lte)) + { + for (TokenOperator gt : Arrays.asList(TokenOperator.gt, TokenOperator.gte)) + { + Assertions.assertThat(tokens(node.executeWithResult(withKeyspace("SELECT * FROM %s.tbl WHERE " + lt.condition + " AND " + gt.condition), QUORUM, pk, pk))) + .describedAs("Token %d with operators %s / %s", token, lt.condition, gt.condition) + .isEqualTo(Sets.intersection(lt.expected(token, tokens), gt.expected(token, tokens))); + } + } + + Assertions.assertThat(tokens(node.executeWithResult(withKeyspace("SELECT * FROM %s.tbl WHERE token(pk) BETWEEN token(?) AND token(?)"), QUORUM, pk, pk))) + .describedAs("Token %d with operator token(pk) BETWEEN token(?) AND token(?)", token) + .isEqualTo(TokenOperator.eq.expected(token, tokens)); + } + } + } + + public static NavigableSet tokens(QueryResult result) + { + NavigableSet set = new TreeSet<>(); + while (result.hasNext()) + set.add(Murmur3Partitioner.instance.getToken(result.next().get("pk")).token); + return set; + } + + public enum TokenOperator + { + eq("token(pk) = token(?)") { + @Override + public NavigableSet expected(long token, NavigableSet tokens) + { + if (tokens.contains(token)) + return new TreeSet<>(Collections.singleton(token)); + return Collections.emptyNavigableSet(); + } + + @Override + public boolean intersects(long token, Range range) + { + return range.contains(new LongToken(token)); + } + }, + lt("token(pk) < token(?)") + { + @Override + public NavigableSet expected(long token, NavigableSet tokens) + { + return tokens.headSet(token, false); + } + + @Override + public boolean intersects(long token, Range range) + { + // <= is implemented as a min key bound which still intersects the range even though it returns no results + return token >= range.left.getLongValue() + 1; + } + }, + lte("token(pk) <= token(?)") + { + @Override + public NavigableSet expected(long token, NavigableSet tokens) + { + return tokens.headSet(token, true); + } + @Override + public boolean intersects(long token, Range range) + { + return token >= range.left.getLongValue() + 1; + } + }, + gt("token(pk) > token(?)") + { + @Override + public NavigableSet expected(long token, NavigableSet tokens) + { + return tokens.tailSet(token, false); + } + + @Override + public boolean intersects(long token, Range range) + { + return token < range.right.getLongValue(); + } + }, + gte("token(pk) >= token(?)") + { + @Override + public NavigableSet expected(long token, NavigableSet tokens) + { + return tokens.tailSet(token, true); + } + + @Override + public boolean intersects(long token, Range range) + { + return token <= range.right.getLongValue(); + } + }; + ; + + public final String condition; + + TokenOperator(String s) + { + this.condition = s; + } + + public abstract NavigableSet expected(long token, NavigableSet tokens); + + // Intersects for the purpose of executing the query not necessarily the results that are returned + public abstract boolean intersects(long token, Range range); + } + + private NavigableSet tokens() + { + NavigableSet set = new TreeSet<>(); +// set.add(Long.MIN_VALUE); // Murmur3Partitioner.LongToken.keyForToken does not support MIN_VALUE, but doesn't reject it... it actually produces MAX_VALUE... + set.add(Long.MIN_VALUE + 1); + set.add(0L); + set.add(Long.MAX_VALUE); + return set; + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/MigrationFromAccordReadRaceTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/MigrationFromAccordReadRaceTest.java new file mode 100644 index 000000000000..db5b026cc80f --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/MigrationFromAccordReadRaceTest.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +public class MigrationFromAccordReadRaceTest extends AccordMigrationReadRaceTestBase +{ + @Override + protected boolean migratingAwayFromAccord() + { + return true; + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/MigrationFromAccordWriteRaceTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/MigrationFromAccordWriteRaceTest.java new file mode 100644 index 000000000000..1a8f30f76597 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/MigrationFromAccordWriteRaceTest.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +public class MigrationFromAccordWriteRaceTest extends AccordMigrationWriteRaceTestBase +{ + protected boolean migratingAwayFromAccord() + { + return true; + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/MigrationToAccordReadRaceTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/MigrationToAccordReadRaceTest.java new file mode 100644 index 000000000000..4faa91ebceb6 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/MigrationToAccordReadRaceTest.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import org.junit.Ignore; + +@Ignore("Flakey") +public class MigrationToAccordReadRaceTest extends AccordMigrationReadRaceTestBase +{ + @Override + protected boolean migratingAwayFromAccord() + { + return false; + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/MigrationToAccordWriteRaceTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/MigrationToAccordWriteRaceTest.java new file mode 100644 index 000000000000..c93b491b9762 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/MigrationToAccordWriteRaceTest.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +public class MigrationToAccordWriteRaceTest extends AccordMigrationWriteRaceTestBase +{ + protected boolean migratingAwayFromAccord() + { + return false; + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/MixedReadAccordCQLTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/MixedReadAccordCQLTest.java new file mode 100644 index 000000000000..b9faecfac2fa --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/MixedReadAccordCQLTest.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import org.apache.cassandra.service.consensus.TransactionalMode; + +public class MixedReadAccordCQLTest extends AccordCQLTestBase +{ + public MixedReadAccordCQLTest() + { + super(TransactionalMode.mixed_reads); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/accord/NewSchemaTest.java b/test/distributed/org/apache/cassandra/distributed/test/accord/NewSchemaTest.java new file mode 100644 index 000000000000..4a6fab41fe39 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/accord/NewSchemaTest.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.accord; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.List; + +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.cql3.ast.Select; +import org.apache.cassandra.cql3.ast.Txn; +import org.apache.cassandra.distributed.api.SimpleQueryResult; +import org.apache.cassandra.service.accord.AccordService; +import org.assertj.core.api.Assertions; + +import static java.util.function.UnaryOperator.identity; + +public class NewSchemaTest extends AccordTestBase +{ + private static final Logger logger = LoggerFactory.getLogger(NewSchemaTest.class); + + @Override + protected Logger logger() + { + return logger; + } + + @BeforeClass + public static void setupClass() throws IOException + { + AccordTestBase.setupCluster(identity(), 2); + } + + @Test + public void test() + { + for (int i = 0; i < 20; i++) + { + String ks = "ks" + i; + String tableName = "tbl" + i; + String table = ks + "." + tableName; + SHARED_CLUSTER.schemaChange("CREATE KEYSPACE " + ks + " WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor': 1}"); + SHARED_CLUSTER.schemaChange(String.format("CREATE TABLE %s (pk blob primary key) WITH transactional_mode='full'", table)); + SHARED_CLUSTER.forEach(node -> node.runOnInstance(() -> AccordService.instance().setCacheSize(0))); + + List keys = tokensToKeys(tokens()); + + read(ks, tableName, keys).exec(); + } + } + + private static Query read(String ks, String table, List keys) + { + assert !keys.isEmpty(); + Txn.Builder builder = new Txn.Builder(); + for (int i = 0; i < keys.size(); i++) + builder.addLet("row" + i, new Select.Builder().wildcard().table(ks, table).value("pk", keys.get(i))); + builder.addReturnReferences("row0.pk"); + Txn txn = builder.build(); + ByteBuffer[] binds = txn.bindsEncoded(); + Assertions.assertThat(Arrays.asList(binds)).isEqualTo(keys); + return new Query(txn.toCQL(), binds); + } + + private static class Query + { + final String cql; + final Object[] binds; + + private Query(String cql, Object[] binds) + { + this.cql = cql; + this.binds = binds; + } + + SimpleQueryResult exec() + { + return executeWithRetry(SHARED_CLUSTER, cql, binds); + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/cql3/AccordInteropMultiNodeTableWalkBase.java b/test/distributed/org/apache/cassandra/distributed/test/cql3/AccordInteropMultiNodeTableWalkBase.java new file mode 100644 index 000000000000..6e4c1ad1bda6 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/cql3/AccordInteropMultiNodeTableWalkBase.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.cql3; + +import accord.utils.Property; +import accord.utils.RandomSource; +import org.apache.cassandra.cql3.KnownIssue; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.service.reads.repair.ReadRepairStrategy; + + +public abstract class AccordInteropMultiNodeTableWalkBase extends MultiNodeTableWalkBase +{ + private final TransactionalMode transactionalMode; + + protected AccordInteropMultiNodeTableWalkBase(TransactionalMode transactionalMode) + { + super(ReadRepairStrategy.NONE); + this.transactionalMode = transactionalMode; + } + + @Override + protected void preCheck(Cluster cluster, Property.StatefulBuilder builder) + { + addUncaughtExceptionsFilter(cluster); + } + + static void addUncaughtExceptionsFilter(Cluster cluster) + { + if (IGNORED_ISSUES.contains(KnownIssue.ACCORD_JOURNAL_SUPPORT_DROP_TABLE)) + { + cluster.setUncaughtExceptionsFilter(t -> { + // There is a known issue with drop table and journal snapshots where the journal can't find the given table... + // To make these tests stable need to ignore this error as it's unrelated to the test and is target to be fixed + // directly. + /* +Suppressed: java.lang.AssertionError: Unknown keyspace ks12 + at org.apache.cassandra.db.Keyspace.open(Keyspace.java:149) + at org.apache.cassandra.db.Keyspace.openAndGetStoreIfExists(Keyspace.java:172) + at org.apache.cassandra.service.accord.AccordDataStore.lambda$snapshot$2(AccordDataStore.java:104) + */ + + if (t instanceof AssertionError + && t.getMessage() != null + && t.getMessage().startsWith("Unknown keyspace ks")) + return true; + return false; + }); + } + } + + @Override + protected TableMetadata defineTable(RandomSource rs, String ks) + { + TableMetadata metadata = super.defineTable(rs, ks); + return metadata.withSwapped(metadata.params.unbuild().transactionalMode(transactionalMode).build()); + } + + @Override + protected State createState(RandomSource rs, Cluster cluster) + { + return new AccordInteropMultiNodeState(rs, cluster); + } + + public class AccordInteropMultiNodeState extends MultiNodeState + { + public AccordInteropMultiNodeState(RandomSource rs, Cluster cluster) + { + super(rs, cluster); + } + + @Override + protected ConsistencyLevel selectCl() + { + return ConsistencyLevel.QUORUM; + } + + @Override + protected ConsistencyLevel mutationCl() + { + return ConsistencyLevel.QUORUM; + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/cql3/AccordInteropMultiNodeTokenConflictBase.java b/test/distributed/org/apache/cassandra/distributed/test/cql3/AccordInteropMultiNodeTokenConflictBase.java new file mode 100644 index 000000000000..e96675689e86 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/cql3/AccordInteropMultiNodeTokenConflictBase.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.cql3; + +import accord.utils.Property; +import accord.utils.RandomSource; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.service.consensus.TransactionalMode; + +public abstract class AccordInteropMultiNodeTokenConflictBase extends MultiNodeTokenConflictTest +{ + protected AccordInteropMultiNodeTokenConflictBase(TransactionalMode transactionalMode) + { + super(transactionalMode); + } + + @Override + protected void preCheck(Cluster cluster, Property.StatefulBuilder builder) + { + AccordInteropMultiNodeTableWalkBase.addUncaughtExceptionsFilter(cluster); + } + + @Override + protected State createState(RandomSource rs, Cluster cluster) + { + return new AccordInteropState(rs, cluster); + } + + private class AccordInteropState extends State + { + AccordInteropState(RandomSource rs, Cluster cluster) + { + super(rs, cluster); + } + + @Override + protected ConsistencyLevel selectCl() + { + return ConsistencyLevel.QUORUM; + } + + @Override + protected ConsistencyLevel mutationCl() + { + return ConsistencyLevel.QUORUM; + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/cql3/CasMultiNodeTableWalkBase.java b/test/distributed/org/apache/cassandra/distributed/test/cql3/CasMultiNodeTableWalkBase.java index bf8a44dcb946..31d1aab31f15 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/cql3/CasMultiNodeTableWalkBase.java +++ b/test/distributed/org/apache/cassandra/distributed/test/cql3/CasMultiNodeTableWalkBase.java @@ -78,7 +78,8 @@ private State(RandomSource rs, Cluster cluster) @Override protected Gen toMutationGen(ASTGenerators.MutationGenBuilder mutationGenBuilder) { - mutationGenBuilder.withCasGen(i -> true); + mutationGenBuilder.withCasGen(i -> true) + .withAllowUpdateMultipleClusteringKeys(false); // paxos supports but the model doesn't yet // generator might not always generate a cas statement... should fix generator! Gen gen = toGen(mutationGenBuilder.build()).filter(Mutation::isCas); if (metadata.regularAndStaticColumns().stream().anyMatch(c -> c.type.isUDT()) diff --git a/test/distributed/org/apache/cassandra/distributed/test/cql3/FullAccordInteropMultiNodeTableWalkTest.java b/test/distributed/org/apache/cassandra/distributed/test/cql3/FullAccordInteropMultiNodeTableWalkTest.java new file mode 100644 index 000000000000..3e6468a0a586 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/cql3/FullAccordInteropMultiNodeTableWalkTest.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.cql3; + +import accord.utils.Property; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.service.consensus.TransactionalMode; + +public class FullAccordInteropMultiNodeTableWalkTest extends AccordInteropMultiNodeTableWalkBase +{ + public FullAccordInteropMultiNodeTableWalkTest() + { + super(TransactionalMode.full); + } + + @Override + protected void preCheck(Cluster cluster, Property.StatefulBuilder builder) + { + super.preCheck(cluster, builder); + // if a failing seed is detected, populate here + // Example: builder.withSeed(42L); + // CQL operations may have opertors such as +, -, and / (example 4 + 4), to "apply" them to get a constant value + // CQL_DEBUG_APPLY_OPERATOR = true; + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/cql3/FullAccordInteropMultiNodeTokenConflictTest.java b/test/distributed/org/apache/cassandra/distributed/test/cql3/FullAccordInteropMultiNodeTokenConflictTest.java new file mode 100644 index 000000000000..7858d184436d --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/cql3/FullAccordInteropMultiNodeTokenConflictTest.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.cql3; + +import accord.utils.Property; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.service.consensus.TransactionalMode; + +public class FullAccordInteropMultiNodeTokenConflictTest extends AccordInteropMultiNodeTokenConflictBase +{ + public FullAccordInteropMultiNodeTokenConflictTest() + { + super(TransactionalMode.full); + } + + @Override + protected void preCheck(Cluster cluster, Property.StatefulBuilder builder) + { + super.preCheck(cluster, builder); + // if a failing seed is detected, populate here + // Example: builder.withSeed(42L); + // CQL operations may have opertors such as +, -, and / (example 4 + 4), to "apply" them to get a constant value + // CQL_DEBUG_APPLY_OPERATOR = true; + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/cql3/MixedReadsAccordInteropMultiNodeTableWalkTest.java b/test/distributed/org/apache/cassandra/distributed/test/cql3/MixedReadsAccordInteropMultiNodeTableWalkTest.java new file mode 100644 index 000000000000..c4e2a3bcd1ee --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/cql3/MixedReadsAccordInteropMultiNodeTableWalkTest.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.cql3; + +import accord.utils.Property; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.service.consensus.TransactionalMode; + +public class MixedReadsAccordInteropMultiNodeTableWalkTest extends AccordInteropMultiNodeTableWalkBase +{ + public MixedReadsAccordInteropMultiNodeTableWalkTest() + { + super(TransactionalMode.mixed_reads); + } + + @Override + protected void preCheck(Cluster cluster, Property.StatefulBuilder builder) + { + super.preCheck(cluster, builder); + // if a failing seed is detected, populate here + // Example: builder.withSeed(42L); + // CQL operations may have opertors such as +, -, and / (example 4 + 4), to "apply" them to get a constant value + // CQL_DEBUG_APPLY_OPERATOR = true; + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/cql3/MixedReadsAccordInteropMultiNodeTokenConflictTest.java b/test/distributed/org/apache/cassandra/distributed/test/cql3/MixedReadsAccordInteropMultiNodeTokenConflictTest.java new file mode 100644 index 000000000000..6d2110b54f3a --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/cql3/MixedReadsAccordInteropMultiNodeTokenConflictTest.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.cql3; + +import accord.utils.Property; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.service.consensus.TransactionalMode; + +public class MixedReadsAccordInteropMultiNodeTokenConflictTest extends AccordInteropMultiNodeTokenConflictBase +{ + public MixedReadsAccordInteropMultiNodeTokenConflictTest() + { + super(TransactionalMode.mixed_reads); + } + + @Override + protected void preCheck(Cluster cluster, Property.StatefulBuilder builder) + { + super.preCheck(cluster, builder); + // if a failing seed is detected, populate here + // Example: builder.withSeed(42L); + // CQL operations may have opertors such as +, -, and / (example 4 + 4), to "apply" them to get a constant value + // CQL_DEBUG_APPLY_OPERATOR = true; + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/cql3/MultiNodeTableWalkBase.java b/test/distributed/org/apache/cassandra/distributed/test/cql3/MultiNodeTableWalkBase.java index 3e9c1195f6a1..126f9ec90839 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/cql3/MultiNodeTableWalkBase.java +++ b/test/distributed/org/apache/cassandra/distributed/test/cql3/MultiNodeTableWalkBase.java @@ -21,12 +21,19 @@ import java.io.IOException; import accord.utils.RandomSource; +import net.bytebuddy.ByteBuddy; +import net.bytebuddy.dynamic.loading.ClassLoadingStrategy; +import net.bytebuddy.implementation.MethodDelegation; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.api.IInstanceConfig; import org.apache.cassandra.distributed.api.IInvokableInstance; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.reads.repair.ReadRepairStrategy; +import org.apache.cassandra.utils.Shared; +import org.apache.cassandra.utils.TimeUUID; + +import static net.bytebuddy.matcher.ElementMatchers.named; public abstract class MultiNodeTableWalkBase extends SingleNodeTableWalkTest { @@ -53,12 +60,13 @@ protected TableMetadata defineTable(RandomSource rs, String ks) @Override protected Cluster createCluster() throws IOException { - return createCluster(mockMultiNode ? 1 : 3, this::clusterConfig); + return createCluster(mockMultiNode ? 1 : 3); } @Override protected void clusterConfig(IInstanceConfig c) { + super.clusterConfig(c); c.set("range_request_timeout", "180s") .set("read_request_timeout", "180s") .set("write_request_timeout", "180s") @@ -66,6 +74,12 @@ protected void clusterConfig(IInstanceConfig c) .set("slow_query_log_timeout", "180s"); } + @Override + protected void clusterInitializer(ClassLoader cl, int node) + { + BBHelper.install(cl, node); + } + @Override protected State createState(RandomSource rs, Cluster cluster) { @@ -79,27 +93,6 @@ public MultiNodeState(RandomSource rs, Cluster cluster) super(rs, cluster); } - @Override - public boolean allowNonPartitionQuery() - { - // This is disabled to make CI stable. There are known issues that are being fixed so have to exclude for now - return false; - } - - @Override - public boolean allowNonPartitionMultiColumnQuery() - { - // This is disabled to make CI stable. There are known issues that are being fixed so have to exclude for now - return false; - } - - @Override - public boolean allowPartitionQuery() - { - // This is disabled to make CI stable. There are known issues that are being fixed so have to exclude for now - return false; - } - @Override protected boolean isMultiNode() { @@ -108,6 +101,12 @@ protected boolean isMultiNode() return true; } + @Override + protected boolean allowRepair() + { + return hasEnoughMemtableForRepair() || hasEnoughSSTablesForRepair(); + } + @Override protected IInvokableInstance selectInstance(RandomSource rs) { @@ -131,4 +130,42 @@ protected ConsistencyLevel mutationCl() return ConsistencyLevel.NODE_LOCAL; } } + + /** + * This is not a deterministic clock for TimeUUID, but it's a monotonic clock, which means that any instance that gets + * a TimeUUID from this clock has the propery that its happens-after all other ones cross all instances. + * + * This class came around because TimeUUID.Generator.nextUnixMicros works with milliseconds, and when time doesn't + * move forward (goes back or test is "too fast") then it becomes an instance local bump-counter; this counter allows + * a logically later timeuuid to happens-before a logically earlier one! + */ + @Shared + public static class GlobalClock + { + private static long lastMicros = 0; + public synchronized static long nextUnixMicros() + { + return ++lastMicros; + } + + public synchronized static void reset() + { + // this method isn't actually needed for the property of this class, but it does help isolate any non-deterministic issues + lastMicros = 0; + } + } + + public static class BBHelper + { + static void install(ClassLoader cl, int nodeNumber) + { + new ByteBuddy().rebase(TimeUUID.Generator.class) + .method(named("nextUnixMicros")) + .intercept(MethodDelegation.to(GlobalClock.class)) + .make() + .load(cl, ClassLoadingStrategy.Default.INJECTION); + + GlobalClock.reset(); + } + } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/cql3/MultiNodeTableWalkWithReadRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/cql3/MultiNodeTableWalkWithReadRepairTest.java index 7727e3a76ab3..a8647668715b 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/cql3/MultiNodeTableWalkWithReadRepairTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/cql3/MultiNodeTableWalkWithReadRepairTest.java @@ -18,13 +18,10 @@ package org.apache.cassandra.distributed.test.cql3; -import org.junit.Ignore; - import accord.utils.Property; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.service.reads.repair.ReadRepairStrategy; -@Ignore("In order to stay stable RR tests are ignored for now. Once Single node and multi node w/o RR are stable, then this test should be enabled to include RR testing") public class MultiNodeTableWalkWithReadRepairTest extends MultiNodeTableWalkBase { public MultiNodeTableWalkWithReadRepairTest() diff --git a/test/distributed/org/apache/cassandra/distributed/test/cql3/MultiNodeTokenConflictTest.java b/test/distributed/org/apache/cassandra/distributed/test/cql3/MultiNodeTokenConflictTest.java index 2d296277cfd4..081a2006183b 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/cql3/MultiNodeTokenConflictTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/cql3/MultiNodeTokenConflictTest.java @@ -20,17 +20,31 @@ import java.io.IOException; +import javax.annotation.Nullable; + import accord.utils.Property; import accord.utils.RandomSource; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.IInstanceConfig; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.consensus.TransactionalMode; import org.apache.cassandra.service.reads.repair.ReadRepairStrategy; public class MultiNodeTokenConflictTest extends SingleNodeTokenConflictTest { + protected MultiNodeTokenConflictTest(@Nullable TransactionalMode transactionalMode) + { + super(transactionalMode); + } + + public MultiNodeTokenConflictTest() + { + super(); + } + @Override - protected void preCheck(Property.StatefulBuilder builder) + protected void preCheck(Cluster cluster, Property.StatefulBuilder builder) { // if a failing seed is detected, populate here // Example: builder.withSeed(42L); @@ -46,16 +60,21 @@ protected TableMetadata defineTable(RandomSource rs, String ks) return tbl.unbuild().params(tbl.params.unbuild().readRepair(ReadRepairStrategy.NONE).build()).build(); } + @Override + protected void clusterConfig(IInstanceConfig c) + { + super.clusterConfig(c); + c.set("range_request_timeout", "180s") + .set("read_request_timeout", "180s") + .set("write_request_timeout", "180s") + .set("native_transport_timeout", "180s") + .set("slow_query_log_timeout", "180s"); + } + @Override protected Cluster createCluster() throws IOException { - return createCluster(3, c -> { - c.set("range_request_timeout", "180s") - .set("read_request_timeout", "180s") - .set("write_request_timeout", "180s") - .set("native_transport_timeout", "180s") - .set("slow_query_log_timeout", "180s"); - }); + return createCluster(3); } @Override diff --git a/test/distributed/org/apache/cassandra/distributed/test/cql3/SingleNodeTableWalkTest.java b/test/distributed/org/apache/cassandra/distributed/test/cql3/SingleNodeTableWalkTest.java index 1feb9c5f8693..171ccf140cca 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/cql3/SingleNodeTableWalkTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/cql3/SingleNodeTableWalkTest.java @@ -44,7 +44,7 @@ import accord.utils.RandomSource; import org.apache.cassandra.cql3.KnownIssue; import org.apache.cassandra.cql3.ast.Bind; -import org.apache.cassandra.cql3.ast.Conditional; +import org.apache.cassandra.cql3.ast.Conditional.Where.Inequality; import org.apache.cassandra.cql3.ast.CreateIndexDDL; import org.apache.cassandra.cql3.ast.FunctionCall; import org.apache.cassandra.cql3.ast.Mutation; @@ -59,8 +59,8 @@ import org.apache.cassandra.db.marshal.UTF8Type; import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.distributed.Cluster; -import org.apache.cassandra.distributed.api.IInstanceConfig; import org.apache.cassandra.distributed.test.sai.SAIUtil; +import org.apache.cassandra.utils.LoggingCommand; import org.apache.cassandra.harry.model.BytesPartitionState; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.TableMetadata; @@ -95,6 +95,10 @@ public class SingleNodeTableWalkTest extends StatefulASTBase protected static boolean READ_AFTER_WRITE = false; + public SingleNodeTableWalkTest() + { + } + protected void preCheck(Cluster cluster, Property.StatefulBuilder builder) { // if a failing seed is detected, populate here @@ -167,7 +171,7 @@ protected List supportedIndexers() Select.Builder builder = Select.builder().table(state.metadata); builder.where(FunctionCall.tokenByColumns(state.model.factory.partitionColumns), - Conditional.Where.Inequality.EQUAL, + Inequality.EQUAL, token(state, ref)); Select select = builder.build(); @@ -210,10 +214,10 @@ protected List supportedIndexers() else { builder.where(pkToken, - startInclusive ? Conditional.Where.Inequality.GREATER_THAN_EQ : Conditional.Where.Inequality.GREATER_THAN, + startInclusive ? Inequality.GREATER_THAN_EQ : Inequality.GREATER_THAN, token(state, start)); builder.where(pkToken, - endInclusive ? Conditional.Where.Inequality.LESS_THAN_EQ : Conditional.Where.Inequality.LESS_THAN, + endInclusive ? Inequality.LESS_THAN_EQ : Inequality.LESS_THAN, token(state, end)); } Select select = builder.build(); @@ -319,7 +323,7 @@ protected List supportedIndexers() Select select = builder.build(); String annotate = cols.stream().map(symbol -> { var indexed = state.indexes.get(symbol); - return symbol.detailedName() + (indexed == null ? "" : " (indexed with " + indexed.indexDDL.indexer.name() + ")"); + return symbol.detailedName() + (indexed == null ? "" : " (indexed with " + indexed.indexDDL.indexer.name() + ')'); }).collect(Collectors.joining(", ")); return state.command(rs, select, annotate); } @@ -327,7 +331,7 @@ protected List supportedIndexers() private Property.Command simpleRangeSearch(RandomSource rs, State state, Symbol symbol, ByteBuffer value, Select.Builder builder) { // do a simple search, like > or < - Conditional.Where.Inequality kind = state.rangeInequalityGen.next(rs); + Inequality kind = state.rangeInequalityGen.next(rs); builder.where(symbol, kind, value); Select select = builder.build(); var indexed = state.indexes.get(symbol); @@ -350,12 +354,7 @@ protected State createState(RandomSource rs, Cluster cluster) protected Cluster createCluster() throws IOException { - return createCluster(1, this::clusterConfig); - } - - protected void clusterConfig(IInstanceConfig config) - { - + return createCluster(1); } @Test @@ -369,14 +368,20 @@ public void test() throws IOException .add(StatefulASTBase::insert) .add(StatefulASTBase::fullTableScan) .addIf(State::hasPartitions, this::selectExisting) - .addAllIf(State::supportTokens, b -> b.add(this::selectToken) - .add(this::selectTokenRange)) + .addAllIf(State::supportTokens, + this::selectToken, + this::selectTokenRange, + StatefulASTBase::selectMinTokenRange) .addIf(State::hasEnoughMemtable, StatefulASTBase::flushTable) .addIf(State::hasEnoughSSTables, StatefulASTBase::compactTable) + .addAllIf(BaseState::allowRepair, + StatefulASTBase::incrementalRepair, + StatefulASTBase::previewRepair) .addIf(State::allowNonPartitionQuery, this::nonPartitionQuery) .addIf(State::allowNonPartitionMultiColumnQuery, this::multiColumnQuery) .addIf(State::allowPartitionQuery, this::partitionRestrictedQuery) .destroyState(State::close) + .commandsTransformer(LoggingCommand.factory()) .onSuccess(onSuccess(logger)) .build()); } @@ -458,7 +463,8 @@ public State(RandomSource rs, Cluster cluster) .withoutTtl() .withoutTimestamp() .withPartitions(Generators.fromGen(Gens.mixedDistribution(uniquePartitions).next(rs))) - .withColumnExpressions(e -> e.withOperators(Generators.fromGen(BOOLEAN_DISTRIBUTION.next(rs)))); + .withColumnExpressions(e -> e.withOperators(Generators.fromGen(BOOLEAN_DISTRIBUTION.next(rs)))) + .withIgnoreIssues(IGNORED_ISSUES); if (IGNORED_ISSUES.contains(KnownIssue.SAI_EMPTY_TYPE)) { model.factory.regularAndStaticColumns.stream() @@ -558,11 +564,6 @@ private LinkedHashMap createIndexes(RandomSource rs, Tabl return indexed; } - public boolean hasPartitions() - { - return !model.isEmpty(); - } - public boolean supportTokens() { return hasPartitions(); diff --git a/test/distributed/org/apache/cassandra/distributed/test/cql3/SingleNodeTokenConflictTest.java b/test/distributed/org/apache/cassandra/distributed/test/cql3/SingleNodeTokenConflictTest.java index 179b107eae77..090843c4b311 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/cql3/SingleNodeTokenConflictTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/cql3/SingleNodeTokenConflictTest.java @@ -33,6 +33,8 @@ import java.util.TreeSet; import java.util.stream.Collectors; +import javax.annotation.Nullable; + import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -55,6 +57,7 @@ import org.apache.cassandra.dht.Murmur3Partitioner.LongToken; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.consensus.TransactionalMode; import org.apache.cassandra.tools.nodetool.formatter.TableBuilder; import org.apache.cassandra.utils.ASTGenerators; import org.apache.cassandra.utils.AbstractTypeGenerators; @@ -86,7 +89,20 @@ public class SingleNodeTokenConflictTest extends StatefulASTBase AbstractTypeGenerators.withoutUnsafeEquality(AbstractTypeGenerators.builder() .withTypeKinds(TypeKind.PRIMITIVE)); - protected void preCheck(Property.StatefulBuilder builder) + @Nullable + private final TransactionalMode transactionalMode; + + protected SingleNodeTokenConflictTest(@Nullable TransactionalMode transactionalMode) + { + this.transactionalMode = transactionalMode; + } + + public SingleNodeTokenConflictTest() + { + this(null); + } + + protected void preCheck(Cluster cluster, Property.StatefulBuilder builder) { // if a failing seed is detected, populate here // Example: builder.withSeed(42L); @@ -240,7 +256,7 @@ protected void preCheck(Property.StatefulBuilder builder) protected Cluster createCluster() throws IOException { - return createCluster(1, i -> {}); + return createCluster(1); } @Test @@ -249,7 +265,7 @@ public void test() throws IOException try (Cluster cluster = createCluster()) { Property.StatefulBuilder statefulBuilder = stateful().withExamples(10); - preCheck(statefulBuilder); + preCheck(cluster, statefulBuilder); statefulBuilder.check(commands(() -> rs -> createState(rs, cluster)) .add(StatefulASTBase::insert) //TODO (now, coverage): this is flakey and non-deterministic. When this fails (gives bad response) rerunning the seed yields a passing test! @@ -264,6 +280,7 @@ public void test() throws IOException .add(SingleNodeTokenConflictTest::tokenBetween) .add(SingleNodeTokenConflictTest::tokenRange) .add(SingleNodeTokenConflictTest::tokenBoundRange) + .addIf(State::hasPartitions, StatefulASTBase::selectMinTokenRange) .addIf(State::hasEnoughMemtable, StatefulASTBase::flushTable) .addIf(State::hasEnoughSSTables, StatefulASTBase::compactTable) .destroyState(State::close) @@ -279,23 +296,26 @@ protected State createState(RandomSource rs, Cluster cluster) protected TableMetadata defineTable(RandomSource rs, String ks) { - return toGen(new TableMetadataBuilder() - .withTableKinds(TableMetadata.Kind.REGULAR) - .withKnownMemtables() - .withKeyspaceName(ks).withTableName("tbl") - .withSimpleColumnNames() - .withDefaultTypeGen(SUPPORTED_TYPES) - .withPartitioner(Murmur3Partitioner.instance) - .withPartitionColumnsCount(1) - // this should produce vector... should make this easier... - .withPartitionColumnTypeGen(new TypeGenBuilder() - .withMaxDepth(0) - .withTypeKinds(TypeKind.VECTOR) - .withPrimitives(LongType.instance) - .withVectorSizeGen(i -> 2) - .withDefaultSizeGen(1)) - .build()) - .next(rs); + TableMetadata metadata = toGen(new TableMetadataBuilder() + .withTableKinds(TableMetadata.Kind.REGULAR) + .withKnownMemtables() + .withKeyspaceName(ks).withTableName("tbl") + .withSimpleColumnNames() + .withDefaultTypeGen(SUPPORTED_TYPES) + .withPartitioner(Murmur3Partitioner.instance) + .withPartitionColumnsCount(1) + // this should produce vector... should make this easier... + .withPartitionColumnTypeGen(new TypeGenBuilder() + .withMaxDepth(0) + .withTypeKinds(TypeKind.VECTOR) + .withPrimitives(LongType.instance) + .withVectorSizeGen(i -> 2) + .withDefaultSizeGen(1)) + .build()) + .next(rs); + if (transactionalMode != null) + metadata = metadata.withSwapped(metadata.params.unbuild().transactionalMode(transactionalMode).build()); + return metadata; } class State extends CommonState @@ -335,9 +355,9 @@ class State extends CommonState this.neighbors = rs.nextBoolean() ? Collections.emptyList() : extractNeighbors(pkValues); // in case neighbors conflicts with pkValues or tokenValues, use ImmutableUniqueList which will ignore rather than fail this.pkValues = ImmutableUniqueList.builder() - .mayAddAll(pkValues) - .mayAddAll(tokenValues) - .mayAddAll(neighbors) + .addAll(pkValues) + .addAll(tokenValues) + .addAll(neighbors) .build(); this.pkGen = Gens.pick(pkValues); this.order = new TreeSet<>(PK_TYPE); @@ -367,6 +387,7 @@ class State extends CommonState .withoutTtl() .withoutTimestamp() .withPartitions(SourceDSL.arbitrary().pick(uniquePartitions)) + .withIgnoreIssues(IGNORED_ISSUES) .build()); } diff --git a/test/distributed/org/apache/cassandra/distributed/test/cql3/StatefulASTBase.java b/test/distributed/org/apache/cassandra/distributed/test/cql3/StatefulASTBase.java index a90b4679797c..527a7ea65864 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/cql3/StatefulASTBase.java +++ b/test/distributed/org/apache/cassandra/distributed/test/cql3/StatefulASTBase.java @@ -22,17 +22,20 @@ import java.net.InetSocketAddress; import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.Arrays; import java.util.EnumSet; import java.util.List; -import java.util.Objects; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.BiConsumer; import java.util.function.Consumer; import java.util.stream.Collectors; import java.util.stream.Stream; import javax.annotation.Nullable; import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Maps; import org.slf4j.Logger; import accord.utils.Gen; @@ -45,12 +48,15 @@ import com.datastax.driver.core.Session; import com.datastax.driver.core.SimpleStatement; import com.datastax.driver.core.SocketOptions; +import com.datastax.driver.core.exceptions.ReadFailureException; +import com.datastax.driver.core.exceptions.WriteFailureException; import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.KnownIssue; import org.apache.cassandra.cql3.ast.Bind; import org.apache.cassandra.cql3.ast.CQLFormatter; import org.apache.cassandra.cql3.ast.Conditional; +import org.apache.cassandra.cql3.ast.FunctionCall; import org.apache.cassandra.cql3.ast.Literal; import org.apache.cassandra.cql3.ast.Mutation; import org.apache.cassandra.cql3.ast.Select; @@ -64,6 +70,7 @@ import org.apache.cassandra.db.marshal.AsciiType; import org.apache.cassandra.db.marshal.BytesType; import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.api.Feature; @@ -72,8 +79,12 @@ import org.apache.cassandra.distributed.api.IInvokableInstance; import org.apache.cassandra.distributed.test.JavaDriverUtils; import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.harry.model.ASTSingleTableModel; import org.apache.cassandra.harry.util.StringUtils; +import org.apache.cassandra.repair.RepairGenerators; +import org.apache.cassandra.repair.RepairGenerators.PreviewType; +import org.apache.cassandra.repair.RepairGenerators.RepairType; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.utils.AbstractTypeGenerators; import org.apache.cassandra.utils.CassandraGenerators; @@ -112,6 +123,8 @@ public class StatefulASTBase extends TestBaseImpl .collect(Collectors.toList())); protected static final Gen FETCH_SIZE_DISTRO = Gens.mixedDistribution(new int[] {1, 10, 100, 1000, 5000}); protected static final Gen LIMIT_DISTRO = Gens.mixedDistribution(1, 1001); + protected static final Gen REPAIR_TYPE_EMPTY_MODEL_DISTRO = Gens.mixedDistribution(0, 2); + protected static final Gen REPAIR_TYPE_DISTRO = Gens.mixedDistribution(0, 3); static { @@ -137,9 +150,25 @@ protected static String nextKeyspace() return "ks" + COUNTER.incrementAndGet(); } - protected static Cluster createCluster(int nodeCount, Consumer config) throws IOException + protected void clusterConfig(IInstanceConfig config) + { + config.set("repair.retries.max_attempts", Integer.MAX_VALUE); + } + + protected void clusterInitializer(ClassLoader cl, int node) + { + + } + + protected Cluster createCluster(int nodeCount) throws IOException + { + return createCluster(nodeCount, this::clusterConfig, this::clusterInitializer); + } + + protected static Cluster createCluster(int nodeCount, Consumer config, BiConsumer instanceInitializer) throws IOException { Cluster cluster = Cluster.build(nodeCount) + .withInstanceInitializer(instanceInitializer) .withConfig(c -> { c.with(Feature.NATIVE_PROTOCOL, Feature.NETWORK, Feature.GOSSIP) // When drop tables or truncate are performed, we attempt to take snapshots. This can be costly and isn't needed by these tests @@ -163,7 +192,7 @@ protected Property.StatefulSuccess onSuccess(Logg protected static Property.Command flushTable(RandomSource rs, S state) { - return new Property.SimpleCommand<>("nodetool flush " + state.metadata.keyspace + " " + state.metadata.name, s2 -> { + return new Property.SimpleCommand<>("nodetool flush " + state.metadata.keyspace + ' ' + state.metadata.name, s2 -> { s2.cluster.forEach(i -> i.nodetoolResult("flush", s2.metadata.keyspace, s2.metadata.name).asserts().success()); s2.flush(); }); @@ -171,7 +200,7 @@ protected Property.StatefulSuccess onSuccess(Logg protected static Property.Command compactTable(RandomSource rs, S state) { - return new Property.SimpleCommand<>("nodetool compact " + state.metadata.keyspace + " " + state.metadata.name, s2 -> { + return new Property.SimpleCommand<>("nodetool compact " + state.metadata.keyspace + ' ' + state.metadata.name, s2 -> { state.cluster.forEach(i -> i.nodetoolResult("compact", s2.metadata.keyspace, s2.metadata.name).asserts().success()); s2.compact(); }); @@ -189,6 +218,59 @@ protected Property.StatefulSuccess onSuccess(Logg state.commandSafeRandomHistory(selectForMutation(state, mutation), "Select for Mutation Validation")); } + protected static Property.Command incrementalRepair(RandomSource rs, S state) + { + return repair(rs, state, state.repairArgsBuilder().withType(i -> RepairType.IR).withPreviewType(i -> PreviewType.NONE), null); + } + + protected static Property.Command previewRepair(RandomSource rs, S state) + { + return repair(rs, state, state.repairArgsBuilder().withType(i -> RepairType.FULL).withPreviewType(i -> PreviewType.REPAIRED), null); + } + + protected static Property.Command repair(RandomSource rs, S state, RepairGenerators.Builder argsBuilder, @Nullable String annotate) + { + IInvokableInstance inst = state.selectInstance(rs); + Gen> argsGen = argsBuilder.build(); + List args = ImmutableList.builder() + .add("repair") + .addAll(argsGen.next(rs)) + .build(); + boolean preview = RepairGenerators.isPreview(args); + // mimic org.apache.cassandra.repair.state.CoordinatorState.getType + String type; + if (preview) + { + // mimic org.apache.cassandra.tools.nodetool.Repair.getPreviewKind + PreviewType previewType = RepairGenerators.previewType(args); + switch (previewType) + { + case REPAIRED: + type = "preview repaired"; + break; + case UNREPAIRED: + type = RepairGenerators.isFull(args) ? "preview full" : "preview unrepaired"; + break; + default: + throw new UnsupportedOperationException(previewType.name()); + } + } + else + { + type = RepairGenerators.isFull(args) ? "full" : "incremental"; + } + + String postfix = "type " + type + ", on " + inst; + if (annotate == null) annotate = postfix; + else annotate += ", " + postfix; + + return new Property.SimpleCommand<>("nodetool " + String.join(" ", args) + " -- " + annotate, s2 -> { + inst.nodetoolResult(args.toArray(String[]::new)).asserts().success(); + if (!preview) + s2.repair(); + }); + } + private static Select selectForMutation(S state, Mutation mutation) { var select = Select.builder(state.metadata).allowFiltering(); @@ -217,6 +299,42 @@ private static Select selectForMutation(S state, Mutatio return state.command(rs, select, "full table scan"); } + protected static Property.Command selectMinTokenRange(RandomSource rs, S state) + { + var key = rs.pickOrderedSet(state.model.partitionKeys()); + FunctionCall tokenCall = FunctionCall.tokenByColumns(state.model.factory.partitionColumns); + Literal min = Literal.of(key.token.getLongValue()); + Literal max = Literal.of(Long.MIN_VALUE); + if (rs.nextBoolean()) + { + Literal tmp = min; + min = max; + max = tmp; + } + Select select; + if (rs.nextBoolean()) + { + select = Select.builder(state.metadata) + .where(tokenCall, state.greaterThanGen.next(rs), min) + .where(tokenCall, state.lessThanGen.next(rs), max) + .build(); + } + else + { + // it's possible that the range was flipped, which is known bug with BETWEEN, so + // make sure the range is not flipped until that bug is fixed + if (IGNORED_ISSUES.contains(KnownIssue.BETWEEN_START_LARGER_THAN_END)) + { + min = Literal.of(key.token.getLongValue()); + max = Literal.of(Long.MIN_VALUE); + } + select = Select.builder(state.metadata) + .between(tokenCall, min, max) + .build(); + } + return state.command(rs, select, "min token range"); + } + protected static abstract class BaseState implements AutoCloseable { protected final RandomSource rs; @@ -230,16 +348,19 @@ protected static abstract class BaseState implements AutoCloseable protected final Gen lessThanGen; protected final Gen greaterThanGen; protected final Gen rangeInequalityGen; + protected final Gen.IntGen repairTypeEmptyModelGen, repairTypeGen; protected final Gen.IntGen fetchSizeGen; protected final TableMetadata metadata; protected final TableReference tableRef; protected final ASTSingleTableModel model; + private final String sstableFormatName; private final Visitor debug; - private final int enoughMemtables; - private final int enoughSSTables; + private final int enoughMemtables, enoughMemtablesForRepair; + private final int enoughSSTables, enoughSSTablesForRepair; protected int numMutations, mutationsSinceLastFlush; - protected int numFlushes, flushesSinceLastCompaction; + protected int numFlushes, flushesSinceLastCompaction, flushesSinceLastRepair; protected int numCompact; + protected int numRepairs; protected int operations; protected BaseState(RandomSource rs, Cluster cluster, TableMetadata metadata) @@ -264,13 +385,26 @@ protected BaseState(RandomSource rs, Cluster cluster, TableMetadata metadata) this.perPartitionLimitGen = LIMIT_DISTRO.next(rs); this.limitGen = LIMIT_DISTRO.next(rs); - this.enoughMemtables = rs.pickInt(3, 10, 50); + this.repairTypeEmptyModelGen = REPAIR_TYPE_EMPTY_MODEL_DISTRO.next(rs); + this.repairTypeGen = REPAIR_TYPE_DISTRO.next(rs); + + this.enoughMemtables = rs.pickInt(1, 3, 10, 50); + this.enoughMemtablesForRepair = rs.pickInt(1, 3, 10, 50); this.enoughSSTables = rs.pickInt(3, 10, 50); + this.enoughSSTablesForRepair = rs.pickInt(1, 3, 10, 50); this.metadata = metadata; this.tableRef = TableReference.from(metadata); - this.model = new ASTSingleTableModel(metadata); + this.model = new ASTSingleTableModel(metadata, IGNORED_ISSUES); createTable(metadata); + + String sstableFormatName = this.sstableFormatName = Generators.toGen(CassandraGenerators.sstableFormatNames()).next(rs); + cluster.forEach(i -> i.runOnInstance(() -> DatabaseDescriptor.setSelectedSSTableFormat(sstableFormatName))); + } + + public boolean hasPartitions() + { + return !model.isEmpty(); } protected boolean readAfterWrite() @@ -301,6 +435,35 @@ private String createKeyspaceCQL(String ks) return command(rs, select, null); } + protected boolean allowRepair() + { + return false; + } + + protected RepairGenerators.Builder repairArgsBuilder() + { + return new RepairGenerators.Builder(i -> Arrays.asList(metadata.keyspace, metadata.name)) + // paxos cleanup's finish prepare is delayed based off CAS/Write timeout, but these tests make that 3 minutes (so CI is stable) + // which means this step is delayed 3 minutes, making repairs suppppper slow... + // see org.apache.cassandra.service.paxos.cleanup.PaxosCleanup#finishPrepare + .withSkipPaxosGen(i -> true) + .withRanges(rs -> { + switch (model.isEmpty() ? repairTypeEmptyModelGen.next(rs) : repairTypeGen.next(rs)) + { + case 0: return RepairGenerators.LOCAL_RANGE; + case 1: return RepairGenerators.PRIMARY_RANGE; + case 2: + { + Token a = rs.pickOrderedSet(model.partitionKeys()).token; + return List.of("--start-token", Long.toString(a.getLongValue() - 1), + "--end-token", a.toString()); + } + default: throw new UnsupportedOperationException(); + } + }) + ; + } + protected boolean allowLimit(Select select) { //TODO (coverage): allow this in the model! @@ -388,8 +551,8 @@ protected ConsistencyLevel mutationCl() else annotate += ", " + postfix; Mutation finalMutation = mutation; return new Property.SimpleCommand<>(humanReadable(mutation, annotate), s -> { - s.executeQuery(inst, Integer.MAX_VALUE, s.mutationCl(), finalMutation); - s.model.update(finalMutation); + var result = s.executeQuery(inst, Integer.MAX_VALUE, s.mutationCl(), finalMutation); + s.model.updateAndValidate(result, finalMutation); s.mutation(); }); } @@ -404,11 +567,23 @@ protected boolean hasEnoughMemtable() return mutationsSinceLastFlush > enoughMemtables; } + protected boolean hasEnoughMemtableForRepair() + { + // use last flush rather than last repair as this method cares about data in the memtable + // and not amount of mutations since repair + return mutationsSinceLastFlush > enoughMemtablesForRepair; + } + protected boolean hasEnoughSSTables() { return flushesSinceLastCompaction > enoughSSTables; } + protected boolean hasEnoughSSTablesForRepair() + { + return flushesSinceLastRepair > enoughSSTablesForRepair; + } + protected void mutation() { numMutations++; @@ -420,6 +595,7 @@ protected void flush() mutationsSinceLastFlush = 0; numFlushes++; flushesSinceLastCompaction++; + flushesSinceLastRepair++; } protected void compact() @@ -428,6 +604,15 @@ protected void compact() numCompact++; } + protected void repair() + { + if (mutationsSinceLastFlush > 0) + flush(); + + numRepairs++; + flushesSinceLastRepair = 0; + } + protected Value value(RandomSource rs, ByteBuffer bb, AbstractType type) { return bindOrLiteralGen.next(rs) ? new Bind(bb, type) : new Literal(bb, type); @@ -451,7 +636,7 @@ protected ByteBuffer[][] executeQuery(IInstance instance, int fetchSize, Consist SimpleStatement ss = new SimpleStatement(stmt.toCQL(), (Object[]) stmt.bindsEncoded()); if (fetchSize != Integer.MAX_VALUE) ss.setFetchSize(fetchSize); - if (stmt instanceof Mutation) + if (stmt.kind() == Statement.Kind.MUTATION) { switch (cl) { @@ -479,11 +664,35 @@ protected ByteBuffer[][] executeQuery(IInstance instance, int fetchSize, Consist .findAny() .get(); ss.setHost(host); - ResultSet result = session.execute(ss); + ResultSet result; + try + { + result = session.execute(ss); + } + catch (ReadFailureException t) + { + throw new AssertionError("failed from=" + Maps.transformValues(t.getFailuresMap(), BaseState::safeErrorCode), t); + } + catch (WriteFailureException t) + { + throw new AssertionError("failed from=" + Maps.transformValues(t.getFailuresMap(), BaseState::safeErrorCode), t); + } return getRowsAsByteBuffer(result); } } + private static String safeErrorCode(Integer code) + { + try + { + return RequestFailureReason.fromCode(code).name(); + } + catch (IllegalArgumentException e) + { + return "Unexpected code " + code + ": " + e.getMessage(); + } + } + @VisibleForTesting static ByteBuffer[][] getRowsAsByteBuffer(ResultSet result) { @@ -512,7 +721,8 @@ private String humanReadable(Statement stmt, @Nullable String annotate) protected void toString(StringBuilder sb) { - sb.append(createKeyspaceCQL(metadata.keyspace)); + sb.append("Config:\nsstable:\n\tselected_format: ").append(sstableFormatName); + sb.append('\n').append(createKeyspaceCQL(metadata.keyspace)); CassandraGenerators.visitUDTs(metadata, udt -> sb.append('\n').append(udt.toCqlString(false, false, true)).append(';')); sb.append('\n').append(metadata.toCqlString(false, false, false)); } @@ -533,39 +743,6 @@ public String toString() toString(sb); return sb.toString(); } - - private static final class ValueWithType - { - final ByteBuffer value; - final AbstractType type; - - private ValueWithType(ByteBuffer value, AbstractType type) - { - this.value = value; - this.type = type; - } - - @Override - public boolean equals(Object o) - { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - ValueWithType value1 = (ValueWithType) o; - return value.equals(value1.value) && type.equals(value1.type); - } - - @Override - public int hashCode() - { - return Objects.hash(value, type); - } - - @Override - public String toString() - { - return type.toCQLString(value); - } - } } protected static abstract class CommonState extends BaseState diff --git a/test/distributed/org/apache/cassandra/distributed/test/guardrails/GuardrailCollectionSizeOnSSTableWriteTest.java b/test/distributed/org/apache/cassandra/distributed/test/guardrails/GuardrailCollectionSizeOnSSTableWriteTest.java index 6b7daef26feb..3405185d451d 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/guardrails/GuardrailCollectionSizeOnSSTableWriteTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/guardrails/GuardrailCollectionSizeOnSSTableWriteTest.java @@ -26,9 +26,12 @@ import com.datastax.driver.core.Session; import com.datastax.driver.core.SimpleStatement; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.guardrails.Guardrails; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.IInvokableInstance; import static java.nio.ByteBuffer.allocate; @@ -57,7 +60,6 @@ public static void setupCluster() throws IOException .set("collection_size_warn_threshold", WARN_THRESHOLD + "B") .set("collection_size_fail_threshold", FAIL_THRESHOLD + "B")) .start()); - cluster.disableAutoCompaction(KEYSPACE); driverCluster = com.datastax.driver.core.Cluster.builder().addContactPoint("127.0.0.1").build(); driverSession = driverCluster.connect(); } @@ -84,7 +86,7 @@ protected Cluster getCluster() @Test public void testSetSize() throws Throwable { - schemaChange("CREATE TABLE %s (k int PRIMARY KEY, v set)"); + createTable("CREATE TABLE %s (k int PRIMARY KEY, v set)"); execute("INSERT INTO %s (k, v) VALUES (0, null)"); execute("INSERT INTO %s (k, v) VALUES (1, ?)", set()); @@ -108,7 +110,7 @@ public void testSetSize() throws Throwable @Test public void testSetSizeFrozen() { - schemaChange("CREATE TABLE %s (k int PRIMARY KEY, v frozen>)"); + createTable("CREATE TABLE %s (k int PRIMARY KEY, v frozen>)"); execute("INSERT INTO %s (k, v) VALUES (0, null)"); execute("INSERT INTO %s (k, v) VALUES (1, ?)", set()); @@ -123,7 +125,7 @@ public void testSetSizeFrozen() @Test public void testSetSizeWithUpdates() { - schemaChange("CREATE TABLE %s (k int PRIMARY KEY, v set)"); + createTable("CREATE TABLE %s (k int PRIMARY KEY, v set)"); execute("INSERT INTO %s (k, v) VALUES (0, ?)", set(allocate(1))); execute("UPDATE %s SET v = v + ? WHERE k = 0", set(allocate(1))); @@ -145,7 +147,7 @@ public void testSetSizeWithUpdates() @Test public void testSetSizeAfterCompaction() throws Throwable { - schemaChange("CREATE TABLE %s (k int PRIMARY KEY, v set)"); + createTable("CREATE TABLE %s (k int PRIMARY KEY, v set)"); execute("INSERT INTO %s (k, v) VALUES (0, ?)", set(allocate(1))); assertNotWarnedOnFlush(); @@ -175,7 +177,7 @@ public void testSetSizeAfterCompaction() throws Throwable @Test public void testListSize() throws Throwable { - schemaChange("CREATE TABLE %s (k int PRIMARY KEY, v list)"); + createTable("CREATE TABLE %s (k int PRIMARY KEY, v list)"); execute("INSERT INTO %s (k, v) VALUES (0, null)"); execute("INSERT INTO %s (k, v) VALUES (1, ?)", list()); @@ -199,7 +201,7 @@ public void testListSize() throws Throwable @Test public void testListSizeFrozen() { - schemaChange("CREATE TABLE %s (k int PRIMARY KEY, v frozen>)"); + createTable("CREATE TABLE %s (k int PRIMARY KEY, v frozen>)"); execute("INSERT INTO %s (k, v) VALUES (0, null)"); execute("INSERT INTO %s (k, v) VALUES (1, ?)", list()); @@ -214,7 +216,7 @@ public void testListSizeFrozen() @Test public void testListSizeWithUpdates() { - schemaChange("CREATE TABLE %s (k int PRIMARY KEY, v list)"); + createTable("CREATE TABLE %s (k int PRIMARY KEY, v list)"); execute("INSERT INTO %s (k, v) VALUES (0, ?)", list(allocate(1))); execute("UPDATE %s SET v = v + ? WHERE k = 0", list(allocate(1))); @@ -236,7 +238,7 @@ public void testListSizeWithUpdates() @Test public void testListSizeAfterCompaction() throws Throwable { - schemaChange("CREATE TABLE %s (k int PRIMARY KEY, v list)"); + createTable("CREATE TABLE %s (k int PRIMARY KEY, v list)"); execute("INSERT INTO %s (k, v) VALUES (0, ?)", list(allocate(1))); assertNotWarnedOnFlush(); @@ -266,7 +268,7 @@ public void testListSizeAfterCompaction() throws Throwable @Test public void testMapSize() throws Throwable { - schemaChange("CREATE TABLE %s (k int PRIMARY KEY, v map)"); + createTable("CREATE TABLE %s (k int PRIMARY KEY, v map)"); execute("INSERT INTO %s (k, v) VALUES (0, null)"); execute("INSERT INTO %s (k, v) VALUES (1, ?)", map()); @@ -297,7 +299,7 @@ public void testMapSize() throws Throwable @Test public void testMapSizeFrozen() { - schemaChange("CREATE TABLE %s (k int PRIMARY KEY, v frozen>)"); + createTable("CREATE TABLE %s (k int PRIMARY KEY, v frozen>)"); execute("INSERT INTO %s (k, v) VALUES (0, null)"); execute("INSERT INTO %s (k, v) VALUES (1, ?)", map()); @@ -316,7 +318,7 @@ public void testMapSizeFrozen() @Test public void testMapSizeWithUpdates() { - schemaChange("CREATE TABLE %s (k int PRIMARY KEY, v map)"); + createTable("CREATE TABLE %s (k int PRIMARY KEY, v map)"); execute("INSERT INTO %s (k, v) VALUES (0, ?)", map(allocate(1), allocate(1))); execute("UPDATE %s SET v = v + ? WHERE k = 0", map(allocate(1), allocate(1))); @@ -350,7 +352,7 @@ public void testMapSizeWithUpdates() @Test public void testMapSizeAfterCompaction() { - schemaChange("CREATE TABLE %s (k int PRIMARY KEY, v map)"); + createTable("CREATE TABLE %s (k int PRIMARY KEY, v map)"); execute("INSERT INTO %s (k, v) VALUES (0, ?)", map(allocate(1), allocate(1))); execute("UPDATE %s SET v = v + ? WHERE k = 0", map(allocate(1), allocate(1))); @@ -397,7 +399,7 @@ public void testMapSizeAfterCompaction() @Test public void testCompositePartitionKey() { - schemaChange("CREATE TABLE %s (k1 int, k2 text, v set, PRIMARY KEY((k1, k2)))"); + createTable("CREATE TABLE %s (k1 int, k2 text, v set, PRIMARY KEY((k1, k2)))"); execute("INSERT INTO %s (k1, k2, v) VALUES (0, 'a', ?)", set(allocate(WARN_THRESHOLD))); assertWarnedOnFlush(warnMessage("(0, 'a')")); @@ -409,7 +411,7 @@ public void testCompositePartitionKey() @Test public void testCompositeClusteringKey() { - schemaChange("CREATE TABLE %s (k int, c1 int, c2 text, v set, PRIMARY KEY(k, c1, c2))"); + createTable("CREATE TABLE %s (k int, c1 int, c2 text, v set, PRIMARY KEY(k, c1, c2))"); execute("INSERT INTO %s (k, c1, c2, v) VALUES (1, 10, 'a', ?)", set(allocate(WARN_THRESHOLD))); assertWarnedOnFlush(warnMessage("(1, 10, 'a')")); @@ -434,4 +436,16 @@ private String failMessage(String key) { return String.format("Detected collection v in row %s in table %s of size", key, qualifiedTableName); } + + private void createTable(String cql) + { + schemaChange(cql); + for (IInvokableInstance instance : cluster) + { + instance.runOnInstance(() -> { + for (ColumnFamilyStore cs : Keyspace.open(KEYSPACE).getColumnFamilyStores()) + cs.disableAutoCompaction(); + }); + } + } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/jmx/JMXTestsUtil.java b/test/distributed/org/apache/cassandra/distributed/test/jmx/JMXTestsUtil.java index 5726c242d9e7..ce6c301b3993 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/jmx/JMXTestsUtil.java +++ b/test/distributed/org/apache/cassandra/distributed/test/jmx/JMXTestsUtil.java @@ -20,6 +20,7 @@ import java.util.ArrayList; import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -64,7 +65,17 @@ public class JMXTestsUtil "org.apache.cassandra.db:type=StorageService:resumeMove", // throws since there is no move in progress "org.apache.cassandra.db:type=StorageService:abortMove", // throws since there is no move in progress "org.apache.cassandra.db:type=CIDRGroupsMappingManager:loadCidrGroupsCache", // AllowAllCIDRAuthorizer doesn't support this operation, as feature is disabled by default - "org.apache.cassandra.db:type=StorageService:forceRemoveCompletion" // deprecated (TCM) + "org.apache.cassandra.db:type=StorageService:forceRemoveCompletion", // deprecated (TCM) + "org.apache.cassandra.db:type=StorageService:createEpochUnsafe" // for Accord testing, but will likely be removed + ); + // This set of mbeans are registered early enough during the startup of a + // Cassandra instance for in-jvm dtests to avoid missing registration of mbeans. + // We ignore both "org.apache.cassandra.diag:type=DiagnosticEventService" and + // "org.apache.cassandra.diag:type=LastEventIdBroadcaster" because they are being intialized + // outside the scope of the in-jvm Instance initialization. + private static final Set EXPECTED_MBEANS_TO_BE_REGISTERED = Set.of( + "org.apache.cassandra.db:type=EndpointSnitchInfo", + "org.apache.cassandra.db:type=LocationInfo" ); /** @@ -75,6 +86,7 @@ public class JMXTestsUtil */ public static void testAllValidGetters(Cluster cluster, Map jmxEnv) throws Exception { + Set missingExpectedMbeans = new HashSet<>(EXPECTED_MBEANS_TO_BE_REGISTERED); for (IInvokableInstance instance : cluster) { if (instance.isShutdown()) @@ -91,6 +103,7 @@ public static void testAllValidGetters(Cluster cluster, Map jmxEnv) t { if (!name.getDomain().startsWith("org.apache.cassandra")) continue; + missingExpectedMbeans.remove(name.getCanonicalName()); MBeanInfo info = mbsc.getMBeanInfo(name); for (MBeanAttributeInfo a : info.getAttributes()) { @@ -123,7 +136,7 @@ public static void testAllValidGetters(Cluster cluster, Map jmxEnv) t } } } - if (!errors.isEmpty()) + if (!errors.isEmpty() || !missingExpectedMbeans.isEmpty()) { AssertionError root = new AssertionError(); for (Named error : errors) @@ -132,6 +145,13 @@ public static void testAllValidGetters(Cluster cluster, Map jmxEnv) t logger.error("Error {}", error.getMessage()); root.addSuppressed(error); } + for (String missingMbean : missingExpectedMbeans) + { + // The Named object's message has the cause also so this only logs the message + String errorMessage = String.format("Expected mbean %s was not found", missingMbean); + logger.error(errorMessage); + root.addSuppressed(new RuntimeException(errorMessage)); + } throw root; } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/AlterTopologyTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/AlterTopologyTest.java new file mode 100644 index 000000000000..85c7040a58a7 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/log/AlterTopologyTest.java @@ -0,0 +1,267 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.log; + +import java.util.HashMap; +import java.util.Map; + +import org.junit.Test; + +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.cql3.UntypedResultSet; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.TokenSupplier; +import org.apache.cassandra.distributed.shared.ClusterUtils; +import org.apache.cassandra.exceptions.ExceptionCode; +import org.apache.cassandra.harry.SchemaSpec; +import org.apache.cassandra.harry.dsl.HistoryBuilder; +import org.apache.cassandra.harry.dsl.ReplayingHistoryBuilder; +import org.apache.cassandra.harry.execution.InJvmDTestVisitExecutor; +import org.apache.cassandra.harry.gen.Generator; +import org.apache.cassandra.harry.gen.Generators; +import org.apache.cassandra.harry.gen.SchemaGenerators; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.membership.Directory; +import org.apache.cassandra.tcm.membership.Location; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.ownership.PlacementProvider; +import org.apache.cassandra.tcm.sequences.SequencesUtils.ClearLockedRanges; +import org.apache.cassandra.tcm.sequences.SequencesUtils.LockRanges; +import org.apache.cassandra.tcm.transformations.AlterTopology; +import org.apache.cassandra.tcm.transformations.CustomTransformation; +import org.awaitility.Awaitility; +import org.awaitility.core.ConditionFactory; + +import static java.time.Duration.ofSeconds; +import static org.apache.cassandra.distributed.api.Feature.GOSSIP; +import static org.apache.cassandra.distributed.shared.ClusterUtils.waitForCMSToQuiesce; +import static org.apache.cassandra.harry.checker.TestHelper.withRandom; +import static org.junit.Assert.assertEquals; + +public class AlterTopologyTest extends FuzzTestBase +{ + @Test + public void testTopologyChanges() throws Exception + { + Generator schemaGen = SchemaGenerators.schemaSpecGen(KEYSPACE, "change_topology_test", 1000); + try (Cluster cluster = builder().withTokenSupplier(TokenSupplier.evenlyDistributedTokens(4)) + .withRack("dc1", "rack1", 1) + .withRack("dc1", "rack2", 1) + .withRack("dc1", "rack3", 1) + .withRack("dc1", "rack4", 1) + .withConfig(config -> config.with(GOSSIP)) + .withNodes(4) + .start()) + { + IInvokableInstance cmsInstance = cluster.get(1); + + withRandom(rng -> { + SchemaSpec schema = schemaGen.generate(rng); + Generators.TrackingGenerator pkGen = Generators.tracking(Generators.int32(0, Math.min(schema.valueGenerators.pkPopulation(), 1000))); + Generator ckGen = Generators.int32(0, Math.min(schema.valueGenerators.ckPopulation(), 1000)); + + HistoryBuilder history = new ReplayingHistoryBuilder(schema.valueGenerators, + (hb) -> InJvmDTestVisitExecutor.builder() + .nodeSelector(i -> 1) + .build(schema, hb, cluster)); + history.custom(() -> { + cluster.schemaChange("CREATE KEYSPACE " + KEYSPACE + + " WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1' : 3 };"); + cluster.schemaChange(schema.compile()); + waitForCMSToQuiesce(cluster, cmsInstance); + }, "Setup"); + + + Runnable writeAndValidate = () -> { + for (int i = 0; i < 2000; i++) + history.insert(pkGen.generate(rng), ckGen.generate(rng)); + + for (int pk : pkGen.generated()) + history.selectPartition(pk); + }; + writeAndValidate.run(); + + cluster.forEach(i -> i.runOnInstance(() -> { + CustomTransformation.registerExtension(LockRanges.NAME, LockRanges.serializer); + CustomTransformation.registerExtension(ClearLockedRanges.NAME, ClearLockedRanges.serializer); + })); + + // a dc change which affects placements is not allowed, so expect a rejection + history.custom(() -> { + cmsInstance.runOnInstance(() -> { + PlacementProvider pp = ClusterMetadataService.instance().placementProvider(); + NodeId id = ClusterMetadata.current().myNodeId(); + Map updates = new HashMap<>(); + updates.put(id, new Location("dcX", "rack1")); + assertAlterTopologyRejection(pp, updates, "Proposed updates modify data placements"); + }); + }, "DC change affecting placements"); + + // a rack change which affects placements is also not allowed + history.custom(() -> { + cmsInstance.runOnInstance(() -> { + PlacementProvider pp = ClusterMetadataService.instance().placementProvider(); + NodeId id = ClusterMetadata.current().myNodeId(); + Map updates = new HashMap<>(); + updates.put(id, new Location("dc1", "rack2")); + assertAlterTopologyRejection(pp, updates, "Proposed updates modify data placements"); + }); + },"Rack change affecting placements "); + + // submit an update which would not modify placements so would normally be accepted + history.custom(() -> { + cmsInstance.runOnInstance(() -> { + PlacementProvider pp = ClusterMetadataService.instance().placementProvider(); + NodeId id = ClusterMetadata.current().myNodeId(); + Map updates = new HashMap<>(); + updates.put(id, new Location("dc1", "rack99")); + // if there are locked ranges, implying in-progress range movements, any update is rejected + ClusterMetadataService.instance().commit(new CustomTransformation(LockRanges.NAME, new LockRanges())); + assertAlterTopologyRejection(pp, updates, "The requested topology changes cannot be executed while there are ongoing range movements"); + + // but if no movements are in flight, the update is allowed + ClusterMetadataService.instance().commit(new CustomTransformation(ClearLockedRanges.NAME, new ClearLockedRanges())); + ClusterMetadataService.instance().commit(new AlterTopology(updates, pp)); + if (!ClusterMetadata.current().directory.location(id).rack.equals("rack99")) + throw new AssertionError("Expected rack to have changed"); + }); + }, "Rack change not affecting placements"); + + // changing multiple/all racks atomically + history.custom(() -> { + cmsInstance.runOnInstance(() -> { + PlacementProvider pp = ClusterMetadataService.instance().placementProvider(); + Map updates = new HashMap<>(); + Directory dir = ClusterMetadata.current().directory; + for (NodeId nodeId : dir.peerIds()) + updates.put(nodeId, new Location("dc1", "rack" + (nodeId.id() + 100))); + + ClusterMetadataService.instance().commit(new AlterTopology(updates, pp)); + dir = ClusterMetadata.current().directory; + for (NodeId nodeId : dir.peerIds()) + if (!ClusterMetadata.current().directory.location(nodeId).rack.equals("rack" + (nodeId.id() + 100))) + throw new AssertionError("Expected rack to have changed"); + }); + }, "Modify all racks not affecting placements"); + + // renaming a datacenter is supported, as long as it is not referenced in any replication params as that + // would impact placements + history.custom(() -> { + cmsInstance.runOnInstance(() -> { + PlacementProvider pp = ClusterMetadataService.instance().placementProvider(); + Map updates = new HashMap<>(); + Directory dir = ClusterMetadata.current().directory; + for (NodeId nodeId : dir.peerIds()) + updates.put(nodeId, new Location("renamed_dc", dir.location(nodeId).rack)); + assertAlterTopologyRejection(pp, updates, "Proposed updates modify data placements"); + }); + }, "Renaming DC referenced in replication params"); + + // after modifying replication for the test keyspace, this should be allowed + history.custom(() -> { + cmsInstance.runOnInstance(() -> { + PlacementProvider pp = ClusterMetadataService.instance().placementProvider(); + Map updates = new HashMap<>(); + QueryProcessor.executeInternal("ALTER KEYSPACE " + KEYSPACE + + " WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 3 };"); + Directory dir = ClusterMetadata.current().directory; + for (NodeId nodeId : dir.peerIds()) + updates.put(nodeId, new Location("renamed_dc", dir.location(nodeId).rack)); + + ClusterMetadataService.instance().commit(new AlterTopology(updates, pp)); + + for (NodeId nodeId : dir.peerIds()) + if (!ClusterMetadata.current().directory.location(nodeId).datacenter.equals("renamed_dc")) + throw new AssertionError("Expected dc to have changed"); + + // modify both datacenter and racks + dir = ClusterMetadata.current().directory; + for (NodeId nodeId : dir.peerIds()) + updates.put(nodeId, new Location("renamed_dc_again", "rack" + (nodeId.id() + 200))); + + ClusterMetadataService.instance().commit(new AlterTopology(updates, pp)); + dir = ClusterMetadata.current().directory; + for (NodeId nodeId : dir.peerIds()) + if (!ClusterMetadata.current().directory.location(nodeId).equals(new Location("renamed_dc_again", "rack" + (nodeId.id() + 200)))) + throw new AssertionError("Expected dc to have changed"); + }); + waitForCMSToQuiesce(cluster, cmsInstance); + },"Renaming DC not referenced in replication params"); + + // updates to system tables run asynchronously so spin until they're done + history.custom(() -> { + cluster.forEach(i -> await(60).until(() -> i.callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + NodeId myId = metadata.myNodeId(); + Directory dir = metadata.directory; + for (NodeId nodeId : dir.peerIds()) + { + String query = nodeId.equals(myId) + ? "select data_center, rack from system.local" + : String.format("select data_center, rack from system.peers_v2 where peer = '%s'", + dir.endpoint(nodeId).getHostAddress(false)); + UntypedResultSet res = QueryProcessor.executeInternal(query); + if (!res.one().getString("data_center").equals("renamed_dc_again")) + return false; + if (!res.one().getString("rack").equals("rack" + (nodeId.id() + 200))) + return false; + } + return true; + }))); + }, "Verify local system table updates"); + + // check gossip is also updated + history.custom(() -> { + Map> gossipInfo = ClusterUtils.gossipInfo(cmsInstance); + gossipInfo.forEach((ep, states) -> { + String nodeId = states.get("HOST_ID").split(":")[1]; + String dc = states.get("DC").split(":")[1]; + assertEquals("renamed_dc_again", dc); + String rack = states.get("RACK").split(":")[1]; + String expected = "rack" + (NodeId.fromString(nodeId).id() + 200); + assertEquals(expected, rack); + }); + }, "Verify gossip state"); + + writeAndValidate.run(); + }); + } + } + + private static void assertAlterTopologyRejection(PlacementProvider pp, Map updates, String error) + { + ClusterMetadataService.instance() + .commit(new AlterTopology(updates, pp), + m -> { throw new AssertionError("Expected rejection");}, + (c, r) -> { + if (!(c == ExceptionCode.INVALID && r.startsWith(error))) + throw new AssertionError("Unexpected failure response: " + r); + return ClusterMetadata.current(); + }); + + } + + private static ConditionFactory await(int seconds) + { + return Awaitility.await().atMost(ofSeconds(seconds)).pollDelay(ofSeconds(1)); + } + +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/BootWithMetadataTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/BootWithMetadataTest.java index fc6dad5adf0f..bcc02b89a62d 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/BootWithMetadataTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/BootWithMetadataTest.java @@ -52,6 +52,8 @@ public class BootWithMetadataTest extends TestBaseImpl public void resetTest() throws IOException, ExecutionException, InterruptedException { try (Cluster cluster = init(builder().withNodes(3) + // Accord tracks epochs, and if the expose no longer exist it is not able to process anything, causing it to crash... + .withConfig(c -> c.set("accord.enabled", false)) .start())) { long epoch = 0; @@ -83,9 +85,10 @@ public void resetTest() throws IOException, ExecutionException, InterruptedExcep assertEquals(1, ClusterMetadata.current().fullCMSMembers().size()); assertTrue(ClusterMetadata.current().fullCMSMembers().contains(InetAddressAndPort.getByNameUnchecked("127.0.0.1"))); Keyspace ks = Keyspace.open(KEYSPACE); - assertEquals(6, ks.getColumnFamilyStores().size()); + assertEquals(7, ks.getColumnFamilyStores().size()); for (int i = 0; i < 6; i++) assertTrue(ks.getColumnFamilyStore("x"+i) != null); // getColumnFamilyStore throws + assertTrue(ks.getColumnFamilyStore("yy") != null); }); } } @@ -94,6 +97,8 @@ public void resetTest() throws IOException, ExecutionException, InterruptedExcep public void newCMSTest() throws IOException, ExecutionException, InterruptedException { try (Cluster cluster = init(builder().withNodes(4) + // Accord tracks epochs, and if the expose no longer exist it is not able to process anything, causing it to crash... + .withConfig(c -> c.set("accord.enabled", false)) .start())) { for (int i = 0; i < 10; i++) diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/BounceIndexRebuildTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/BounceIndexRebuildTest.java index acf19d8fbff4..43bb78f44b7f 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/BounceIndexRebuildTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/BounceIndexRebuildTest.java @@ -18,6 +18,7 @@ package org.apache.cassandra.distributed.test.log; +import org.apache.cassandra.distributed.test.sai.SAIUtil; import org.junit.Assert; import org.junit.Test; @@ -25,6 +26,8 @@ import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.test.TestBaseImpl; +import static org.apache.cassandra.distributed.api.Feature.GOSSIP; +import static org.apache.cassandra.distributed.api.Feature.NETWORK; import static org.junit.Assert.assertEquals; public class BounceIndexRebuildTest extends TestBaseImpl @@ -32,14 +35,15 @@ public class BounceIndexRebuildTest extends TestBaseImpl @Test public void bounceTest() throws Exception { - try (Cluster cluster = init(builder().withNodes(1) - .start())) + try (Cluster cluster = init(builder().withNodes(1).withConfig(config -> config.with(NETWORK, GOSSIP)).start())) { cluster.schemaChange(withKeyspace("create table %s.tbl (id int primary key, x int)")); for (int i = 0; i < 10; i++) cluster.coordinator(1).execute(withKeyspace("insert into %s.tbl (id, x) values (?, ?)"), ConsistencyLevel.ALL, i, i); - cluster.schemaChange(withKeyspace("create index idx on %s.tbl (x)")); + cluster.schemaChange(withKeyspace("create index idx on %s.tbl (x) using 'sai'")); + SAIUtil.waitForIndexQueryable(cluster, KEYSPACE); + Object[][] res = cluster.coordinator(1).execute(withKeyspace("select * from %s.tbl where x=5"), ConsistencyLevel.ALL); assert res.length > 0; String patternLegacyBuild = "Index build of idx complete"; diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataDumpTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataDumpTest.java new file mode 100644 index 000000000000..b0f66ff885a8 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataDumpTest.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.log; + +import java.io.IOException; + +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.NodeToolResult; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.transformations.CustomTransformation; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class ClusterMetadataDumpTest extends TestBaseImpl +{ + @Test + public void dumpLogTest() throws IOException + { + try (Cluster cluster = init(builder().withNodes(3) + .start())) + { + cluster.get(1).runOnInstance(() -> { + for (int i = 0; i < 10; i++) + ClusterMetadataService.instance().commit(new CustomTransformation(CustomTransformation.PokeInt.NAME, new CustomTransformation.PokeInt(i))); + }); + + NodeToolResult res = cluster.get(1).nodetoolResult("cms", "dumplog"); + res.asserts().success(); + int unsafeJoinSeen = 0; + int registerSeen = 0; + int epochsSeen = 0; + for (String l : res.getStdout().split("\n")) + { + if (l.contains("kind")) + { + if (l.contains("REGISTER")) + registerSeen++; + else if (l.contains("UNSAFE_JOIN")) + unsafeJoinSeen++; + } + if (l.startsWith("Epoch:")) + epochsSeen++; + } + assertEquals(3, unsafeJoinSeen); + assertEquals(3, registerSeen); + assertTrue(epochsSeen > 15); + + res = cluster.get(1).nodetoolResult("cms", "dumplog", "--start", "10", "--end", "15"); + epochsSeen = 0; + for (String l : res.getStdout().split("\n")) + { + if (l.startsWith("Epoch: ")) + { + epochsSeen++; + long epoch = Long.parseLong(l.split(": ")[1]); + assertTrue(epoch >= 10 && epoch <= 15); + } + } + assertEquals(6, epochsSeen); + } + } + + @Test + public void dumpDirectoryTest() throws IOException + { + try (Cluster cluster = init(builder().withNodes(3) + .start())) + { + NodeToolResult res = cluster.get(1).nodetoolResult("cms", "dumpdirectory"); + res.asserts().success(); + int nodesFound = 0; + for (String l : res.getStdout().split("\n")) + { + if (l.startsWith("NodeId")) + nodesFound++; + assertFalse(l.contains("tokens")); + } + assertEquals(3, nodesFound); + res = cluster.get(1).nodetoolResult("cms", "dumpdirectory", "--tokens"); + res.asserts().success(); + nodesFound = 0; + int tokensFound = 0; + for (String l : res.getStdout().split("\n")) + { + if (l.startsWith("NodeId")) + nodesFound++; + + if (l.contains("tokens")) + tokensFound++; + } + assertEquals(3, nodesFound); + assertEquals(3, tokensFound); + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java b/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java index 3c822864c570..b606a94fc4a0 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java @@ -33,12 +33,13 @@ import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.SettableFuture; - import org.apache.cassandra.ServerTestUtils.ResettableClusterMetadataService; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.QueryProcessor; import org.apache.cassandra.cql3.statements.schema.CreateKeyspaceStatement; import org.apache.cassandra.cql3.statements.schema.KeyspaceAttributes; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.dht.ByteOrderedPartitioner; import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.dht.Murmur3Partitioner; @@ -52,8 +53,13 @@ import org.apache.cassandra.schema.KeyspaceParams; import org.apache.cassandra.schema.Keyspaces; import org.apache.cassandra.schema.ReplicationParams; +import org.apache.cassandra.schema.MemtableParams; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.schema.SchemaTransformation; import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.accord.AccordStaleReplicas; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationState; import org.apache.cassandra.tcm.AtomicLongBackedProcessor; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.ClusterMetadataService; @@ -64,6 +70,7 @@ import org.apache.cassandra.tcm.Transformation; import org.apache.cassandra.tcm.log.LocalLog; import org.apache.cassandra.tcm.membership.Directory; +import org.apache.cassandra.service.accord.AccordFastPath; import org.apache.cassandra.tcm.membership.Location; import org.apache.cassandra.tcm.membership.NodeAddresses; import org.apache.cassandra.tcm.membership.NodeId; @@ -149,35 +156,25 @@ public static ClusterMetadata minimalForTesting(Epoch epoch, IPartitioner partit Directory.EMPTY, new TokenMap(partitioner), DataPlacements.empty(), + AccordFastPath.EMPTY, LockedRanges.EMPTY, InProgressSequences.EMPTY, - ImmutableMap.of()); + ConsensusMigrationState.EMPTY, + ImmutableMap.of(), + AccordStaleReplicas.EMPTY); } public static ClusterMetadata minimalForTesting(IPartitioner partitioner) { - return new ClusterMetadata(Epoch.EMPTY, - partitioner, - null, - null, - null, - DataPlacements.empty(), - null, - null, - ImmutableMap.of()); + return minimalForTesting(Epoch.EMPTY, partitioner); } public static ClusterMetadata minimalForTesting(Keyspaces keyspaces) { - return new ClusterMetadata(Epoch.EMPTY, - Murmur3Partitioner.instance, - new DistributedSchema(keyspaces), - null, - null, - DataPlacements.empty(), - null, - null, - ImmutableMap.of()); + return minimalForTesting(Murmur3Partitioner.instance).transformer() + .with(new DistributedSchema(keyspaces)) + .build() + .metadata.forceEpoch(Epoch.EMPTY); } public static ClusterMetadataService syncInstanceForTest() @@ -224,6 +221,30 @@ public static void createKeyspace(String statement) } } + public static void setMemtable(String ks, String table, String memtable) + { + setMemtable(ks, table, MemtableParams.get(memtable)); + } + + public static void setMemtable(String ks, String table, MemtableParams memtable) + { + if (SchemaConstants.isLocalSystemKeyspace(ks)) + { + ColumnFamilyStore store = Keyspace.open(ks).getColumnFamilyStore(table); + store.reload(store.metadata().unbuild().memtable(memtable).build()); + } + else + { + Schema.instance.submit(cms -> { + var km = cms.schema.getKeyspaceMetadata(ks); + var update = km.withSwapped(km.tables.withSwapped(km.tables.getNullable(table).unbuild() + .memtable(memtable) + .build())); + return cms.schema.getKeyspaces().withAddedOrUpdated(update); + }); + } + } + private static Set leaving(ClusterMetadata metadata) { return metadata.directory.states.entrySet().stream() diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/CoordinatorPathTestBase.java b/test/distributed/org/apache/cassandra/distributed/test/log/CoordinatorPathTestBase.java index cfa0ec4be7ca..45a62d3a97f2 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/CoordinatorPathTestBase.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/CoordinatorPathTestBase.java @@ -127,7 +127,9 @@ public void coordinatorPathTest(TokenPlacementModel.ReplicationFactor rf, TestRu try (Cluster cluster = builder().withNodes(1) .withConfig(cfg -> cfg.set("seed_provider", new ParameterizedClass(SimpleSeedProvider.class.getName(), - Collections.singletonMap("seeds", fakeCmsNode.id() + ":7012")))) + Collections.singletonMap("seeds", fakeCmsNode.id() + ":7012"))) + // Accord depends on Processor.reconstruct, but those verbs are not simulated, causing the tests to fail + .set("accord.enabled", false)) .withTokenSupplier(factory) .withNodeIdTopology(NetworkTopology.singleDcNetworkTopology(10, "dc0", "rack0")) .createWithoutStarting(); @@ -732,7 +734,8 @@ public void init() log, new Processor() { - public Commit.Result commit(Entry.Id entryId, Transformation event, Epoch lastKnown, Retry.Deadline retryPolicy) + @Override + public Commit.Result commit(Entry.Id entryId, Transformation event, Epoch lastKnown, Retry retryPolicy) { if (lastKnown == null) lastKnown = log.waitForHighestConsecutive().epoch; @@ -745,13 +748,26 @@ public Commit.Result commit(Entry.Id entryId, Transformation event, Epoch lastKn return result; } - public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry.Deadline retryPolicy) + @Override + public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry retryPolicy) { Epoch since = log.waitForHighestConsecutive().epoch; LogState logState = driver.requestResponse(new FetchCMSLog(since, true)); log.append(logState); return log.waitForHighestConsecutive(); } + + @Override + public LogState getLocalState(Epoch start, Epoch end, boolean includeSnapshot) + { + return log.getLocalEntries(start, end, includeSnapshot); + } + + @Override + public LogState getLogState(Epoch start, Epoch end, boolean includeSnapshot, Retry retryPolicy) + { + return getLocalState(start, end, includeSnapshot); + } }, (a,b) -> {}, false); diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/FetchLogFromPeers2Test.java b/test/distributed/org/apache/cassandra/distributed/test/log/FetchLogFromPeers2Test.java index d3b549d20f94..78c67c9e157b 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/FetchLogFromPeers2Test.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/FetchLogFromPeers2Test.java @@ -19,48 +19,50 @@ package org.apache.cassandra.distributed.test.log; import java.util.UUID; -import java.util.concurrent.ExecutionException; import org.junit.Test; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.test.TestBaseImpl; -import org.apache.cassandra.distributed.test.log.FetchLogFromPeersTest.ClusterState; import org.apache.cassandra.metrics.TCMMetrics; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tcm.Epoch; -import static org.apache.cassandra.distributed.test.log.FetchLogFromPeersTest.*; +import static org.apache.cassandra.distributed.test.log.FetchLogFromPeersTest.ClusterState; +import static org.apache.cassandra.distributed.test.log.FetchLogFromPeersTest.Operation; +import static org.apache.cassandra.distributed.test.log.FetchLogFromPeersTest.coordinator; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; public class FetchLogFromPeers2Test extends TestBaseImpl { @Test - public void testSchema() throws Exception + public void testSchema() throws Throwable { - try (Cluster cluster = init(builder().withNodes(3) - .start())) + try (Cluster cluster = init(builder().withNodes(3).start())) { - cluster.schemaChange(withKeyspace("alter keyspace %s with replication = {'class':'SimpleStrategy', 'replication_factor':3}")); - cluster.schemaChange(withKeyspace("create table %s.tbl (id int primary key)")); - cluster.schemaChange(withKeyspace("create table %s.tbl2 (id int primary key)")); + cluster.schemaChange(withKeyspace("alter keyspace %s with replication = {'class':'SimpleStrategy', 'replication_factor':3} ")); + cluster.schemaChange(withKeyspace("create table %s.tbl (id int primary key) WITH speculative_retry = 'ALWAYS';")); for (ClusterState clusterState : ClusterState.values()) + { for (Operation operation : Operation.values()) { + cluster.filters().inbound().from(1, 2).to(1, 2).drop(); setupSchemaBehind(cluster); + cluster.filters().inbound().to(1).to(1).drop(); runQuery(cluster, clusterState, operation); + cluster.filters().reset(); } + } + } } - public void runQuery(Cluster cluster, ClusterState clusterState, Operation operation) throws ExecutionException, InterruptedException + public void runQuery(Cluster cluster, ClusterState clusterState, Operation operation) throws Throwable { - cluster.get(1).shutdown().get(); - // node2 is behind String query; switch (operation) @@ -79,7 +81,7 @@ public void runQuery(Cluster cluster, ClusterState clusterState, Operation opera long metricsBefore = cluster.get(2).callOnInstance(() -> TCMMetrics.instance.fetchedPeerLogEntries.getCount()); if (clusterState == ClusterState.COORDINATOR_BEHIND) { - long [] coordinatorBehindMetricsBefore = new long[cluster.size()]; + long[] coordinatorBehindMetricsBefore = new long[cluster.size()]; try { for (int i = 1; i <= cluster.size(); i++) @@ -88,7 +90,9 @@ public void runQuery(Cluster cluster, ClusterState clusterState, Operation opera cluster.coordinator(coordinator).execute(withKeyspace(query), ConsistencyLevel.QUORUM); fail("should fail"); } - catch (Exception ignored) {} + catch (Exception ignored) + { + } boolean metricBumped = false; for (int i = 1; i <= cluster.size(); i++) @@ -103,20 +107,15 @@ public void runQuery(Cluster cluster, ClusterState clusterState, Operation opera } } assertTrue("Metric CoordinatorBehindSchema should have been bumped for at least one replica", metricBumped); - } cluster.coordinator(coordinator).execute(withKeyspace(query), ConsistencyLevel.QUORUM); assertTrue(cluster.get(2).logs().grep(mark, "Fetching log from /127.0.0.3:7012").getResult().size() > 0); long metricsAfter = cluster.get(2).callOnInstance(() -> TCMMetrics.instance.fetchedPeerLogEntries.getCount()); assertTrue(metricsAfter > metricsBefore); - - cluster.get(1).startup(); } public void setupSchemaBehind(Cluster cluster) { - cluster.filters().reset(); - cluster.filters().inbound().from(1).to(2).drop(); long epochBefore = cluster.get(3).callOnInstance(() -> ClusterMetadata.current().epoch.getEpoch()); cluster.coordinator(1).execute(withKeyspace("alter table %s.tbl with comment='test " + UUID.randomUUID() + "'"), ConsistencyLevel.ONE); cluster.get(3).runOnInstance(() -> { @@ -129,6 +128,5 @@ public void setupSchemaBehind(Cluster cluster) throw new RuntimeException(e); } }); - cluster.filters().reset(); } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/FetchLogFromPeersDCTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/FetchLogFromPeersDCTest.java new file mode 100644 index 000000000000..b47108d1d0e7 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/log/FetchLogFromPeersDCTest.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.log; + +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.TokenSupplier; +import org.apache.cassandra.distributed.shared.NetworkTopology; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.tcm.ClusterMetadata; + +import static org.apache.cassandra.net.Verb.TCM_FETCH_PEER_LOG_REQ; +import static org.apache.cassandra.net.Verb.TCM_REPLICATION; +import static org.junit.Assert.assertEquals; + +public class FetchLogFromPeersDCTest extends TestBaseImpl +{ + + @Test + public void catchupCoordinatorBehindTestPlacements() throws Exception + { + try (Cluster cluster = init(builder().withNodes(4).withConfig(c -> c.with(Feature.NETWORK, Feature.GOSSIP)) + .withoutVNodes() + .withTokenSupplier(TokenSupplier.evenlyDistributedTokens(4)) + .withNodeIdTopology(NetworkTopology.networkTopology(4, (i) -> NetworkTopology.dcAndRack("dc" + (i <= 2 ? 0 : 1), "rack" + i))) + .start())) + { + cluster.schemaChange(withKeyspace("alter keyspace %s with replication = {'class':'NetworkTopologyStrategy', 'dc0':2, 'dc1':2}")); + cluster.schemaChange(withKeyspace("create table %s.tbl (id int primary key)")); + cluster.filters().inbound().verbs(TCM_REPLICATION.id).from(1).to(3, 4).drop(); + // don't allow the dc1 nodes to catch up from eachother - we should catch up from the actual originator of the message: + cluster.filters().inbound().verbs(TCM_FETCH_PEER_LOG_REQ.id).from(3, 4).to(3,4).drop(); + cluster.get(1).schemaChangeInternal(withKeyspace("alter table %s.tbl with comment='abc'")); + cluster.coordinator(1).execute(withKeyspace("insert into %s.tbl (id) values (1)"), ConsistencyLevel.ALL); + long epoch = cluster.get(1).callOnInstance(() -> ClusterMetadata.current().epoch.getEpoch()); + cluster.forEach(i -> i.runOnInstance(() -> { + assertEquals(epoch, ClusterMetadata.current().epoch.getEpoch()); + })); + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/ForceSnapshotTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/ForceSnapshotTest.java index 9f5cf1e30743..3eb195853531 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/ForceSnapshotTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/ForceSnapshotTest.java @@ -28,6 +28,7 @@ import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.shared.ClusterUtils; import org.apache.cassandra.distributed.test.TestBaseImpl; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.tcm.ClusterMetadata; @@ -147,6 +148,7 @@ public void testDumpLoadMetadata() throws IOException throw new RuntimeException(e); } }); + ClusterUtils.waitForCMSToQuiesce(cluster, 1); cluster.forEach(() -> assertEquals(10, Keyspace.open(KEYSPACE).getColumnFamilyStores().size())); // make sure we execute more transformations; diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/MetadataChangeSimulationTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/MetadataChangeSimulationTest.java index 414f74bbef47..44fdc7d70153 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/MetadataChangeSimulationTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/MetadataChangeSimulationTest.java @@ -935,10 +935,10 @@ public void testPlacementsAllSettled() throws Throwable while (!state.inFlightOperations.isEmpty()) { state = state.inFlightOperations.get(random.nextInt(state.inFlightOperations.size())).advance(state); - Assert.assertEquals(allSettled, sut.service.metadata().writePlacementAllSettled(ksm)); + Assert.assertTrue(allSettled.equivalentTo(sut.service.metadata().writePlacementAllSettled(ksm))); validatePlacements(sut, state); } - Assert.assertEquals(allSettled, sut.service.metadata().placements.get(ksm.params.replication)); + Assert.assertTrue(allSettled.equivalentTo(sut.service.metadata().placements.get(ksm.params.replication))); } } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/ModelState.java b/test/distributed/org/apache/cassandra/distributed/test/log/ModelState.java index 0eb73b86940e..9dea76040de5 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/ModelState.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/ModelState.java @@ -343,6 +343,20 @@ public Transformer withReplaced(Node oldNode, Node newNode) return this; } + public Transformer withUpdatedRacks(Map updates) + { + assert currentNodes.containsAll(updates.keySet()); + List newNodes = new ArrayList<>(); + currentNodes.forEach(node -> { + if (updates.containsKey(node)) + newNodes.add(node.withNewRack(updates.get(node))); + else + newNodes.add(node); + }); + currentNodes = newNodes; + return this; + } + public Transformer updateSimulation(PlacementSimulator.SimulatedPlacements simulatedPlacements) { this.simulatedPlacements = simulatedPlacements; diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/ReconfigureCMSTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/ReconfigureCMSTest.java index 2869fe913ac8..0f95735e5a4e 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/ReconfigureCMSTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/ReconfigureCMSTest.java @@ -134,7 +134,7 @@ public void cancelCMSReconfigurationTest() throws Throwable assertEquals(2, metadata.fullCMSMembers().size()); ReplicationParams params = ReplicationParams.meta(metadata); DataPlacement placements = metadata.placements.get(params); - assertEquals(placements.reads, placements.writes); + assertTrue(placements.reads.equivalentTo(placements.writes)); assertEquals(metadata.fullCMSMembers().size(), Integer.parseInt(params.asMap().get("dc0"))); }); @@ -159,7 +159,7 @@ public void cancelCMSReconfigurationTest() throws Throwable Assert.assertTrue(metadata.fullCMSMembers().contains(FBUtilities.getBroadcastAddressAndPort())); assertEquals(3, metadata.fullCMSMembers().size()); DataPlacement placements = metadata.placements.get(ReplicationParams.meta(metadata)); - assertEquals(placements.reads, placements.writes); + Assert.assertTrue(placements.reads.equivalentTo(placements.writes)); }); } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/ReconstructEpochTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/ReconstructEpochTest.java new file mode 100644 index 000000000000..b1b1411c5803 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/log/ReconstructEpochTest.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.log; + +import java.util.Iterator; +import java.util.concurrent.TimeUnit; + +import org.junit.Assert; +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.metrics.TCMMetrics; +import org.apache.cassandra.schema.DistributedMetadataLogKeyspace; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.Retry; +import org.apache.cassandra.tcm.log.Entry; +import org.apache.cassandra.tcm.log.LogState; + +import static org.apache.cassandra.config.DatabaseDescriptor.getCmsAwaitTimeout; + +public class ReconstructEpochTest extends TestBaseImpl +{ + @Test + public void logReaderTest() throws Exception + { + try (Cluster cluster = init(builder().withNodes(2).start())) + { + cluster.schemaChange(withKeyspace("CREATE TABLE %s.tbl (id int primary key)")); + for (int i = 0; i < 30; i++) + { + if (i > 0 && i % 5 == 0) + cluster.get(1).runOnInstance(() -> ClusterMetadataService.instance().triggerSnapshot()); + cluster.schemaChange(withKeyspace("ALTER TABLE %s.tbl WITH comment = '" + i + "'")); + } + + cluster.get(1).runOnInstance(() -> { + for (int[] cfg : new int[][]{ new int[]{ 6, 9 }, + new int[]{ 2, 20 }, + new int[]{ 5, 5 }, + new int[]{ 15, 20 } }) + { + int start = cfg[0]; + int end = cfg[1]; + LogState logState = DistributedMetadataLogKeyspace.getLogState(Epoch.create(start), Epoch.create(end), true); + Assert.assertEquals(start, logState.baseState.epoch.getEpoch()); + Iterator iter = logState.entries.iterator(); + for (int i = start + 1; i <= end; i++) + Assert.assertEquals(i, iter.next().epoch.getEpoch()); + } + }); + + + cluster.get(2).runOnInstance(() -> { + for (int[] cfg : new int[][]{ new int[]{ 6, 9 }, + new int[]{ 2, 20 }, + new int[]{ 5, 5 }, + new int[]{ 15, 20 } }) + { + int start = cfg[0]; + int end = cfg[1]; + LogState logState = ClusterMetadataService.instance() + .processor() + .getLogState(Epoch.create(start), + Epoch.create(end), + true, + Retry.untilElapsed(getCmsAwaitTimeout().to(TimeUnit.NANOSECONDS), TCMMetrics.instance.commitRetries)); + + Assert.assertEquals(start, logState.baseState.epoch.getEpoch()); + Iterator iter = logState.entries.iterator(); + for (int i = start + 1; i <= end; i++) + Assert.assertEquals(i, iter.next().epoch.getEpoch()); + } + }); + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/SimulatedOperation.java b/test/distributed/org/apache/cassandra/distributed/test/log/SimulatedOperation.java index 77638319331d..c6fa2da54b47 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/SimulatedOperation.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/SimulatedOperation.java @@ -20,6 +20,7 @@ import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -40,10 +41,13 @@ import org.apache.cassandra.tcm.MultiStepOperation; import org.apache.cassandra.tcm.Transformation; import org.apache.cassandra.tcm.ownership.VersionedEndpoints; +import org.apache.cassandra.tcm.membership.Location; +import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.tcm.sequences.BootstrapAndJoin; import org.apache.cassandra.tcm.sequences.BootstrapAndReplace; import org.apache.cassandra.tcm.sequences.LeaveStreams; import org.apache.cassandra.tcm.sequences.UnbootstrapAndLeave; +import org.apache.cassandra.tcm.transformations.AlterTopology; import org.apache.cassandra.tcm.transformations.CancelInProgressSequence; import org.apache.cassandra.tcm.transformations.PrepareJoin; import org.apache.cassandra.tcm.transformations.PrepareLeave; @@ -101,6 +105,23 @@ public static ModelState joinWithoutBootstrap(ModelState state, } + public static ModelState changeRacks(CMSSut sut, ModelState state, Map updates) + { + ModelState.Transformer transformer = state.transformer() + .withUpdatedRacks(updates) + .updateSimulation(state.simulatedPlacements); + + Map serviceUpdates = new HashMap<>(); + for (Map.Entry entry : updates.entrySet()) + { + Node n = entry.getKey(); + String rack = entry.getValue(); + serviceUpdates.put(n.nodeId(), new Location(n.dc(), rack)); + } + sut.service.commit(new AlterTopology(serviceUpdates, sut.service.placementProvider())); + return transformer.transform(); + } + public static ModelState leave(CMSSut sut, ModelState state, Node node) { ModelState.Transformer transformer = state.transformer(); diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/SnapshotTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/SnapshotTest.java index 628df75ca463..11f8f0b1f1e1 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/SnapshotTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/SnapshotTest.java @@ -51,8 +51,8 @@ public void testSimpleSnapshot() throws Throwable .start())) { cluster.schemaChange(withKeyspace("create table %s.tbl (id int primary key, x int)")); - cluster.schemaChange(withKeyspace("create table %s.tblconstraints (id int primary key, x int check x > 100 and x < 200, v text check LENGTH(v) > 10)")); - cluster.schemaChange(withKeyspace("create table %s.tblconstraints2 (id int primary key, x int check NOT_NULL(x), v text check LENGTH(v) > 10)")); + cluster.schemaChange(withKeyspace("create table %s.tblconstraints (id int primary key, x int check x > 100 and x < 200, v text check LENGTH() > 10)")); + cluster.schemaChange(withKeyspace("create table %s.tblconstraints2 (id int primary key, x int check NOT NULL, v text check LENGTH() > 10)")); cluster.schemaChange(withKeyspace("CREATE OR REPLACE FUNCTION %s.fLog (input double) CALLED ON NULL INPUT RETURNS double LANGUAGE java AS 'return Double.valueOf(Math.log(input.doubleValue()));';")); cluster.schemaChange(withKeyspace("CREATE OR REPLACE FUNCTION %s.avgState ( state tuple, val int ) CALLED ON NULL INPUT RETURNS tuple LANGUAGE java AS \n" + " 'if (val !=null) { state.setInt(0, state.getInt(0)+1); state.setLong(1, state.getLong(1)+val.intValue()); } return state;'; ")); diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/SystemKeyspaceStorageTest.java b/test/distributed/org/apache/cassandra/distributed/test/log/SystemKeyspaceStorageTest.java index 2e131bcf497b..28ef23bf6271 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/SystemKeyspaceStorageTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/SystemKeyspaceStorageTest.java @@ -98,14 +98,13 @@ public void testLogStateQuery() throws Throwable cluster.get(1).runOnInstance(() -> deleteSnapshot(toRemoveSnapshot.getEpoch())); } } - Epoch latestSnapshot = remainingSnapshots.get(remainingSnapshots.size() - 1); Epoch lastEpoch = allEpochs.stream().max(Comparator.naturalOrder()).get(); repeat(10, () -> { repeat(100, () -> { Epoch since = allEpochs.get(rng.nextInt(allEpochs.size())); - for (boolean consistentReplay : new boolean[]{ true, false }) + for (boolean consistentFetch : new boolean[]{ true, false }) { - LogState logState = simulatedCluster.node(2).requestResponse(new FetchCMSLog(since, consistentReplay)); + LogState logState = simulatedCluster.node(2).requestResponse(new FetchCMSLog(since, consistentFetch)); // if we return a snapshot it is always the most recent one // we don't return a snapshot if there is only 1 snapshot after `since` Epoch start = since; @@ -119,12 +118,16 @@ public void testLogStateQuery() throws Throwable } else { - assertEquals(latestSnapshot, logState.baseState.epoch); + assertEquals(since, logState.baseState.epoch); start = logState.baseState.epoch; if (logState.entries.isEmpty()) // no entries, snapshot should have the same epoch as since assertEquals(since, start); else // first epoch in entries should be snapshot epoch + 1 + { + if (!start.nextEpoch().equals(logState.entries.get(0).epoch)) + System.out.println(1); assertEquals(start.nextEpoch(), logState.entries.get(0).epoch); + } } for (Entry entry : logState.entries) @@ -174,7 +177,7 @@ public static void repeat(int num, ExecUtil.ThrowingSerializableRunnable r) } catch (Throwable throwable) { - throw new AssertionError(throwable); + throw new AssertionError(String.format("Failed on %dth/%d repetition", i, num), throwable); } } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/log/TestProcessor.java b/test/distributed/org/apache/cassandra/distributed/test/log/TestProcessor.java index f5fabfb4acd0..27ddbd18be45 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/log/TestProcessor.java +++ b/test/distributed/org/apache/cassandra/distributed/test/log/TestProcessor.java @@ -32,6 +32,7 @@ import org.apache.cassandra.tcm.Retry; import org.apache.cassandra.tcm.Transformation; import org.apache.cassandra.tcm.log.Entry; +import org.apache.cassandra.tcm.log.LogState; import org.apache.cassandra.utils.concurrent.WaitQueue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -54,7 +55,7 @@ public TestProcessor(Processor delegate) } @Override - public Commit.Result commit(Entry.Id entryId, Transformation transform, Epoch lastKnown, Retry.Deadline retryPolicy) + public Commit.Result commit(Entry.Id entryId, Transformation transform, Epoch lastKnown, Retry retryPolicy) { maybePause(transform); waitIfPaused(); @@ -64,11 +65,23 @@ public Commit.Result commit(Entry.Id entryId, Transformation transform, Epoch la } @Override - public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry.Deadline retryPolicy) + public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry retryPolicy) { return delegate.fetchLogAndWait(waitFor, retryPolicy); } + @Override + public LogState getLocalState(Epoch start, Epoch end, boolean includeSnapshot) + { + return delegate.getLocalState(start, end, includeSnapshot); + } + + @Override + public LogState getLogState(Epoch start, Epoch end, boolean includeSnapshot, Retry retryPolicy) + { + return delegate.getLogState(start, end, includeSnapshot, retryPolicy); + } + protected void waitIfPaused() { if (isPaused()) diff --git a/test/distributed/org/apache/cassandra/distributed/test/metrics/CoordinatorReadLatencyMetricTest.java b/test/distributed/org/apache/cassandra/distributed/test/metrics/CoordinatorReadLatencyMetricTest.java index ab3de57cb66b..c82dfb5597f3 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/metrics/CoordinatorReadLatencyMetricTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/metrics/CoordinatorReadLatencyMetricTest.java @@ -18,23 +18,69 @@ package org.apache.cassandra.distributed.test.metrics; +import java.io.IOException; import java.util.concurrent.TimeUnit; +import java.util.function.LongSupplier; import java.util.stream.Collectors; import java.util.stream.IntStream; +import org.assertj.core.api.Assertions; import org.junit.Test; import org.apache.cassandra.config.Config; +import org.apache.cassandra.cql3.ast.Select; +import org.apache.cassandra.cql3.ast.Txn; +import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.IInvokableInstance; import org.apache.cassandra.distributed.test.TestBaseImpl; import org.apache.cassandra.metrics.ClientRequestsMetricsHolder; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.service.consensus.migration.TransactionalMigrationFromMode; import org.apache.cassandra.service.paxos.Paxos; +import static org.apache.cassandra.cql3.ast.Conditional.Where.Inequality.LESS_THAN; import static org.junit.Assert.assertTrue; public class CoordinatorReadLatencyMetricTest extends TestBaseImpl { + @Test + public void singleRowTest() throws IOException + { + try (Cluster cluster = init(builder().withNodes(1).start())) + { + cluster.schemaChange(withKeyspace("CREATE TABLE %s.tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))")); + for (int i = 0; i < 100; i++) + cluster.coordinator(1).execute(withKeyspace("insert into %s.tbl (pk, ck ,v) values (0, ?, 1)"), ConsistencyLevel.ALL, i); + + var select = Select.builder() + //TODO (now, correctness, coverage): count(v) breaks accord as we get mutliple rows rather than the count of rows... +// .withSelection(FunctionCall.count("v")) + .table(KEYSPACE, "tbl") + .value("pk", 0) + .where("ck", LESS_THAN, 42) + .limit(1) + .build(); + + verifyTableLatency(cluster, 1, () -> verifyLatencyMetrics(cluster, select.toCQL(), ConsistencyLevel.QUORUM, select.binds())); + cluster.get(1).runOnInstance(() -> Paxos.setPaxosVariant(Config.PaxosVariant.v1)); + verifyTableLatency(cluster, 1, () -> verifyLatencyMetrics(cluster, select.toCQL(), ConsistencyLevel.SERIAL, select.binds())); + cluster.get(1).runOnInstance(() -> Paxos.setPaxosVariant(Config.PaxosVariant.v2)); + verifyTableLatency(cluster, 1, () -> verifyLatencyMetrics(cluster, select.toCQL(), ConsistencyLevel.SERIAL, select.binds())); + + cluster.schemaChange(withKeyspace("ALTER TABLE %s.tbl WITH " + TransactionalMode.full.asCqlParam() + " AND " + TransactionalMigrationFromMode.none.asCqlParam())); + var txn = Txn.wrap(select); + verifyTableLatency(cluster, 1, () -> verifyLatencyMetrics(cluster, txn.toCQL(), ConsistencyLevel.QUORUM, txn.binds())); + + var let = Txn.builder() + .addLet("a", select) + .addReturnReferences("a.v") + .build(); + verifyTableLatency(cluster, 1, () -> verifyLatencyMetrics(cluster, let.toCQL(), ConsistencyLevel.QUORUM, let.binds())); + } + } + @Test public void internalPagingWithAggregateTest() throws Throwable { @@ -91,16 +137,26 @@ public void multiplePartitionKeyInClauseTest() throws Throwable } } - private void verifyLatencyMetricsWhenPaging(Cluster cluster, - int pagesize, - int expectedQueries, - String query, - ConsistencyLevel consistencyLevel) + private static void verifyLatencyMetricsWhenPaging(Cluster cluster, + int pagesize, + int expectedQueries, + String query, + ConsistencyLevel consistencyLevel) + { + verifyLatencyMetrics(cluster, expectedQueries, () -> cluster.coordinator(1).executeWithPaging(query, consistencyLevel, pagesize)); + } + + private static void verifyLatencyMetrics(Cluster cluster, String query, ConsistencyLevel consistencyLevel, Object[] bindings) + { + verifyLatencyMetrics(cluster, 1, () -> cluster.coordinator(1).execute(query, consistencyLevel, bindings)); + } + + private static void verifyLatencyMetrics(Cluster cluster, int expectedQueries, Runnable query) { long countBefore = cluster.get(1).callOnInstance(() -> ClientRequestsMetricsHolder.readMetrics.latency.getCount()); long totalLatencyBefore = cluster.get(1).callOnInstance(() -> ClientRequestsMetricsHolder.readMetrics.totalLatency.getCount()); long startTime = System.nanoTime(); - cluster.coordinator(1).executeWithPaging(query, consistencyLevel, pagesize); + query.run(); long elapsedTime = System.nanoTime() - startTime; long countAfter = cluster.get(1).callOnInstance(() -> ClientRequestsMetricsHolder.readMetrics.latency.getCount()); long totalLatencyAfter = cluster.get(1).callOnInstance(() -> ClientRequestsMetricsHolder.readMetrics.totalLatency.getCount()); @@ -113,4 +169,16 @@ private void verifyLatencyMetricsWhenPaging(Cluster cluster, totalLatencyRecorded <= elapsedTime); } + private static void verifyTableLatency(Cluster cluster, int expectedQueries, Runnable query) + { + IInvokableInstance inst = cluster.get(1); + LongSupplier tableMetric = () -> inst.callOnInstance(() -> Keyspace.open("distributed_test_keyspace").getColumnFamilyStore("tbl").getMetrics().readLatency.latency.getCount()); + + long tableBefore = tableMetric.getAsLong(); + query.run(); + long tableAfter = tableMetric.getAsLong(); + + Assertions.assertThat(tableAfter - tableBefore).isEqualTo(expectedQueries); + } + } diff --git a/test/distributed/org/apache/cassandra/distributed/test/metrics/HintsServiceMetricsTest.java b/test/distributed/org/apache/cassandra/distributed/test/metrics/HintsServiceMetricsTest.java index 0fc9bff174b8..42b274e628a8 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/metrics/HintsServiceMetricsTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/metrics/HintsServiceMetricsTest.java @@ -108,6 +108,8 @@ public void testHintsServiceMetrics() throws Exception dropWritesForNode2.set(true); for (int i = 0; i < NUM_ROWS / 2; i++) coordinator.execute(withKeyspace("INSERT INTO %s.t (k, v) VALUES (?, ?)"), QUORUM, i, i); + // some hints have created for node1, so file size must be greater than 0 + waitUntilAsserted(() -> assertThat(countHintsFileSize(node1)).isGreaterThan(0)); dropWritesForNode2.set(false); // write the second half of the rows with the third node dropping mutations requests, @@ -115,8 +117,15 @@ public void testHintsServiceMetrics() throws Exception dropWritesForNode3.set(true); for (int i = NUM_ROWS / 2; i < NUM_ROWS; i++) coordinator.execute(withKeyspace("INSERT INTO %s.t (k, v) VALUES (?, ?)"), QUORUM, i, i); + // another hints have created for node1, so file size must be greater than 0 + waitUntilAsserted(() -> assertThat(countHintsFileSize(node1)).isGreaterThan(0)); dropWritesForNode3.set(false); + // Hints Throttle happens in the delivery process, so must be greater than 0 + waitUntilAsserted(() -> assertThat(countHintsThrottle(node1)).isGreaterThan(0)); + waitUntilAsserted(() -> assertThat(countHintsApplySucceeded(node1)).isEqualTo(0)); + waitUntilAsserted(() -> assertThat(countHintsApplyFailed(node1)).isEqualTo(0)); + // wait until all the hints have been successfully applied to the nodes that have been dropping mutations waitUntilAsserted(() -> assertThat(countRows(node2)).isEqualTo(countRows(node3)).isEqualTo(NUM_ROWS)); @@ -142,6 +151,14 @@ public void testHintsServiceMetrics() throws Exception assertThat(countHintsSucceeded(node)).isEqualTo(0); assertThat(countHintsFailed(node)).isEqualTo(0); assertThat(countHintsTimedOut(node)).isEqualTo(0); + assertThat(countHintsRetryDifferentSystem(node)).isEqualTo(0); + + assertThat(countHintsFileSize(node)).isEqualTo(0); + assertThat(countHintsThrottle(node)).isEqualTo(0); + // node two and three must apply these hints which belongs to them, so must be greater than 0 + assertThat(countHintsApplySucceeded(node)).isGreaterThan(0); + assertThat(countHintsApplyFailed(node)).isEqualTo(0); + assertThat(countGlobalDelays(node)).isEqualTo(0); cluster.forEach(target -> assertThat(countEndpointDelays(node, target)).isEqualTo(0)); } @@ -180,6 +197,35 @@ private static Long countHintsTimedOut(IInvokableInstance node) return node.callOnInstance(() -> HintsServiceMetrics.hintsTimedOut.getCount()); } + @SuppressWarnings("Convert2MethodRef") + private static Long countHintsRetryDifferentSystem(IInvokableInstance node) + { + return node.callOnInstance(() -> HintsServiceMetrics.hintsRetryDifferentSystem.getCount()); + } + + private static Long countHintsFileSize(IInvokableInstance node) + { + return node.callOnInstance(HintsServiceMetrics.hintsFileSize::getValue); + } + + @SuppressWarnings("Convert2MethodRef") + private static Long countHintsApplySucceeded(IInvokableInstance node) + { + return node.callOnInstance(() -> HintsServiceMetrics.hintsApplySucceeded.getCount()); + } + + @SuppressWarnings("Convert2MethodRef") + private static Long countHintsApplyFailed(IInvokableInstance node) + { + return node.callOnInstance(() -> HintsServiceMetrics.hintsApplyFailed.getCount()); + } + + @SuppressWarnings("Convert2MethodRef") + private static Long countHintsThrottle(IInvokableInstance node) + { + return node.callOnInstance(() -> HintsServiceMetrics.hintsThrottle.getCount()); + } + private static Long countGlobalDelays(IInvokableInstance node) { return getHistogramCount(node, "org.apache.cassandra.metrics.HintsService.Hint_delays"); diff --git a/test/distributed/org/apache/cassandra/distributed/test/repair/AutoRepairSchedulerDisallowParallelReplicaRepairAcrossSchedulesTest.java b/test/distributed/org/apache/cassandra/distributed/test/repair/AutoRepairSchedulerDisallowParallelReplicaRepairAcrossSchedulesTest.java new file mode 100644 index 000000000000..e1ccbcb8da1b --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/repair/AutoRepairSchedulerDisallowParallelReplicaRepairAcrossSchedulesTest.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.repair; + +import java.io.IOException; +import java.util.concurrent.TimeUnit; + +import com.google.common.collect.ImmutableMap; + +import org.apache.cassandra.Util; +import org.apache.cassandra.metrics.AutoRepairMetrics; +import org.apache.cassandra.metrics.AutoRepairMetricsManager; + +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.repair.autorepair.AutoRepair; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig; +import org.apache.cassandra.service.AutoRepairService; +import org.apache.cassandra.utils.FBUtilities; + +import static org.hamcrest.Matchers.greaterThan; +import static org.junit.Assert.assertEquals; + +/** + * Distributed tests for {@link org.apache.cassandra.repair.autorepair.AutoRepair} scheduler's + * allow_parallel_replica_repair_across_schedules feature. + */ +public class AutoRepairSchedulerDisallowParallelReplicaRepairAcrossSchedulesTest extends TestBaseImpl +{ + private static Cluster cluster; + + @BeforeClass + public static void init() throws IOException + { + // Configure a cluster with preview and incremental repair enabled in a way that preview repair can be + // run on all three nodes concurrently, but incremental repair can only be run when there are no parallel + // repairs. We should detect contention in the incremental repair scheduler but not preview repaired + // scheduler as a result. + cluster = Cluster.build(3) + .withConfig(config -> config + .set("auto_repair", + ImmutableMap.of( + "repair_type_overrides", + ImmutableMap.of(AutoRepairConfig.RepairType.PREVIEW_REPAIRED.getConfigName(), + ImmutableMap.of( + // Configure preview repair to run frequently to + // provoke contention with incremental scheduler. + "initial_scheduler_delay", "5s", + "enabled", "true", + "parallel_repair_count", "3", + "allow_parallel_replica_repair", "true", + "min_repair_interval", "5s"), + AutoRepairConfig.RepairType.INCREMENTAL.getConfigName(), + ImmutableMap.of( + "initial_scheduler_delay", "5s", + "enabled", "true", + "parallel_repair_count", "3", + // Don't allow parallel replica repair across + // schedules + "allow_parallel_replica_repair", "false", + "allow_parallel_replica_repair_across_schedules", "false", + "min_repair_interval", "5s")))) + .set("auto_repair.enabled", "true") + .set("auto_repair.global_settings.repair_retry_backoff", "5s") + .set("auto_repair.repair_task_min_duration", "0s") + .set("auto_repair.repair_check_interval", "5s")) + .start(); + + cluster.schemaChange("CREATE KEYSPACE IF NOT EXISTS " + KEYSPACE + " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 3};"); + cluster.schemaChange(withKeyspace("CREATE TABLE %s.tbl (pk int, ck text, v1 int, v2 int, PRIMARY KEY (pk, ck)) WITH read_repair='NONE'")); + } + + @AfterClass + public static void tearDown() + { + cluster.close(); + } + + @Test + public void testScheduler() + { + cluster.forEach(i -> i.runOnInstance(() -> { + try + { + AutoRepairService.setup(); + AutoRepair.instance.setup(); + } + catch (Exception e) + { + throw new RuntimeException(e); + } + })); + + // validate that the repair ran on all nodes + cluster.forEach(i -> i.runOnInstance(() -> { + // Expect contention on incremental repair across schedules + AutoRepairMetrics incrementalMetrics = AutoRepairMetricsManager.getMetrics(AutoRepairConfig.RepairType.INCREMENTAL); + Util.spinAssert(String.format("%s: AutoRepair has not observed any replica contention in INCREMENTAL repair", + FBUtilities.getJustBroadcastAddress().toString()), + greaterThan(0L), + incrementalMetrics.repairDelayedBySchedule::getCount, + 5, + TimeUnit.MINUTES); + + // No repair contention should be observed for preview repaired since allow_parallel_replica_repair was true + AutoRepairMetrics previewMetrics = AutoRepairMetricsManager.getMetrics(AutoRepairConfig.RepairType.PREVIEW_REPAIRED); + assertEquals(0L, previewMetrics.repairDelayedByReplica.getCount()); + assertEquals(0L, previewMetrics.repairDelayedBySchedule.getCount()); + })); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/repair/AutoRepairSchedulerTest.java b/test/distributed/org/apache/cassandra/distributed/test/repair/AutoRepairSchedulerTest.java new file mode 100644 index 000000000000..adca5070828e --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/repair/AutoRepairSchedulerTest.java @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.repair; + +import java.io.IOException; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +import com.google.common.collect.ImmutableMap; + +import org.apache.cassandra.Util; +import org.apache.cassandra.config.DurationSpec; +import org.apache.cassandra.distributed.api.TokenSupplier; +import org.apache.cassandra.metrics.AutoRepairMetrics; +import org.apache.cassandra.metrics.AutoRepairMetricsManager; +import org.apache.cassandra.schema.SystemDistributedKeyspace; + +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.repair.autorepair.AutoRepair; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig; +import org.apache.cassandra.service.AutoRepairService; +import org.apache.cassandra.utils.FBUtilities; + +import static org.apache.cassandra.schema.SchemaConstants.DISTRIBUTED_KEYSPACE_NAME; +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; +import static org.junit.Assert.assertEquals; + +/** + * Distributed tests for {@link org.apache.cassandra.repair.autorepair.AutoRepair} scheduler + */ +public class AutoRepairSchedulerTest extends TestBaseImpl +{ + private static Cluster cluster; + static SimpleDateFormat sdf; + + @BeforeClass + public static void init() throws IOException + { + // Define the expected date format pattern + String pattern = "EEE MMM dd HH:mm:ss z yyyy"; + // Create SimpleDateFormat object with the given pattern + sdf = new SimpleDateFormat(pattern); + sdf.setLenient(false); + // Configure a 3-node cluster with num_tokens: 4 and auto_repair enabled + cluster = Cluster.build(3) + .withTokenCount(4) + .withTokenSupplier(TokenSupplier.evenlyDistributedTokens(3, 4)) + .withConfig(config -> config + .set("num_tokens", 4) + .set("auto_repair", + ImmutableMap.of( + "repair_type_overrides", + ImmutableMap.of(AutoRepairConfig.RepairType.FULL.getConfigName(), + ImmutableMap.of( + "initial_scheduler_delay", "5s", + "enabled", "true", + "parallel_repair_count", "3", + // Allow parallel replica repair to allow replicas + // to execute full repair at same time. + "allow_parallel_replica_repair", "true", + "min_repair_interval", "5s"), + AutoRepairConfig.RepairType.INCREMENTAL.getConfigName(), + ImmutableMap.of( + "initial_scheduler_delay", "5s", + "enabled", "true", + // Set parallel repair count to 3 to provoke + // contention between replicas when scheduling. + "parallel_repair_count", "3", + // Disallow parallel replica repair to prevent + // replicas from issuing incremental repair at + // same time. + "allow_parallel_replica_repair", "false", + // Run more aggressively since full repair is + // less restrictive about when it can run repair, + // so need to check more frequently to allow + // incremental to get an attempt in. + "min_repair_interval", "5s")))) + .set("auto_repair.enabled", "true") + .set("auto_repair.global_settings.repair_by_keyspace", "true") + .set("auto_repair.global_settings.repair_retry_backoff", "5s") + .set("auto_repair.repair_task_min_duration", "0s") + .set("auto_repair.repair_check_interval", "5s")) + .start(); + + cluster.schemaChange("CREATE KEYSPACE IF NOT EXISTS " + KEYSPACE + " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 3};"); + cluster.schemaChange(withKeyspace("CREATE TABLE %s.tbl (pk int, ck text, v1 int, v2 int, PRIMARY KEY (pk, ck)) WITH read_repair='NONE'")); + } + + @AfterClass + public static void tearDown() + { + cluster.close(); + } + + @Test + public void testScheduler() throws ParseException + { + // ensure there was no history of previous repair runs through the scheduler + Object[][] rows = cluster.coordinator(1).execute(String.format("SELECT repair_type, host_id, repair_start_ts, repair_finish_ts, repair_turn FROM %s.%s", DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY), ConsistencyLevel.QUORUM); + assertEquals(0, rows.length); + + cluster.forEach(i -> i.runOnInstance(() -> { + try + { + AutoRepairService.setup(); + AutoRepair.instance.setup(); + } + catch (Exception e) + { + throw new RuntimeException(e); + } + })); + + // validate that the repair ran on all nodes + cluster.forEach(i -> i.runOnInstance(() -> { + String broadcastAddress = FBUtilities.getJustBroadcastAddress().toString(); + + // Reduce sleeping if repair finishes quickly to speed up test but make it non-zero to provoke some + // contention. + AutoRepair.SLEEP_IF_REPAIR_FINISHES_QUICKLY = new DurationSpec.IntSecondsBound("2s"); + + AutoRepairMetrics incrementalMetrics = AutoRepairMetricsManager.getMetrics(AutoRepairConfig.RepairType.INCREMENTAL); + // Since the AutoRepair sleeps up to SLEEP_IF_REPAIR_FINISHES_QUICKLY if the repair finishes quickly, + // so the "nodeRepairTimeInSec" metric should at least be greater than or equal to + // SLEEP_IF_REPAIR_FINISHES_QUICKLY + Util.spinAssert(String.format("%s: AutoRepair has not yet completed one INCREMENTAL repair cycle", broadcastAddress), + greaterThanOrEqualTo(2L), + () -> incrementalMetrics.nodeRepairTimeInSec.getValue().longValue(), + 5, + TimeUnit.MINUTES); + + // Expect some contention on incremental repair. + Util.spinAssert(String.format("%s: AutoRepair has not observed any replica contention in INCREMENTAL repair", broadcastAddress), + greaterThan(0L), + incrementalMetrics.repairDelayedByReplica::getCount, + 5, + TimeUnit.MINUTES); + // Do not expect any contention across schedules since allow_parallel_replica_repairs across schedules + // was not configured. + assertEquals(0L, incrementalMetrics.repairDelayedBySchedule.getCount()); + + AutoRepairMetrics fullMetrics = AutoRepairMetricsManager.getMetrics(AutoRepairConfig.RepairType.FULL); + Util.spinAssert(String.format("%s: AutoRepair has not yet completed one FULL repair cycle", broadcastAddress), + greaterThanOrEqualTo(2L), + () -> fullMetrics.nodeRepairTimeInSec.getValue().longValue(), + 5, + TimeUnit.MINUTES); + + // No repair contention should be observed for full repair since allow_parallel_replica_repair was true + assertEquals(0L, fullMetrics.repairDelayedByReplica.getCount()); + assertEquals(0L, fullMetrics.repairDelayedBySchedule.getCount()); + })); + + validate(AutoRepairConfig.RepairType.FULL.toString()); + validate(AutoRepairConfig.RepairType.INCREMENTAL.toString()); + } + + private void validate(String repairType) throws ParseException + { + Object[][] rows = cluster.coordinator(1).execute(String.format("SELECT repair_type, host_id, repair_start_ts, repair_finish_ts, repair_turn FROM %s.%s where repair_type='%s'", DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, repairType), ConsistencyLevel.QUORUM); + assertEquals(3, rows.length); + for (int node = 0; node < rows.length; node++) + { + Object[] row = rows[node]; + // repair_type + Assert.assertEquals(repairType, row[0].toString()); + // host_id + Assert.assertNotNull(UUID.fromString(row[1].toString())); + // ensure there is a legit repair_start_ts and repair_finish_ts + sdf.parse(row[2].toString()); + sdf.parse(row[3].toString()); + // the reason why the repair was scheduled + Assert.assertNotNull(row[4]); + Assert.assertEquals("MY_TURN", row[4].toString()); + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/sai/IndexAvailabilityTest.java b/test/distributed/org/apache/cassandra/distributed/test/sai/IndexAvailabilityTest.java index 53cca2614dc3..66a63fa11b91 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/sai/IndexAvailabilityTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/sai/IndexAvailabilityTest.java @@ -33,6 +33,7 @@ import net.bytebuddy.dynamic.loading.ClassLoadingStrategy; import net.bytebuddy.implementation.MethodDelegation; import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.api.IInvokableInstance; import org.apache.cassandra.distributed.test.TestBaseImpl; import org.apache.cassandra.index.Index; @@ -48,6 +49,7 @@ import static org.apache.cassandra.distributed.api.Feature.GOSSIP; import static org.apache.cassandra.distributed.api.Feature.NETWORK; import static org.apache.cassandra.distributed.test.sai.SAIUtil.waitForIndexQueryable; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.awaitility.Awaitility.await; import static org.junit.Assert.assertEquals; @@ -57,7 +59,7 @@ public class IndexAvailabilityTest extends TestBaseImpl private static final String CREATE_TABLE = "CREATE TABLE %s.%s (pk text primary key, v1 int, v2 text) " + "WITH compaction = {'class' : 'SizeTieredCompactionStrategy', 'enabled' : false }"; private static final String CREATE_INDEX = "CREATE CUSTOM INDEX %s ON %s.%s(%s) USING 'StorageAttachedIndex'"; - + private static final Map expectedNodeIndexQueryability = new ConcurrentHashMap<>(); private List keyspaces; private List indexesPerKs; @@ -188,6 +190,82 @@ private void markIndexNonQueryable(IInvokableInstance node, String keyspace, Str }); } + @Test + public void testIndexExceptionsTwoIndexesOn3NodeCluster() throws Exception + { + try (Cluster cluster = init(Cluster.build(3) + .withConfig(config -> config.with(GOSSIP) + .with(NETWORK)) + .start())) + { + String ks2 = "ks2"; + String cf1 = "cf1"; + String index1 = "cf1_idx1"; + String index2 = "cf1_idx2"; + + // Create keyspace, table with correct column types + cluster.schemaChange(String.format(CREATE_KEYSPACE, ks2, 2)); + cluster.schemaChange("CREATE TABLE " + ks2 + '.' + cf1 + " (pk int PRIMARY KEY, v1 int, v2 int)"); + executeOnAllCoordinators(cluster, + "SELECT pk FROM " + ks2 + '.' + cf1 + " WHERE v1=0 AND v2=0 ALLOW FILTERING"); + executeOnAllCoordinators(cluster, + "SELECT pk FROM " + ks2 + '.' + cf1 + " WHERE v2=0 ALLOW FILTERING"); + executeOnAllCoordinators(cluster, + "SELECT pk FROM " + ks2 + '.' + cf1 + " WHERE v1=0 ALLOW FILTERING"); + + cluster.schemaChange(String.format(CREATE_INDEX, index1, ks2, cf1, "v1")); + cluster.schemaChange(String.format(CREATE_INDEX, index2, ks2, cf1, "v2")); + cluster.forEach(node -> expectedNodeIndexQueryability.put(NodeIndex.create(ks2, index1, node), Index.Status.BUILD_SUCCEEDED)); + for (IInvokableInstance node : cluster.get(2, 1, 3)) + for (IInvokableInstance replica : cluster.get(1, 2, 3)) + waitForIndexingStatus(node, ks2, index1, replica, Index.Status.BUILD_SUCCEEDED); + + // Mark only index2 as building on node3, leave index1 in BUILD_SUCCEEDED state + markIndexBuilding(cluster.get(3), ks2, cf1, index2); + cluster.forEach(node -> expectedNodeIndexQueryability.put(NodeIndex.create(ks2, index2, node), Index.Status.FULL_REBUILD_STARTED)); + for (IInvokableInstance node : cluster.get(1, 2, 3)) + waitForIndexingStatus(node, ks2, index2, cluster.get(3), Index.Status.FULL_REBUILD_STARTED); + + assertThatThrownBy(() -> + executeOnAllCoordinators(cluster, + "SELECT pk FROM " + ks2 + '.' + cf1 + " WHERE v1=0 AND v2=0")) + .hasMessageContaining("Operation failed - received 1 responses and 1 failures: INDEX_BUILD_IN_PROGRESS"); + + // Mark only index2 as failing on node2, leave index1 in BUILD_SUCCEEDED state + markIndexBuilding(cluster.get(2), ks2, cf1, index2); + cluster.forEach(node -> expectedNodeIndexQueryability.put(NodeIndex.create(ks2, index2, node), Index.Status.FULL_REBUILD_STARTED)); + for (IInvokableInstance node : cluster.get(1, 2, 3)) + waitForIndexingStatus(node, ks2, index2, cluster.get(2), Index.Status.FULL_REBUILD_STARTED); + + + assertThatThrownBy(() -> + executeOnAllCoordinators(cluster, + "SELECT pk FROM " + ks2 + '.' + cf1 + " WHERE v1=0 AND v2=0")) + .hasMessageContaining("Operation failed - received 1 responses and 1 failures: INDEX_BUILD_IN_PROGRESS"); + + // Mark only index2 as failing on node1, leave index1 in BUILD_SUCCEEDED state + markIndexNonQueryable(cluster.get(1), ks2, cf1, index2); + cluster.forEach(node -> expectedNodeIndexQueryability.put(NodeIndex.create(ks2, index2, node), Index.Status.BUILD_FAILED)); + for (IInvokableInstance node : cluster.get(1, 2, 3)) { + waitForIndexingStatus(node, ks2, index2, cluster.get(1), Index.Status.BUILD_FAILED); + } + + assertThatThrownBy(() -> + executeOnAllCoordinators(cluster, + "SELECT pk FROM " + ks2 + '.' + cf1 + " WHERE v1=0 AND v2=0")) + .hasMessageMatching("^Operation failed - received 0 responses and 2 failures: INDEX_NOT_AVAILABLE from .+, INDEX_BUILD_IN_PROGRESS from .+$"); + } + } + + private void executeOnAllCoordinators(Cluster cluster, String query) + { + // test different coordinator + for (int nodeId = 1; nodeId <= cluster.size(); nodeId++) + { + assertEquals(0, cluster.coordinator(nodeId).execute(query, ConsistencyLevel.LOCAL_QUORUM).length); + } + } + @SuppressWarnings("DataFlowIssue") private void markIndexQueryable(IInvokableInstance node, String keyspace, String table, String indexName) { diff --git a/test/distributed/org/apache/cassandra/distributed/test/sai/PartialWritesWithRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/sai/PartialWritesWithRepairTest.java new file mode 100644 index 000000000000..703137c5c15a --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/sai/PartialWritesWithRepairTest.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.sai; + +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.shared.ClusterUtils; +import org.apache.cassandra.distributed.shared.ClusterUtils.Range; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.junit.Test; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +public class PartialWritesWithRepairTest extends TestBaseImpl +{ + @Test + public void test() throws IOException + { + try (Cluster cluster = Cluster.build(2) + .withConfig(c -> c.with(Feature.values())) + .start()) + { + init(cluster); + cluster.schemaChange(withKeyspace("CREATE TABLE %s.tbl (pk vector, ck int, s1 int static, v1 int, v2 int, PRIMARY KEY (pk, ck))")); + cluster.schemaChange(withKeyspace("CREATE INDEX ON %s.tbl(s1) USING 'sai'")); + cluster.schemaChange(withKeyspace("CREATE INDEX ON %s.tbl(v1) USING 'sai'")); + cluster.schemaChange(withKeyspace("CREATE INDEX ON %s.tbl(v2) USING 'sai'")); + IInvokableInstance node1 = cluster.get(1); + IInvokableInstance node2 = cluster.get(2); + // see org.apache.cassandra.service.StorageService.repair + List partialRanges = ClusterUtils.getPrimaryRanges(node1, KEYSPACE); + var completeRanges = completeRanges(partialRanges); + + // write to each column for the complete set + // avoid writing to one of the columns for the partial set + for (var range : completeRanges) + { + ByteBuffer pk = key(range); + node2.executeInternal(withKeyspace("INSERT INTO %s.tbl(pk, ck, s1, v1, v2) VALUES (?, ?, ?, ?, ?)"), pk, 0, 0, 0, 0); + node2.executeInternal(withKeyspace("INSERT INTO %s.tbl(pk, ck, s1, v1, v2) VALUES (?, ?, ?, ?, ?)"), pk, 1, 0, 1, 1); + } + for (var range : partialRanges) + { + ByteBuffer pk = key(range); + node2.executeInternal(withKeyspace("INSERT INTO %s.tbl(pk, ck, v1) VALUES (?, ?, ?)"), pk, 0, 0); + node2.executeInternal(withKeyspace("INSERT INTO %s.tbl(pk, ck, v1) VALUES (?, ?, ?)"), pk, 1, 1); + } + + node1.nodetoolResult("repair", KEYSPACE, "-pr").asserts().success(); + } + } + + private static ByteBuffer key(Range range) + { + return Murmur3Partitioner.LongToken.keyForToken(range.right()); + } + + private static List completeRanges(List ranges) + { + ranges.sort(Comparator.comparingLong(Range::left)); + List list = new ArrayList<>(); + Range previous = ranges.get(0); + if (previous.left() != Long.MIN_VALUE) + list.add(new Range(Long.MIN_VALUE, ranges.get(0).left())); + for (int i = 1; i < ranges.size(); i++) + { + Range next = ranges.get(i); + if (!previous.right.equals(next.left)) + list.add(new Range(previous.right, next.left)); + previous = next; + } + return list; + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/sai/ReplicaFilteringWithStaticsTest.java b/test/distributed/org/apache/cassandra/distributed/test/sai/ReplicaFilteringWithStaticsTest.java index 59cc1cc6a51f..61f221c1520f 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/sai/ReplicaFilteringWithStaticsTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/sai/ReplicaFilteringWithStaticsTest.java @@ -27,6 +27,7 @@ import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.distributed.test.cql3.MultiNodeTableWalkWithoutReadRepairTest; import static org.apache.cassandra.distributed.api.ConsistencyLevel.ALL; import static org.apache.cassandra.distributed.api.Feature.GOSSIP; @@ -44,6 +45,45 @@ public static void setUpCluster() throws IOException CLUSTER = init(Cluster.build(3).withConfig(config -> config.set("hinted_handoff_enabled", false).with(GOSSIP).with(NETWORK)).start()); } + @Test + public void testRowFilterDeletePurging() + { + testRowFilterDeletePurging(false); + } + + @Test + public void testRowFilterDeletePurgingSAI() + { + testRowFilterDeletePurging(true); + } + + /** + * Originally discovered by {@link MultiNodeTableWalkWithoutReadRepairTest} with seed 6640281155419111674 + */ + public void testRowFilterDeletePurging(boolean sai) + { + String table = "row_filtering_delete_purging" + (sai ? "_sai" : ""); + + CLUSTER.schemaChange(withKeyspace("CREATE TABLE %s." + table + " (pk0 double, ck0 boolean, s0 ascii static, v0 ascii, " + + "PRIMARY KEY (pk0, ck0)) WITH CLUSTERING ORDER BY (ck0 DESC) AND read_repair = 'NONE'")); + disableCompaction(CLUSTER, KEYSPACE, table); + + if (sai) + { + CLUSTER.schemaChange(withKeyspace("CREATE INDEX ON %s." + table + "(s0) USING 'sai'")); + SAIUtil.waitForIndexQueryable(CLUSTER, KEYSPACE); + } + + CLUSTER.get(3).executeInternal(withKeyspace("UPDATE %s." + table + " USING TIMESTAMP 1 SET s0='foo', v0='c' WHERE pk0 = 2.9 AND ck0 IN (false, true)")); + + // This delete must be resolved by RFP to eliminate the row with ck0 = true from node 3: + CLUSTER.get(1).executeInternal(withKeyspace("DELETE FROM %s." + table + " USING TIMESTAMP 2 WHERE pk0 = 2.9 AND ck0 = true")); + CLUSTER.get(1).executeInternal(withKeyspace("INSERT INTO %s." + table + " (pk0, ck0, s0, v0) VALUES (2.9, false, 'bar', 'xyz') USING TIMESTAMP 3")); + + String select = withKeyspace("SELECT ck0 FROM %s." + table + " WHERE s0 = 'bar' ALLOW FILTERING"); + assertRows(CLUSTER.coordinator(1).executeWithPaging(select, ALL, 100), row(false)); + } + @Test public void testStaticMatchWithPartitionDelete() { diff --git a/test/distributed/org/apache/cassandra/distributed/test/sai/SAIUtil.java b/test/distributed/org/apache/cassandra/distributed/test/sai/SAIUtil.java index 36c6e8445d7a..c00e5699ba8c 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/sai/SAIUtil.java +++ b/test/distributed/org/apache/cassandra/distributed/test/sai/SAIUtil.java @@ -29,7 +29,6 @@ import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.Feature; import org.apache.cassandra.distributed.api.IInstance; -import org.apache.cassandra.distributed.api.IInvokableInstance; import org.apache.cassandra.distributed.api.SimpleQueryResult; import org.apache.cassandra.index.Index; import org.apache.cassandra.index.IndexStatusManager; @@ -86,23 +85,26 @@ public static void assertIndexQueryable(Cluster cluster, String keyspace, String */ private static void assertIndexesQueryable(Cluster cluster, String keyspace, final Iterable indexes) { - IInvokableInstance localNode = cluster.get(1); final List nodes = cluster.stream() .map(node -> nodeAddress(node.broadcastAddress())) .collect(Collectors.toList()); - localNode.runOnInstance(() -> { - for (String index : indexes) - { - for (InetAddressAndPort node : nodes) + for (var localNode : cluster) + { + if (localNode.isShutdown()) continue; + localNode.runOnInstance(() -> { + for (String index : indexes) { - Index.Status status = IndexStatusManager.instance.getIndexStatus(node, keyspace, index); - assert status == Index.Status.BUILD_SUCCEEDED + for (InetAddressAndPort node : nodes) + { + Index.Status status = IndexStatusManager.instance.getIndexStatus(node, keyspace, index); + assert status == Index.Status.BUILD_SUCCEEDED : "Index " + index + " not queryable on node " + node + " (status = " + status + ')'; + } } - } - }); + }); + } } private static InetAddressAndPort nodeAddress(InetSocketAddress address) diff --git a/test/distributed/org/apache/cassandra/distributed/test/sai/StrictFilteringTest.java b/test/distributed/org/apache/cassandra/distributed/test/sai/StrictFilteringTest.java index 480a93abe37f..b23e3f45875a 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/sai/StrictFilteringTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/sai/StrictFilteringTest.java @@ -25,10 +25,13 @@ import org.junit.BeforeClass; import org.junit.Test; +import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.test.TestBaseImpl; +import static org.junit.Assert.assertEquals; + import static org.apache.cassandra.distributed.api.Feature.GOSSIP; import static org.apache.cassandra.distributed.api.Feature.NETWORK; import static org.apache.cassandra.distributed.shared.AssertUtils.assertRows; @@ -216,6 +219,51 @@ public void testShortReadWithRegularColumns() assertRows(initialRows, row(0, 1, 2)); } + @Test + public void testNoShortReadAtLimit() + { + CLUSTER.schemaChange(withKeyspace("CREATE TABLE %s.no_srp_at_limit (k int, c int, a int, PRIMARY KEY (k, c)) WITH read_repair = 'NONE'")); + CLUSTER.schemaChange(withKeyspace("CREATE INDEX ON %s.no_srp_at_limit(a) USING 'sai'")); + SAIUtil.waitForIndexQueryable(CLUSTER, KEYSPACE); + + CLUSTER.get(1).executeInternal(withKeyspace("INSERT INTO %s.no_srp_at_limit(k, c, a) VALUES (0, 2, 1) USING TIMESTAMP 5")); + CLUSTER.get(2).executeInternal(withKeyspace("INSERT INTO %s.no_srp_at_limit(k, c, a) VALUES (0, 3, 1) USING TIMESTAMP 6")); + + Long srpRequestsBefore = CLUSTER.get(1).callOnInstance(() -> Keyspace.open(KEYSPACE).getColumnFamilyStore("no_srp_at_limit").metric.shortReadProtectionRequests.getCount()); + + String select = withKeyspace("SELECT * FROM %s.no_srp_at_limit WHERE k = 0 AND a = 1 LIMIT 1"); + Object[][] initialRows = CLUSTER.coordinator(1).execute(select, ConsistencyLevel.ALL, 2); + assertRows(initialRows, row(0, 2, 1)); + + Long srpRequestsAfter = CLUSTER.get(1).callOnInstance(() -> Keyspace.open(KEYSPACE).getColumnFamilyStore("no_srp_at_limit").metric.shortReadProtectionRequests.getCount()); + assertEquals(srpRequestsBefore, srpRequestsAfter); + } + + @Test + public void testNecessaryShortRead() + { + CLUSTER.schemaChange(withKeyspace("CREATE TABLE %s.necessary_short_read (k int, c int, a int, PRIMARY KEY (k, c)) WITH read_repair = 'NONE'")); + CLUSTER.schemaChange(withKeyspace("CREATE INDEX ON %s.necessary_short_read(a) USING 'sai'")); + SAIUtil.waitForIndexQueryable(CLUSTER, KEYSPACE); + + CLUSTER.get(1).executeInternal(withKeyspace("INSERT INTO %s.necessary_short_read(k, c, a) VALUES (0, 2, 1) USING TIMESTAMP 5")); + CLUSTER.get(2).executeInternal(withKeyspace("INSERT INTO %s.necessary_short_read(k, c, a) VALUES (0, 2, 2) USING TIMESTAMP 6")); + + CLUSTER.get(2).executeInternal(withKeyspace("INSERT INTO %s.necessary_short_read(k, c, a) VALUES (0, 3, 1) USING TIMESTAMP 7")); + CLUSTER.get(1).executeInternal(withKeyspace("INSERT INTO %s.necessary_short_read(k, c, a) VALUES (0, 3, 2) USING TIMESTAMP 8")); + + CLUSTER.get(1).executeInternal(withKeyspace("INSERT INTO %s.necessary_short_read(k, c, a) VALUES (0, 4, 1) USING TIMESTAMP 9")); + + Long srpRequestsBefore = CLUSTER.get(1).callOnInstance(() -> Keyspace.open(KEYSPACE).getColumnFamilyStore("necessary_short_read").metric.shortReadProtectionRequests.getCount()); + + String select = withKeyspace("SELECT * FROM %s.necessary_short_read WHERE k = 0 AND a = 1 LIMIT 1"); + Object[][] initialRows = CLUSTER.coordinator(1).execute(select, ConsistencyLevel.ALL); + assertRows(initialRows, row(0, 4, 1)); + + Long srpRequestsAfter = CLUSTER.get(1).callOnInstance(() -> Keyspace.open(KEYSPACE).getColumnFamilyStore("necessary_short_read").metric.shortReadProtectionRequests.getCount()); + assertEquals(srpRequestsBefore + 2L, srpRequestsAfter.longValue()); + } + @Test public void testShortReadWithStaticColumn() { diff --git a/test/distributed/org/apache/cassandra/distributed/test/tcm/FailureDetectorRecomputeTest.java b/test/distributed/org/apache/cassandra/distributed/test/tcm/FailureDetectorRecomputeTest.java new file mode 100644 index 000000000000..7abfd7386cf2 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/tcm/FailureDetectorRecomputeTest.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.tcm; + +import java.io.IOException; + +import org.junit.Test; + +import java.util.concurrent.Callable; +import java.util.concurrent.atomic.AtomicBoolean; + +import net.bytebuddy.ByteBuddy; +import net.bytebuddy.dynamic.loading.ClassLoadingStrategy; +import net.bytebuddy.implementation.MethodDelegation; +import net.bytebuddy.implementation.bind.annotation.SuperCall; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.gms.FailureDetector; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.ReplicaPlan; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.transformations.CustomTransformation; + +import static net.bytebuddy.matcher.ElementMatchers.named; +import static net.bytebuddy.matcher.ElementMatchers.takesArguments; + +public class FailureDetectorRecomputeTest extends TestBaseImpl +{ + @Test + public void readTest() throws IOException + { + try (Cluster cluster = init(Cluster.build(3) + .withInstanceInitializer(BB::install) + .start())) + { + cluster.schemaChange(withKeyspace("create table %s.tbl (id int primary key)")); + cluster.get(1).runOnInstance(() -> BB.enabled.set(true)); + for (int i = 0; i < 10; i++) + cluster.coordinator(1).execute(withKeyspace("select * from %s.tbl where id=?"), ConsistencyLevel.QUORUM, i); + } + } + + @Test + public void writeTest() throws IOException + { + try (Cluster cluster = init(Cluster.build(3) + .withInstanceInitializer(BB::install) + .start())) + { + cluster.schemaChange(withKeyspace("create table %s.tbl (id int primary key)")); + cluster.get(1).runOnInstance(() -> BB.enabled.set(true)); + for (int i = 0; i < 10; i++) + cluster.coordinator(1).execute(withKeyspace("insert into %s.tbl (id) values (?)"), ConsistencyLevel.QUORUM, i); + } + } + + public static class BB + { + public static AtomicBoolean enabled = new AtomicBoolean(); + + public static void install(ClassLoader cl, int i) + { + new ByteBuddy().rebase(FailureDetector.class) + .method(named("isAlive").and(takesArguments(1))) + .intercept(MethodDelegation.to(FailureDetectorRecomputeTest.BB.class)) + .make() + .load(cl, ClassLoadingStrategy.Default.INJECTION); + + new ByteBuddy().rebase(ReplicaPlan.AbstractForRead.class) + .method(named("stillAppliesTo").and(takesArguments(1))) + .intercept(MethodDelegation.to(FailureDetectorRecomputeTest.BB.class)) + .make() + .load(cl, ClassLoadingStrategy.Default.INJECTION); + } + + static int downNode = 1; + public static boolean isAlive(InetAddressAndPort ep) + { + if (!enabled.get()) + return true; + enabled.set(false); + ClusterMetadataService.instance().commit(CustomTransformation.make("hello")); + enabled.set(true); + return !ep.equals(InetAddressAndPort.getByNameUnchecked("127.0.0." + ((downNode % 3) + 1))); + } + + public static boolean stillAppliesTo(ClusterMetadata metadata, @SuperCall Callable zuper) throws Exception + { + if (!enabled.get()) + return true; + downNode++; + return zuper.call(); + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/tcm/RepairMetadataKeyspaceTest.java b/test/distributed/org/apache/cassandra/distributed/test/tcm/RepairMetadataKeyspaceTest.java index 074a64913f48..70d1acfddd14 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/tcm/RepairMetadataKeyspaceTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/tcm/RepairMetadataKeyspaceTest.java @@ -28,6 +28,7 @@ import org.junit.Test; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.IInvokableInstance; import org.apache.cassandra.distributed.shared.ClusterUtils; @@ -63,6 +64,7 @@ public void testRepairMetadataKeyspace() throws Throwable IInvokableInstance toRepair = cluster.get(3); stopUnchecked(toRepair); + DatabaseDescriptor.clientInitialization(); String targetDir = DistributedMetadataLogKeyspace.TABLE_NAME + '-' + DistributedMetadataLogKeyspace.LOG_TABLE_ID.toHexString(); for (File datadir : getDataDirectories(toRepair)) { diff --git a/test/distributed/org/apache/cassandra/distributed/test/thresholds/TombstoneCountWarningTest.java b/test/distributed/org/apache/cassandra/distributed/test/thresholds/TombstoneCountWarningTest.java index 5e409cbc31e0..054256a08f1e 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/thresholds/TombstoneCountWarningTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/thresholds/TombstoneCountWarningTest.java @@ -57,6 +57,7 @@ import org.apache.cassandra.distributed.test.JavaDriverUtils; import org.apache.cassandra.distributed.test.TestBaseImpl; import org.apache.cassandra.exceptions.ReadFailureException; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.exceptions.TombstoneAbortException; import org.apache.cassandra.locator.InetAddressAndPort; @@ -425,7 +426,7 @@ public static void awaitResults(@SuperCall Runnable zuper) } @SuppressWarnings("unused") - public static void onFailure(InetAddressAndPort from, RequestFailureReason failureReason, @SuperCall Runnable zuper) throws Exception + public static void onFailure(InetAddressAndPort from, RequestFailure failure, @SuperCall Runnable zuper) throws Exception { State.onFailure(new InetSocketAddress(from.getAddress(), from.getPort())); zuper.run(); diff --git a/test/distributed/org/apache/cassandra/distributed/test/topology/DecommissionAvoidTimeouts.java b/test/distributed/org/apache/cassandra/distributed/test/topology/DecommissionAvoidTimeouts.java index 69ccafdd458e..ddf59ae84058 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/topology/DecommissionAvoidTimeouts.java +++ b/test/distributed/org/apache/cassandra/distributed/test/topology/DecommissionAvoidTimeouts.java @@ -74,6 +74,7 @@ public abstract class DecommissionAvoidTimeouts extends TestBaseImpl @Test public void test() throws Exception { + IInvokableInstance paused = null; try (Cluster cluster = Cluster.build(8) .withRacks(2, 4) .withInstanceInitializer(new BB()) @@ -103,11 +104,13 @@ public void test() throws Exception toDecom.coordinator().execute("INSERT INTO " + table + "(pk) VALUES (?)", ConsistencyLevel.EACH_QUORUM, key); } - Callable pending = pauseBeforeCommit(cluster.get(1), (e) -> e instanceof PrepareLeave.StartLeave); + paused = cluster.get(1); + Callable pending = pauseBeforeCommit(paused, (e) -> e instanceof PrepareLeave.StartLeave); CompletableFuture nodetool = CompletableFuture.runAsync(() -> toDecom.nodetoolResult("decommission").asserts().success()); ClusterUtils.awaitGossipStateMatch(cluster, cluster.get(DECOM_NODE), ApplicationState.SEVERITY); pending.call(); - unpauseCommits(cluster.get(1)); + unpauseCommits(paused); + paused = null; cluster.forEach(i -> i.runOnInstance(() -> ((DynamicEndpointSnitch) DatabaseDescriptor.getNodeProximity()).updateScores())); cluster.filters().verbs(Verb.GOSSIP_DIGEST_SYN.id).drop(); @@ -169,6 +172,12 @@ public void test() throws Exception // ignore } } + finally + { + if (paused != null) + unpauseCommits(paused); + + } } protected abstract String getQuery(String table); diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeBatchTestBase.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeBatchTestBase.java index d7696f73d517..9b868af17b83 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeBatchTestBase.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeBatchTestBase.java @@ -62,6 +62,8 @@ protected void testSimpleStrategy(Semver from, Semver to, boolean isLogged) thro .setup(cluster -> { cluster.schemaChange("CREATE KEYSPACE test_simple WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 2};"); cluster.schemaChange("CREATE TABLE test_simple.names (key int PRIMARY KEY, name text)"); + if (isLogged) + cluster.setUncaughtExceptionsFilter(t -> t.getMessage() != null && t.getMessage().startsWith("Operation timed out")); }) .runAfterNodeUpgrade((cluster, upgraded) -> { if (isLogged) diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModePaxosTTLTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModePaxosTTLTest.java new file mode 100644 index 000000000000..63d2497ad802 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModePaxosTTLTest.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.upgrade; + +import java.util.concurrent.TimeUnit; + +import org.junit.Test; + +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.upgrade.MixedModePaxosTestBase.FakePaxosHelper; + +import static java.lang.String.format; + +public class MixedModePaxosTTLTest extends UpgradeTestBase +{ + /** + * Tests the mixed mode paxos loop bug in CASSANDRA-20514 + * + * CEP-14 changed the ttl behavior of legacy paxos state to expire based off the ballot time of the operation being + * persisted, not the time a commit is persisted. This eliminated the race addressed by CASSANDRA-12043, and so the + * check it added to the most recent commit prepare logic was removed. + * + * When operating in mixed mode though, this can still be a problem. If a 4.1 or higher node is coordinating a paxos + * operation with 2 or more replicas on 4.0 or lower, this race becomes a problem again. You need 3 things to make + * this an infinite loop + * 1. a 4.1 node coordinating a paxos operation with 2x 4.0 replicas + * 2. replica A) a 4.0 node returns a most recent commit for a ballot that's could have been ttld + * 3. replica B) a 4.0 node has ttl'd that mrc AND converted the ttld cells into tombstones + * + * The 4.1 coordinator receives the mrc from replica A, but since it no longer disregards missing most recent commits + * past the ttl window, it sends the "missing" commit to replica B. Since replica B now has a tombstone for that mrc, + * and tombstones win when reconciled with live cells, even ones with ttls, the commit is a noop and it continues + * to report nothing for its mrc value when the coordinator restarts the prepare phase. This loops until the query + * times out. + */ + @Test + public void legacyExpiredStateTest() throws Throwable + { + String keyspace = "ks"; + String table = "tbl"; + int gcGrace = 60*60*24; // 1 day + int key = 100; // hashes to nodes 2 & 3 w/ murmur @ RF=2 + new TestCase() + .withConfig(c -> c.with(Feature.GOSSIP, Feature.NETWORK).set("cas_contention_timeout", "500ms")) + .nodes(3) + .nodesToUpgrade(1) + .upgradesToCurrentFrom(v40) + .setup(cluster -> { + cluster.schemaChange(format("CREATE KEYSPACE %s WITH REPLICATION={'class': 'SimpleStrategy', 'replication_factor': '2'}", keyspace)); + cluster.schemaChange(format("CREATE TABLE %s.%s (k int primary key, v int) " + + "WITH gc_grace_seconds=%s", keyspace, table, gcGrace)); + }) + .runAfterClusterUpgrade(cluster -> { + // disable compaction to prevent paxos state from being purged + cluster.forEach(instance -> instance.nodetool("disableautocompaction")); + + long ballotMicros = TimeUnit.MILLISECONDS.toMicros(System.currentTimeMillis()); + ballotMicros -= TimeUnit.SECONDS.toMicros(gcGrace + 10); + FakePaxosHelper helper = FakePaxosHelper.create(cluster.coordinator(1), keyspace, table, key, gcGrace, ballotMicros); + + // confirm none of the nodes have paxos state + for (int i = 1; i <= cluster.size(); i++) + helper.assertNoPaxosData(cluster.coordinator(i)); + + // save a tombstoned commit to one node to simulate expired cells being converted to tombstones + helper.tombstoneCommit(cluster.coordinator(2)); + + // insert paxos state and confirm it hasn't ttl'd yet + helper.saveCommit(cluster.coordinator(3)); + helper.assertPaxosData(cluster.coordinator(3)); + + // paxos operation should not timeout + cluster.coordinator(1).execute(format("SELECT * FROM %s.%s WHERE k=%s", keyspace, table, key), ConsistencyLevel.SERIAL); + }) + .run(); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModePaxosTestBase.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModePaxosTestBase.java index 9aec8461d982..e238e8c00c2e 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModePaxosTestBase.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModePaxosTestBase.java @@ -76,6 +76,7 @@ private void ttldPaxosStateTest(boolean legacyAware, boolean upgradeAware) throw String keyspace = KEYSPACE; String table = "tbl"; int gcGrace = 10; + int key = 1; new TestCase() .withConfig(c -> c.with(Feature.GOSSIP, Feature.NETWORK)) .nodes(2) @@ -91,11 +92,11 @@ private void ttldPaxosStateTest(boolean legacyAware, boolean upgradeAware) throw // insert a ttl'd committed paxos state long ballotMicros = TimeUnit.NANOSECONDS.toMicros(System.currentTimeMillis()); - FakePaxosHelper helper = FakePaxosHelper.create(cluster.coordinator(1), keyspace, table, gcGrace, ballotMicros); + FakePaxosHelper helper = FakePaxosHelper.create(cluster.coordinator(1), keyspace, table, key, gcGrace, ballotMicros); // confirm none of the nodes have paxos state for (int i = 1; i <= cluster.size(); i++) - Assert.assertEquals(0, cluster.coordinator(i).execute("SELECT * FROM system.paxos", ConsistencyLevel.ONE).length); + helper.assertNoPaxosData(cluster.coordinator(i)); // save commit to both nodes @@ -109,11 +110,11 @@ private void ttldPaxosStateTest(boolean legacyAware, boolean upgradeAware) throw Thread.sleep(TimeUnit.SECONDS.toMillis(gcGrace * 2)); // confirm paxos state has ttld - Assert.assertEquals(0, cluster.coordinator(1).execute("SELECT * FROM system.paxos", ConsistencyLevel.ONE).length); - Assert.assertEquals(0, cluster.coordinator(2).execute("SELECT * FROM system.paxos", ConsistencyLevel.ONE).length); + helper.assertNoPaxosData(cluster.coordinator(1)); + helper.assertNoPaxosData(cluster.coordinator(2)); // paxos operation should not timeout - cluster.coordinator(upgradedCoordinator() ? 1 : 2).execute(format("SELECT * FROM %s.%s WHERE k=1", keyspace, table), ConsistencyLevel.SERIAL); + cluster.coordinator(upgradedCoordinator() ? 1 : 2).execute(format("SELECT * FROM %s.%s WHERE k=%s", keyspace, table, key), ConsistencyLevel.SERIAL); }) .run(); } @@ -133,14 +134,14 @@ public void legacyAwareTTldPaxosStateTest() throws Throwable @Test public void bothAwareTTldPaxosStateTest() throws Throwable { - ttldPaxosStateTest(true, false); + ttldPaxosStateTest(true, true); } /** * This is an upgrade test, and paxos internally limits ttls to 3 hours, so we have to manually save commits in * the paxos table to get entries ttl'd in a reasonable amount of time */ - private static class FakePaxosHelper + static class FakePaxosHelper { static final int current_version = MessagingService.current_version; static final int version_40a = MessagingService.VERSION_40; @@ -181,6 +182,21 @@ ByteBuffer updateBytes(int version) return PartitionUpdate.toBytes(update, version); } + private Object[][] paxosData(ICoordinator coordinator) + { + return coordinator.execute("SELECT * FROM system.paxos WHERE row_key = ? AND cf_id = ?", ConsistencyLevel.ONE, key, cfId); + } + + void assertNoPaxosData(ICoordinator coordinator) + { + Assert.assertEquals(0, paxosData(coordinator).length); + } + + void assertPaxosData(ICoordinator coordinator) + { + Assert.assertEquals(1, paxosData(coordinator).length); + } + void saveCommit(ICoordinator coordinator) { String cql = "UPDATE system.paxos USING TIMESTAMP ? AND TTL ? SET proposal_ballot = null, proposal = null, most_recent_commit_at = ?, most_recent_commit = ?, most_recent_commit_version = ? WHERE row_key = ? AND cf_id = ?"; @@ -194,10 +210,31 @@ void saveCommit(ICoordinator coordinator) cfId); } - public static FakePaxosHelper create(ICoordinator coordinator, String keyspace, String table, int ttl, long ballotMicros) + void tombstoneCommit(ICoordinator coordinator) + { + String cql = "DELETE proposal_ballot, proposal, most_recent_commit_at, most_recent_commit, most_recent_commit_version FROM system.paxos USING TIMESTAMP ? WHERE row_key = ? AND cf_id = ?"; + coordinator.execute(cql, ConsistencyLevel.ONE, + ballotMicros, + key, + cfId); + } + + void saveCommitNoTTL(ICoordinator coordinator) + { + String cql = "UPDATE system.paxos USING TIMESTAMP ? SET proposal_ballot = null, proposal = null, most_recent_commit_at = ?, most_recent_commit = ?, most_recent_commit_version = ? WHERE row_key = ? AND cf_id = ?"; + coordinator.execute(cql, ConsistencyLevel.ONE, + ballotMicros, + ballot, + updateBytes(version_40a), + version_40a, + key, + cfId); + } + + public static FakePaxosHelper create(ICoordinator coordinator, String keyspace, String table, int key, int ttl, long ballotMicros) { UUID cfId = (UUID) coordinator.execute("SELECT id FROM system_schema.tables WHERE keyspace_name=? AND table_name=?", ConsistencyLevel.ONE, keyspace, table)[0][0]; - return new FakePaxosHelper(keyspace, table, cfId, 1, ttl, ballotMicros); + return new FakePaxosHelper(keyspace, table, cfId, key, ttl, ballotMicros); } } } diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeTTLOverflowUpgradeTestBase.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeTTLOverflowUpgradeTestBase.java index c0a4b5b747a4..95db3bebeda0 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeTTLOverflowUpgradeTestBase.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeTTLOverflowUpgradeTestBase.java @@ -22,6 +22,7 @@ import java.util.function.BiConsumer; import java.util.stream.Stream; +import org.agrona.collections.IntHashSet; import org.apache.cassandra.cql3.Attributes; import org.apache.cassandra.distributed.UpgradeableCluster; import org.apache.cassandra.distributed.api.Feature; @@ -77,8 +78,35 @@ enum Step static volatile long clusterStatupTime = 0; + private static class AllowedErrors + { + private final IntHashSet upgraded = new IntHashSet(); + + private void clear() + { + upgraded.clear(); + } + + private void upgraded(int node) + { + upgraded.add(node); + } + + private boolean uncaughtExceptionsFilter(int node, Throwable t) + { + String message = t.getMessage(); + if (message != null && message.endsWith("In order to avoid this use a lower TTL, change the expiration date overflow policy or upgrade to a version where this limitation is fixed. See CASSANDRA-14092 for more details.")) + { + // upgraded nodes should not produce these errors + return !upgraded.contains(node); + } + return false; + } + } + static void testTTLOverflow(RunOnClusterAndNode runAfterNodeUpgrade) throws Throwable { + AllowedErrors allowedErrors = new AllowedErrors(); new TestCase() .nodes(2) .nodesToUpgradeOrdered(1, 2) @@ -87,6 +115,9 @@ static void testTTLOverflow(RunOnClusterAndNode runAfterNodeUpgrade) throws Thro .singleUpgradeToCurrentFrom(v41) .withConfig(c -> c.with(Feature.GOSSIP).set("storage_compatibility_mode", "CASSANDRA_4")) .setup(cluster -> { + allowedErrors.clear(); + cluster.setUncaughtExceptionsFilter(allowedErrors::uncaughtExceptionsFilter); + cluster.schemaChange(String.format("CREATE TABLE %s.%s (k int PRIMARY KEY, v1 int, v2 int)", KEYSPACE, T_REGULAR)); cluster.schemaChange(String.format("CREATE TABLE %s.%s (k int, c int, v1 int, v2 int, PRIMARY KEY (k, c))", KEYSPACE, T_CLUST)); cluster.schemaChange(String.format("CREATE TABLE %s.%s (k int, c int, v1 int static, v2 int, PRIMARY KEY (k, c))", KEYSPACE, T_STATIC)); @@ -100,7 +131,10 @@ static void testTTLOverflow(RunOnClusterAndNode runAfterNodeUpgrade) throws Thro clusterStatupTime = Clock.Global.currentTimeMillis(); verify(Step.NODE1_PREV_NODE2_PREV, cluster, true); }) - .runAfterNodeUpgrade(runAfterNodeUpgrade) + .runAfterNodeUpgrade((c, n) -> { + allowedErrors.upgraded(n); + runAfterNodeUpgrade.run(c, n); + }) .run(); } diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeWritetimeOrTTLTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeWritetimeOrTTLTest.java index 5979fae79bad..ad7143b1582a 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeWritetimeOrTTLTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeWritetimeOrTTLTest.java @@ -20,9 +20,11 @@ import java.util.Arrays; import java.util.List; +import java.util.Set; import org.junit.Test; +import org.agrona.collections.IntHashSet; import org.apache.cassandra.distributed.api.Feature; import org.apache.cassandra.distributed.api.ICoordinator; import org.assertj.core.api.Assertions; @@ -40,9 +42,14 @@ */ public class MixedModeWritetimeOrTTLTest extends UpgradeTestBase { + private static final String CANNOT_USE_SELECTION_FUNCTION_WRITE_TIME_ON_NON_FROZEN_COLLECTION_S = "Cannot use selection function writeTime on non-frozen collection s"; + private static final String CANNOT_USE_SELECTION_FUNCTION_TTL_ON_NON_FROZEN_COLLECTION_S = "Cannot use selection function ttl on non-frozen collection s"; + private static final String MAXWRITETIME_UNKNOWN = "Unknown function 'maxwritetime'"; + @Test public void testWritetimeOrTTLDuringUpgrade() throws Throwable { + AllowedErrors allowedErrors = new AllowedErrors(); new TestCase() .nodes(2) .nodesToUpgradeOrdered(1, 2) @@ -51,6 +58,8 @@ public void testWritetimeOrTTLDuringUpgrade() throws Throwable .singleUpgradeToCurrentFrom(v41) .withConfig(c -> c.with(Feature.GOSSIP)) .setup(cluster -> { + allowedErrors.clear(); + cluster.setUncaughtExceptionsFilter(allowedErrors::uncaughtExceptionsFilter); ICoordinator coordinator = cluster.coordinator(1); cluster.schemaChange(withKeyspace("CREATE TABLE %s.t (k int PRIMARY KEY, v int, s set, fs frozen>)")); @@ -61,6 +70,7 @@ public void testWritetimeOrTTLDuringUpgrade() throws Throwable assertPre42Behaviour(cluster.coordinator(2)); }) .runAfterNodeUpgrade((cluster, node) -> { + allowedErrors.upgraded(node); if (node == 1) // only node1 is upgraded, and the cluster is in mixed mode { assertPost42Behaviour(cluster.coordinator(1)); @@ -80,24 +90,24 @@ private void assertPre42Behaviour(ICoordinator coordinator) // regular column, supported except for maxwritetime assertRows(coordinator.execute(withKeyspace("SELECT writetime(v) FROM %s.t"), ALL), row(2L)); Assertions.assertThatThrownBy(() -> coordinator.execute(withKeyspace("SELECT maxwritetime(v) FROM %s.t"), ALL)) - .hasMessageContaining("Unknown function 'maxwritetime'"); + .hasMessageContaining(MAXWRITETIME_UNKNOWN); Assertions.assertThat((Integer) coordinator.execute(withKeyspace("SELECT ttl(v) FROM %s.t"), ALL)[0][0]) .isLessThanOrEqualTo(2000).isGreaterThan(2000 - 300); // margin of error of 5 minutes since TTLs decrease // frozen collection, supported except for maxwritetime assertRows(coordinator.execute(withKeyspace("SELECT writetime(fs) FROM %s.t"), ALL), row(1L)); Assertions.assertThatThrownBy(() -> coordinator.execute(withKeyspace("SELECT maxwritetime(fs) FROM %s.t"), ALL)) - .hasMessageContaining("Unknown function 'maxwritetime'"); + .hasMessageContaining(MAXWRITETIME_UNKNOWN); Assertions.assertThat((Integer) coordinator.execute(withKeyspace("SELECT ttl(fs) FROM %s.t"), ALL)[0][0]) .isLessThanOrEqualTo(1000).isGreaterThan(1000 - 300); // margin of error of 5 minutes since TTLs decrease // not-frozen collection, not supported Assertions.assertThatThrownBy(() -> coordinator.execute(withKeyspace("SELECT writetime(s) FROM %s.t"), ALL)) - .hasMessageContaining("Cannot use selection function writeTime on non-frozen collection s"); + .hasMessageContaining(CANNOT_USE_SELECTION_FUNCTION_WRITE_TIME_ON_NON_FROZEN_COLLECTION_S); Assertions.assertThatThrownBy(() -> coordinator.execute(withKeyspace("SELECT maxwritetime(s) FROM %s.t"), ALL)) - .hasMessageContaining("Unknown function 'maxwritetime'"); + .hasMessageContaining(MAXWRITETIME_UNKNOWN); Assertions.assertThatThrownBy(() -> coordinator.execute(withKeyspace("SELECT ttl(s) FROM %s.t"), ALL)) - .hasMessageContaining("Cannot use selection function ttl on non-frozen collection s"); + .hasMessageContaining(CANNOT_USE_SELECTION_FUNCTION_TTL_ON_NON_FROZEN_COLLECTION_S); } private void assertPost42Behaviour(ICoordinator coordinator) @@ -120,4 +130,34 @@ private void assertPost42Behaviour(ICoordinator coordinator) Assertions.assertThat(coordinator.execute(withKeyspace("SELECT ttl(s) FROM %s.t"), ALL)[0][0]) .matches(l -> l instanceof List && ((List) l).size() == 4); } + + private static class AllowedErrors + { + private static final Set EXPECTED_ERRORS = Set.of(MAXWRITETIME_UNKNOWN, + CANNOT_USE_SELECTION_FUNCTION_WRITE_TIME_ON_NON_FROZEN_COLLECTION_S, + CANNOT_USE_SELECTION_FUNCTION_TTL_ON_NON_FROZEN_COLLECTION_S); + + private final IntHashSet upgraded = new IntHashSet(); + + private void clear() + { + upgraded.clear(); + } + + private void upgraded(int node) + { + upgraded.add(node); + } + + private boolean uncaughtExceptionsFilter(int node, Throwable t) + { + String message = t.getMessage(); + if (message != null && EXPECTED_ERRORS.contains(message)) + { + // upgraded nodes should not produce these errors + return !upgraded.contains(node); + } + return false; + } + } } diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTestBase.java b/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTestBase.java index 6e6aaaef7669..33c4139878be 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTestBase.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTestBase.java @@ -53,6 +53,7 @@ import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.SimpleGraph; +import static org.apache.cassandra.config.CassandraRelevantProperties.DTEST_ACCORD_ENABLED; import static org.apache.cassandra.config.CassandraRelevantProperties.SKIP_GC_INSPECTOR; import static org.apache.cassandra.distributed.shared.Versions.Version; import static org.apache.cassandra.distributed.shared.Versions.find; @@ -74,6 +75,7 @@ public static void beforeClass() throws Throwable { ICluster.setup(); SKIP_GC_INSPECTOR.setBoolean(true); + DTEST_ACCORD_ENABLED.setBoolean(false); } diff --git a/test/distributed/org/apache/cassandra/distributed/util/QueryResultUtil.java b/test/distributed/org/apache/cassandra/distributed/util/QueryResultUtil.java index 6ac7c60fb073..bb6b975ca2b4 100644 --- a/test/distributed/org/apache/cassandra/distributed/util/QueryResultUtil.java +++ b/test/distributed/org/apache/cassandra/distributed/util/QueryResultUtil.java @@ -20,13 +20,16 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Map; import java.util.Objects; +import java.util.function.Function; import java.util.function.Predicate; import com.google.monitoring.runtime.instrumentation.common.collect.Iterators; import org.apache.cassandra.distributed.api.QueryResults; import org.apache.cassandra.distributed.api.Row; import org.apache.cassandra.distributed.api.SimpleQueryResult; +import org.apache.cassandra.distributed.shared.AssertUtils; import org.apache.cassandra.tools.nodetool.formatter.TableBuilder; import org.assertj.core.api.Assertions; import org.assertj.core.data.Index; @@ -69,6 +72,33 @@ public static void orderBy(SimpleQueryResult qr, String... columns) }); } + @SuppressWarnings("unchecked") + public static SimpleQueryResult map(SimpleQueryResult input, Map> mapper) + { + if (input.toObjectArrays().length == 0 || mapper == null || mapper.isEmpty()) + return input; + for (String name : mapper.keySet()) + { + if (!input.names().contains(name)) + throw new IllegalArgumentException("Unable to find column " + name); + } + Object[][] rows = input.toObjectArrays().clone(); + List names = new ArrayList<>(mapper.keySet()); + int[] idxes = names.stream().mapToInt(input.names()::indexOf).toArray(); + for (int i = 0; i < rows.length; i++) + { + Object[] row = rows[i].clone(); + for (int j = 0; j < idxes.length; j++) + { + @SuppressWarnings("rawtypes") Function map = mapper.get(names.get(j)); + int idx = idxes[j]; + row[idx] = map.apply(row[idx]); + } + rows[i] = row; + } + return new SimpleQueryResult(input.names().toArray(new String[0]), rows, input.warnings()); + } + public static boolean contains(SimpleQueryResult qr, Object... values) { return contains(qr, a -> equals(a, values)); @@ -121,6 +151,7 @@ public static String expand(SimpleQueryResult qr) { StringBuilder sb = new StringBuilder(); int rowNum = 1; + qr.mark(); while (qr.hasNext()) { sb.append("@ Row ").append(rowNum).append('\n'); @@ -133,6 +164,7 @@ public static String expand(SimpleQueryResult qr) } sb.append(table); } + qr.reset(); return sb.toString(); } @@ -199,6 +231,41 @@ public SimpleQueryResultAssertHelper contains(Predicate fn) return this; } + public SimpleQueryResultAssertHelper isEqualTo(SimpleQueryResult expectedResult) + { + qr.mark(); + expectedResult.mark(); + try + { + // org.apache.cassandra.distributed.shared.AssertUtils.assertRows has some issues with the error msg + // so rewrite to make sure to have a nicer msg + List otherNames = qr.names().isEmpty() ? expectedResult.names() : qr.names(); + Assertions.assertThat(otherNames).describedAs("Column names do not match").isEqualTo(qr.names()); + int rowId = 0; + while (qr.hasNext()) + { + if (!expectedResult.hasNext()) + throw new AssertionError("Unexpected row at index " + rowId + "; found " + Arrays.toString(qr.next().toObjectArray())); + Row next = qr.next(); + Row expected = expectedResult.next(); + if (!Arrays.equals(next.toObjectArray(), expected.toObjectArray())) + throw new AssertionError("Expected row " + rowId + " to be " + Arrays.toString(expected.toObjectArray()) + " but was " + Arrays.toString(next.toObjectArray())); + + rowId++; + } + if (expectedResult.hasNext()) + throw new AssertionError("Expected row " + rowId + " to be " + Arrays.toString(expectedResult.next().toObjectArray()) + " but was missing"); + + AssertUtils.assertRows(qr, expectedResult); + } + finally + { + qr.reset(); + expectedResult.reset(); + } + return this; + } + public SimpleQueryResultAssertHelper isEqualTo(Object... values) { Assertions.assertThat(qr.toObjectArrays()) @@ -207,6 +274,15 @@ public SimpleQueryResultAssertHelper isEqualTo(Object... values) return this; } + public SimpleQueryResultAssertHelper isDeepEqualTo(Object[][] values) + { + Object[][] results = qr.toObjectArrays(); + Assertions.assertThat(results) + .hasNumberOfRows(values.length) + .isDeepEqualTo(values); + return this; + } + public SimpleQueryResultAssertHelper hasSize(int size) { Assertions.assertThat(qr.toObjectArrays()).hasNumberOfRows(size); diff --git a/test/distributed/org/apache/cassandra/fuzz/ring/ConsistentBootstrapTest.java b/test/distributed/org/apache/cassandra/fuzz/ring/ConsistentBootstrapTest.java index 57c12b768471..08af4f4d4b9e 100644 --- a/test/distributed/org/apache/cassandra/fuzz/ring/ConsistentBootstrapTest.java +++ b/test/distributed/org/apache/cassandra/fuzz/ring/ConsistentBootstrapTest.java @@ -19,6 +19,8 @@ package org.apache.cassandra.fuzz.ring; import java.util.concurrent.Callable; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Consumer; import org.junit.Assert; import org.junit.Test; @@ -32,6 +34,7 @@ import org.apache.cassandra.distributed.api.IInstanceConfig; import org.apache.cassandra.distributed.api.IInvokableInstance; import org.apache.cassandra.distributed.api.TokenSupplier; +import org.apache.cassandra.distributed.shared.ClusterUtils; import org.apache.cassandra.distributed.shared.NetworkTopology; import org.apache.cassandra.distributed.test.log.FuzzTestBase; import org.apache.cassandra.harry.SchemaSpec; @@ -66,17 +69,16 @@ public class ConsistentBootstrapTest extends FuzzTestBase public void bootstrapFuzzTest() throws Throwable { Generator schemaGen = SchemaGenerators.schemaSpecGen(KEYSPACE, "bootstrap_fuzz", 1000); - IInvokableInstance forShutdown = null; try (Cluster cluster = builder().withNodes(3) .withTokenSupplier(TokenSupplier.evenlyDistributedTokens(4)) .withNodeIdTopology(NetworkTopology.singleDcNetworkTopology(4, "dc0", "rack0")) .withConfig((config) -> config.with(Feature.NETWORK, Feature.GOSSIP) .set("write_request_timeout", "10s") .set("metadata_snapshot_frequency", 5)) - .start()) + .start(); + CloseableRef forShutdown = new CloseableRef<>(ClusterUtils::unpauseCommits)) { IInvokableInstance cmsInstance = cluster.get(1); - forShutdown = cmsInstance; waitForCMSToQuiesce(cluster, cmsInstance); withRandom(rng -> { @@ -107,6 +109,7 @@ public void bootstrapFuzzTest() throws Throwable .set(Constants.KEY_DTEST_FULL_STARTUP, true); IInvokableInstance newInstance = cluster.bootstrap(config); + forShutdown.set(cmsInstance); // Prime the CMS node to pause before the finish join event is committed Callable pending = pauseBeforeCommit(cmsInstance, (e) -> e instanceof PrepareJoin.FinishJoin); new Thread(() -> newInstance.startup()).start(); @@ -123,6 +126,7 @@ public void bootstrapFuzzTest() throws Throwable // wait for the cluster to all witness the finish join event unpauseCommits(cmsInstance); + forShutdown.set(null); waitForCMSToQuiesce(cluster, bootstrapVisible.call()); }, "Finish bootstrap"); writeAndValidate.run(); @@ -134,30 +138,42 @@ public void bootstrapFuzzTest() throws Throwable history); }); } - catch (Throwable t) + } + + public static class CloseableRef extends AtomicReference implements AutoCloseable + { + private final Consumer onClose; + + public CloseableRef(Consumer onClose) { - if (forShutdown != null) - unpauseCommits(forShutdown); - throw t; + this.onClose = onClose; } - } + @Override + public void close() throws Exception + { + T v = getAndSet(null); + if (v != null) + onClose.accept(v); + } + } @Test public void coordinatorIsBehindTest() throws Throwable { Generator schemaGen = SchemaGenerators.schemaSpecGen(KEYSPACE, "coordinator_is_behind", 1000); - IInvokableInstance forShutdown = null; try (Cluster cluster = builder().withNodes(3) .withTokenSupplier(TokenSupplier.evenlyDistributedTokens(4)) .withNodeIdTopology(NetworkTopology.singleDcNetworkTopology(4, "dc0", "rack0")) .withConfig((config) -> config.with(Feature.NETWORK, Feature.GOSSIP) .set("write_request_timeout", "10s") + .set("accord.enabled", false) + .set("cms_await_timeout", "60s") .set("metadata_snapshot_frequency", 5)) - .start()) + .start(); + CloseableRef forShutdown = new CloseableRef<>(ClusterUtils::unpauseCommits)) { IInvokableInstance cmsInstance = cluster.get(1); - forShutdown = cmsInstance; waitForCMSToQuiesce(cluster, cmsInstance); withRandom(rng -> { @@ -195,6 +211,7 @@ public void coordinatorIsBehindTest() throws Throwable // Prime the CMS node to pause before the finish join event is committed Callable pending = pauseBeforeCommit(cmsInstance, (e) -> e instanceof PrepareJoin.MidJoin); + forShutdown.set(cmsInstance); IInstanceConfig config = cluster.newInstanceConfig() .set("auto_bootstrap", true) .set(Constants.KEY_DTEST_FULL_STARTUP, true) @@ -265,14 +282,9 @@ public void coordinatorIsBehindTest() throws Throwable cluster.filters().reset(); unpauseCommits(cmsInstance); + forShutdown.set(null); startup.join(); }); } - catch (Throwable t) - { - if (forShutdown != null) - unpauseCommits(forShutdown); - throw t; - } } } \ No newline at end of file diff --git a/test/distributed/org/apache/cassandra/fuzz/sai/AccordFullMultiNodeSAITest.java b/test/distributed/org/apache/cassandra/fuzz/sai/AccordFullMultiNodeSAITest.java new file mode 100644 index 000000000000..291ef7dcaa13 --- /dev/null +++ b/test/distributed/org/apache/cassandra/fuzz/sai/AccordFullMultiNodeSAITest.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.fuzz.sai; + +import org.junit.Ignore; + +import org.apache.cassandra.harry.SchemaSpec; +import org.apache.cassandra.harry.gen.Generator; +import org.apache.cassandra.harry.gen.SchemaGenerators; +import org.apache.cassandra.service.consensus.TransactionalMode; + +@Ignore("It was believed that these tests were failing due to CASSANDRA-20567, but in fixing that issue it was found that the tests are still failing! Harry is detecting an incorrect response...") +public class AccordFullMultiNodeSAITest extends MultiNodeSAITestBase +{ + public AccordFullMultiNodeSAITest() + { + super(TransactionalMode.full); + } + + @Override + protected Generator schemaGenerator(boolean disableReadRepair) + { + return SchemaGenerators.schemaSpecGen(KEYSPACE, "basic_sai", MAX_PARTITION_SIZE, + SchemaSpec.optionsBuilder().disableReadRepair(disableReadRepair).withTransactionalMode(TransactionalMode.full)); + } +} diff --git a/test/distributed/org/apache/cassandra/fuzz/sai/AccordFullSingleNodeSAITest.java b/test/distributed/org/apache/cassandra/fuzz/sai/AccordFullSingleNodeSAITest.java new file mode 100644 index 000000000000..d09b352d777c --- /dev/null +++ b/test/distributed/org/apache/cassandra/fuzz/sai/AccordFullSingleNodeSAITest.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.fuzz.sai; + +import org.apache.cassandra.harry.SchemaSpec; +import org.apache.cassandra.harry.gen.Generator; +import org.apache.cassandra.harry.gen.SchemaGenerators; +import org.apache.cassandra.service.consensus.TransactionalMode; + +public class AccordFullSingleNodeSAITest extends SingleNodeSAITestBase +{ + public AccordFullSingleNodeSAITest() + { + super(TransactionalMode.full); + } + + @Override + protected Generator schemaGenerator(boolean disableReadRepair) + { + return SchemaGenerators.schemaSpecGen(KEYSPACE, "basic_sai", MAX_PARTITION_SIZE, SchemaSpec.optionsBuilder().withTransactionalMode(TransactionalMode.full)); + } +} diff --git a/test/distributed/org/apache/cassandra/fuzz/sai/AccordInteropMultiNodeSAITest.java b/test/distributed/org/apache/cassandra/fuzz/sai/AccordInteropMultiNodeSAITest.java new file mode 100644 index 000000000000..2d665eb47d2c --- /dev/null +++ b/test/distributed/org/apache/cassandra/fuzz/sai/AccordInteropMultiNodeSAITest.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.fuzz.sai; + +import org.junit.Ignore; + +import org.apache.cassandra.harry.SchemaSpec; +import org.apache.cassandra.harry.gen.Generator; +import org.apache.cassandra.harry.gen.SchemaGenerators; +import org.apache.cassandra.service.consensus.TransactionalMode; + +@Ignore("It was believed that these tests were failing due to CASSANDRA-20567, but in fixing that issue it was found that the tests are still failing! Harry is detecting an incorrect response...") +public class AccordInteropMultiNodeSAITest extends MultiNodeSAITestBase +{ + public AccordInteropMultiNodeSAITest() + { + super(TransactionalMode.test_interop_read); + } + + @Override + protected Generator schemaGenerator(boolean disableReadRepair) + { + return SchemaGenerators.schemaSpecGen(KEYSPACE, "basic_sai", MAX_PARTITION_SIZE, + SchemaSpec.optionsBuilder().disableReadRepair(disableReadRepair) + .withTransactionalMode(TransactionalMode.test_interop_read)); + } +} diff --git a/test/distributed/org/apache/cassandra/fuzz/sai/AccordInteropSingleNodeSAITest.java b/test/distributed/org/apache/cassandra/fuzz/sai/AccordInteropSingleNodeSAITest.java new file mode 100644 index 000000000000..383e2c647014 --- /dev/null +++ b/test/distributed/org/apache/cassandra/fuzz/sai/AccordInteropSingleNodeSAITest.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.fuzz.sai; + +import org.apache.cassandra.harry.SchemaSpec; +import org.apache.cassandra.harry.gen.Generator; +import org.apache.cassandra.harry.gen.SchemaGenerators; +import org.apache.cassandra.service.consensus.TransactionalMode; + +public class AccordInteropSingleNodeSAITest extends SingleNodeSAITestBase +{ + public AccordInteropSingleNodeSAITest() + { + super(TransactionalMode.test_interop_read); + } + + @Override + protected Generator schemaGenerator(boolean disableReadRepair) + { + return SchemaGenerators.schemaSpecGen(KEYSPACE, "basic_sai", MAX_PARTITION_SIZE, + SchemaSpec.optionsBuilder().withTransactionalMode(TransactionalMode.test_interop_read)); + } +} diff --git a/test/distributed/org/apache/cassandra/fuzz/sai/MultiNodeSAITest.java b/test/distributed/org/apache/cassandra/fuzz/sai/MultiNodeSAITest.java index 9ca536921d92..bb94a2c30d52 100644 --- a/test/distributed/org/apache/cassandra/fuzz/sai/MultiNodeSAITest.java +++ b/test/distributed/org/apache/cassandra/fuzz/sai/MultiNodeSAITest.java @@ -22,6 +22,6 @@ public class MultiNodeSAITest extends MultiNodeSAITestBase { public MultiNodeSAITest() { - super(); + super(null); } } \ No newline at end of file diff --git a/test/distributed/org/apache/cassandra/fuzz/sai/MultiNodeSAITestBase.java b/test/distributed/org/apache/cassandra/fuzz/sai/MultiNodeSAITestBase.java index 860e779970f0..df94bf248705 100644 --- a/test/distributed/org/apache/cassandra/fuzz/sai/MultiNodeSAITestBase.java +++ b/test/distributed/org/apache/cassandra/fuzz/sai/MultiNodeSAITestBase.java @@ -23,15 +23,16 @@ import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.test.sai.SAIUtil; import org.apache.cassandra.harry.SchemaSpec; +import org.apache.cassandra.service.consensus.TransactionalMode; import static org.apache.cassandra.distributed.api.Feature.GOSSIP; import static org.apache.cassandra.distributed.api.Feature.NETWORK; public abstract class MultiNodeSAITestBase extends SingleNodeSAITestBase { - public MultiNodeSAITestBase() + public MultiNodeSAITestBase(TransactionalMode transactionalMode) { - super(); + super(transactionalMode); } @BeforeClass diff --git a/test/distributed/org/apache/cassandra/fuzz/sai/SingleNodeSAITest.java b/test/distributed/org/apache/cassandra/fuzz/sai/SingleNodeSAITest.java index 37a9b1c184b9..44209ae20b32 100644 --- a/test/distributed/org/apache/cassandra/fuzz/sai/SingleNodeSAITest.java +++ b/test/distributed/org/apache/cassandra/fuzz/sai/SingleNodeSAITest.java @@ -22,6 +22,6 @@ public class SingleNodeSAITest extends SingleNodeSAITestBase { public SingleNodeSAITest() { - super(); + super(null); } } diff --git a/test/distributed/org/apache/cassandra/fuzz/sai/SingleNodeSAITestBase.java b/test/distributed/org/apache/cassandra/fuzz/sai/SingleNodeSAITestBase.java index b915d1748e17..0f0f6f96f2b4 100644 --- a/test/distributed/org/apache/cassandra/fuzz/sai/SingleNodeSAITestBase.java +++ b/test/distributed/org/apache/cassandra/fuzz/sai/SingleNodeSAITestBase.java @@ -49,6 +49,7 @@ import org.apache.cassandra.harry.gen.Generators; import org.apache.cassandra.harry.gen.SchemaGenerators; import org.apache.cassandra.index.sai.utils.IndexTermType; +import org.apache.cassandra.service.consensus.TransactionalMode; import static org.apache.cassandra.distributed.api.Feature.GOSSIP; import static org.apache.cassandra.distributed.api.Feature.NETWORK; @@ -71,14 +72,19 @@ public abstract class SingleNodeSAITestBase extends TestBaseImpl private static final int NUM_PARTITIONS = 64; private static final int NUM_VISITED_PARTITIONS = 16; - private static final int MAX_PARTITION_SIZE = 2000; + protected static final int MAX_PARTITION_SIZE = 2000; private static final int UNIQUE_CELL_VALUES = 5; protected static final Logger logger = LoggerFactory.getLogger(SingleNodeSAITest.class); protected static Cluster cluster; - protected SingleNodeSAITestBase() {} + protected TransactionalMode transactionalMode; + + protected SingleNodeSAITestBase(TransactionalMode transactionalMode) + { + this.transactionalMode = transactionalMode; + } @BeforeClass public static void before() throws Throwable @@ -169,121 +175,130 @@ private void saiTest(EntropySource rng, SchemaSpec schema, Supplier cre cluster.schemaChange(schema.compile()); cluster.schemaChange(schema.compile().replace(schema.keyspace + '.' + schema.table, schema.keyspace + ".debug_table")); - AtomicInteger indexCount = new AtomicInteger(); - - Streams.concat(schema.clusteringKeys.stream(), schema.regularColumns.stream(), schema.staticColumns.stream()) - .forEach(column -> { - if (createIndex.get()) - { - logger.info("Adding index to column {}...", column.name); - cluster.schemaChange(String.format("CREATE INDEX %s_sai_idx ON %s.%s (%s) USING 'sai' ", - column.name, schema.keyspace, schema.table, column.name)); - indexCount.incrementAndGet(); - } - else - { - logger.info("Leaving column {} unindexed...", column.name); - } - }); - - CassandraRelevantProperties.SAI_INTERSECTION_CLAUSE_LIMIT.setInt(indexCount.get()); - waitForIndexesQueryable(schema); - - HistoryBuilder history = new ReplayingHistoryBuilder(schema.valueGenerators, - (hb) -> InJvmDTestVisitExecutor.builder() - .pageSizeSelector(pageSizeSelector(rng)) - .consistencyLevel(consistencyLevelSelector()) - .doubleWriting(schema, hb, cluster, "debug_table")); - Set partitions = new HashSet<>(); - int attempts = 0; - while (partitions.size() < NUM_VISITED_PARTITIONS && attempts < NUM_VISITED_PARTITIONS * 10) + try { - partitions.add(globalPkGen.generate(rng)); - attempts++; - } - - if (partitions.size() < NUM_VISITED_PARTITIONS) - logger.warn("Unable to generate {} partitions to visit. Continuing with {}...", NUM_VISITED_PARTITIONS, partitions.size()); + AtomicInteger indexCount = new AtomicInteger(); + + Streams.concat(schema.clusteringKeys.stream(), schema.regularColumns.stream(), schema.staticColumns.stream()) + .forEach(column -> { + if (createIndex.get()) + { + logger.info("Adding index to column {}...", column.name); + cluster.schemaChange(String.format("CREATE INDEX %s_sai_idx ON %s.%s (%s) USING 'sai' ", + column.name, schema.keyspace, schema.table, column.name)); + indexCount.incrementAndGet(); + } + else + { + logger.info("Leaving column {} unindexed...", column.name); + } + }); + + CassandraRelevantProperties.SAI_INTERSECTION_CLAUSE_LIMIT.setInt(indexCount.get()); + waitForIndexesQueryable(schema); + + HistoryBuilder history = new ReplayingHistoryBuilder(schema.valueGenerators, + (hb) -> InJvmDTestVisitExecutor.builder() + .pageSizeSelector(pageSizeSelector(rng)) + .consistencyLevel(consistencyLevelSelector()) + .doubleWriting(schema, hb, cluster, "debug_table")); + Set partitions = new HashSet<>(); + int attempts = 0; + while (partitions.size() < NUM_VISITED_PARTITIONS && attempts < NUM_VISITED_PARTITIONS * 10) + { + partitions.add(globalPkGen.generate(rng)); + attempts++; + } - Generator pkGen = Generators.pick(List.copyOf(partitions)); + if (partitions.size() < NUM_VISITED_PARTITIONS) + logger.warn("Unable to generate {} partitions to visit. Continuing with {}...", NUM_VISITED_PARTITIONS, partitions.size()); - // Ensure that we don't attempt to use range queries against SAI indexes that don't support them: - Set eqOnlyRegularColumns = new HashSet<>(); - for (int i = 0; i < schema.regularColumns.size(); i++) - if (IndexTermType.isEqOnlyType(schema.regularColumns.get(i).type.asServerType())) - eqOnlyRegularColumns.add(i); + Generator pkGen = Generators.pick(List.copyOf(partitions)); - Set eqOnlyStaticColumns = new HashSet<>(); - for (int i = 0; i < schema.staticColumns.size(); i++) - if (IndexTermType.isEqOnlyType(schema.staticColumns.get(i).type.asServerType())) - eqOnlyStaticColumns.add(i); + // Ensure that we don't attempt to use range queries against SAI indexes that don't support them: + Set eqOnlyRegularColumns = new HashSet<>(); + for (int i = 0; i < schema.regularColumns.size(); i++) + if (IndexTermType.isEqOnlyType(schema.regularColumns.get(i).type.asServerType())) + eqOnlyRegularColumns.add(i); - Set eqOnlyClusteringColumns = new HashSet<>(); - for (int i = 0; i < schema.clusteringKeys.size(); i++) - if (IndexTermType.isEqOnlyType(schema.clusteringKeys.get(i).type.asServerType())) - eqOnlyClusteringColumns.add(i); + Set eqOnlyStaticColumns = new HashSet<>(); + for (int i = 0; i < schema.staticColumns.size(); i++) + if (IndexTermType.isEqOnlyType(schema.staticColumns.get(i).type.asServerType())) + eqOnlyStaticColumns.add(i); - for (int i = 0; i < OPERATIONS_PER_RUN; i++) - { - int partitionIndex = pkGen.generate(rng); - HistoryBuilderHelper.insertRandomData(schema, partitionIndex, ckGen.generate(rng), rng, 0.5d, history); + Set eqOnlyClusteringColumns = new HashSet<>(); + for (int i = 0; i < schema.clusteringKeys.size(); i++) + if (IndexTermType.isEqOnlyType(schema.clusteringKeys.get(i).type.asServerType())) + eqOnlyClusteringColumns.add(i); - if (rng.nextFloat() > 0.99f) + for (int i = 0; i < OPERATIONS_PER_RUN; i++) { - int row1 = ckGen.generate(rng); - int row2 = ckGen.generate(rng); - history.deleteRowRange(partitionIndex, - Math.min(row1, row2), - Math.max(row1, row2), - rng.nextInt(schema.clusteringKeys.size()), - rng.nextBoolean(), - rng.nextBoolean()); - } + int partitionIndex = pkGen.generate(rng); + HistoryBuilderHelper.insertRandomData(schema, partitionIndex, ckGen.generate(rng), rng, 0.5d, history); - if (rng.nextFloat() > 0.995f) - HistoryBuilderHelper.deleteRandomColumns(schema, partitionIndex, ckGen.generate(rng), rng, history); + if (rng.nextFloat() > 0.99f) + { + int row1 = ckGen.generate(rng); + int row2 = ckGen.generate(rng); + history.deleteRowRange(partitionIndex, + Math.min(row1, row2), + Math.max(row1, row2), + rng.nextInt(schema.clusteringKeys.size()), + rng.nextBoolean(), + rng.nextBoolean()); + } - if (rng.nextFloat() > 0.9995f) - history.deletePartition(partitionIndex); + if (rng.nextFloat() > 0.995f) + HistoryBuilderHelper.deleteRandomColumns(schema, partitionIndex, ckGen.generate(rng), rng, history); - if (i % FLUSH_SKIP == 0) - history.custom(() -> flush(schema), "Flush"); - else if (i % COMPACTION_SKIP == 0) - history.custom(() -> compact(schema), "Compact"); - else if (i % repairSkip == 0) - history.custom(() -> repair(schema), "Repair"); + if (rng.nextFloat() > 0.9995f) + history.deletePartition(partitionIndex); - if (i > 0 && i % VALIDATION_SKIP == 0) - { - for (int j = 0; j < QUERIES_PER_VALIDATION; j++) + if (i % FLUSH_SKIP == 0) + history.custom(() -> flush(schema), "Flush"); + else if (i % COMPACTION_SKIP == 0) + history.custom(() -> compact(schema), "Compact"); + else if (i % repairSkip == 0) + history.custom(() -> repair(schema), "Repair"); + + if (i > 0 && i % VALIDATION_SKIP == 0) { - List regularRelations = - HistoryBuilderHelper.generateValueRelations(rng, - schema.regularColumns.size(), - column -> Math.min(schema.valueGenerators.regularPopulation(column), UNIQUE_CELL_VALUES), - eqOnlyRegularColumns::contains); - - List staticRelations = - HistoryBuilderHelper.generateValueRelations(rng, - schema.staticColumns.size(), - column -> Math.min(schema.valueGenerators.staticPopulation(column), UNIQUE_CELL_VALUES), - eqOnlyStaticColumns::contains); - - Integer pk = pkGen.generate(rng); - - IdxRelation[] ckRelations = - HistoryBuilderHelper.generateClusteringRelations(rng, - schema.clusteringKeys.size(), - ckGen, - eqOnlyClusteringColumns).toArray(new IdxRelation[0]); - - IdxRelation[] regularRelationsArray = regularRelations.toArray(new IdxRelation[regularRelations.size()]); - IdxRelation[] staticRelationsArray = staticRelations.toArray(new IdxRelation[staticRelations.size()]); - - history.select(pk, ckRelations, regularRelationsArray, staticRelationsArray); + for (int j = 0; j < QUERIES_PER_VALIDATION; j++) + { + List regularRelations = + HistoryBuilderHelper.generateValueRelations(rng, + schema.regularColumns.size(), + column -> Math.min(schema.valueGenerators.regularPopulation(column), UNIQUE_CELL_VALUES), + eqOnlyRegularColumns::contains); + + List staticRelations = + HistoryBuilderHelper.generateValueRelations(rng, + schema.staticColumns.size(), + column -> Math.min(schema.valueGenerators.staticPopulation(column), UNIQUE_CELL_VALUES), + eqOnlyStaticColumns::contains); + + Integer pk = pkGen.generate(rng); + + IdxRelation[] ckRelations = + HistoryBuilderHelper.generateClusteringRelations(rng, + schema.clusteringKeys.size(), + ckGen, + eqOnlyClusteringColumns).toArray(new IdxRelation[0]); + + IdxRelation[] regularRelationsArray = regularRelations.toArray(new IdxRelation[regularRelations.size()]); + IdxRelation[] staticRelationsArray = staticRelations.toArray(new IdxRelation[staticRelations.size()]); + + history.select(pk, ckRelations, regularRelationsArray, staticRelationsArray); + } } } } + finally + { + // Any Accord tables must be dropped before the keyspace is dropped: + cluster.schemaChange("DROP TABLE IF EXISTS " + KEYSPACE + '.' + schema.table); + cluster.schemaChange("DROP TABLE IF EXISTS " + KEYSPACE + '.' + "debug_table"); + } } protected Generator schemaGenerator(boolean disableReadRepair) @@ -338,8 +353,13 @@ protected InJvmDTestVisitExecutor.ConsistencyLevelSelector consistencyLevelSelec return ConsistencyLevel.ALL; // The goal here is to make replicas as out of date as possible, modulo the efforts of repair - // and read-repair in the test itself. - return ConsistencyLevel.NODE_LOCAL; + // and read-repair in the test itself. node_local bypasses Accord which breaks any attempt at testing Accord + // so if we are running with Accord use QUORUM (which Accord will ignore since it runs with transactional + // mode full). + if (transactionalMode != null && transactionalMode.nonSerialWritesThroughAccord) + return ConsistencyLevel.QUORUM; + else + return ConsistencyLevel.NODE_LOCAL; }; } diff --git a/test/distributed/org/apache/cassandra/fuzz/topology/AccordBootstrapTest.java b/test/distributed/org/apache/cassandra/fuzz/topology/AccordBootstrapTest.java new file mode 100644 index 000000000000..77aaebfab7c4 --- /dev/null +++ b/test/distributed/org/apache/cassandra/fuzz/topology/AccordBootstrapTest.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.fuzz.topology; + +import java.util.HashSet; + +import org.junit.Test; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.Constants; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.IInstanceConfig; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.TokenSupplier; +import org.apache.cassandra.distributed.shared.ClusterUtils; +import org.apache.cassandra.distributed.shared.NetworkTopology; +import org.apache.cassandra.distributed.test.log.FuzzTestBase; +import org.apache.cassandra.harry.SchemaSpec; +import org.apache.cassandra.harry.dsl.HistoryBuilder; +import org.apache.cassandra.harry.dsl.HistoryBuilderHelper; +import org.apache.cassandra.harry.dsl.ReplayingHistoryBuilder; +import org.apache.cassandra.harry.execution.InJvmDTestVisitExecutor; +import org.apache.cassandra.harry.execution.QueryBuildingVisitExecutor; +import org.apache.cassandra.harry.gen.Generator; +import org.apache.cassandra.harry.gen.Generators; +import org.apache.cassandra.harry.gen.Generators.TrackingGenerator; +import org.apache.cassandra.harry.gen.SchemaGenerators; +import org.apache.cassandra.service.consensus.TransactionalMode; + +import static org.apache.cassandra.distributed.shared.ClusterUtils.unpauseCommits; +import static org.apache.cassandra.distributed.shared.ClusterUtils.waitForCMSToQuiesce; +import static org.apache.cassandra.harry.checker.TestHelper.withRandom; + +public class AccordBootstrapTest extends FuzzTestBase +{ + private static final int WRITES = 10; + private static final int POPULATION = 1000; + + @Test + public void bootstrapFuzzTest() throws Throwable + { + CassandraRelevantProperties.SYSTEM_TRACES_DEFAULT_RF.setInt(3); + IInvokableInstance forShutdown = null; + try (Cluster cluster = builder().withNodes(3) + .withTokenSupplier(TokenSupplier.evenlyDistributedTokens(100)) + .withNodeIdTopology(NetworkTopology.singleDcNetworkTopology(100, "dc0", "rack0")) + .withConfig((config) -> config.with(Feature.NETWORK, Feature.GOSSIP) + .set("write_request_timeout", "2s") + .set("request_timeout", "5s") + .set("concurrent_accord_operations", 2) + .set("progress_barrier_min_consistency_level", "QUORUM") + .set("progress_barrier_default_consistency_level", "QUORUM") + .set("metadata_snapshot_frequency", 5)) + .start()) + { + IInvokableInstance cmsInstance = cluster.get(1); + forShutdown = cmsInstance; + waitForCMSToQuiesce(cluster, cmsInstance); + + HashSet downInstances = new HashSet<>(); + withRandom(rng -> { + Generator schemaGen = SchemaGenerators.trivialSchema(KEYSPACE, "bootstrap_fuzz", POPULATION, + SchemaSpec.optionsBuilder() + .addWriteTimestamps(false) + .withTransactionalMode(TransactionalMode.full) + ); + + SchemaSpec schema = schemaGen.generate(rng); + TrackingGenerator pkGen = Generators.tracking(Generators.int32(0, Math.min(schema.valueGenerators.pkPopulation(), POPULATION))); + Generator ckGen = Generators.int32(0, Math.min(schema.valueGenerators.ckPopulation(), POPULATION)); + HistoryBuilder history = new ReplayingHistoryBuilder(schema.valueGenerators, + hb -> InJvmDTestVisitExecutor.builder() + .consistencyLevel(ConsistencyLevel.QUORUM) + .wrapQueries(QueryBuildingVisitExecutor.WrapQueries.TRANSACTION) + .pageSizeSelector(p -> InJvmDTestVisitExecutor.PageSizeSelector.NO_PAGING) + .nodeSelector(lts -> { + while (true) + { + int pick = rng.nextInt(1, cluster.size() + 1); + if (!downInstances.contains(pick)) + return pick; + + } + }) + .build(schema, hb, cluster)); + + Runnable writeAndValidate = () -> { + for (int i = 0; i < WRITES; i++) + HistoryBuilderHelper.insertRandomData(schema, pkGen, ckGen, rng, history); + + for (int pk : pkGen.generated()) + history.selectPartition(pk); + }; + + history.customThrowing(() -> { + cluster.schemaChange(String.format("CREATE KEYSPACE %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 3};", KEYSPACE)); + cluster.schemaChange(schema.compile()); + waitForCMSToQuiesce(cluster, cmsInstance); + }, "Setup"); + Thread.sleep(1000); + writeAndValidate.run(); + + history.customThrowing(() -> { + IInstanceConfig config = cluster.newInstanceConfig() + .set("auto_bootstrap", true) + .set(Constants.KEY_DTEST_FULL_STARTUP, true); + cluster.bootstrap(config).startup(); + waitForCMSToQuiesce(cluster, cmsInstance); + }, "Start boostrap"); + + writeAndValidate.run(); + + history.customThrowing(() -> { + downInstances.add(2); + ClusterUtils.stopUnchecked(cluster.get(2)); + cluster.get(1).logs().watchFor("/127.0.0.2:.* is now DOWN"); + }, "Shut down node 2"); + + history.customThrowing(() -> { + IInstanceConfig config = cluster.newInstanceConfig() + .set("auto_bootstrap", true) + .set(Constants.KEY_DTEST_FULL_STARTUP, true); + cluster.bootstrap(config).startup(); + waitForCMSToQuiesce(cluster, cmsInstance); + }, "Bootstrap one more"); + + writeAndValidate.run(); + + history.customThrowing(() -> { + cluster.get(2).startup(); + cluster.get(1).logs().watchFor("/127.0.0.2:.* is now UP"); + downInstances.remove(2); + }, "Start up node 2"); + + }); + } + catch (Throwable t) + { + if (forShutdown != null) + unpauseCommits(forShutdown); + throw t; + } + } +} diff --git a/test/distributed/org/apache/cassandra/fuzz/topology/AccordTopologyMixupTest.java b/test/distributed/org/apache/cassandra/fuzz/topology/AccordTopologyMixupTest.java new file mode 100644 index 000000000000..8d6bc0277331 --- /dev/null +++ b/test/distributed/org/apache/cassandra/fuzz/topology/AccordTopologyMixupTest.java @@ -0,0 +1,396 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.fuzz.topology; + +import java.nio.ByteBuffer; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import javax.annotation.Nullable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.coordinate.Exhausted; +import accord.coordinate.Preempted; +import accord.coordinate.Timeout; +import accord.local.Node; +import accord.primitives.Ranges; +import accord.primitives.TxnId; +import accord.utils.Gen; +import accord.utils.Gens; +import accord.utils.Invariants; +import accord.utils.Property; +import accord.utils.Property.Command; +import accord.utils.RandomSource; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.config.Config; +import org.apache.cassandra.cql3.ast.CQLFormatter; +import org.apache.cassandra.cql3.ast.Mutation; +import org.apache.cassandra.cql3.ast.Select; +import org.apache.cassandra.cql3.ast.StandardVisitors; +import org.apache.cassandra.cql3.ast.Statement; +import org.apache.cassandra.cql3.ast.Txn; +import org.apache.cassandra.db.marshal.AsciiType; +import org.apache.cassandra.db.marshal.BytesType; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.db.virtual.AccordVirtualTables; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.IInstanceConfig; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.QueryResults; +import org.apache.cassandra.distributed.api.SimpleQueryResult; +import org.apache.cassandra.distributed.shared.ClusterUtils; +import org.apache.cassandra.distributed.test.accord.AccordTestBase; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.api.AccordAgent; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.tools.nodetool.formatter.TableBuilder; +import org.apache.cassandra.utils.ASTGenerators; +import org.apache.cassandra.utils.AbstractTypeGenerators; +import org.apache.cassandra.utils.CassandraGenerators; +import org.apache.cassandra.utils.FastByteOperations; +import org.apache.cassandra.utils.Generators; +import org.apache.cassandra.utils.Isolated; +import org.apache.cassandra.utils.Retry; +import org.apache.cassandra.utils.Shared; +import org.quicktheories.generators.SourceDSL; + +import static org.apache.cassandra.schema.SchemaConstants.VIRTUAL_VIEWS; +import static org.apache.cassandra.utils.AbstractTypeGenerators.overridePrimitiveTypeSupport; +import static org.apache.cassandra.utils.AbstractTypeGenerators.stringComparator; +import static org.apache.cassandra.utils.AccordGenerators.fromQT; + +public class AccordTopologyMixupTest extends TopologyMixupTestBase +{ + private static final Logger logger = LoggerFactory.getLogger(AccordTopologyMixupTest.class); + + /** + * Should the history show the CQL? By default, this is off as its very verbose, but when debugging this can be helpful. + */ + private static boolean HISTORY_SHOWS_CQL = false; + + static + { + CassandraRelevantProperties.ACCORD_AGENT_CLASS.setString(InterceptAgent.class.getName()); + // enable most expensive debugging checks + CassandraRelevantProperties.ACCORD_KEY_PARANOIA_CPU.setString(Invariants.Paranoia.QUADRATIC.name()); + CassandraRelevantProperties.ACCORD_KEY_PARANOIA_MEMORY.setString(Invariants.Paranoia.QUADRATIC.name()); + CassandraRelevantProperties.ACCORD_KEY_PARANOIA_COSTFACTOR.setString(Invariants.ParanoiaCostFactor.HIGH.name()); + + overridePrimitiveTypeSupport(AsciiType.instance, AbstractTypeGenerators.TypeSupport.of(AsciiType.instance, SourceDSL.strings().ascii().ofLengthBetween(1, 10), stringComparator(AsciiType.instance))); + overridePrimitiveTypeSupport(UTF8Type.instance, AbstractTypeGenerators.TypeSupport.of(UTF8Type.instance, Generators.utf8(1, 10), stringComparator(UTF8Type.instance))); + overridePrimitiveTypeSupport(BytesType.instance, AbstractTypeGenerators.TypeSupport.of(BytesType.instance, Generators.bytes(1, 10), FastByteOperations::compareUnsigned)); + } + + private static final List TRANSACTIONAL_MODES = Stream.of(TransactionalMode.values()).filter(t -> t.accordIsEnabled).collect(Collectors.toList()); + + @Override + protected Gen> stateGen() + { + return AccordState::new; + } + + @Override + protected void preCheck(Property.StatefulBuilder builder) + { + // if a failing seed is detected, populate here + // Example: builder.withSeed(42L); + // HISTORY_SHOWS_CQL = true; // uncomment if the CQL done should be included in the history + } + + private static Spec createSchemaSpec(RandomSource rs, Cluster cluster) + { + TransactionalMode mode = rs.pick(TRANSACTIONAL_MODES); + // This test puts a focus on topology / cluster operations, so schema "shouldn't matter"... limit the domain of the test to improve the ability to debug + AbstractTypeGenerators.TypeGenBuilder supportedTypes = AbstractTypeGenerators.withoutUnsafeEquality(AbstractTypeGenerators.builder() + .withTypeKinds(AbstractTypeGenerators.TypeKind.PRIMITIVE)); + TableMetadata metadata = fromQT(new CassandraGenerators.TableMetadataBuilder() + .withKeyspaceName(KEYSPACE) + .withTableName("tbl") + .withTableKinds(TableMetadata.Kind.REGULAR) + .withKnownMemtables() + .withSimpleColumnNames() + //TODO (coverage): include "fast_path = 'keyspace'" override + .withTransactionalMode(mode) + .withDefaultTypeGen(supportedTypes) + .build()) + .next(rs); + maybeCreateUDTs(cluster, metadata); + String schemaCQL = metadata.toCqlString(false, false, false); + logger.info("Creating test table:\n{}", schemaCQL); + cluster.schemaChange(schemaCQL); + return new Spec(mode, metadata); + } + + private static CommandGen cqlOperations(Spec spec) + { + Gen select = (Gen) (Gen) fromQT(new ASTGenerators.SelectGenBuilder(spec.metadata).withLimit1().build()); + Gen mutation = (Gen) (Gen) fromQT(new ASTGenerators.MutationGenBuilder(spec.metadata).withoutTimestamp().withoutTtl().withAllowUpdateMultipleClusteringKeys(false).build()); + Gen txn = (Gen) (Gen) fromQT(new ASTGenerators.TxnGenBuilder(spec.metadata).build()); + Map, Integer> operations = new LinkedHashMap<>(); + operations.put(select, 1); + operations.put(mutation, 1); + operations.put(txn, 1); + Gen statementGen = Gens.oneOf(operations); + return (rs, state) -> cqlOperation(rs, state, statementGen); + } + + private static Command, Void, ?> cqlOperation(RandomSource rs, State state, Gen statementGen) + { + Statement stmt = statementGen.map(s -> { + if (s.kind() == Statement.Kind.TXN || s.kind() == Statement.Kind.MUTATION && ((Mutation) s).isCas()) + return s; + return s instanceof Select ? Txn.wrap((Select) s) : Txn.wrap((Mutation) s); + }).next(rs); + IInvokableInstance node = state.cluster.get(rs.pickInt(state.topologyHistory.up())); + String msg = HISTORY_SHOWS_CQL ? + "\n" + stmt.visit(StandardVisitors.DEBUG).toCQL(new CQLFormatter.PrettyPrint()) + "\n" + : stmt.kind() == Statement.Kind.MUTATION ? ((Mutation) stmt).mutationKind().name() : stmt.kind().name(); + return new Property.SimpleCommand<>(node + ":" + msg + "; epoch=" + state.currentEpoch.get(), s2 -> executeTxn(s2.cluster, node, stmt.toCQL(), stmt.bindsEncoded())); + } + + private static SimpleQueryResult executeTxn(Cluster cluster, IInvokableInstance node, String stmt, ByteBuffer[] binds) + { + if (!AccordTestBase.isIdempotent(node, stmt)) + { + // won't be able to retry... + return node.coordinator().executeWithResult(stmt, ConsistencyLevel.ANY, (Object[]) binds); + } + return AccordTestBase.executeWithRetry(cluster, node, stmt, (Object[]) binds); + } + + private static void maybeCreateUDTs(Cluster cluster, TableMetadata metadata) + { + CassandraGenerators.visitUDTs(metadata, next -> { + String cql = next.toCqlString(false, false, false); + logger.warn("Creating UDT {}", cql); + cluster.schemaChange(cql); + }); + } + + public static class Spec implements Schema + { + private final TransactionalMode mode; + private final TableMetadata metadata; + + public Spec(TransactionalMode mode, TableMetadata metadata) + { + this.mode = mode; + this.metadata = metadata; + } + + @Override + public String table() + { + return metadata.name; + } + + @Override + public String keyspace() + { + return metadata.keyspace; + } + + @Override + public String createSchema() + { + return metadata.toCqlString(false, false, false); + } + } + + private static class AccordState extends State + { + private final Map instanceEpochReadyState = new TreeMap<>(); + private final Map instanceEpochSyncState = new TreeMap<>(); + private final ListenerHolder listener; + + public AccordState(RandomSource rs) + { + super(rs, AccordTopologyMixupTest::createSchemaSpec, AccordTopologyMixupTest::cqlOperations); + + this.listener = new ListenerHolder(this); + this.preActions.add(this::populateEpochState); + } + + private void populateEpochState() + { + updateMap(instanceEpochReadyState, "SELECT * FROM " + VIRTUAL_VIEWS + "." + AccordVirtualTables.EPOCHS); + updateMap(instanceEpochSyncState, "SELECT * FROM " + VIRTUAL_VIEWS + "." + AccordVirtualTables.TABLE_EPOCHS); + } + + private void updateMap(Map map, String cql) + { + for (var inst : cluster) + { + int num = inst.config().num(); + if (inst.isShutdown()) + { + map.put(num, "unknown"); + continue; + } + try + { + SimpleQueryResult qr = Retry.retryWithBackoffBlocking(5, () -> cluster.get(num).executeInternalWithResult(cql)); + map.put(num, TableBuilder.toStringPiped(qr.names(), QueryResults.stringify(qr))); + } + catch (Throwable t) + { + // Throwable.toString shows the type + msg but not the stack trace + map.put(num, "unknown due to failure: " + t); + } + } + } + + @Override + protected void onConfigure(IInstanceConfig c) + { + c.set("accord.command_store_shard_count", 1) + .set("accord.queue_shard_count", 1) + .set("accord.shard_durability_target_splits", 4) + .set("concurrent_accord_operations", 1) + .set("paxos_variant", Config.PaxosVariant.v2.name()); + } + + @Override + protected void onStartupComplete(long tcmEpoch) + { + ClusterUtils.awaitAccordEpochReady(cluster, tcmEpoch); + } + + @Override + public String toString() + { + StringBuilder sb = new StringBuilder(super.toString()); + sb.append("\nAccord Epoch State:"); + for (var e : instanceEpochReadyState.entrySet()) + sb.append("\nnode").append(e.getKey()).append(":\n").append(e.getValue()); + sb.append("\nAccord Epoch Ranges:"); + for (var e : instanceEpochSyncState.entrySet()) + sb.append("\nnode").append(e.getKey()).append(":\n").append(e.getValue()); + return sb.toString(); + } + + @Override + public void close() throws Exception + { + listener.close(); + super.close(); + } + } + + public static class ListenerHolder implements AccordTopologyMixupTest.SharedState.Listener, AutoCloseable + { + private final Map debug = new ConcurrentHashMap<>(); + private final State state; + + public ListenerHolder(State state) + { + this.state = state; + AccordTopologyMixupTest.SharedState.listeners.add(this); + } + + @Override + public void debugTxn(Node.Id node, String type, TxnId txnId) + { + debug.putIfAbsent(txnId, () -> { + // this runs in the main thread, so is actually thread safe + int[] up = state.topologyHistory.up(); + logger.error("{} failed with txn id {}; global debug summary:\n{}", type, txnId, ClusterUtils.queryTxnStateAsString(state.cluster, txnId, up)); + debug.remove(txnId); + }); + } + + public void runTasks() + { + for (Runnable r : debug.values()) + { + try + { + r.run(); + } + catch (Throwable t) + { + // TODO (correctness): how to handle? + logger.error("Unhandled error in onError listeners", t); + } + } + } + + @Override + public void close() + { + runTasks(); + AccordTopologyMixupTest.SharedState.listeners.remove(this); + debug.clear(); + } + } + + @Shared + public static class SharedState + { + public interface Listener + { + void debugTxn(Node.Id node, String type, TxnId txnId); + } + + public static final CopyOnWriteArrayList listeners = new CopyOnWriteArrayList<>(); + + public static void debugTxn(@Nullable Integer node, String type, String id) + { + Node.Id nodeId = node == null ? null : new Node.Id(node); + TxnId txnId = TxnId.parse(id); + listeners.forEach(l -> l.debugTxn(nodeId, type, txnId)); + } + } + + @Isolated + public static class InterceptAgent extends AccordAgent + { + public InterceptAgent() + { + super(); + } + + { + AccordAgent.setOnFailedBarrier((id, cause) -> { + if (cause instanceof Timeout || cause instanceof Preempted) + { + SharedState.debugTxn(null, "Repair Barrier", id.toString()); + } + }); + } + @Override + public void onFailedBootstrap(int attempts, String phase, Ranges ranges, Runnable retry, Throwable failure) + { + if (failure instanceof Exhausted) + { + Exhausted e = (Exhausted) failure; + SharedState.debugTxn(self.id, "Bootstrap#" + phase, e.txnId().toString()); + } + super.onFailedBootstrap(attempts, phase, ranges, retry, failure); + } + } +} diff --git a/test/distributed/org/apache/cassandra/fuzz/topology/HarryOnAccordTopologyMixupTest.java b/test/distributed/org/apache/cassandra/fuzz/topology/HarryOnAccordTopologyMixupTest.java new file mode 100644 index 000000000000..f133540146e2 --- /dev/null +++ b/test/distributed/org/apache/cassandra/fuzz/topology/HarryOnAccordTopologyMixupTest.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.fuzz.topology; + + +import accord.utils.Gen; +import accord.utils.Invariants; +import accord.utils.Property; +import accord.utils.RandomSource; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.distributed.api.IInstanceConfig; +import org.apache.cassandra.distributed.shared.ClusterUtils; +import org.apache.cassandra.fuzz.topology.AccordTopologyMixupTest.ListenerHolder; +import org.apache.cassandra.service.consensus.TransactionalMode; + +public class HarryOnAccordTopologyMixupTest extends HarryTopologyMixupTest +{ + static + { + CassandraRelevantProperties.ACCORD_AGENT_CLASS.setString(AccordTopologyMixupTest.InterceptAgent.class.getName()); + // enable most expensive debugging checks + CassandraRelevantProperties.ACCORD_KEY_PARANOIA_CPU.setString(Invariants.Paranoia.QUADRATIC.name()); + CassandraRelevantProperties.ACCORD_KEY_PARANOIA_MEMORY.setString(Invariants.Paranoia.QUADRATIC.name()); + CassandraRelevantProperties.ACCORD_KEY_PARANOIA_COSTFACTOR.setString(Invariants.ParanoiaCostFactor.HIGH.name()); + } + + @Override + protected void preCheck(Property.StatefulBuilder builder) + { + // if a failing seed is detected, populate here + // Example: builder.withSeed(42L); + } + + @Override + protected Gen> stateGen() + { + return HarryOnAccordState::new; + } + + public HarryOnAccordTopologyMixupTest() + { + super(new AccordMode(AccordMode.Kind.Direct, TransactionalMode.full)); + } + + public class HarryOnAccordState extends HarryState + { + private final ListenerHolder listener; + + public HarryOnAccordState(RandomSource rs) + { + super(rs); + + this.listener = new ListenerHolder(this); + } + + @Override + protected void onConfigure(IInstanceConfig config) + { + super.onConfigure(config); + config.set("accord.command_store_shard_count", 1) + .set("accord.queue_shard_count", 1) + .set("accord.shard_durability_target_splits", 4) + .set("concurrent_accord_operations", 1); + } + + @Override + protected void onStartupComplete(long tcmEpoch) + { + ClusterUtils.awaitAccordEpochReady(cluster, tcmEpoch); + } + + @Override + public void close() throws Exception + { + listener.close(); + super.close(); + } + } +} diff --git a/test/distributed/org/apache/cassandra/fuzz/topology/HarryTopologyMixupTest.java b/test/distributed/org/apache/cassandra/fuzz/topology/HarryTopologyMixupTest.java index 6f83ae00136c..816c38f11a13 100644 --- a/test/distributed/org/apache/cassandra/fuzz/topology/HarryTopologyMixupTest.java +++ b/test/distributed/org/apache/cassandra/fuzz/topology/HarryTopologyMixupTest.java @@ -30,6 +30,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import accord.primitives.TxnId; import accord.utils.Gen; import accord.utils.Property; import accord.utils.Property.Command; @@ -38,16 +39,20 @@ import accord.utils.RandomSource; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.IInstanceConfig; +import org.apache.cassandra.distributed.shared.ClusterUtils; import org.apache.cassandra.exceptions.RequestTimeoutException; import org.apache.cassandra.harry.SchemaSpec; import org.apache.cassandra.harry.dsl.HistoryBuilder; import org.apache.cassandra.harry.dsl.ReplayingHistoryBuilder; import org.apache.cassandra.harry.execution.InJvmDTestVisitExecutor; +import org.apache.cassandra.harry.execution.QueryBuildingVisitExecutor; import org.apache.cassandra.harry.gen.EntropySource; import org.apache.cassandra.harry.gen.Generator; import org.apache.cassandra.harry.gen.Generators; import org.apache.cassandra.harry.gen.SchemaGenerators; import org.apache.cassandra.harry.gen.rng.JdkRandomEntropySource; +import org.apache.cassandra.harry.op.Operations; +import org.apache.cassandra.service.consensus.TransactionalMode; import org.apache.cassandra.utils.AssertionUtils; import org.assertj.core.api.Condition; @@ -58,8 +63,32 @@ public class HarryTopologyMixupTest extends TopologyMixupTestBase TIMEOUT_CHECKER = AssertionUtils.isInstanceof(RequestTimeoutException.class); private static final Logger logger = LoggerFactory.getLogger(HarryTopologyMixupTest.class); + public static class AccordMode + { + public AccordMode(Kind kind, @Nullable TransactionalMode transactionalMode) + { + this.kind = kind; + this.transactionalMode = transactionalMode; + } + + public enum Kind + {None, Direct, Passthrough} + + public final Kind kind; + @Nullable + public final TransactionalMode transactionalMode; + } + + private final AccordMode mode; + public HarryTopologyMixupTest() { + this(new AccordMode(AccordMode.Kind.None, null)); + } + + protected HarryTopologyMixupTest(AccordMode mode) + { + this.mode = mode; } @Override @@ -86,16 +115,29 @@ protected void destroyState(State state, @Nullable Throwable cause) } } - private static BiFunction createSchemaSpec() + private static BiFunction createSchemaSpec(AccordMode mode) { return (rs, cluster) -> { EntropySource rng = new JdkRandomEntropySource(rs.nextLong()); - Generator schemaGen = SchemaGenerators.schemaSpecGen("harry", "table", 1000);; - SchemaSpec schema = schemaGen.generate(rng); + Generator schemaGen; + SchemaSpec schema; + if (mode.kind != AccordMode.Kind.None) + { + schemaGen = SchemaGenerators.schemaSpecGen("harry", "table", 1000, + SchemaSpec.optionsBuilder() + .withTransactionalMode(mode.transactionalMode) + .addWriteTimestamps(!isWriteTimeFromAccord(mode.transactionalMode))); + } + else + schemaGen = SchemaGenerators.schemaSpecGen("harry", "table", 1000); + + schema = schemaGen.generate(rng); HistoryBuilder harry = new ReplayingHistoryBuilder(schema.valueGenerators, hb -> { InJvmDTestVisitExecutor.Builder builder = InJvmDTestVisitExecutor.builder(); + if (mode.kind == AccordMode.Kind.Direct) + builder = builder.wrapQueries(QueryBuildingVisitExecutor.WrapQueries.TRANSACTION); return builder.nodeSelector(new InJvmDTestVisitExecutor.NodeSelector() { private final AtomicLong cnt = new AtomicLong(); @@ -116,6 +158,25 @@ public int select(long lts) t = Throwables.getRootCause(t); if (!TIMEOUT_CHECKER.matches(t)) return false; + + TxnId id; + try + { + id = TxnId.parse(t.getMessage()); + } + catch (Throwable t2) + { + return true; + } + try + { + int[] nodes = cluster.stream().filter(i -> !i.isShutdown()).mapToInt(i -> i.config().num()).toArray(); + logger.warn("Timeout for txn {}; debug info\n{}", id, ClusterUtils.queryTxnStateAsString(cluster, id, nodes)); + } + catch (Throwable t3) + { + t.addSuppressed(t3); + } return false; }) .build(schema, hb, cluster); @@ -172,6 +233,10 @@ private static CommandGen cqlOperations(Spec spec) { long pd = spec.harry.valueGenerators().pkGen().descriptorAt(pkIdx); reads.add(new HarryCommand(s -> String.format("Harry Validate pd=%d%s", pd, state.commandNamePostfix()), s -> spec.harry.selectPartition(pkIdx))); + + TransactionalMode transationalMode = spec.schema.options.transactionalMode(); + if (TransactionalMode.full == transationalMode) + reads.add(new HarryCommand(s -> String.format("Harry Reverse Validate pd=%d%s", pd, state.commandNamePostfix()), s -> spec.harry.selectPartition(pkIdx, Operations.ClusteringOrderBy.DESC))); } reads.add(new HarryCommand(s -> "Reset Harry Write State" + state.commandNamePostfix(), s -> ((HarryState) s).numInserts = 0)); return Property.multistep(reads); @@ -216,7 +281,7 @@ public class HarryState extends State public HarryState(RandomSource rs) { - super(rs, createSchemaSpec(), HarryTopologyMixupTest::cqlOperations); + super(rs, createSchemaSpec(mode), HarryTopologyMixupTest::cqlOperations); } @Override @@ -225,4 +290,9 @@ protected void onConfigure(IInstanceConfig config) config.set("metadata_snapshot_frequency", 5); } } + + private static boolean isWriteTimeFromAccord(TransactionalMode transactionalMode) + { + return transactionalMode != null && transactionalMode.nonSerialWritesThroughAccord; + } } \ No newline at end of file diff --git a/test/distributed/org/apache/cassandra/fuzz/topology/TopologyMixupTestBase.java b/test/distributed/org/apache/cassandra/fuzz/topology/TopologyMixupTestBase.java index 192713d361c3..a4341dddeb97 100644 --- a/test/distributed/org/apache/cassandra/fuzz/topology/TopologyMixupTestBase.java +++ b/test/distributed/org/apache/cassandra/fuzz/topology/TopologyMixupTestBase.java @@ -51,7 +51,7 @@ import org.agrona.collections.IntHashSet; import org.apache.cassandra.distributed.Constants; import org.apache.cassandra.distributed.api.ICoordinator; -import org.apache.cassandra.distributed.api.Row; +import org.apache.cassandra.distributed.api.QueryResults; import org.apache.cassandra.distributed.api.SimpleQueryResult; import org.junit.Test; @@ -84,19 +84,15 @@ import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tools.nodetool.formatter.TableBuilder; -import org.apache.cassandra.utils.Clock; import org.apache.cassandra.utils.ConfigGenBuilder; +import org.apache.cassandra.utils.LoggingCommand; import org.apache.cassandra.utils.Retry; import static accord.utils.Property.commands; import static accord.utils.Property.ignoreCommand; import static accord.utils.Property.multistep; import static accord.utils.Property.stateful; -import static org.apache.cassandra.harry.model.TokenPlacementModel.Range; -import static org.apache.cassandra.harry.model.TokenPlacementModel.Replica; -import static org.apache.cassandra.harry.model.TokenPlacementModel.ReplicatedRanges; -import static org.apache.cassandra.harry.model.TokenPlacementModel.ReplicationFactor; -import static org.apache.cassandra.harry.model.TokenPlacementModel.SimpleReplicationFactor; +import static org.apache.cassandra.harry.model.TokenPlacementModel.*; /** * These tests can create many instances, so mac users may need to run the following to avoid address bind failures @@ -144,8 +140,9 @@ private enum RemoveType state -> state.cluster.get(toCoordinate).nodetoolResult("repair", state.schema.keyspace(), state.schema.table(), "--force").asserts().success()); } - private static Command, Void, ?> repairCommand(int toCoordinate, String ks, String... tables) { - return new SimpleCommand<>(state -> "nodetool repair " + ks + (tables.length == 0 ? "" : " " + Arrays.asList(tables)) + " from node" + toCoordinate + state.commandNamePostfix(), + private static Command, Void, ?> repairCommand(@Nullable String reason, int toCoordinate, String ks, String... tables) + { + return new SimpleCommand<>(state -> "nodetool repair " + ks + (tables.length == 0 ? "" : " " + Arrays.asList(tables)) + (reason == null ? "" : " for " + reason) + " from node" + toCoordinate + state.commandNamePostfix(), state -> { if (tables.length == 0) { state.cluster.get(toCoordinate).nodetoolResult("repair", ks, "--force").asserts().success(); @@ -160,6 +157,19 @@ private enum RemoveType }); } + protected static Command, Void, ?> repairFor(State state, String reason) + { + List, Void, ?>> commands = new ArrayList<>(); + //TODO (efficiency): rather than run on every instance, run on 1 per section of the ring? + for (int inst : state.topologyHistory.up()) + { + commands.add(repairCommand(reason, inst, "system_auth")); + commands.add(repairCommand(reason, inst, "system_traces")); + commands.add(repairCommand(reason, inst, state.schema.keyspace(), state.schema.table())); + } + return multistep(commands); + } + private Command, Void, ?> waitForCMSToQuiesce() { return new Property.StateOnlyCommand<>() @@ -378,30 +388,30 @@ public void test() Property.StatefulBuilder statefulBuilder = stateful().withSteps(20).withStepTimeout(Duration.ofMinutes(3)).withExamples(1); preCheck(statefulBuilder); statefulBuilder.check(commands(this::stateGen) - .preCommands(state -> state.preActions.forEach(Runnable::run)) - .add(2, (rs, state) -> { - EnumSet possibleTopologyChanges = possibleTopologyChanges(state); - if (possibleTopologyChanges.isEmpty()) return ignoreCommand(); - return topologyCommand(state, possibleTopologyChanges).next(rs); - }) - .add(1, (rs, state) -> repairCommand(rs.pickInt(state.topologyHistory.up()))) - .add(7, (rs, state) -> state.statementGen.apply(rs, state)) - .destroyState((state, cause) -> { - try (state) - { - TopologyMixupTestBase.this.destroyState(state, cause); - } - }) - .commandsTransformer((state, gen) -> { - for (BiFunction, Gen, Void, ?>>, Gen, Void, ?>>> fn : state.commandsTransformers) - gen = fn.apply(state, gen); - return gen; - }) - .onSuccess((state, sut, history) -> logger.info("Successful for the following:\nState {}\nHistory:\n{}", state, Property.formatList("\t\t", history))) - .build()); + .preCommands(state -> state.preActions.forEach(Runnable::run)) + .addIf(State::allowTopologyChanges, 2, (rs, state) -> { + EnumSet possibleTopologyChanges = possibleTopologyChanges(state); + if (possibleTopologyChanges.isEmpty()) return ignoreCommand(); + return topologyCommand(state, possibleTopologyChanges).next(rs); + }) + .add(1, (rs, state) -> repairCommand(rs.pickInt(state.topologyHistory.up()))) + .add(7, (rs, state) -> state.statementGen.apply(rs, state)) + .destroyState((state, cause) -> { + try (state) + { + TopologyMixupTestBase.this.destroyState(state, cause); + } + }) + .commandsTransformer((state, gen) -> { + for (BiFunction, Gen, Void, ?>>, Gen, Void, ?>>> fn : state.commandsTransformers) + gen = fn.apply(state, gen); + return gen; + }) + .onSuccess((state, sut, history) -> logger.info("Successful for the following:\nState {}\nHistory:\n{}", state, Property.formatList("\t\t", history))) + .build()); } - private EnumSet possibleTopologyChanges(State state) + private static EnumSet possibleTopologyChanges(State state) { EnumSet possibleTopologyChanges = EnumSet.noneOf(TopologyChange.class); // up or down is logically more correct, but since this runs sequentially and after the topology changes are complete, we don't have downed nodes at this point @@ -440,13 +450,13 @@ private EnumSet possibleTopologyChanges(State state) switch (task) { case AddNode: - possible.put(ignore -> multistep(addNode(), awaitClusterStable()), 1); + possible.put(ignore -> multistep(repairFor(state, "add node"), addNode(), awaitClusterStable()), 1); break; case RemoveNode: possible.put(rs -> multistep(removeNodeRandomizedDispatch(rs, state), awaitClusterStable()), 1); break; case HostReplace: - possible.put(rs -> multistep(hostReplace(rs, state), awaitClusterStable()), 1); + possible.put(rs -> multistep(repairFor(state, "host replace"), hostReplace(rs, state), awaitClusterStable()), 1); break; case StartNode: possible.put(rs -> startInstance(rs, state), 1); @@ -481,35 +491,6 @@ protected interface CommandGen Command, Void, ?> apply(RandomSource rs, State state); } - private static class LoggingCommand extends Property.ForwardingCommand - { - private static final Logger logger = LoggerFactory.getLogger(LoggingCommand.class); - - private LoggingCommand(Command delegate) - { - super(delegate); - } - - @Override - public Result apply(State s) throws Throwable - { - String name = detailed(s); - long startNanos = Clock.Global.nanoTime(); - try - { - logger.info("Starting command: {}", name); - Result o = super.apply(s); - logger.info("Command {} was success after {}", name, Duration.ofNanos(Clock.Global.nanoTime() - startNanos)); - return o; - } - catch (Throwable t) - { - logger.warn("Command {} failed after {}: {}", name, Duration.ofNanos(Clock.Global.nanoTime() - startNanos), t.toString()); // don't want stack trace, just type/msg - throw t; - } - } - } - protected static class State implements AutoCloseable { final TopologyHistory topologyHistory; @@ -534,34 +515,34 @@ public State(RandomSource rs, BiFunction schemaSpecGen this.yamlConfigOverrides = CONF_GEN.next(rs); cluster = Cluster.build(topologyHistory.minNodes) - .withTokenSupplier(topologyHistory) - .withConfig(c -> { - c.with(Feature.values()) - .set("write_request_timeout", "10s") - .set("read_request_timeout", "10s") - .set("range_request_timeout", "20s") - .set("request_timeout", "20s") - .set("native_transport_timeout", "30s") - // bound startup to some value larger than the task timeout, this is to allow the - // tests to stop blocking when a startup issue is detected. The main reason for - // this is that startup blocks forever, waiting for accord and streaming to - // complete... but if there are bugs at these layers then the startup will never - // exit, blocking the JVM from giving the needed information (logs/seed) to debug. - .set(Constants.KEY_DTEST_STARTUP_TIMEOUT, "4m") - .set(Constants.KEY_DTEST_API_STARTUP_FAILURE_AS_SHUTDOWN, false); - //TODO (maintenance): where to put this? Anything touching ConfigGenBuilder with jvm-dtest needs this... - ((InstanceConfig) c).remove("commitlog_sync_period_in_ms"); - for (Map.Entry e : yamlConfigOverrides.entrySet()) - c.set(e.getKey(), e.getValue()); - onConfigure(c); - }) - //TODO (maintenance): should TopologyHistory also be a INodeProvisionStrategy.Factory so address information is stored in the Node? - //TODO (maintenance): AbstractCluster's Map nodeIdTopology makes playing with dc/rack annoying, if this becomes an interface then TopologyHistory could own - .withNodeProvisionStrategy((subnet, portMap) -> new INodeProvisionStrategy.AbstractNodeProvisionStrategy(portMap) - { - { - Invariants.checkArgument(subnet == 0, "Unexpected subnet detected: %d", subnet); - } + .withTokenSupplier(topologyHistory) + .withConfig(c -> { + c.with(Feature.values()) + .set("write_request_timeout", "10s") + .set("read_request_timeout", "10s") + .set("range_request_timeout", "20s") + .set("request_timeout", "20s") + .set("native_transport_timeout", "30s") + // bound startup to some value larger than the task timeout, this is to allow the + // tests to stop blocking when a startup issue is detected. The main reason for + // this is that startup blocks forever, waiting for accord and streaming to + // complete... but if there are bugs at these layers then the startup will never + // exit, blocking the JVM from giving the needed information (logs/seed) to debug. + .set(Constants.KEY_DTEST_STARTUP_TIMEOUT, "4m") + .set(Constants.KEY_DTEST_API_STARTUP_FAILURE_AS_SHUTDOWN, false); + //TODO (maintenance): where to put this? Anything touching ConfigGenBuilder with jvm-dtest needs this... + ((InstanceConfig) c).remove("commitlog_sync_period_in_ms"); + for (Map.Entry e : yamlConfigOverrides.entrySet()) + c.set(e.getKey(), e.getValue()); + onConfigure(c); + }) + //TODO (maintenance): should TopologyHistory also be a INodeProvisionStrategy.Factory so address information is stored in the Node? + //TODO (maintenance): AbstractCluster's Map nodeIdTopology makes playing with dc/rack annoying, if this becomes an interface then TopologyHistory could own + .withNodeProvisionStrategy((subnet, portMap) -> new INodeProvisionStrategy.AbstractNodeProvisionStrategy(portMap) + { + { + Invariants.requireArgument(subnet == 0, "Unexpected subnet detected: %d", subnet); + } private final String ipPrefix = "127.0." + subnet + '.'; @@ -636,35 +617,17 @@ public String ipAddress(int nodeNum) if (next.checkPreconditions(state) == Property.PreCheckResult.Ignore) return next; commandsTransformers.remove(self); - int[] up = state.topologyHistory.up(); List, Void, ?>> commands = new ArrayList<>(); commands.add(fixDistributedSchemas); - for (String ks : Arrays.asList("system_auth", "system_traces")) - { - int coordinator = rs.pickInt(up); - commands.add(repairCommand(coordinator, ks)); - } commands.add(fixTestKeyspace); - { - int coordinator = rs.pickInt(up); - commands.add(repairCommand(coordinator, KEYSPACE)); - } + commands.add(repairFor(state, "set RF=" + TARGET_RF)); commands.add(reconfig); commands.add(next); return multistep(commands); }; } }); - commandsTransformers.add((state, commandGen) -> rs2 -> { - Command, Void, ?> c = commandGen.next(rs2); - if (!(c instanceof Property.MultistepCommand)) - return new LoggingCommand<>(c); - Property.MultistepCommand, Void> multistep = (Property.MultistepCommand, Void>) c; - List, Void, ?>> subcommands = new ArrayList<>(); - for (var sub : multistep) - subcommands.add(new LoggingCommand<>(sub)); - return multistep(subcommands); - }); + commandsTransformers.add(LoggingCommand.factory()); preActions.add(() -> { int[] up = topologyHistory.up(); // use the most recent node just in case the cluster isn't in-sync @@ -686,6 +649,11 @@ public String ipAddress(int nodeNum) onStartupComplete(waitForEpoch); } + protected boolean allowTopologyChanges() + { + return !possibleTopologyChanges(this).isEmpty(); + } + protected void onStartupComplete(long tcmEpoch) { @@ -707,11 +675,15 @@ public int[] upAndSafe() int quorum = topologyHistory.quorum(); // find what ranges are able to handle 1 node loss Set safeRanges = new HashSet<>(); + Set cms = new HashSet<>(); + for (Integer node : cmsGroup) + cms.add(node); + ring.rangesToReplicas((range, replicas) -> { IntHashSet alive = new IntHashSet(); for (int peer : replicas) { - if (up.contains(peer)) + if (up.contains(peer) && !cms.contains(peer)) alive.add(peer); } if (quorum < alive.size()) @@ -756,17 +728,8 @@ public void close() throws Exception try { SimpleQueryResult qr = Retry.retryWithBackoffBlocking(5, () -> cluster.get(cmsNode).executeInternalWithResult("SELECT epoch, kind, transformation FROM system_views.cluster_metadata_log")); - TableBuilder builder = new TableBuilder(" | "); - builder.add(qr.names()); - while (qr.hasNext()) - { - Row next = qr.next(); - builder.add(Stream.of(next.toObjectArray()) - .map(Objects::toString) - .map(s -> s.length() > 100 ? s.substring(0, 100) + "..." : s) - .collect(Collectors.toList())); - } - epochHistory = "Epochs:\n" + builder; + String table = TableBuilder.toStringPiped(qr.names(), QueryResults.stringify(qr, 100)); + epochHistory = "Epochs:\n" + table; } catch (Throwable t) { @@ -985,7 +948,7 @@ private static int addressToNodeId(InetAddressAndPort addressAndPort) { String address = addressAndPort.getAddress().getHostAddress(); String[] parts = address.split("\\."); - Invariants.checkState(parts.length == 4, "Unable to parse address %s", address); + Invariants.require(parts.length == 4, "Unable to parse address %s", address); return Integer.parseInt(parts[3]); } } diff --git a/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java b/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java index 60168f65196c..952663c50e14 100644 --- a/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java +++ b/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java @@ -254,6 +254,12 @@ public KeyIterator keyIterator() throws IOException return delegate.keyIterator(); } + @Override + public KeyIterator keyIterator(AbstractBounds range) throws IOException + { + return delegate.keyIterator(range); + } + @Override public DecoratedKey firstKeyBeyond(PartitionPosition token) { diff --git a/test/distributed/org/apache/cassandra/service/accord/AccordJournalBurnTest.java b/test/distributed/org/apache/cassandra/service/accord/AccordJournalBurnTest.java new file mode 100644 index 000000000000..eea399dd609a --- /dev/null +++ b/test/distributed/org/apache/cassandra/service/accord/AccordJournalBurnTest.java @@ -0,0 +1,358 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.io.IOException; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; +import javax.annotation.Nullable; + +import org.junit.Before; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.burn.BurnTestBase; +import accord.burn.SimulationException; +import accord.impl.TopologyFactory; +import accord.impl.basic.Cluster; +import accord.impl.basic.RandomDelayQueue; +import accord.local.CommandStores; +import accord.local.DurableBefore; +import accord.local.Node; +import accord.local.RedundantBefore; +import accord.primitives.EpochSupplier; +import accord.utils.DefaultRandom; +import accord.utils.Invariants; +import accord.utils.PersistentField; +import accord.utils.RandomSource; +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Directories; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.compaction.ActiveCompactionsTracker; +import org.apache.cassandra.db.compaction.CompactionController; +import org.apache.cassandra.db.compaction.CompactionIterator; +import org.apache.cassandra.db.compaction.OperationType; +import org.apache.cassandra.db.compaction.writers.CompactionAwareWriter; +import org.apache.cassandra.db.compaction.writers.DefaultCompactionWriter; +import org.apache.cassandra.db.lifecycle.LifecycleTransaction; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.io.sstable.ISSTableScanner; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.journal.SegmentCompactor; +import org.apache.cassandra.journal.StaticSegment; +import org.apache.cassandra.journal.TestParams; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.Tables; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.serializers.CommandSerializers; +import org.apache.cassandra.service.accord.serializers.DepsSerializers; +import org.apache.cassandra.service.accord.serializers.KeySerializers; +import org.apache.cassandra.service.accord.serializers.ResultSerializers; +import org.apache.cassandra.service.accord.serializers.TopologySerializers; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.tools.FieldUtil; + +import static accord.impl.PrefixedIntHashKey.ranges; +import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; + +public class AccordJournalBurnTest extends BurnTestBase +{ + private static final Logger logger = LoggerFactory.getLogger(AccordJournalBurnTest.class); + + public static void setUp() throws Throwable + { + StorageService.instance.registerMBeans(); + StorageService.instance.setPartitionerUnsafe(Murmur3Partitioner.instance); + ServerTestUtils.prepareServerNoRegister(); + + Keyspace.setInitialized(); + FieldUtil.transferFields(new KeySerializers.Impl(BurnTestKeySerializers.key, + BurnTestKeySerializers.routingKey, + BurnTestKeySerializers.range), + KeySerializers.class); + + FieldUtil.transferFields(new CommandSerializers.QuerySerializers(BurnTestKeySerializers.read, + BurnTestKeySerializers.query, + BurnTestKeySerializers.update, + BurnTestKeySerializers.write, + BurnTestKeySerializers.tablesAndKeys), + CommandSerializers.class); + + FieldUtil.transferFields(new DepsSerializers.Impl(BurnTestKeySerializers.range), + DepsSerializers.class); + + FieldUtil.setInstanceUnsafe(ResultSerializers.class, + BurnTestKeySerializers.result, + "result"); + + FieldUtil.setInstanceUnsafe(TopologySerializers.class, + new TopologySerializers.ShardSerializer(BurnTestKeySerializers.range), + "shard"); + } + + private static final AtomicInteger counter = new AtomicInteger(); + + @Before + public void beforeTest() throws Throwable + { + + } + + @Test + public void testOne() + { + long seed = System.nanoTime(); + int operations = 1000; + + logger.info("Seed: {}", seed); + Cluster.trace.trace("Seed: {}", seed); + RandomSource random = new DefaultRandom(seed); + try + { + List clients = generateIds(true, 1 + random.nextInt(4)); + int rf; + float chance = random.nextFloat(); + if (chance < 0.2f) rf = random.nextInt(2, 9); + else if (chance < 0.4f) rf = 3; + else if (chance < 0.7f) rf = 5; + else if (chance < 0.8f) rf = 7; + else rf = 9; + + List nodes = generateIds(false, random.nextInt(rf, rf * 3)); + + { + ServerTestUtils.daemonInitialization(); + + TableMetadata[] metadatas = new TableMetadata[1 + nodes.size()]; + metadatas[0] = AccordKeyspace.CommandsForKeys; + for (int i = 0; i < nodes.size(); i++) + metadatas[1 + i] = AccordKeyspace.journalMetadata("journal_" + nodes.get(i), false); + + AccordKeyspace.TABLES = Tables.of(metadatas); + setUp(); + } + + Keyspace ks = Schema.instance.getKeyspaceInstance("system_accord"); + + burn(random, new TopologyFactory(rf, ranges(0, HASH_RANGE_START, HASH_RANGE_END, random.nextInt(Math.max(nodes.size() + 1, rf), nodes.size() * 3))), + clients, + nodes, + 5 + random.nextInt(15), + 5 + random.nextInt(15), + operations, + 10 + random.nextInt(30), + new RandomDelayQueue.Factory(random).get(), + (nodeId, randomSource) -> { + try + { + File directory = new File(Files.createTempDirectory(Integer.toString(counter.incrementAndGet()))); + directory.deleteRecursiveOnExit(); + ColumnFamilyStore cfs = ks.getColumnFamilyStore("journal_" + nodeId); + cfs.disableAutoCompaction(); + AccordJournal journal = new AccordJournal(new TestParams() + { + @Override + public int segmentSize() + { + return 1 * 1024 * 1024; + } + }, directory, cfs) + { + @Override + public void start(Node node) + { + super.start(node); + unsafeSetStarted(); + } + + @Override + public void saveCommand(int store, CommandUpdate update, @Nullable Runnable onFlush) + { + // For the purpose of this test, we do not have to wait for flush, since we do not test durability and are using mmap + super.saveCommand(store, update, () -> {}); + if (onFlush != null) + onFlush.run(); + } + + @Override + public void saveStoreState(int store, FieldUpdates fieldUpdates, @Nullable Runnable onFlush) + { + super.saveStoreState(store, fieldUpdates, () -> {}); + if (onFlush != null) + onFlush.run(); + } + + @Override + public void saveTopology(TopologyUpdate topologyUpdate, Runnable onFlush) + { + super.saveTopology(topologyUpdate, () -> {}); + if (onFlush != null) + onFlush.run(); + } + + @Override + protected SegmentCompactor compactor(ColumnFamilyStore cfs, Version userVersion) + { + return new NemesisAccordSegmentCompactor<>(userVersion, cfs, randomSource.fork()) + { + @Nullable + @Override + public Collection> compact(Collection> staticSegments) + { + if (journalTable == null) + throw new IllegalStateException("Unsafe access to AccordJournal during ; journalTable was touched before it was published"); + Collection> result = super.compact(staticSegments); + journalTable.safeNotify(index -> index.remove(staticSegments)); + return result; + } + }; + } + + private CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs, + Directories directories, + LifecycleTransaction transaction, + Set nonExpiredSSTables) + { + return new DefaultCompactionWriter(cfs, directories, transaction, nonExpiredSSTables, false, 0); + } + + @Override + public void purge(CommandStores commandStores, EpochSupplier minEpoch) + { + this.journal.closeCurrentSegmentForTestingIfNonEmpty(); + this.journal.runCompactorForTesting(); + + Set orig = cfs.getLiveSSTables(); + List all = new ArrayList<>(orig); + if (all.size() <= 1) + return; + + Set selected = new HashSet<>(); + int count = all.size(); + int removeCount = random.nextInt(1, count); + while (removeCount-- > 0) + { + int removeIndex = random.nextInt(count); + SSTableReader reader = all.get(removeIndex); + if (reader == null) + continue; + all.set(removeIndex, null); + selected.add(reader); + --count; + } + + if (selected.isEmpty()) + return; + List scanners = selected.stream().map(SSTableReader::getScanner).collect(Collectors.toList()); + + Collection newSStables; + + try (LifecycleTransaction txn = cfs.getTracker().tryModify(selected, OperationType.COMPACTION); + CompactionController controller = new CompactionController(cfs, selected, 0); + CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, + scanners, + controller, + 0, + nextTimeUUID(), + ActiveCompactionsTracker.NOOP, null, + () -> getCompactionInfo(node, cfs.getTableId()), + () -> Version.V1)) + { + try (CompactionAwareWriter writer = getCompactionAwareWriter(cfs, cfs.getDirectories(), txn, selected)) + { + while (ci.hasNext()) + writer.append(ci.next()); + + ci.setTargetDirectory(writer.getSStableDirectory().path()); + // point of no return + newSStables = writer.finish(); + } + catch (IOException e) + { + throw new RuntimeException(e); + } + } + + Invariants.require(!orig.equals(cfs.getLiveSSTables())); + } + + + @Override + public void replay(CommandStores commandStores) + { + // Make sure to replay _only_ static segments + this.closeCurrentSegmentForTestingIfNonEmpty(); + super.replay(commandStores); + } + + @Override + public PersistentField.Persister durableBeforePersister() + { + // TODO (required): we should be persisting in the journal, but this currently causes the burn test to take far too long + return DurableBefore.NOOP_PERSISTER; + } + }; + + return journal; + } + catch (Throwable t) + { + throw new RuntimeException(t); + } + } + ); + } + catch (Throwable t) + { + logger.error("Exception running burn test for seed {}:", seed, t); + throw SimulationException.wrap(seed, t); + } + } + + public static IAccordService.AccordCompactionInfos getCompactionInfo(Node node, TableId tableId) + { + IAccordService.AccordCompactionInfos compactionInfos = new IAccordService.AccordCompactionInfos(node.durableBefore(), node.topology().minEpoch()); + node.commandStores().forEachCommandStore(commandStore -> { + RedundantBefore redundantBefore = commandStore.unsafeGetRedundantBefore(); + if (redundantBefore == null) + redundantBefore = RedundantBefore.EMPTY; + CommandStores.RangesForEpoch rangesForEpoch = commandStore.unsafeGetRangesForEpoch(); + if (rangesForEpoch == null) + rangesForEpoch = CommandStores.RangesForEpoch.EMPTY; + compactionInfos.put(commandStore.id(), new IAccordService.AccordCompactionInfo(commandStore.id(), + redundantBefore, + rangesForEpoch, + tableId)); + }); + return compactionInfos; + } + + +} diff --git a/test/distributed/org/apache/cassandra/service/accord/AccordJournalCompactionTest.java b/test/distributed/org/apache/cassandra/service/accord/AccordJournalCompactionTest.java new file mode 100644 index 000000000000..8471fdd82982 --- /dev/null +++ b/test/distributed/org/apache/cassandra/service/accord/AccordJournalCompactionTest.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.nio.file.Files; +import java.util.NavigableMap; +import java.util.concurrent.atomic.AtomicInteger; + +import com.google.common.collect.ImmutableSortedMap; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.local.DurableBefore; +import accord.local.RedundantBefore; +import accord.primitives.Deps; +import accord.primitives.KeyDeps; +import accord.primitives.Ranges; +import accord.primitives.Timestamp; +import accord.primitives.TxnId; +import accord.utils.AccordGens; +import accord.utils.DefaultRandom; +import accord.utils.Gen; +import accord.utils.RandomSource; +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.journal.TestParams; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.utils.AccordGenerators; + +import static accord.api.Journal.FieldUpdates; +import static accord.local.CommandStores.RangesForEpoch; +import static org.apache.cassandra.service.accord.AccordJournalValueSerializers.DurableBeforeAccumulator; + + +public class AccordJournalCompactionTest +{ + @BeforeClass + public static void setUp() throws Throwable + { + ServerTestUtils.daemonInitialization(); + StorageService.instance.registerMBeans(); + StorageService.instance.setPartitionerUnsafe(Murmur3Partitioner.instance); + ServerTestUtils.prepareServerNoRegister(); + + StorageService.instance.initServer(); + Keyspace.setInitialized(); + } + + private AtomicInteger counter = new AtomicInteger(); + @Before + public void beforeTest() throws Throwable + { + File directory = new File(Files.createTempDirectory(Integer.toString(counter.incrementAndGet()))); + directory.deleteRecursiveOnExit(); + DatabaseDescriptor.setAccordJournalDirectory(directory.path()); + } + + @Test + public void segmentMergeTest() throws InterruptedException + { + ColumnFamilyStore cfs = Keyspace.open(SchemaConstants.ACCORD_KEYSPACE_NAME).getColumnFamilyStore(AccordKeyspace.JOURNAL); + cfs.disableAutoCompaction(); + + RedundantBefore redundantBeforeAccumulator = RedundantBefore.EMPTY; + DurableBeforeAccumulator durableBeforeAccumulator = new DurableBeforeAccumulator(); + NavigableMap safeToReadAtAccumulator = ImmutableSortedMap.of(Timestamp.NONE, Ranges.EMPTY); + NavigableMap bootstrapBeganAtAccumulator = ImmutableSortedMap.of(TxnId.NONE, Ranges.EMPTY); + RangesForEpoch rangesForEpochAccumulator = null; + + Gen durableBeforeGen = AccordGenerators.durableBeforeGen(DatabaseDescriptor.getPartitioner()); + Gen> safeToReadGen = AccordGenerators.safeToReadGen(DatabaseDescriptor.getPartitioner()); + Gen rangesForEpochGen = AccordGenerators.rangesForEpoch(DatabaseDescriptor.getPartitioner()); + + AccordJournal journal = new AccordJournal(new TestParams() + { + @Override + public int segmentSize() + { + return 1024 * 1024; + } + + @Override + public boolean enableCompaction() + { + return false; + } + }); + try + { + journal.start(null); + journal.unsafeSetStarted(); + Timestamp timestamp = Timestamp.NONE; + + RandomSource rs = new DefaultRandom(1); + + int count = 1_000; +// RedundantBefore redundantBefore = RedundantBefore.EMPTY; + for (int i = 0; i <= count; i++) + { + timestamp = timestamp.next(); + FieldUpdates updates = new FieldUpdates(); + DurableBefore addDurableBefore = durableBeforeGen.next(rs); + // TODO: improve redundant before generator and re-enable +// updates.addRedundantBefore = redundantBeforeGen.next(rs); +// updates.newRedundantBefore = redundantBefore = RedundantBefore.merge(redundantBefore, updates.addRedundantBefore); + updates.newSafeToRead = safeToReadGen.next(rs); + updates.newRangesForEpoch = rangesForEpochGen.next(rs); + + journal.durableBeforePersister().persist(addDurableBefore, null); + journal.saveStoreState(1, updates, null); + + redundantBeforeAccumulator = updates.newRedundantBefore; + durableBeforeAccumulator.update(addDurableBefore); + if (updates.newBootstrapBeganAt != null) + bootstrapBeganAtAccumulator = updates.newBootstrapBeganAt; + if (updates.newSafeToRead != null) + safeToReadAtAccumulator = updates.newSafeToRead; + if (updates.newRangesForEpoch != null) + rangesForEpochAccumulator = updates.newRangesForEpoch; + + if (i % 100 == 0) + journal.closeCurrentSegmentForTestingIfNonEmpty(); + if (i % 200 == 0) + journal.runCompactorForTesting(); + } + +// Assert.assertEquals(redundantBeforeAccumulator.get(), journal.loadRedundantBefore(1)); + Assert.assertEquals(durableBeforeAccumulator.get(), journal.durableBeforePersister().load()); + Assert.assertEquals(bootstrapBeganAtAccumulator, journal.loadBootstrapBeganAt(1)); + Assert.assertEquals(safeToReadAtAccumulator, journal.loadSafeToRead(1)); + Assert.assertEquals(rangesForEpochAccumulator, journal.loadRangesForEpoch(1)); + } + finally + { + journal.shutdown(); + } + } + + public static Gen depsGen() + { + Gen keyDepsGen = AccordGenerators.keyDepsGen(DatabaseDescriptor.getPartitioner()); + return AccordGens.deps(keyDepsGen::next, (rs) -> Deps.NONE.rangeDeps); + } +} diff --git a/test/distributed/org/apache/cassandra/service/accord/BurnTestKeySerializers.java b/test/distributed/org/apache/cassandra/service/accord/BurnTestKeySerializers.java new file mode 100644 index 000000000000..b11191219f40 --- /dev/null +++ b/test/distributed/org/apache/cassandra/service/accord/BurnTestKeySerializers.java @@ -0,0 +1,448 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.io.IOException; +import java.util.Map; +import java.util.function.Function; + +import accord.api.Key; +import accord.api.Query; +import accord.api.Read; +import accord.api.Result; +import accord.api.RoutingKey; +import accord.api.Update; +import accord.api.Write; +import accord.impl.PrefixedIntHashKey; +import accord.impl.list.ListQuery; +import accord.impl.list.ListRead; +import accord.impl.list.ListResult; +import accord.impl.list.ListUpdate; +import accord.impl.list.ListWrite; +import accord.local.Node; +import accord.primitives.Keys; +import accord.primitives.Range; +import accord.primitives.Seekables; +import accord.primitives.TxnId; +import org.apache.cassandra.io.ParameterisedVersionedSerializer; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.service.accord.api.AccordRoutableKey; +import org.apache.cassandra.service.accord.api.AccordRoutableKey.AccordSearchableKeySerializer; +import org.apache.cassandra.service.accord.serializers.CommandSerializers; +import org.apache.cassandra.service.accord.serializers.KeySerializers; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.serializers.TopologySerializers; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.utils.CastingSerializer; + +@SuppressWarnings("unchecked") +public class BurnTestKeySerializers +{ + private BurnTestKeySerializers() {} + + public static final AccordRoutableKey.AccordKeySerializer key = + (AccordRoutableKey.AccordKeySerializer) + (AccordSearchableKeySerializer) + new AccordSearchableKeySerializer() + { + @Override + public void serialize(PrefixedIntHashKey t, DataOutputPlus out) throws IOException + { + assert t instanceof PrefixedIntHashKey.Key; + out.writeInt(t.prefix); + out.writeInt(t.key); + out.writeInt(t.hash); + } + + @Override + public PrefixedIntHashKey deserialize(DataInputPlus in) throws IOException + { + int prefix = in.readInt(); + int key = in.readInt(); + int hash = in.readInt(); + return PrefixedIntHashKey.key(prefix, key, hash); + } + + @Override + public long serializedSize(PrefixedIntHashKey t) + { + return 3 * Integer.BYTES; + } + + @Override + public void skip(DataInputPlus in) throws IOException + { + in.skipBytesFully(3 * Integer.BYTES); + } + + @Override + public int fixedKeyLengthForPrefix(Object prefix) + { + return 8; + } + + @Override + public int serializedSizeOfPrefix(Object prefix) + { + return 4; + } + + @Override + public int serializedSizeWithoutPrefix(PrefixedIntHashKey key) + { + return 8; + } + + @Override + public void serializePrefix(Object prefix, DataOutputPlus out) throws IOException + { + out.writeInt((Integer) prefix); + } + + @Override + public void serializeWithoutPrefixOrLength(PrefixedIntHashKey key, DataOutputPlus out) throws IOException + { + out.writeInt(key.hash); + out.writeInt(key.key); + } + + @Override + public Object deserializePrefix(DataInputPlus in) throws IOException + { + return in.readInt(); + } + + @Override + public PrefixedIntHashKey deserializeWithPrefix(Object prefix, int length, DataInputPlus in) throws IOException + { + int key = in.readInt(); + int hash = in.readInt(); + return PrefixedIntHashKey.key((Integer)prefix, key, hash); + } + }; + + public static final AccordSearchableKeySerializer routingKey = + (AccordSearchableKeySerializer) + (AccordSearchableKeySerializer) + new AccordSearchableKeySerializer() + { + public void serialize(PrefixedIntHashKey.Hash t, DataOutputPlus out) throws IOException + { + out.writeInt(t.prefix); + out.writeInt(t.hash); + } + + public PrefixedIntHashKey.Hash deserialize(DataInputPlus in) throws IOException + { + int prefix = in.readInt(); + int hash = in.readInt(); + return new PrefixedIntHashKey.Hash(prefix, hash); + } + + public long serializedSize(PrefixedIntHashKey.Hash t) + { + return 2 * Integer.BYTES; + } + + public void skip(DataInputPlus in) throws IOException + { + in.skipBytesFully(2 * Integer.BYTES); + } + + @Override + public int fixedKeyLengthForPrefix(Object prefix) + { + return 4; + } + + @Override + public int serializedSizeOfPrefix(Object prefix) + { + return 4; + } + + @Override + public int serializedSizeWithoutPrefix(PrefixedIntHashKey.Hash key) + { + return 4; + } + + @Override + public void serializePrefix(Object prefix, DataOutputPlus out) throws IOException + { + out.writeInt((Integer) prefix); + } + + @Override + public void serializeWithoutPrefixOrLength(PrefixedIntHashKey.Hash key, DataOutputPlus out) throws IOException + { + out.writeInt(key.hash); + } + + @Override + public Object deserializePrefix(DataInputPlus in) throws IOException + { + return in.readInt(); + } + + @Override + public PrefixedIntHashKey.Hash deserializeWithPrefix(Object prefix, int length, DataInputPlus in) throws IOException + { + int hash = in.readInt(); + return PrefixedIntHashKey.forHash((Integer)prefix, hash); + } + }; + + public static final UnversionedSerializer range = + (UnversionedSerializer) + (UnversionedSerializer) + new UnversionedSerializer() + { + @Override + public void serialize(PrefixedIntHashKey.Range t, DataOutputPlus out) throws IOException + { + routingKey.serialize(t.start(), out); + routingKey.serialize(t.end(), out); + } + + @Override + public PrefixedIntHashKey.Range deserialize(DataInputPlus in) throws IOException + { + RoutingKey start = routingKey.deserialize(in); + RoutingKey end = routingKey.deserialize(in); + return PrefixedIntHashKey.range((PrefixedIntHashKey.PrefixedIntRoutingKey) start, (PrefixedIntHashKey.PrefixedIntRoutingKey) end); + } + + @Override + public long serializedSize(PrefixedIntHashKey.Range t) + { + throw new RuntimeException("not implemented"); + } + }; + + public static final ParameterisedVersionedSerializer read = (ParameterisedVersionedSerializer) new ParameterisedVersionedSerializer() + { + @Override + public void serialize(ListRead t, TableMetadatasAndKeys seekables, DataOutputPlus out, Version version) throws IOException + { + out.writeBoolean(t.isEphemeralRead); + KeySerializers.seekables.serialize(t.userReadKeys, out); + KeySerializers.seekables.serialize(t.keys, out); + } + + @Override + public ListRead deserialize(TableMetadatasAndKeys seekables, DataInputPlus in, Version version) throws IOException + { + boolean isEphemeralRead = in.readBoolean(); + Seekables userReadKeys = KeySerializers.seekables.deserialize(in); + Seekables keys = KeySerializers.seekables.deserialize(in); + return new ListRead(Function.identity(), isEphemeralRead, userReadKeys, keys); + } + + @Override + public long serializedSize(ListRead t, TableMetadatasAndKeys seekables, Version version) + { + throw new RuntimeException("not implemented"); + } + }; + + public static final UnversionedSerializer query = CastingSerializer.create(ListQuery.class, new UnversionedSerializer<>() + { + public void serialize(ListQuery t, DataOutputPlus out) throws IOException + { + if (t == null) + { + out.writeByte(0); + return; + } + out.writeByte(1); + TopologySerializers.nodeId.serialize(t.client, out); + out.writeLong(t.requestId); + out.writeBoolean(t.isEphemeralRead); + } + + public ListQuery deserialize(DataInputPlus in) throws IOException + { + switch (in.readByte()) + { + case 0: + return null; + case 1: + break; + default: + throw new AssertionError(); + } + + Node.Id client = TopologySerializers.nodeId.deserialize(in); + long requestId = in.readLong(); + boolean isEphemeralRead = in.readBoolean(); + return new ListQuery(client, requestId, isEphemeralRead); + } + + public long serializedSize(ListQuery t) + { + throw new RuntimeException("not implemented"); + } + }); + + public static final ParameterisedVersionedSerializer update = (ParameterisedVersionedSerializer) new ParameterisedVersionedSerializer() + { + public void serialize(ListUpdate t, TableMetadatasAndKeys seekables, DataOutputPlus out, Version version) throws IOException + { + out.writeInt(t.size()); + for (Map.Entry e : t.entrySet()) + { + KeySerializers.key.serialize(e.getKey(), out); + out.writeInt(e.getValue()); + } + } + + public ListUpdate deserialize(TableMetadatasAndKeys seekables, DataInputPlus in, Version version) throws IOException + { + int size = in.readInt(); + ListUpdate listUpdate = new ListUpdate(Function.identity()); + for (int i = 0; i < size; i++) + { + Key k = KeySerializers.key.deserialize(in); + int v = in.readInt(); + listUpdate.put(k, v); + } + return listUpdate; + } + + public long serializedSize(ListUpdate t, TableMetadatasAndKeys seekables, Version version) + { + throw new RuntimeException("not implemented"); + } + }; + + public static final ParameterisedVersionedSerializer write = (ParameterisedVersionedSerializer) new ParameterisedVersionedSerializer() + { + public void serialize(ListWrite t, Seekables seekables, DataOutputPlus out, Version version) throws IOException + { + out.writeInt(t.size()); + for (Map.Entry e : t.entrySet()) + { + KeySerializers.key.serialize(e.getKey(), out); + out.writeInt(e.getValue().length); + for (int v : e.getValue()) + out.writeInt(v); + } + } + + public ListWrite deserialize(Seekables seekables, DataInputPlus in, Version version) throws IOException + { + int size = in.readInt(); + ListWrite write = new ListWrite(Function.identity()); + for (int i = 0; i < size; i++) + { + Key k = KeySerializers.key.deserialize(in); + int len = in.readInt(); + int[] vals = new int[len]; + for (int j = 0; j < len; j++) + vals[j] = in.readInt(); + write.put(k, vals); + } + return write; + } + + public long serializedSize(ListWrite t, Seekables seekables, Version version) + { + throw new RuntimeException("not implemented"); + } + }; + + public static final UnversionedSerializer tablesAndKeys = new UnversionedSerializer<>() + { + @Override + public void serialize(TableMetadatasAndKeys t, DataOutputPlus out) throws IOException + { + } + + @Override + public TableMetadatasAndKeys deserialize(DataInputPlus in) throws IOException + { + return null; + } + + @Override + public long serializedSize(TableMetadatasAndKeys t) + { + return 0; + } + }; + + public static final UnversionedSerializer result = CastingSerializer.create(ListResult.class, new UnversionedSerializer<>() + { + public void serialize(ListResult t, DataOutputPlus out) throws IOException + { + TopologySerializers.nodeId.serialize(t.client, out); + out.writeLong(t.requestId); + CommandSerializers.txnId.serialize(t.txnId, out); + + KeySerializers.seekables.serialize(t.readKeys, out); + KeySerializers.keys.serialize(t.responseKeys, out); + + out.writeInt(t.read.length); + for (int[] ints : t.read) + { + out.writeInt(ints.length); + for (int i : ints) + out.writeInt(i); + } + + out.writeInt(t.update == null ? 0 : 1); + if (t.update != null) + update.serialize(t.update, null, out, Version.LATEST); + + out.writeInt(t.status.ordinal()); + } + + public ListResult deserialize(DataInputPlus in) throws IOException + { + Node.Id client = TopologySerializers.nodeId.deserialize(in); + long requestId = in.readLong(); + TxnId txnId = CommandSerializers.txnId.deserialize(in); + Seekables readKeys = KeySerializers.seekables.deserialize(in); + Keys responseKeys = KeySerializers.keys.deserialize(in); + int[][] read = new int[in.readInt()][]; + for (int i = 0; i < read.length; i++) + { + int[] v = new int[in.readInt()]; + for (int j = 0; j < v.length; j++) + { + v[j] = in.readInt(); + } + read[i] = v; + } + ListUpdate upd = null; + if (in.readInt() != 0) + upd = (ListUpdate) update.deserialize(null, in, Version.LATEST); + ListResult.Status status = ListResult.Status.values()[in.readInt()]; + return new ListResult(status, client, requestId, txnId, readKeys, responseKeys, read, upd); + } + + public long serializedSize(ListResult t) + { + throw new RuntimeException("not implemented"); + } + }); +} \ No newline at end of file diff --git a/test/distributed/org/apache/cassandra/service/accord/NemesisAccordSegmentCompactor.java b/test/distributed/org/apache/cassandra/service/accord/NemesisAccordSegmentCompactor.java new file mode 100644 index 000000000000..b1e2407991ab --- /dev/null +++ b/test/distributed/org/apache/cassandra/service/accord/NemesisAccordSegmentCompactor.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import accord.utils.RandomSource; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.SerializationHeader; +import org.apache.cassandra.db.rows.EncodingStats; +import org.apache.cassandra.io.sstable.Descriptor; +import org.apache.cassandra.io.sstable.SSTableTxnWriter; +import org.apache.cassandra.service.accord.serializers.Version; + +/** + * Nemesis compactor: a compactor that will distribute your keys over a large(r) number of SSTables. + * + * For testing purposes only. + */ +public class NemesisAccordSegmentCompactor extends AbstractAccordSegmentCompactor +{ + private final RandomSource randomSource; + private final SSTableTxnWriter[] writers; + private final Set written = new HashSet<>(); + + public NemesisAccordSegmentCompactor(Version userVersion, ColumnFamilyStore cfs, RandomSource randomSource) + { + super(userVersion, cfs); + this.randomSource = randomSource; + this.writers = new SSTableTxnWriter[randomSource.nextInt(2, 10)]; + } + + @Override + boolean considerWritingKey() + { + if (written.size() == writers.length - 1) + return false; + return randomSource.nextBoolean(); + } + + @Override + void switchPartitions() + { + written.clear(); + } + + @Override + void initializeWriter() + { + for (int i = 0; i < writers.length; i++) + { + Descriptor descriptor = cfs.newSSTableDescriptor(cfs.getDirectories().getDirectoryForNewSSTables()); + SerializationHeader header = new SerializationHeader(true, cfs.metadata(), cfs.metadata().regularAndStaticColumns(), EncodingStats.NO_STATS); + writers[i] = SSTableTxnWriter.create(cfs, descriptor, 0, 0, null, false, header); + } + } + + @Override + SSTableTxnWriter writer() + { + for (int i = 0; i < 10_000; i++) + { + SSTableTxnWriter writer = writers[randomSource.nextInt(writers.length)]; + if (written.add(writer)) + return writer; + } + throw new IllegalStateException(String.format("Could not pick an sstable from %s. Written: %s", Arrays.asList(writers), written)); + } + + @Override + void finishAndAddWriter() + { + for (SSTableTxnWriter writer : writers) + { + cfs.addSSTables(writer.finish(true)); + writer.close(); + } + Arrays.fill(writers, null); + } + + @Override + Throwable cleanupWriter(Throwable t) + { + for (SSTableTxnWriter writer : writers) + t = writer.abort(t); + return t; + } +} diff --git a/test/harry/main/org/apache/cassandra/harry/SchemaSpec.java b/test/harry/main/org/apache/cassandra/harry/SchemaSpec.java index dbfa6e5f53b6..b103d82e37b4 100644 --- a/test/harry/main/org/apache/cassandra/harry/SchemaSpec.java +++ b/test/harry/main/org/apache/cassandra/harry/SchemaSpec.java @@ -30,6 +30,7 @@ import org.apache.cassandra.harry.gen.Generators; import org.apache.cassandra.harry.gen.ValueGenerators; import org.apache.cassandra.harry.util.IteratorsUtil; +import org.apache.cassandra.service.consensus.TransactionalMode; import org.apache.cassandra.utils.ByteArrayUtil; import static org.apache.cassandra.harry.gen.InvertibleGenerator.MAX_ENTROPY; @@ -177,6 +178,15 @@ public String compile() shouldAppendAnd = true; } + if (options.transactionalMode() != null) + { + appendWith.run(); + if (shouldAppendAnd) + sb.append(" AND"); + sb.append(" ").append(options.transactionalMode().asCqlParam()); + shouldAppendAnd = true; + } + if (options.disableReadRepair()) { appendWith.run(); @@ -339,6 +349,7 @@ public int hashCode() public interface Options { + TransactionalMode transactionalMode(); boolean addWriteTimestamps(); boolean disableReadRepair(); String compactionStrategy(); @@ -354,6 +365,7 @@ public static OptionsBuilder optionsBuilder() public static class OptionsBuilder implements Options { + private TransactionalMode transactionalMode = null; private boolean addWriteTimestamps = true; private boolean disableReadRepair = false; private String compactionStrategy = null; @@ -365,6 +377,23 @@ private OptionsBuilder() { } + public Options build() + { + return this; + } + + public OptionsBuilder withTransactionalMode(TransactionalMode mode) + { + this.transactionalMode = mode; + return this; + } + + @Override + public TransactionalMode transactionalMode() + { + return transactionalMode; + } + public OptionsBuilder addWriteTimestamps(boolean newValue) { this.addWriteTimestamps = newValue; diff --git a/test/harry/main/org/apache/cassandra/harry/cql/SelectHelper.java b/test/harry/main/org/apache/cassandra/harry/cql/SelectHelper.java index 148711173c69..599a28be17ee 100644 --- a/test/harry/main/org/apache/cassandra/harry/cql/SelectHelper.java +++ b/test/harry/main/org/apache/cassandra/harry/cql/SelectHelper.java @@ -61,8 +61,8 @@ public static CompiledStatement select(Operations.SelectRow select, SchemaSpec s { ColumnSpec column = schema.clusteringKeys.get(i); builder.where(new Symbol(column.name, column.type.asServerType()), - toInequality(Relations.RelationKind.EQ), - new Bind(ck[i], column.type.asServerType())); + toInequality(Relations.RelationKind.EQ), + new Bind(ck[i], column.type.asServerType())); } return toCompiled(builder.build()); @@ -81,15 +81,15 @@ public static CompiledStatement select(Operations.SelectRange select, SchemaSpec if (select.lowerBoundRelation()[i] != null) { builder.where(new Symbol(column.name, column.type.asServerType()), - toInequality(select.lowerBoundRelation()[i]), - new Bind(lowBound[i], column.type.asServerType())); + toInequality(select.lowerBoundRelation()[i]), + new Bind(lowBound[i], column.type.asServerType())); } if (select.upperBoundRelation()[i] != null) { builder.where(new Symbol(column.name, column.type.asServerType()), - toInequality(select.upperBoundRelation()[i]), - new Bind(highBound[i], column.type.asServerType())); + toInequality(select.upperBoundRelation()[i]), + new Bind(highBound[i], column.type.asServerType())); } } @@ -115,8 +115,8 @@ public static CompiledStatement select(Operations.SelectCustom select, SchemaSpe Object[] query = cache.computeIfAbsent(relation.descriptor, schema.valueGenerators.ckGen()::inflate); ColumnSpec column = schema.clusteringKeys.get(relation.column); builder.where(new Symbol(column.name, column.type.asServerType()), - toInequality(relation.kind), - new Bind(query[relation.column], column.type.asServerType())); + toInequality(relation.kind), + new Bind(query[relation.column], column.type.asServerType())); } for (Relations.Relation relation : select.regularRelations()) @@ -133,8 +133,8 @@ public static CompiledStatement select(Operations.SelectCustom select, SchemaSpe Object query = schema.valueGenerators.staticColumnGen(relation.column).inflate(relation.descriptor); ColumnSpec column = schema.staticColumns.get(relation.column); builder.where(new Symbol(column.name, column.type.asServerType()), - toInequality(relation.kind), - new Bind(query, column.type.asServerType())); + toInequality(relation.kind), + new Bind(query, column.type.asServerType())); } if (select.orderBy() == Operations.ClusteringOrderBy.DESC) @@ -197,8 +197,8 @@ public static Select.Builder commmonPart(Operations.SelectStatement select, Sche ColumnSpec column = schema.partitionKeys.get(i); Object value = pk[i]; builder.where(new Symbol(column.name, column.type.asServerType()), - Where.Inequality.EQUAL, - new Bind(value, column.type.asServerType())); + Where.Inequality.EQUAL, + new Bind(value, column.type.asServerType())); } return builder; diff --git a/test/harry/main/org/apache/cassandra/harry/dsl/SingleOperationVisitBuilder.java b/test/harry/main/org/apache/cassandra/harry/dsl/SingleOperationVisitBuilder.java index b4f2ff908bf2..263f15648c62 100644 --- a/test/harry/main/org/apache/cassandra/harry/dsl/SingleOperationVisitBuilder.java +++ b/test/harry/main/org/apache/cassandra/harry/dsl/SingleOperationVisitBuilder.java @@ -461,7 +461,7 @@ public SingleOperationBuilder select(int pdIdx, IdxRelation[] ckIdxRelations, Id Relations.Relation[] ckRelations = new Relations.Relation[ckIdxRelations.length]; for (int i = 0; i < ckRelations.length; i++) { - Invariants.checkState(ckIdxRelations[i].column < valueGenerators.ckColumnCount()); + Invariants.require(ckIdxRelations[i].column < valueGenerators.ckColumnCount()); ckRelations[i] = new Relations.Relation(ckIdxRelations[i].kind, valueGenerators.ckGen().descriptorAt(ckIdxRelations[i].idx), ckIdxRelations[i].column); @@ -470,7 +470,7 @@ public SingleOperationBuilder select(int pdIdx, IdxRelation[] ckIdxRelations, Id Relations.Relation[] regularRelations = new Relations.Relation[regularIdxRelations.length]; for (int i = 0; i < regularRelations.length; i++) { - Invariants.checkState(regularIdxRelations[i].column < valueGenerators.regularColumnCount()); + Invariants.require(regularIdxRelations[i].column < valueGenerators.regularColumnCount()); regularRelations[i] = new Relations.Relation(regularIdxRelations[i].kind, valueGenerators.regularColumnGen(regularIdxRelations[i].column).descriptorAt(regularIdxRelations[i].idx), regularIdxRelations[i].column); @@ -479,7 +479,7 @@ public SingleOperationBuilder select(int pdIdx, IdxRelation[] ckIdxRelations, Id Relations.Relation[] staticRelations = new Relations.Relation[staticIdxRelations.length]; for (int i = 0; i < staticRelations.length; i++) { - Invariants.checkState(staticIdxRelations[i].column < valueGenerators.staticColumnCount()); + Invariants.require(staticIdxRelations[i].column < valueGenerators.staticColumnCount()); staticRelations[i] = new Relations.Relation(staticIdxRelations[i].kind, valueGenerators.staticColumnGen(staticIdxRelations[i].column).descriptorAt(staticIdxRelations[i].idx), staticIdxRelations[i].column); diff --git a/test/harry/main/org/apache/cassandra/harry/execution/CQLTesterVisitExecutor.java b/test/harry/main/org/apache/cassandra/harry/execution/CQLTesterVisitExecutor.java index 0f301d8cb8f0..752ea373793e 100644 --- a/test/harry/main/org/apache/cassandra/harry/execution/CQLTesterVisitExecutor.java +++ b/test/harry/main/org/apache/cassandra/harry/execution/CQLTesterVisitExecutor.java @@ -54,7 +54,7 @@ public List executeWithResult(Visit visit, CompiledStatement state { List actual = new ArrayList<>(); // TODO: Have never tested with multiple - Invariants.checkState(visit.operations.length == 1); + Invariants.require(visit.operations.length == 1); for (UntypedResultSet.Row row : execute.apply(statement)) actual.add(resultSetToRow(schema, (Operations.SelectStatement) visit.operations[0], row)); return actual; @@ -99,7 +99,7 @@ public static ResultSetRow resultSetToRow(SchemaSpec schema, Operations.SelectSt { for (int j = 0; j < schema.clusteringKeys.size(); j++) { - Invariants.checkState(!row.has(schema.clusteringKeys.get(j).name), + Invariants.require(!row.has(schema.clusteringKeys.get(j).name), "All elements of clustering key should have been null"); } clusteringKey = NIL_KEY; diff --git a/test/harry/main/org/apache/cassandra/harry/execution/CQLVisitExecutor.java b/test/harry/main/org/apache/cassandra/harry/execution/CQLVisitExecutor.java index 1922f0011a7e..dc4eb227c422 100644 --- a/test/harry/main/org/apache/cassandra/harry/execution/CQLVisitExecutor.java +++ b/test/harry/main/org/apache/cassandra/harry/execution/CQLVisitExecutor.java @@ -90,9 +90,12 @@ public enum ResultDumpMode public static void replayAfterFailure(Visit visit, CQLVisitExecutor executor, Model.Replay replay) { QueryBuildingVisitExecutor queryBuilder = executor.queryBuilder; - logger.error("Caught an exception at {} while replaying {}\ncluster.schemaChange(\"{}\");\nOperations _for this partition_ up to this visit:", - visit, queryBuilder.compile(visit), - queryBuilder.schema.compile()); + if (!visit.hasCustom) + { + logger.error("Caught an exception at {} while replaying {}\ncluster.schemaChange(\"{}\");\nOperations _for this partition_ up to this visit:", + visit, queryBuilder.compile(visit), + queryBuilder.schema.compile()); + } // Configurable yet hardcoded for a person who is trying to generate repro ResultDumpMode mode = ResultDumpMode.PARTITION; @@ -136,7 +139,7 @@ public void execute(Visit visit) // All operations are not touching any data if (compiledStatement == null) { - Invariants.checkArgument(Arrays.stream(visit.operations).allMatch(op -> op.kind() == Operations.Kind.CUSTOM)); + Invariants.requireArgument(Arrays.stream(visit.operations).allMatch(op -> op.kind() == Operations.Kind.CUSTOM)); return; } @@ -147,7 +150,7 @@ public void execute(Visit visit) } else { - Invariants.checkState(selects.size() == 1); + Invariants.require(selects.size() == 1); executeValidatingVisit(visit, selects, compiledStatement); } dataTracker.end(visit); diff --git a/test/harry/main/org/apache/cassandra/harry/execution/DataTracker.java b/test/harry/main/org/apache/cassandra/harry/execution/DataTracker.java index f2a8c9332fdc..c27d52385e6b 100644 --- a/test/harry/main/org/apache/cassandra/harry/execution/DataTracker.java +++ b/test/harry/main/org/apache/cassandra/harry/execution/DataTracker.java @@ -69,7 +69,7 @@ class SequentialDataTracker implements DataTracker public void begin(Visit visit) { long prev = started.get(); - Invariants.checkState(prev == 0 || visit.lts == (prev + 1)); + Invariants.require(prev == 0 || visit.lts == (prev + 1)); started.set(visit.lts); for (int i = 0; i < visit.operations.length; i++) { @@ -88,8 +88,7 @@ public void begin(Visit visit) public void end(Visit visit) { long current = started.get(); - Invariants.checkState(current == visit.lts, - "Current stated %d, current visit: %d", current, visit.lts); + Invariants.require(current == visit.lts, "Current stated %d, current visit: %d", current, visit.lts); finished.set(visit.lts); } @@ -168,4 +167,4 @@ public boolean allFinished() return started.size() == finished.size(); } } -} \ No newline at end of file +} diff --git a/test/harry/main/org/apache/cassandra/harry/execution/InJvmDTestVisitExecutor.java b/test/harry/main/org/apache/cassandra/harry/execution/InJvmDTestVisitExecutor.java index dca2b7001f6d..282e02505cdf 100644 --- a/test/harry/main/org/apache/cassandra/harry/execution/InJvmDTestVisitExecutor.java +++ b/test/harry/main/org/apache/cassandra/harry/execution/InJvmDTestVisitExecutor.java @@ -103,7 +103,7 @@ protected List executeWithResult(Visit visit, CompiledStatement st protected List executeWithResult(Visit visit, int node, int pageSize, CompiledStatement statement, ConsistencyLevel consistencyLevel) { - Invariants.checkState(visit.operations.length == 1); + Invariants.require(visit.operations.length == 1); Object[][] rows; if (consistencyLevel == ConsistencyLevel.NODE_LOCAL) rows = cluster.get(node).executeInternal(statement.cql(), statement.bindings()); @@ -199,7 +199,7 @@ public static ResultSetRow rowToResultSet(SchemaSpec schema, Operations.SelectSt { for (int j = 0; j < schema.clusteringKeys.size(); j++) { - Invariants.checkState(result[selection.indexOf(schema.clusteringKeys.get(j))] == null, + Invariants.require(result[selection.indexOf(schema.clusteringKeys.get(j))] == null, "All elements of clustering key should have been null"); } clusteringKey = NIL_KEY; @@ -292,9 +292,10 @@ public interface ConsistencyLevelSelector public interface RetryPolicy { RetryPolicy RETRY_ON_TIMEOUT = (t) -> { - return t.getMessage().contains("timed out") || - AssertionUtils.isInstanceof(RequestTimeoutException.class) - .matches(Throwables.getRootCause(t)); + return t.getMessage() != null && + (t.getMessage().contains("timed out") || + AssertionUtils.isInstanceof(RequestTimeoutException.class) + .matches(Throwables.getRootCause(t))); }; RetryPolicy NO_RETRY = (t) -> false; boolean retry(Throwable t); diff --git a/test/harry/main/org/apache/cassandra/harry/execution/QueryBuildingVisitExecutor.java b/test/harry/main/org/apache/cassandra/harry/execution/QueryBuildingVisitExecutor.java index a330007a2a4d..e791f6f12ceb 100644 --- a/test/harry/main/org/apache/cassandra/harry/execution/QueryBuildingVisitExecutor.java +++ b/test/harry/main/org/apache/cassandra/harry/execution/QueryBuildingVisitExecutor.java @@ -72,7 +72,7 @@ public BuiltQuery compile(Visit visit) return query; } - Invariants.checkState(bindings.isEmpty() && visitedPds.isEmpty() && selects.isEmpty()); + Invariants.require(bindings.isEmpty() && visitedPds.isEmpty() && selects.isEmpty()); return null; } @@ -107,7 +107,7 @@ protected void endLts(long lts) { if (statements.isEmpty()) { - Invariants.checkState(bindings.isEmpty() && visitedPds.isEmpty() && selects.isEmpty()); + Invariants.require(bindings.isEmpty() && visitedPds.isEmpty() && selects.isEmpty()); return; } diff --git a/test/harry/main/org/apache/cassandra/harry/execution/RingAwareInJvmDTestVisitExecutor.java b/test/harry/main/org/apache/cassandra/harry/execution/RingAwareInJvmDTestVisitExecutor.java index b9a6e0565d56..5489707bade3 100644 --- a/test/harry/main/org/apache/cassandra/harry/execution/RingAwareInJvmDTestVisitExecutor.java +++ b/test/harry/main/org/apache/cassandra/harry/execution/RingAwareInJvmDTestVisitExecutor.java @@ -119,7 +119,7 @@ protected void executeWithoutResult(Visit visit, CompiledStatement statement) { try { - Invariants.checkState(visit.visitedPartitions.size() == 1, + Invariants.require(visit.visitedPartitions.size() == 1, "Ring aware executor can only read and write one partition at a time"); for (TokenPlacementModel.Replica replica : getReplicasFor(visit.visitedPartitions.iterator().next().longValue())) { diff --git a/test/harry/main/org/apache/cassandra/harry/gen/BijectionCache.java b/test/harry/main/org/apache/cassandra/harry/gen/BijectionCache.java index a388f195e956..d1f9e0db7b52 100644 --- a/test/harry/main/org/apache/cassandra/harry/gen/BijectionCache.java +++ b/test/harry/main/org/apache/cassandra/harry/gen/BijectionCache.java @@ -41,6 +41,8 @@ public BijectionCache(Comparator comparator) @Override public T inflate(long descriptor) { + if (MagicConstants.NIL_DESCR == descriptor) + throw new IllegalArgumentException("Asked for NIL_DESCR"); T value = valueToDescriptor.inverse().get(descriptor); if (value == null) throw new IllegalArgumentException(String.format("Attempted to inflate %d, but it is undefined", descriptor)); @@ -124,6 +126,8 @@ public int byteSize() @Override public int compare(long l, long r) { - throw new UnsupportedOperationException(); + T lhs = inflate(l); + T rhs = inflate(r); + return comparator.compare(lhs, rhs); } } diff --git a/test/unit/accord/utils/WrappedRandomSource.java b/test/harry/main/org/apache/cassandra/harry/gen/EntropyRandomSource.java similarity index 60% rename from test/unit/accord/utils/WrappedRandomSource.java rename to test/harry/main/org/apache/cassandra/harry/gen/EntropyRandomSource.java index 3d02c101fbb8..63f62ab7e22e 100644 --- a/test/unit/accord/utils/WrappedRandomSource.java +++ b/test/harry/main/org/apache/cassandra/harry/gen/EntropyRandomSource.java @@ -16,82 +16,74 @@ * limitations under the License. */ -package accord.utils; +package org.apache.cassandra.harry.gen; -import java.util.Random; +import accord.utils.RandomSource; -class WrappedRandomSource implements RandomSource +public class EntropyRandomSource implements RandomSource { - private final Random random; + private final EntropySource delegate; - WrappedRandomSource(Random random) + public EntropyRandomSource(EntropySource delegate) { - this.random = random; - } - - @Override - public Random asJdkRandom() - { - return random; + this.delegate = delegate; } @Override public void nextBytes(byte[] bytes) { - random.nextBytes(bytes); + for (int i = 0, len = bytes.length; i < len; ) + for (int rnd = nextInt(), + n = Math.min(len - i, Integer.SIZE/Byte.SIZE); + n-- > 0; rnd >>= Byte.SIZE) + bytes[i++] = (byte)rnd; } @Override public boolean nextBoolean() { - return random.nextBoolean(); + return delegate.nextBoolean(); } @Override public int nextInt() { - return random.nextInt(); - } - - @Override - public int nextInt(int maxExclusive) - { - return random.nextInt(maxExclusive); + return delegate.nextInt(); } @Override public long nextLong() { - return random.nextLong(); + return ((long) nextInt() << 32) + nextInt(); } @Override public float nextFloat() { - return random.nextFloat(); + return delegate.nextFloat(); } @Override public double nextDouble() { - return random.nextDouble(); + throw new UnsupportedOperationException("TODO: Implement"); } @Override public double nextGaussian() { - return random.nextGaussian(); + throw new UnsupportedOperationException("TODO: Implement"); } @Override public void setSeed(long seed) { - random.setSeed(seed); + delegate.seed(seed); } @Override public RandomSource fork() { - return new WrappedRandomSource(new Random(nextLong())); + return new EntropyRandomSource(delegate.derive()); } } diff --git a/test/harry/main/org/apache/cassandra/harry/gen/EntropySource.java b/test/harry/main/org/apache/cassandra/harry/gen/EntropySource.java index 0f9308516f68..f72b6119371b 100644 --- a/test/harry/main/org/apache/cassandra/harry/gen/EntropySource.java +++ b/test/harry/main/org/apache/cassandra/harry/gen/EntropySource.java @@ -39,13 +39,23 @@ public interface EntropySource EntropySource derive(); int nextInt(); + + /** + * Generates a long in range [0, max). + */ int nextInt(int max); + + /** + * Generates a long in range [min, max). + */ int nextInt(int min, int max); float nextFloat(); double nextDouble(); /** * Code is adopted from a similar method in JDK 17, and has to be removed as soon as we migrate to JDK 17. + * + * Generates a long in range [min, max). */ default long nextLong(long min, long max) { long ret = next(); diff --git a/test/harry/main/org/apache/cassandra/harry/gen/Generators.java b/test/harry/main/org/apache/cassandra/harry/gen/Generators.java index ca2e5f274982..94825d020b4c 100644 --- a/test/harry/main/org/apache/cassandra/harry/gen/Generators.java +++ b/test/harry/main/org/apache/cassandra/harry/gen/Generators.java @@ -1,20 +1,20 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.cassandra.harry.gen; @@ -135,6 +135,11 @@ public static Generator inetAddr(Generator delegate) } public static Generator bytes(int minSize, int maxSize) + { + return byteArrays(minSize, maxSize).map(ByteBuffer::wrap); + } + + public static Generator byteArrays(int minSize, int maxSize) { return rng -> { int size = rng.nextInt(minSize, maxSize); @@ -144,7 +149,7 @@ public static Generator bytes(int minSize, int maxSize) n = Math.min(size - i, Long.SIZE / Byte.SIZE); n-- > 0; v >>= Byte.SIZE) bytes[i++] = (byte) v; - return ByteBuffer.wrap(bytes); + return bytes; }; } @@ -282,7 +287,7 @@ public T generate(EntropySource rng) { T v = delegate.generate(rng); int hashCode = v.hashCode(); - Invariants.checkState(hashCode != System.identityHashCode(v), "hashCode was not overridden for type %s", v.getClass()); + Invariants.require(hashCode != System.identityHashCode(v), "hashCode was not overridden for type %s", v.getClass()); if (hashCodes.contains(hashCode)) continue; hashCodes.add(hashCode); @@ -460,4 +465,4 @@ public static Generator constant(Supplier constant) { return (random) -> constant.get(); } -} \ No newline at end of file +} diff --git a/test/harry/main/org/apache/cassandra/harry/gen/InvertibleGenerator.java b/test/harry/main/org/apache/cassandra/harry/gen/InvertibleGenerator.java index 8c023e5c8cbd..f16185da375b 100644 --- a/test/harry/main/org/apache/cassandra/harry/gen/InvertibleGenerator.java +++ b/test/harry/main/org/apache/cassandra/harry/gen/InvertibleGenerator.java @@ -74,9 +74,9 @@ public InvertibleGenerator(EntropySource rng, Generator gen, Comparator comparator) { - Invariants.checkState(population > 0, + Invariants.require(population > 0, "Population should be strictly positive %d", population); - Invariants.checkState(Long.compareUnsigned(typeEntropy, 0) > 0, + Invariants.require(Long.compareUnsigned(typeEntropy, 0) > 0, "Type entropy should be strictly positive, but was %d: %s", typeEntropy, gen); // We can / will generate at most that many values @@ -101,7 +101,7 @@ public InvertibleGenerator(EntropySource rng, Object inflated = inflate(candidate); int hash = ArrayUtils.hashCode(inflated); - Invariants.checkState(hash != System.identityHashCode(inflated), "hashCode was not overridden for type %s", inflated.getClass()); + Invariants.require(hash != System.identityHashCode(inflated), "hashCode was not overridden for type %s", inflated.getClass()); if (hashes.add(hash)) allocatedDescriptors.add(candidate); @@ -117,7 +117,7 @@ public InvertibleGenerator(EntropySource rng, for (int i = 1; i < allocatedDescriptors.size(); i++) { T current = inflate(allocatedDescriptors.get(i)); - Invariants.checkState( comparator.compare(current, prev) > 0, + Invariants.require( comparator.compare(current, prev) > 0, () -> String.format("%s should be strictly after %s", prev, current)); } } @@ -138,7 +138,7 @@ public long descriptorAt(int idx) @Override public T inflate(long descriptor) { - Invariants.checkState(!MagicConstants.MAGIC_DESCRIPTOR_VALS.contains(descriptor), + Invariants.require(!MagicConstants.MAGIC_DESCRIPTOR_VALS.contains(descriptor), String.format("Should not be able to inflate %d, as it's magic value", descriptor)); return SeedableEntropySource.computeWithSeed(descriptor, gen::generate); } @@ -158,13 +158,13 @@ public long deflate(T value) { Object[] valueArr = (Object[]) value; Object[] expectedArr = (Object[]) expected; - Invariants.checkState(comparator.compare((T) expected, value) != 0, + Invariants.require(comparator.compare((T) expected, value) != 0, "%s was found: %s", Arrays.toString(expectedArr), Arrays.toString(valueArr)); } else { - Invariants.checkState(comparator.compare((T) expected, value) != 0, + Invariants.require(comparator.compare((T) expected, value) != 0, "%s was found: %s", expected, value); } @@ -179,13 +179,13 @@ public long deflate(T value) Object[] valueArr = (Object[]) value; Object[] expectedArr = (Object[]) expected; - Invariants.checkState(comparator.compare((T) expected, value) == 0, + Invariants.require(comparator.compare((T) expected, value) == 0, "%s != %s", Arrays.toString(expectedArr), Arrays.toString(valueArr)); } else { - Invariants.checkState(comparator.compare((T) expected, value) == 0, + Invariants.require(comparator.compare((T) expected, value) == 0, "%s != %s", expected, value); } @@ -258,4 +258,4 @@ public Comparator descriptorsComparator() descriptorToIdx.put(allocatedDescriptors.get(i), i); return Comparator.comparingInt(descriptorToIdx::get); } -} \ No newline at end of file +} diff --git a/test/unit/accord/utils/DefaultRandom.java b/test/harry/main/org/apache/cassandra/harry/gen/RandomSourceEntropySource.java similarity index 68% rename from test/unit/accord/utils/DefaultRandom.java rename to test/harry/main/org/apache/cassandra/harry/gen/RandomSourceEntropySource.java index b16f1f8bbf34..b1aed196db0a 100644 --- a/test/unit/accord/utils/DefaultRandom.java +++ b/test/harry/main/org/apache/cassandra/harry/gen/RandomSourceEntropySource.java @@ -16,33 +16,35 @@ * limitations under the License. */ -package accord.utils; +package org.apache.cassandra.harry.gen; -import java.util.Random; +import accord.utils.RandomSource; -public class DefaultRandom implements RandomSource +public class RandomSourceEntropySource implements EntropySource { - private final Random delegate; - public DefaultRandom() + private final RandomSource delegate; + + public RandomSourceEntropySource(RandomSource delegate) { - this.delegate = new Random(); + this.delegate = delegate; } - public DefaultRandom(long seed) + @Override + public long next() { - this.delegate = new Random(seed); + return delegate.nextLong(); } @Override - public void nextBytes(byte[] bytes) + public void seed(long seed) { - delegate.nextBytes(bytes); + delegate.setSeed(seed); } @Override - public boolean nextBoolean() + public EntropySource derive() { - return delegate.nextBoolean(); + return new RandomSourceEntropySource(delegate.fork()); } @Override @@ -52,43 +54,32 @@ public int nextInt() } @Override - public long nextLong() + public int nextInt(int max) { - return delegate.nextLong(); + return delegate.nextInt(max); } @Override - public float nextFloat() + public int nextInt(int min, int max) { - return delegate.nextFloat(); + return delegate.nextInt(min, max); } @Override - public double nextDouble() - { - return delegate.nextDouble(); - } - - @Override - public double nextGaussian() + public float nextFloat() { - return delegate.nextGaussian(); + return delegate.nextFloat(); } @Override - public void setSeed(long seed) + public double nextDouble() { - delegate.setSeed(seed); - } - - @Override - public DefaultRandom fork() { - return new DefaultRandom(nextLong()); + return delegate.nextDouble(); } @Override - public Random asJdkRandom() + public boolean nextBoolean() { - return delegate; + return delegate.nextBoolean(); } } diff --git a/test/harry/main/org/apache/cassandra/harry/gen/SchemaGenerators.java b/test/harry/main/org/apache/cassandra/harry/gen/SchemaGenerators.java index c0e0c6e5696b..a3433c5b56d1 100644 --- a/test/harry/main/org/apache/cassandra/harry/gen/SchemaGenerators.java +++ b/test/harry/main/org/apache/cassandra/harry/gen/SchemaGenerators.java @@ -97,6 +97,11 @@ public ColumnSpec generate(EntropySource rng) } public static Generator trivialSchema(String ks, String table, int population) + { + return trivialSchema(ks, table, population, SchemaSpec.optionsBuilder().build()); + } + + public static Generator trivialSchema(String ks, String table, int population, SchemaSpec.Options options) { return (rng) -> { return new SchemaSpec(rng.next(), @@ -105,7 +110,8 @@ public static Generator trivialSchema(String ks, String table, int p Arrays.asList(ColumnSpec.pk("pk1", ColumnSpec.int64Type, Generators.int64())), Arrays.asList(ColumnSpec.ck("ck1", ColumnSpec.int64Type, Generators.int64(), false)), Arrays.asList(ColumnSpec.regularColumn("v1", ColumnSpec.int64Type)), - List.of(ColumnSpec.staticColumn("s1", ColumnSpec.int64Type))); + Arrays.asList(ColumnSpec.staticColumn("s1", ColumnSpec.int64Type)), + options); }; } } diff --git a/test/harry/main/org/apache/cassandra/harry/model/ASTSingleTableModel.java b/test/harry/main/org/apache/cassandra/harry/model/ASTSingleTableModel.java index f6a03bdd1784..a781e10e21d2 100644 --- a/test/harry/main/org/apache/cassandra/harry/model/ASTSingleTableModel.java +++ b/test/harry/main/org/apache/cassandra/harry/model/ASTSingleTableModel.java @@ -24,8 +24,10 @@ import java.util.BitSet; import java.util.Collection; import java.util.Collections; +import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.NavigableSet; @@ -33,21 +35,23 @@ import java.util.Optional; import java.util.Set; import java.util.TreeMap; +import java.util.function.Function; import java.util.function.IntFunction; import java.util.stream.Collectors; import java.util.stream.Stream; import javax.annotation.Nullable; import com.google.common.base.Preconditions; -import com.google.common.collect.Iterables; import com.google.common.collect.Maps; import com.google.common.collect.Sets; + import accord.utils.Invariants; +import org.apache.cassandra.cql3.KnownIssue; import org.apache.cassandra.cql3.ast.AssignmentOperator; import org.apache.cassandra.cql3.ast.CasCondition; -import org.apache.cassandra.cql3.ast.Conditional; import org.apache.cassandra.cql3.ast.Conditional.Where.Inequality; +import org.apache.cassandra.cql3.ast.Conditional; import org.apache.cassandra.cql3.ast.Element; import org.apache.cassandra.cql3.ast.Expression; import org.apache.cassandra.cql3.ast.ExpressionEvaluator; @@ -65,7 +69,10 @@ import org.apache.cassandra.db.BufferClustering; import org.apache.cassandra.db.Clustering; import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.BooleanType; import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.LongType; +import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.dht.Token; import org.apache.cassandra.harry.model.BytesPartitionState.PrimaryKey; import org.apache.cassandra.harry.util.StringUtils; @@ -75,18 +82,32 @@ import org.apache.cassandra.utils.ImmutableUniqueList; import org.apache.cassandra.utils.Pair; +import static org.apache.cassandra.cql3.ast.Elements.symbols; import static org.apache.cassandra.harry.model.BytesPartitionState.asCQL; public class ASTSingleTableModel { private static final ByteBuffer[][] NO_ROWS = new ByteBuffer[0][]; + private static final Symbol CAS_APPLIED = new Symbol.UnquotedSymbol("[applied]", BooleanType.instance); + private static final ImmutableUniqueList CAS_APPLIED_COLUMNS = ImmutableUniqueList.builder().add(CAS_APPLIED).build(); + private static final ByteBuffer[][] CAS_SUCCESS_RESULT = new ByteBuffer[][] { new ByteBuffer[] {BooleanType.instance.decompose(true)} }; + private static final ByteBuffer FALSE = BooleanType.instance.decompose(false); + private static final ByteBuffer[][] CAS_REJECTION_RESULT = new ByteBuffer[][] { new ByteBuffer[] {FALSE} }; public final BytesPartitionState.Factory factory; + private final EnumSet ignoredIssues; private final TreeMap partitions = new TreeMap<>(); + private long numMutations = 0; public ASTSingleTableModel(TableMetadata metadata) + { + this(metadata, EnumSet.noneOf(KnownIssue.class)); + } + + public ASTSingleTableModel(TableMetadata metadata, EnumSet ignoredIssues) { this.factory = new BytesPartitionState.Factory(metadata); + this.ignoredIssues = Objects.requireNonNull(ignoredIssues); } public NavigableSet partitionKeys() @@ -191,6 +212,212 @@ private void indexRowColumn(TreeMap> index, boolean public void update(Mutation mutation) { if (!shouldApply(mutation)) return; + updateInternal(mutation); + } + + public void updateAndValidate(ByteBuffer[][] actual, Mutation mutation) + { + if (!shouldApply(mutation)) + { + if (mutation.isCas()) + validateCasNotApplied(actual, mutation); + return; + } + if (mutation.isCas()) + validate(CAS_APPLIED_COLUMNS, actual, CAS_SUCCESS_RESULT); + updateInternal(mutation); + } + + private void validateCasNotApplied(ByteBuffer[][] actual, Mutation mutation) + { + // see org.apache.cassandra.cql3.statements.ModificationStatement.buildCasFailureResultSet + var condition = mutation.casCondition().get(); + var partition = partitions.get(referencePartition(mutation)); + var cd = cdOrNull(mutation); + BytesPartitionState.Row row = partition == null ? null : partition.get(cd); + boolean touchesStaticColumns = !factory.staticColumns.isEmpty() + && symbols(mutation).anyMatch(factory.staticColumns::contains); + ImmutableUniqueList columns; + ByteBuffer[][] expected; + if (partition == null) + { + columns = CAS_APPLIED_COLUMNS; + expected = CAS_REJECTION_RESULT; + } + else if (condition instanceof CasCondition.IfCondition) + { + if (touchesStaticColumns + && cd == null + && ignoredIssues.contains(KnownIssue.CAS_ON_STATIC_ROW)) + { + if (casOnStaticRowCouldReturnData(partition)) + { + // if the static row exists, we can match the col condition + // if the static row doesn't exist, and there are rows, then we can return null + List conditionReferencedColumns = conditionReferencedColumns(mutation); + columns = ImmutableUniqueList.builder(conditionReferencedColumns.size() + 1) + .add(CAS_APPLIED) + .addAll(conditionReferencedColumns) + .build(); + ByteBuffer[] result = getRowAsByteBuffer(columns, partition, row); + result[0] = FALSE; + + expected = new ByteBuffer[][]{ result }; + } + else + { + // static/row don't exist, so can't return a current state + columns = CAS_APPLIED_COLUMNS; + expected = CAS_REJECTION_RESULT; + } + } + else if (partition.staticRow().isEmpty() + && (cd == null || row == null)) + { + // static/row don't exist, so can't return a current state + columns = CAS_APPLIED_COLUMNS; + expected = CAS_REJECTION_RESULT; + } + else + { + List conditionReferencedColumns = conditionReferencedColumns(mutation); + columns = ImmutableUniqueList.builder(conditionReferencedColumns.size() + 1) + .add(CAS_APPLIED) + .addAll(conditionReferencedColumns) + .build(); + ByteBuffer[] result = getRowAsByteBuffer(columns, partition, row); + result[0] = FALSE; + + expected = new ByteBuffer[][]{ result }; + } + } + else if (condition == CasCondition.Simple.Exists) + { + if (touchesStaticColumns + && cd == null + && ignoredIssues.contains(KnownIssue.CAS_ON_STATIC_ROW)) + { + if (casOnStaticRowCouldReturnData(partition)) + { + if (!partition.rows().isEmpty()) + row = partition.rows().get(0); + // Partition level IF EXISTS checks if the static row exists (which is defined as notEmpty), so its known that the static row is empty! + // One would expect that the DELETE just returns [[applied]] but it actually returns a row... but we are not working with rows, we are working with partitions... + // This is a leaky implementation detail! Checking for the partition to exist is the following ReadCommand: + // SELECT s0, s1 WHERE pk = ? LIMIT 1 + // this doesn't include the row columns, only the static columns... but the LIMIT returned a row and not + // the static row (because the static row is empty)! + columns = ImmutableUniqueList.builder(factory.selectionOrder.size() + 1) + .add(CAS_APPLIED) + .addAll(factory.selectionOrder) + .build(); + ByteBuffer[] result = getRowAsByteBuffer(columns, partition, row); + result[0] = FALSE; + if (row != null) + { + for (var c : factory.regularColumns) + // null out the row columns.... + result[columns.indexOf(c)] = null; + } + + expected = new ByteBuffer[][]{ result }; + } + else + { + // static/row don't exist, so can't return a current state + columns = CAS_APPLIED_COLUMNS; + expected = CAS_REJECTION_RESULT; + } + } + else if (!touchesStaticColumns || partition.staticRow().isEmpty()) + { + columns = CAS_APPLIED_COLUMNS; + expected = CAS_REJECTION_RESULT; + } + else + { + columns = ImmutableUniqueList.builder(factory.selectionOrder.size() + 1) + .add(CAS_APPLIED) + .addAll(factory.selectionOrder) + .build(); + ByteBuffer[] result = getRowAsByteBuffer(columns, partition, row); + result[0] = FALSE; + + expected = new ByteBuffer[][]{ result }; + } + } + else if (condition == CasCondition.Simple.NotExists) + { + if (touchesStaticColumns + && cd == null + && ignoredIssues.contains(KnownIssue.CAS_ON_STATIC_ROW) + && !partition.rows().isEmpty()) + row = partition.rows().get(0); + columns = ImmutableUniqueList.builder(factory.selectionOrder.size() + 1) + .add(CAS_APPLIED) + .addAll(factory.selectionOrder) + .build(); + ByteBuffer[] result = getRowAsByteBuffer(columns, partition, row); + result[0] = FALSE; + if (!touchesStaticColumns) + { + for (var s : factory.staticColumns) + result[columns.indexOf(s)] = null; + } + + if (cd == null + && ignoredIssues.contains(KnownIssue.CAS_ON_STATIC_ROW) + && row != null) + { + for (var c : factory.regularColumns) + // null out the row columns.... + result[columns.indexOf(c)] = null; + } + + expected = new ByteBuffer[][]{ result }; + } + else + { + throw new AssertionError(); + } + validate(columns, actual, expected); + } + + private static boolean casOnStaticRowCouldReturnData(BytesPartitionState partition) + { + return !partition.staticRow().isEmpty() + || !partition.rows().isEmpty(); + } + private List conditionReferencedColumns(Mutation mutation) + { + //TODO (correctness): does ast.AND support the correct "order" as seen from CAS? + LinkedHashSet regularCols = null, staticCols = null; + for (var c : (Iterable) () -> symbols(mutation.casCondition().get()).distinct().iterator()) + { + if (factory.staticColumns.contains(c)) + { + if (staticCols == null) + staticCols = new LinkedHashSet<>(); + staticCols.add(c); + } + else + { + if (regularCols == null) + regularCols = new LinkedHashSet<>(); + regularCols.add(c); + } + } + List ordered = new ArrayList<>(); + if (regularCols != null) + ordered.addAll(regularCols); + if (staticCols != null) + ordered.addAll(staticCols); + return ordered; + } + + private void updateInternal(Mutation mutation) + { + numMutations++; switch (mutation.kind) { case INSERT: @@ -209,6 +436,7 @@ public void update(Mutation mutation) private void update(Mutation.Insert insert) { + long nowTs = insert.timestampOrDefault(numMutations); Clustering pd = pd(insert); BytesPartitionState partition = partitions.get(factory.createRef(pd)); if (partition == null) @@ -219,25 +447,25 @@ private void update(Mutation.Insert insert) Map values = insert.values; if (!factory.staticColumns.isEmpty() && !Sets.intersection(factory.staticColumns.asSet(), values.keySet()).isEmpty()) { - // static columns to add in. If we are doing something like += to a row that doesn't exist, we still update statics... - Map write = new HashMap<>(); - for (Symbol col : Sets.intersection(factory.staticColumns.asSet(), values.keySet())) - write.put(col, eval(values.get(col))); - partition.setStaticColumns(write); + maybeUpdateColumns(Sets.intersection(factory.staticColumns.asSet(), values.keySet()), + partition.staticRow(), + nowTs, values, + partition::setStaticColumns); } // table has clustering but non are in the write, so only pk/static can be updated if (!factory.clusteringColumns.isEmpty() && Sets.intersection(factory.clusteringColumns.asSet(), values.keySet()).isEmpty()) return; - Map write = new HashMap<>(); - for (Symbol col : Sets.intersection(factory.regularColumns.asSet(), values.keySet())) - write.put(col, eval(values.get(col))); - partition.setColumns(key(insert.values, factory.clusteringColumns), - write, - true); + BytesPartitionState finalPartition = partition; + var cd = key(insert.values, factory.clusteringColumns); + maybeUpdateColumns(Sets.intersection(factory.regularColumns.asSet(), values.keySet()), + partition.get(cd), + nowTs, values, + (ts, write) -> finalPartition.setColumns(cd, ts, write, true)); } private void update(Mutation.Update update) { + long nowTs = update.timestampOrDefault(numMutations); var split = splitOnPartition(update.where.simplify()); List> pks = split.left; List remaining = split.right; @@ -252,43 +480,30 @@ private void update(Mutation.Update update) Map set = update.set; if (!factory.staticColumns.isEmpty() && !Sets.intersection(factory.staticColumns.asSet(), set.keySet()).isEmpty()) { - // static columns to add in. If we are doing something like += to a row that doesn't exist, we still update statics... - Map write = new HashMap<>(); - for (Symbol col : Sets.intersection(factory.staticColumns.asSet(), set.keySet())) - { - ByteBuffer current = partition.staticRow().get(col); - EvalResult result = eval(col, current, set.get(col)); - if (result.kind == EvalResult.Kind.SKIP) continue; - write.put(col, result.value); - } - if (!write.isEmpty()) - partition.setStaticColumns(write); + maybeUpdateColumns(Sets.intersection(factory.staticColumns.asSet(), set.keySet()), + partition.staticRow(), + nowTs, set, + partition::setStaticColumns); } // table has clustering but non are in the write, so only pk/static can be updated if (!factory.clusteringColumns.isEmpty() && remaining.isEmpty()) return; + BytesPartitionState finalPartition = partition; for (Clustering cd : clustering(remaining)) { - Map write = new HashMap<>(); - for (Symbol col : Sets.intersection(factory.regularColumns.asSet(), set.keySet())) - { - ByteBuffer current = partition.get(cd, col); - EvalResult result = eval(col, current, set.get(col)); - if (result.kind == EvalResult.Kind.SKIP) continue; - write.put(col, result.value); - } - - if (!write.isEmpty()) - partition.setColumns(cd, write, false); + maybeUpdateColumns(Sets.intersection(factory.regularColumns.asSet(), set.keySet()), + partition.get(cd), + nowTs, set, + (ts, write) -> finalPartition.setColumns(cd, ts, write, false)); } } } private enum DeleteKind {PARTITION, ROW, COLUMN} - private void update(Mutation.Delete delete) { + long nowTs = delete.timestampOrDefault(numMutations); //TODO (coverage): range deletes var split = splitOnPartition(delete.where.simplify()); List> pks = split.left; @@ -313,7 +528,7 @@ else if (!clusterings.isEmpty()) case ROW: for (Clustering cd : clusterings) { - partition.deleteRow(cd); + partition.deleteRow(cd, nowTs); if (partition.shouldDelete()) partitions.remove(partition.ref()); } @@ -321,7 +536,7 @@ else if (!clusterings.isEmpty()) case COLUMN: if (clusterings.isEmpty()) { - partition.deleteStaticColumns(columns); + partition.deleteStaticColumns(nowTs, columns); if (partition.shouldDelete()) partitions.remove(partition.ref()); } @@ -329,7 +544,7 @@ else if (!clusterings.isEmpty()) { for (Clustering cd : clusterings) { - partition.deleteColumns(cd, columns); + partition.deleteColumns(cd, nowTs, columns); if (partition.shouldDelete()) partitions.remove(partition.ref()); } @@ -341,54 +556,68 @@ else if (!clusterings.isEmpty()) } } + private static void maybeUpdateColumns(Set columns, + @Nullable BytesPartitionState.Row row, + long nowTs, Map set, + ColumnUpdate update) + { + if (columns.isEmpty()) + { + update.update(nowTs, Collections.emptyMap()); + return; + } + // static columns to add in. If we are doing something like += to a row that doesn't exist, we still update statics... + Map write = new HashMap<>(); + for (Symbol col : columns) + { + ByteBuffer current = row == null ? null : row.get(col); + EvalResult result = eval(col, current, set.get(col)); + if (result.kind == EvalResult.Kind.SKIP) continue; + write.put(col, result.value); + } + if (!write.isEmpty()) + update.update(nowTs, write); + } + public boolean shouldApply(Mutation mutation) { if (!mutation.isCas()) return true; return shouldApply(mutation, selectPartitionForCAS(mutation)); } - private SelectResult selectPartitionForCAS(Mutation mutation) + private CasContext selectPartitionForCAS(Mutation mutation) { - var partition = partitions.get(factory.createRef(pd(mutation))); - if (partition == null) return SelectResult.ordered(factory.selectionOrder, NO_ROWS); - - var cd = cdOrNull(mutation); - var row = cd == null ? null : partition.get(cd); - ImmutableUniqueList columns = cd != null ? factory.selectionOrder : factory.partitionAndStaticColumns; - return SelectResult.ordered(columns, new ByteBuffer[][] { getRowAsByteBuffer(columns, partition, row)}); + BytesPartitionState.Ref ref = referencePartition(mutation); + Clustering cd = cdOrNull(mutation); + BytesPartitionState partition = partitions.get(ref); + return new CasContext(ref, cd, partition); } - private boolean shouldApply(Mutation mutation, SelectResult current) + private boolean shouldApply(Mutation mutation, CasContext ctx) { Preconditions.checkArgument(mutation.isCas()); // process condition - CasCondition condition; - switch (mutation.kind) - { - case INSERT: - condition = CasCondition.Simple.NotExists; - break; - case UPDATE: - condition = ((Mutation.Update) mutation).casCondition.get(); - break; - case DELETE: - condition = ((Mutation.Delete) mutation).casCondition.get(); - break; - default: - throw new UnsupportedOperationException(mutation.kind.name()); - } + CasCondition condition = mutation.casCondition().get(); + boolean partitionOrRow = ctx.clustering == null; + boolean partitionKnown = ctx.partition != null; + BytesPartitionState.Row row = partitionKnown && !partitionOrRow + ? ctx.partition.get(ctx.clustering) + : null; if (condition instanceof CasCondition.Simple) { - boolean hasPartition = current.rows.length > 0; - boolean partitionOrRow = current.columns.equals(factory.partitionAndStaticColumns); - boolean hasRow = partitionOrRow ? hasPartition : current.isAllDefined(factory.clusteringColumns); + if (partitionOrRow && factory.staticColumns.isEmpty()) + throw new AssertionError("Attempted to create a EXISTS condition on partition without static columns; " + mutation.toCQL()); + // CAS's definition of partition EXISTS isn't based off the partition existing, its based off the static row + // existing (aka at least 1 static column exists and is not null). + boolean hasPartition = partitionKnown && !ctx.partition.staticRow().isEmpty(); + boolean hasRow = row != null; // don't do !isEmpty here as liveness dictates the existence of a row. If you INSERT a row then delete all its columns, it still exists! var simple = (CasCondition.Simple) condition; switch (simple) { case Exists: - return hasRow; + return partitionOrRow ? hasPartition : hasRow; case NotExists: - return !hasRow; + return partitionOrRow ? !hasPartition : !hasRow; default: throw new UnsupportedOperationException(simple.name()); } @@ -396,6 +625,11 @@ private boolean shouldApply(Mutation mutation, SelectResult current) var ifCondition = (CasCondition.IfCondition) condition; String letRow = "row"; Symbol rowSymbol = Symbol.unknownType(letRow); + ImmutableUniqueList columns = partitionOrRow ? factory.partitionAndStaticColumns : factory.selectionOrder; + SelectResult current = SelectResult.ordered(columns, + partitionKnown + ? new ByteBuffer[][] { getRowAsByteBuffer(columns, ctx.partition, row)} + : NO_ROWS); Map lets = Map.of(letRow, current); // point the columns to be row.column that way it matches LET clause in BEGIN TRANSACTION, allowing better reuse var updatedCondition = ifCondition.conditional.visit(new Visitor() @@ -410,6 +644,11 @@ public ReferenceExpression visit(ReferenceExpression r) return process(updatedCondition, lets); } + public BytesPartitionState.Ref referencePartition(Mutation mutation) + { + return factory.createRef(pd(mutation)); + } + private boolean process(Conditional condition, Map lets) { if (condition.getClass() == Conditional.Is.class) @@ -531,7 +770,7 @@ private Pair>, List> splitOnClustering( private Pair>, List> splitOn(ImmutableUniqueList.AsSet columns, List conditionals) { // pk requires equality - Map> pks = new HashMap<>(); + Map> pks = new HashMap<>(); List other = new ArrayList<>(); for (Conditional c : conditionals) { @@ -544,7 +783,7 @@ private Pair>, List> splitOn(ImmutableU ByteBuffer bb = eval(w.rhs); if (pks.containsKey(col)) throw new IllegalArgumentException("Partition column " + col + " was defined multiple times in the WHERE clause"); - pks.put(col, Collections.singleton(bb)); + pks.put(col, Collections.singletonList(bb)); } else { @@ -559,8 +798,8 @@ else if (c instanceof Conditional.In) Symbol col = (Symbol) i.ref; if (pks.containsKey(col)) throw new IllegalArgumentException("Partition column " + col + " was defined multiple times in the WHERE clause"); - var set = i.expressions.stream().map(ASTSingleTableModel::eval).collect(Collectors.toSet()); - pks.put(col, set); + var list = i.expressions.stream().map(ASTSingleTableModel::eval).collect(Collectors.toList()); + pks.put(col, list); } else { @@ -582,19 +821,51 @@ else if (c instanceof Conditional.In) return Pair.create(partitionKeys, other); } - private List> keys(Collection columns, Map> pks) + private static ImmutableUniqueList> keys(Collection columns, Map> columnValues) { - //TODO (coverage): handle IN - ByteBuffer[] bbs = new ByteBuffer[columns.size()]; + return keys(columns, columnValues, Function.identity()); + } + + private static ImmutableUniqueList> keys(Map> values, Collection columns) + { + return keys(columns, values, ASTSingleTableModel::eval); + } + + private static ImmutableUniqueList> keys(Collection columns, + Map> columnValues, + Function eval) + { + if (columns.isEmpty()) return ImmutableUniqueList.empty(); + List current = new ArrayList<>(); + current.add(new ByteBuffer[columns.size()]); int idx = 0; - for (Symbol s : columns) + for (Symbol symbol : columns) { - Set values = pks.get(s); - if (values.size() > 1) - throw new UnsupportedOperationException("IN clause is currently unsupported... its on the backlog!"); - bbs[idx++] = Iterables.getFirst(values, null); + int position = idx++; + List expressions = columnValues.get(symbol); + ByteBuffer firstBB = eval.apply(expressions.get(0)); + current.forEach(bbs -> bbs[position] = firstBB); + if (expressions.size() > 1) + { + // this has a multiplying effect... if there is 1 row and there are 2 expressions, then we have 2 rows + // if there are 2 rows and 2 expressions, we have 4 rows... and so on... + List copy = new ArrayList<>(current); + for (int i = 1; i < expressions.size(); i++) + { + ByteBuffer bb = eval.apply(expressions.get(i)); + for (ByteBuffer[] bbs : copy) + { + bbs = bbs.clone(); + bbs[position] = bb; + current.add(bbs); + } + } + } } - return Collections.singletonList(BufferClustering.make(bbs)); + var builder = ImmutableUniqueList.>builder(); + for (var row : current) + builder.add(new BufferClustering(row)); + return builder.build(); } private Clustering pd(Mutation mutation) @@ -683,6 +954,18 @@ public List getByToken(Token token) public void validate(ByteBuffer[][] actual, Select select) { + if (select.source.isEmpty()) + throw new AssertionError("SELECT without a FROM only allowed in a BEGIN TRANSACTION"); + { + var ref = select.source.get(); + if (ref.keyspace.isPresent()) + { + if (!factory.metadata.keyspace.equals(ref.keyspace.get())) + throw new AssertionError("Incorrect keyspace: expected " + factory.metadata.keyspace + " but given " + ref.keyspace.get()); + } + if (!factory.metadata.name.equals(ref.name)) + throw new AssertionError("Incorrect table: expected " + factory.metadata.name + " but given " + ref.name); + } SelectResult results = getRowsAsByteBuffer(select); try { @@ -692,7 +975,7 @@ public void validate(ByteBuffer[][] actual, Select select) } else { - validate(actual, results.rows); + validate(results.columns, actual, results.rows); } } catch (AssertionError e) @@ -704,13 +987,19 @@ public void validate(ByteBuffer[][] actual, Select select) } } - public void validate(ByteBuffer[][] actual, ByteBuffer[][] expected) - { - validate(factory.selectionOrder, actual, expected); - } - private static void validate(ImmutableUniqueList columns, ByteBuffer[][] actual, ByteBuffer[][] expected) { + int expectedLength = columns.size(); + for (var a : actual) + { + if (a.length != expectedLength) + throw new AssertionError("actual rows do not match the schema " + columns + "; found " + Arrays.toString(a)); + } + for (var e : expected) + { + if (e.length != expectedLength) + throw new AssertionError("expected rows do not match the schema " + columns + "; found " + Arrays.toString(e)); + } // check any order validateAnyOrder(columns, toRow(columns, actual), toRow(columns, expected)); // all rows match, but are they in the right order? @@ -722,27 +1011,9 @@ private static void validateAnyOrder(ImmutableUniqueList columns, Set toRow(ImmutableUniqueList columns, ByteBuffer[][ return set; } + private static class CasContext + { + private final BytesPartitionState.Ref ref; + @Nullable + private final Clustering clustering; + @Nullable + private final BytesPartitionState partition; + + private CasContext(BytesPartitionState.Ref ref, @Nullable Clustering clustering, @Nullable BytesPartitionState partition) + { + this.ref = ref; + this.clustering = clustering; + this.partition = partition; + } + } + private static class SelectResult { private final ImmutableUniqueList columns; @@ -885,17 +1201,47 @@ public boolean isAllDefined(ImmutableUniqueList selectColumns) } } - public ImmutableUniqueList columns(Select select) + private ImmutableUniqueList columns(Select select) { if (select.selections.isEmpty()) return factory.selectionOrder; - throw new UnsupportedOperationException("Getting columns from select other than SELECT * is currently not supported"); + var builder = ImmutableUniqueList.builder(); + for (var e : select.selections) + { + if (!(e instanceof Symbol)) + throw new UnsupportedOperationException("Only column selection is currently supported"); + builder.add((Symbol) e); + } + return builder.build(); + } + + private static ByteBuffer[][] filter(ByteBuffer[][] rows, ImmutableUniqueList actualOrder, ImmutableUniqueList targetOrder) + { + if (actualOrder.equals(targetOrder)) return rows; + if (rows.length == 0) return rows; + if (!actualOrder.containsAll(targetOrder)) + throw new UnsupportedOperationException("Only column selection is currently supported"); + ByteBuffer[][] result = new ByteBuffer[rows.length][]; + for (int i = 0; i < rows.length; i++) + { + ByteBuffer[] actual = rows[i]; + ByteBuffer[] target = new ByteBuffer[targetOrder.size()]; + for (int j = 0; j < targetOrder.size(); j++) + { + Symbol col = targetOrder.get(j); + int actualIndex = actualOrder.indexOf(col); + target[j] = actual[actualIndex]; + } + result[i] = target; + } + return result; } private SelectResult getRowsAsByteBuffer(Select select) { - ImmutableUniqueList columns = columns(select); + ImmutableUniqueList selectOrder = factory.selectionOrder; + ImmutableUniqueList targetOrder = columns(select); if (select.where.isEmpty()) - return SelectResult.ordered(columns, getRowsAsByteBuffer(applyLimits(all(), select.perPartitionLimit, select.limit))); + return SelectResult.ordered(targetOrder, filter(getRowsAsByteBuffer(applyLimits(all(), select.perPartitionLimit, select.limit)), selectOrder, targetOrder)); LookupContext ctx = context(select); List primaryKeys; if (ctx.unmatchable) @@ -923,7 +1269,7 @@ else if (ctx.tokenLowerBound != null || ctx.tokenUpperBound != null) } primaryKeys = applyLimits(primaryKeys, select.perPartitionLimit, select.limit); //TODO (correctness): now that we have the rows we need to handle the selections/aggregation/limit/group-by/etc. - return new SelectResult(columns, getRowsAsByteBuffer(primaryKeys), ctx.unordered); + return new SelectResult(targetOrder, filter(getRowsAsByteBuffer(primaryKeys), selectOrder, targetOrder), ctx.unordered); } private List applyLimits(List primaryKeys, Optional perPartitionLimitOpt, Optional limitOpt) @@ -1092,7 +1438,7 @@ private List getByTokenSearch(@Nullable TokenCondition toke NavigableSet keys = partitions.navigableKeySet(); // To support the case where 2+ keys share the same token, need to create a token ref before and after the token, to make sure // the head/tail sets find the matches correctly - if (tokenLowerBound != null) + if (tokenLowerBound != null && !tokenLowerBound.token.isMinimum()) { boolean inclusive; switch (tokenLowerBound.inequality) @@ -1110,7 +1456,7 @@ private List getByTokenSearch(@Nullable TokenCondition toke // when inclusive=false the ref should be after the token, that way they are excluded keys = keys.tailSet(factory.createRef(tokenLowerBound.token, !inclusive), inclusive); } - if (tokenUpperBound != null) + if (tokenUpperBound != null && !tokenUpperBound.token.isMinimum()) { boolean inclusive; switch (tokenUpperBound.inequality) @@ -1199,37 +1545,6 @@ private Clustering key(Map values, ImmutableUniq return keys.get(0); } - private List> keys(Map> values, ImmutableUniqueList columns) - { - if (columns.isEmpty()) return Collections.singletonList(Clustering.EMPTY); - List current = new ArrayList<>(); - current.add(new ByteBuffer[columns.size()]); - for (Symbol symbol : columns) - { - int position = columns.indexOf(symbol); - List expressions = values.get(symbol); - ByteBuffer firstBB = eval(expressions.get(0)); - current.forEach(bbs -> bbs[position] = firstBB); - if (expressions.size() > 1) - { - // this has a multiplying effect... if there is 1 row and there are 2 expressions, then we have 2 rows - // if there are 2 rows and 2 expressions, we have 4 rows... and so on... - List copy = new ArrayList<>(current); - for (int i = 1; i < expressions.size(); i++) - { - ByteBuffer bb = eval(expressions.get(i)); - for (ByteBuffer[] bbs : copy) - { - bbs = bbs.clone(); - bbs[position] = bb; - current.add(bbs); - } - } - } - } - return current.stream().map(BufferClustering::new).collect(Collectors.toList()); - } - private static class EvalResult { private static final EvalResult SKIP = new EvalResult(Kind.SKIP, null); @@ -1287,6 +1602,30 @@ private static ByteBuffer eval(Expression e) return ExpressionEvaluator.evalEncoded(e); } + private BytesPartitionState.Ref processToken(Expression e) + { + BytesPartitionState.Ref ref; + if (e instanceof FunctionCall) + { + FunctionCall rhs = (FunctionCall) e; + List pkValues = rhs.arguments.stream().map(ASTSingleTableModel::eval).collect(Collectors.toList()); + ref = factory.createRef(new BufferClustering(pkValues.toArray(ByteBuffer[]::new))); + } + else if (e instanceof Value) + { + var value = (Value) e; + if (value.type() != LongType.instance) + throw new AssertionError("Token values only expected to be bigint but given " + value.type().asCQL3Type()); + var token = new Murmur3Partitioner.LongToken(LongType.instance.compose(value.valueEncoded())); + ref = factory.createRef(token, true); // should this be false? + } + else + { + throw new UnsupportedOperationException(e.getClass().toString()); + } + return ref; + } + private static class Row { private static final Row EMPTY = new Row(ImmutableUniqueList.empty(), ByteBufferUtil.EMPTY_ARRAY); @@ -1393,12 +1732,28 @@ private LookupContext(Select select) maybeNormalizeTokenBounds(); } + private LookupContext(Mutation mutation) + { + if (mutation.kind == Mutation.Kind.INSERT) + { + var insert = mutation.asInsert(); + for (var e : insert.values.entrySet()) + eq.put(e.getKey(), Collections.singletonList(e.getValue())); + } + else + { + addConditional(mutation.kind == Mutation.Kind.UPDATE + ? mutation.asUpdate().where + : mutation.asDelete().where); + } + } + private void maybeNormalizeTokenBounds() { if (tokenLowerBound != null && tokenUpperBound != null) { int rc = tokenLowerBound.token.compareTo(tokenUpperBound.token); - if (rc > 0) + if (rc > 0 && !tokenUpperBound.token.isMinimum()) { // where token > 10 and < 0.... nothing matches that! unmatchable = true; @@ -1454,9 +1809,7 @@ else if (w.lhs instanceof FunctionCall) switch (fn.name()) { case "token": - FunctionCall rhs = (FunctionCall) w.rhs; - List pkValues = rhs.arguments.stream().map(ASTSingleTableModel::eval).collect(Collectors.toList()); - BytesPartitionState.Ref ref = factory.createRef(new BufferClustering(pkValues.toArray(ByteBuffer[]::new))); + BytesPartitionState.Ref ref = processToken(w.rhs); switch (w.kind) { case EQUAL: @@ -1552,17 +1905,14 @@ else if (between.ref instanceof FunctionCall) { case "token": // if the ref is a token, the only valid start/end are also token - List start = ((FunctionCall) between.start).arguments.stream().map(ASTSingleTableModel::eval).collect(Collectors.toList()); - Token startToken = factory.createRef(new BufferClustering(start.toArray(ByteBuffer[]::new))).token; - - List end = ((FunctionCall) between.end).arguments.stream().map(ASTSingleTableModel::eval).collect(Collectors.toList()); - Token endToken = factory.createRef(new BufferClustering(end.toArray(ByteBuffer[]::new))).token; + Token startToken = processToken(between.start).token; + Token endToken = processToken(between.end).token; if (startToken.equals(endToken)) { token = startToken; } - else if (startToken.compareTo(endToken) > 0) + else if (startToken.compareTo(endToken) > 0 && !endToken.isMinimum()) { // start is larger than end... no matches unmatchable = true; @@ -1685,4 +2035,9 @@ private TokenCondition(Inequality inequality, Token token) this.token = token; } } + + private interface ColumnUpdate + { + void update(long nowTs, Map write); + } } diff --git a/test/harry/main/org/apache/cassandra/harry/model/ASTSingleTableModelTest.java b/test/harry/main/org/apache/cassandra/harry/model/ASTSingleTableModelTest.java index a04425f82722..3bdf742ce84d 100644 --- a/test/harry/main/org/apache/cassandra/harry/model/ASTSingleTableModelTest.java +++ b/test/harry/main/org/apache/cassandra/harry/model/ASTSingleTableModelTest.java @@ -651,17 +651,102 @@ public void assignmentOperatorMultiCellCollections() model.validate(rows(row(metadata, 0, List.of(42, 42), Set.of(0, 42), Map.of(42, 0), List.of(42, 42), Set.of(0, 42), Map.of(42, 0))), Select.builder(metadata).build()); } + @Test + public void insertEmptyRow() + { + TableMetadata metadata = defaultTable() + .addPartitionKeyColumn("pk", Int32Type.instance) + .addStaticColumn("s", Int32Type.instance) + .addClusteringColumn("ck", Int32Type.instance) + .addRegularColumn("r", Int32Type.instance) + .build(); + ASTSingleTableModel model = new ASTSingleTableModel(metadata); + + model.update(Mutation.insert(metadata) + .value("pk", 0) + .value("s", 0) + .value("ck", 0) + .build()); + model.validate(rows(row(metadata, 0, 0, 0, null)), Select.builder(metadata).build()); + } + + @Test + public void updateEmptyRow() + { + TableMetadata metadata = defaultTable() + .addPartitionKeyColumn("pk", Int32Type.instance) + .addStaticColumn("s", Int32Type.instance) + .addClusteringColumn("ck", Int32Type.instance) + .addRegularColumn("r", Int32Type.instance) + .build(); + ASTSingleTableModel model = new ASTSingleTableModel(metadata); + + model.update(Mutation.update(metadata) + .set("s", 0) + .value("pk", 0) + .value("ck", 0) + .build()); + model.validate(rows(row(metadata, 0, null, 0, null)), Select.builder(metadata).build()); + } + + @Test + public void deleteColumnUpdateDoesntHavePartitionState() + { + TableMetadata metadata = defaultTable() + .addPartitionKeyColumn("pk", Int32Type.instance) + .addStaticColumn("s", Int32Type.instance) + .addClusteringColumn("ck", Int32Type.instance) + .addRegularColumn("r", ListType.getInstance(Int32Type.instance, true)) + .build(); + ASTSingleTableModel model = new ASTSingleTableModel(metadata); + + model.update(Mutation.update(metadata) + .set("r", List.of(0)) + .set("s", 0) + .value("pk", 0) + .value("ck", 0) + .build()); + model.update(Mutation.update(metadata) + .set("r", List.of(1)) + .value("pk", 0) + .value("ck", 1) + .build()); + model.validate(rows(row(metadata, 0, 0, 0, List.of(0)), + row(metadata, 0, 1, 0, List.of(1))), Select.builder(metadata).build()); + + model.update(Mutation.delete(metadata) + .columns("r", "s") + .value("pk", 0) + .value("ck", 0) + .build()); + model.validate(rows(row(metadata, 0, 1, null, List.of(1))), Select.builder(metadata).build()); + } + + private interface SimpleWrite + { + void write(String name, T value, long ts); + } + private static ByteBuffer[][] rows(ByteBuffer[]... rows) { return rows; } + private static ByteBuffer[] row(ByteBuffer... values) + { + return values; + } + private static ByteBuffer[] row(TableMetadata metadata, Object... values) { ByteBuffer[] row = new ByteBuffer[values.length]; var it = metadata.allColumnsInSelectOrder(); for (int i = 0; i < values.length && it.hasNext(); i++) - row[i] = it.next().type.decomposeUntyped(values[i]); + { + ColumnMetadata column = it.next(); + Object value = values[i]; + row[i] = value == null ? null : column.type.decomposeUntyped(value); + } return row; } diff --git a/test/harry/main/org/apache/cassandra/harry/model/BytesPartitionState.java b/test/harry/main/org/apache/cassandra/harry/model/BytesPartitionState.java index c2d18e573d81..70988e7801d4 100644 --- a/test/harry/main/org/apache/cassandra/harry/model/BytesPartitionState.java +++ b/test/harry/main/org/apache/cassandra/harry/model/BytesPartitionState.java @@ -20,8 +20,8 @@ import java.nio.ByteBuffer; import java.util.ArrayList; -import java.util.Collection; import java.util.Comparator; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.NavigableSet; @@ -32,6 +32,8 @@ import java.util.stream.Stream; import javax.annotation.Nullable; +import com.google.common.collect.Sets; + import org.apache.cassandra.cql3.ast.Symbol; import org.apache.cassandra.db.Clustering; import org.apache.cassandra.db.ClusteringComparator; @@ -44,6 +46,7 @@ import org.apache.cassandra.harry.util.BitSet; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FastByteOperations; import org.apache.cassandra.utils.ImmutableUniqueList; @@ -62,31 +65,36 @@ private BytesPartitionState(Factory factory, Clustering key) this.state = factory.partitionState(key); } - public void deleteRow(Clustering clustering) + public void deleteRow(Clustering clustering, long ts) { long cd = factory.clusteringCache.deflateOrUndefined(clustering); if (MagicConstants.UNSET_DESCR == cd) return; - state.delete(cd, MagicConstants.NO_TIMESTAMP); + deleteRow(cd, ts); } - public void deleteColumns(Clustering clustering, Set columns) + private void deleteRow(long cd, long ts) + { + state.delete(cd, ts); + } + + public void deleteColumns(Clustering clustering, long ts, Set columns) { long cd = factory.clusteringCache.deflateOrUndefined(clustering); if (cd != MagicConstants.UNSET_DESCR) { BitSet regularColumns = bitset(columns, true); if (!regularColumns.allUnset()) - state.deleteRegularColumns(MagicConstants.NO_TIMESTAMP, cd, regularColumns); + state.deleteRegularColumns(ts, cd, regularColumns); } - deleteStaticColumns(columns); + deleteStaticColumns(ts, columns); } - public void deleteStaticColumns(Set columns) + public void deleteStaticColumns(long ts, Set columns) { BitSet staticColumns = bitset(columns, false); if (!staticColumns.allUnset()) - state.deleteStaticColumns(MagicConstants.NO_TIMESTAMP, staticColumns); + state.deleteStaticColumns(ts, staticColumns); } private BitSet bitset(Set columns, boolean regular) @@ -109,28 +117,27 @@ public Ref ref() public PrimaryKey partitionRowRef() { - return new PrimaryKey(ref(), null); + return new PrimaryKey(factory, ref(), null); } - public void setStaticColumns(Map values) + public void setStaticColumns(long ts, Map values) { if (factory.staticColumns.isEmpty() || values.isEmpty()) throw new IllegalStateException("Attempt to write to static columns; but they do not exist"); - long[] sds = toDescriptor(factory.staticColumns, values); - state.writeStatic(sds, MagicConstants.NO_TIMESTAMP); + + state.writeStatic(toDescriptor(factory.staticColumns, values), ts); } - public void setColumns(Clustering clustering, Map values, boolean writePrimaryKeyLiveness) + public void setColumns(Clustering clustering, long ts, Map values, boolean writePrimaryKeyLiveness) { long cd = factory.clusteringCache.deflate(clustering); - long[] vds = toDescriptor(factory.regularColumns, values); - state.writeRegular(cd, vds, MagicConstants.NO_TIMESTAMP, writePrimaryKeyLiveness); + state.writeRegular(cd, toDescriptor(factory.regularColumns, values), ts, writePrimaryKeyLiveness); // UDT's have the ability to "update" that triggers a delete; this allows creating an "empty" row. // When an empty row exists without liveness info, then purge the row var row = state.rows.get(cd); if (row.isEmpty() && !row.hasPrimaryKeyLivenessInfo) - state.delete(cd, MagicConstants.NO_TIMESTAMP); + deleteRow(cd, ts); } private long[] toDescriptor(ImmutableUniqueList positions, Map values) @@ -200,6 +207,8 @@ public boolean staticOnly() @Nullable public Row get(Clustering clustering) { + if (clustering == Clustering.STATIC_CLUSTERING) + return staticRow(); long cd = factory.clusteringCache.deflateOrUndefined(clustering); if (cd == MagicConstants.UNSET_DESCR) return null; @@ -216,6 +225,12 @@ public ByteBuffer get(Clustering clustering, Symbol column) return row == null ? null : row.get(column); } + public long timestamp(Clustering clustering, Symbol column) + { + Row row = get(clustering); + return row == null ? MagicConstants.NO_TIMESTAMP : row.timestamp(column); + } + private Row toRow(PartitionState.RowState rowState) { Clustering clustering; @@ -230,10 +245,10 @@ private Row toRow(PartitionState.RowState rowState) clustering = factory.clusteringCache.inflate(rowState.cd); values = fromDescriptor(factory.regularColumns, rowState.vds); } - return new Row(clustering, values); + return new Row(clustering, values, rowState.lts); } - public Collection rows() + public List rows() { return state.rows().values().stream().map(this::toRow).collect(Collectors.toList()); } @@ -281,18 +296,26 @@ private static void appendValues(StringBuilder sb, List columns, Cluster sb.append(')'); } - public class PrimaryKey implements Comparable + public static class PrimaryKey implements Comparable { + private final Factory factory; public final BytesPartitionState.Ref partition; @Nullable public final Clustering clustering; - public PrimaryKey(BytesPartitionState.Ref partition, @Nullable Clustering clustering) + private PrimaryKey(Factory factory, BytesPartitionState.Ref partition, @Nullable Clustering clustering) { + this.factory = factory; this.partition = partition; this.clustering = clustering; } + public boolean isPartitionLevel() + { + return clustering == null // has clustering, but only referencing partition + || Clustering.EMPTY.equals(clustering); // doesn't have clustering + } + @Override public int compareTo(PrimaryKey o) { @@ -324,7 +347,8 @@ public String toString() StringBuilder sb = new StringBuilder("(partition="); sb.append(partition); sb.append(", clustering="); - appendValues(sb, factory.clusteringColumns, clustering); + if (clustering == null) sb.append("null"); + else appendValues(sb, factory.clusteringColumns, clustering); sb.append(')'); return sb.toString(); } @@ -415,12 +439,22 @@ public class Row public final Clustering clustering; private final ImmutableUniqueList columnNames; private final ByteBuffer[] columns; + private final long[] lts; - private Row(Clustering clustering, ByteBuffer[] columns) + private Row(Clustering clustering, ByteBuffer[] columns, long[] lts) { this.clustering = clustering; this.columnNames = clustering == Clustering.STATIC_CLUSTERING ? factory.staticColumns : factory.regularColumns; this.columns = columns; + this.lts = lts; + } + + private Row(Clustering clustering, ImmutableUniqueList columnNames, ByteBuffer[] columns, long[] lts) + { + this.clustering = clustering; + this.columnNames = columnNames; + this.columns = columns; + this.lts = lts; } public ByteBuffer get(Symbol col) @@ -433,15 +467,50 @@ public ByteBuffer get(int offset) return columns[offset]; } + public long timestamp(Symbol col) + { + return lts[columnNames.indexOf(col)]; + } + + public long timestamp(int offset) + { + return lts[offset]; + } + public PrimaryKey ref() { - return new PrimaryKey(BytesPartitionState.this.ref(), clustering); + return new PrimaryKey(factory, BytesPartitionState.this.ref(), clustering); } public boolean isEmpty() { return Stream.of(columns).allMatch(b -> b == null ); } + + public Row select(List selection) + { + if (columnNames.equals(selection)) return this; + selection = validateSelect(selection); + ByteBuffer[] selected = new ByteBuffer[selection.size()]; + ImmutableUniqueList.Builder names = ImmutableUniqueList.builder(selected.length); + for (int i = 0; i < selection.size(); i++) + { + Symbol col = selection.get(i); + selected[i] = columns[columnNames.indexOf(col)]; + names.add(col); + } + + return new Row(clustering, names.build(), selected, lts); + } + + private List validateSelect(List selection) + { + LinkedHashSet uniqueSelection = new LinkedHashSet<>(selection); + var unknown = Sets.difference(uniqueSelection, columnNames.asSet()); + if (!unknown.isEmpty()) + throw new AssertionError("Unable to select columns " + selection + "; has unknown columns " + unknown); + return uniqueSelection.size() == selection.size() ? selection : new ArrayList<>(uniqueSelection); + } } public static class Factory @@ -452,14 +521,19 @@ public static class Factory public final ImmutableUniqueList primaryColumns; public final ImmutableUniqueList staticColumns; public final ImmutableUniqueList regularColumns; - public final ImmutableUniqueList selectionOrder, partitionAndStaticColumns, regularAndStaticColumns; + public final ImmutableUniqueList selectionOrder, partitionAndStaticColumns, clusteringAndRegularColumns, regularAndStaticColumns; public final ClusteringComparator clusteringComparator; // translation layer for harry interop private final BijectionCache> partitionCache = new BijectionCache<>(Reject.instance.as()); private final BijectionCache> clusteringCache; - private final BijectionCache valueCache = new BijectionCache<>(Reject.instance.as()); + private final BijectionCache valueCache = new BijectionCache<>((l, r) -> { + if (!l.type.equals(r.type)) + throw new IllegalArgumentException("Unable to compare different types: " + l.type.asCQL3Type() + " != " + r.type.asCQL3Type()); + // Cells resolve based off unsigned byte order and not type order + return ByteBufferUtil.compareUnsigned(l.value, r.value); + }); private final ValueGenerators, Clustering> valueGenerators; public Factory(TableMetadata metadata) @@ -475,27 +549,27 @@ public Factory(TableMetadata metadata) if (clusteringColumns.isEmpty()) primaryColumns = partitionColumns; else { - symbolListBuilder.addAll(partitionColumns); - symbolListBuilder.addAll(clusteringColumns); - primaryColumns = symbolListBuilder.buildAndClear(); + primaryColumns = symbolListBuilder.addAll(partitionColumns) + .addAll(clusteringColumns) + .buildAndClear(); } - for (ColumnMetadata pk : metadata.staticColumns()) - symbolListBuilder.add(Symbol.from(pk)); + metadata.staticColumns().selectOrderIterator().forEachRemaining(cm -> symbolListBuilder.add(Symbol.from(cm))); staticColumns = symbolListBuilder.buildAndClear(); if (staticColumns.isEmpty()) partitionAndStaticColumns = partitionColumns; else { - symbolListBuilder.addAll(partitionColumns); - symbolListBuilder.addAll(staticColumns); - partitionAndStaticColumns = symbolListBuilder.buildAndClear(); + partitionAndStaticColumns = symbolListBuilder.addAll(partitionColumns) + .addAll(staticColumns) + .buildAndClear(); } - for (ColumnMetadata pk : metadata.regularColumns()) - symbolListBuilder.add(Symbol.from(pk)); + metadata.regularColumns().selectOrderIterator().forEachRemaining(cm -> symbolListBuilder.add(Symbol.from(cm))); regularColumns = symbolListBuilder.buildAndClear(); + clusteringAndRegularColumns = symbolListBuilder.addAll(clusteringColumns) + .addAll(regularColumns) + .buildAndClear(); metadata.allColumnsInSelectOrder().forEachRemaining(cm -> symbolListBuilder.add(Symbol.from(cm))); selectionOrder = symbolListBuilder.buildAndClear(); - metadata.regularAndStaticColumns().forEach(cm -> symbolListBuilder.add(Symbol.from(cm))); - regularAndStaticColumns = symbolListBuilder.buildAndClear(); + regularAndStaticColumns = symbolListBuilder.addAll(staticColumns).addAll(regularColumns).buildAndClear(); clusteringComparator = new ClusteringComparator(clusteringColumns.stream().map(Symbol::rawType).collect(Collectors.toList())); @@ -569,6 +643,11 @@ public BytesPartitionState.Ref createRef(Token token, boolean nullKeyGtMatchingT return new BytesPartitionState.Ref(this, token, nullKeyGtMatchingToken); } + public PrimaryKey createPrimaryKey(Ref pk, @Nullable Clustering cd) + { + return new BytesPartitionState.PrimaryKey(this, pk, cd); + } + private PartitionState partitionState(Clustering key) { return new PartitionState(partitionCache.deflate(key), valueGenerators); diff --git a/test/harry/main/org/apache/cassandra/harry/model/PartitionState.java b/test/harry/main/org/apache/cassandra/harry/model/PartitionState.java index cf4d70bd958f..57ec2ad4af61 100644 --- a/test/harry/main/org/apache/cassandra/harry/model/PartitionState.java +++ b/test/harry/main/org/apache/cassandra/harry/model/PartitionState.java @@ -236,8 +236,12 @@ private RowState updateRowState(RowState currentState, IntFunction column = columns.apply(i); - if (column.compare(vds[i], currentState.vds[i]) > 0) + if (vds[i] == MagicConstants.NIL_DESCR // writing a null is the same as a tombstone, which has higher priority + || (currentState.vds[i] != MagicConstants.NIL_DESCR + && column.compare(vds[i], currentState.vds[i]) > 0)) + { currentState.vds[i] = vds[i]; + } } else { diff --git a/test/harry/main/org/apache/cassandra/harry/model/QuiescentChecker.java b/test/harry/main/org/apache/cassandra/harry/model/QuiescentChecker.java index a77904202c68..1b2efb14e90d 100644 --- a/test/harry/main/org/apache/cassandra/harry/model/QuiescentChecker.java +++ b/test/harry/main/org/apache/cassandra/harry/model/QuiescentChecker.java @@ -148,7 +148,7 @@ public static void validate(ValueGenerators valueGenerators, PartitionState part "\nExpected: %s" + "\nActual: %s", expectedRowState.toString(valueGenerators), - actualRowState); + actualRowState.toString(valueGenerators)); } if (!vdsEqual(expectedRowState.vds, actualRowState.vds)) @@ -191,7 +191,7 @@ public static void validate(ValueGenerators valueGenerators, PartitionState part public static boolean vdsEqual(long[] expected, long[] actual) { - Invariants.checkState(expected.length == actual.length); + Invariants.require(expected.length == actual.length); for (int i = 0; i < actual.length; i++) { long expectedD = expected[i]; diff --git a/test/harry/main/org/apache/cassandra/harry/model/TokenPlacementModel.java b/test/harry/main/org/apache/cassandra/harry/model/TokenPlacementModel.java index 2c24ec257c79..c9dcbcb137c1 100644 --- a/test/harry/main/org/apache/cassandra/harry/model/TokenPlacementModel.java +++ b/test/harry/main/org/apache/cassandra/harry/model/TokenPlacementModel.java @@ -725,6 +725,7 @@ public interface Lookup long token(int tokenIdx); Lookup forceToken(int tokenIdx, long token); void reset(); + int rackIdx(String rack); default NodeId nodeId(int nodeIdx) { @@ -785,6 +786,11 @@ public InetAddressAndPort addr(int idx) return null; } + public int rackIdx(String rack) + { + throw new UnsupportedOperationException(); + } + public void reset() { throw new UnsupportedOperationException(); @@ -843,6 +849,11 @@ public String rack(int rackIdx) { return String.format("rack%d", rackIdx); } + + public int rackIdx(String rack) + { + throw new UnsupportedOperationException(); + } } public static class HumanReadableTokensLookup extends DefaultLookup { @@ -1022,6 +1033,12 @@ public Node overrideToken(long override) { return new Node(tokenIdx, nodeIdx, dcIdx, rackIdx, lookup.forceToken(tokenIdx, override)); } + + public Node withNewRack(String newRack) + { + return new Node(tokenIdx, nodeIdx, dcIdx, lookup.rackIdx(newRack), lookup); + } + public Murmur3Partitioner.LongToken longToken() { return new Murmur3Partitioner.LongToken(token()); diff --git a/test/harry/main/org/apache/cassandra/harry/op/Operations.java b/test/harry/main/org/apache/cassandra/harry/op/Operations.java index 47ffa93c1c66..0a3eee1ffd39 100644 --- a/test/harry/main/org/apache/cassandra/harry/op/Operations.java +++ b/test/harry/main/org/apache/cassandra/harry/op/Operations.java @@ -471,7 +471,7 @@ static Selection fromBitSet(BitSet bitSet, SchemaSpec schema) } else { - Invariants.checkState(schema.allColumnInSelectOrder.size() == bitSet.size()); + Invariants.require(schema.allColumnInSelectOrder.size() == bitSet.size()); Map, Integer> columns = new HashMap<>(); for (int i = 0; i < schema.allColumnInSelectOrder.size(); i++) { diff --git a/test/harry/main/org/apache/cassandra/harry/op/Visit.java b/test/harry/main/org/apache/cassandra/harry/op/Visit.java index 6c8c31390c89..7d77b7287159 100644 --- a/test/harry/main/org/apache/cassandra/harry/op/Visit.java +++ b/test/harry/main/org/apache/cassandra/harry/op/Visit.java @@ -32,6 +32,8 @@ public class Visit public final Set visitedPartitions; public final boolean selectOnly; + public final boolean hasCustom; + public Visit(long lts, Operation[] operations) { Assert.assertTrue(operations.length > 0); @@ -39,8 +41,11 @@ public Visit(long lts, Operation[] operations) this.operations = operations; this.visitedPartitions = new HashSet<>(); boolean selectOnly = true; + boolean hasCustom = false; for (Operation operation : operations) { + if (operation.kind() == Operations.Kind.CUSTOM) + hasCustom = true; if (selectOnly && !(operation instanceof Operations.SelectStatement)) selectOnly = false; @@ -49,6 +54,7 @@ public Visit(long lts, Operation[] operations) } this.selectOnly = selectOnly; + this.hasCustom = hasCustom; } public String toString() diff --git a/test/harry/main/org/apache/cassandra/harry/test/SimpleBijectionTest.java b/test/harry/main/org/apache/cassandra/harry/test/SimpleBijectionTest.java index fde20aa363f3..9e090895da0e 100644 --- a/test/harry/main/org/apache/cassandra/harry/test/SimpleBijectionTest.java +++ b/test/harry/main/org/apache/cassandra/harry/test/SimpleBijectionTest.java @@ -52,7 +52,7 @@ public void testOrder() Object next = generator.inflate(generator.descriptorAt(i)); if (previous != null) { - Invariants.checkState(column.type.comparator().compare(next, previous) > 0, + Invariants.require(column.type.comparator().compare(next, previous) > 0, "%s should be > %s", next, previous); } previous = next; diff --git a/test/harry/main/org/apache/cassandra/harry/util/StringUtils.java b/test/harry/main/org/apache/cassandra/harry/util/StringUtils.java index 0b3b94b73fea..b33a407e5288 100644 --- a/test/harry/main/org/apache/cassandra/harry/util/StringUtils.java +++ b/test/harry/main/org/apache/cassandra/harry/util/StringUtils.java @@ -35,7 +35,7 @@ public static String escapeControlChars(String input) for (int i = 0; i < input.length(); i++) { char c = input.charAt(i); - if (Character.isISOControl(c)) + if (Character.isISOControl(c) && c != '\n') result.append(String.format("\\u%04X", (int) c)); else result.append(c); diff --git a/test/long/org/apache/cassandra/dht/tokenallocator/Murmur3ReplicationAwareTokenAllocatorTest.java b/test/long/org/apache/cassandra/dht/tokenallocator/Murmur3ReplicationAwareTokenAllocatorTest.java index e28ecfa45347..83665f1c9b22 100644 --- a/test/long/org/apache/cassandra/dht/tokenallocator/Murmur3ReplicationAwareTokenAllocatorTest.java +++ b/test/long/org/apache/cassandra/dht/tokenallocator/Murmur3ReplicationAwareTokenAllocatorTest.java @@ -30,7 +30,7 @@ public class Murmur3ReplicationAwareTokenAllocatorTest extends AbstractReplicati @Test public void testExistingCluster() { - super.testExistingCluster(new Murmur3Partitioner(), MAX_VNODE_COUNT); + super.testExistingCluster(Murmur3Partitioner.instance, MAX_VNODE_COUNT); } @Test @@ -43,6 +43,6 @@ public void testNewCluster() private void flakyTestNewCluster() { - testNewCluster(new Murmur3Partitioner(), MAX_VNODE_COUNT); + testNewCluster(Murmur3Partitioner.instance, MAX_VNODE_COUNT); } } diff --git a/test/long/org/apache/cassandra/dht/tokenallocator/NoReplicationTokenAllocatorTest.java b/test/long/org/apache/cassandra/dht/tokenallocator/NoReplicationTokenAllocatorTest.java index 5e13519fcd42..6835f2c5b013 100644 --- a/test/long/org/apache/cassandra/dht/tokenallocator/NoReplicationTokenAllocatorTest.java +++ b/test/long/org/apache/cassandra/dht/tokenallocator/NoReplicationTokenAllocatorTest.java @@ -26,9 +26,9 @@ import com.google.common.collect.Maps; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; +import org.junit.Assert; import org.junit.Test; -import org.junit.Assert; import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.dht.RandomPartitioner; @@ -42,13 +42,13 @@ public class NoReplicationTokenAllocatorTest extends TokenAllocatorTestBase @Test public void testNewClusterWithMurmur3Partitioner() { - testNewCluster(new Murmur3Partitioner()); + testNewCluster(Murmur3Partitioner.instance); } @Test public void testNewClusterWithRandomPartitioner() { - testNewCluster(new RandomPartitioner()); + testNewCluster(RandomPartitioner.instance); } private void testNewCluster(IPartitioner partitioner) @@ -75,13 +75,13 @@ public void testNewCluster(int perUnitCount, TokenCount tc, NoReplicationStrateg @Test public void testExistingClusterWithMurmur3Partitioner() { - testExistingCluster(new Murmur3Partitioner()); + testExistingCluster(Murmur3Partitioner.instance); } @Test public void testExistingClusterWithRandomPartitioner() { - testExistingCluster(new RandomPartitioner()); + testExistingCluster(RandomPartitioner.instance); } private void testExistingCluster(IPartitioner partitioner) diff --git a/test/long/org/apache/cassandra/dht/tokenallocator/RandomReplicationAwareTokenAllocatorTest.java b/test/long/org/apache/cassandra/dht/tokenallocator/RandomReplicationAwareTokenAllocatorTest.java index bb1a2c8f3c57..4e7982e0c7a5 100644 --- a/test/long/org/apache/cassandra/dht/tokenallocator/RandomReplicationAwareTokenAllocatorTest.java +++ b/test/long/org/apache/cassandra/dht/tokenallocator/RandomReplicationAwareTokenAllocatorTest.java @@ -34,13 +34,13 @@ public class RandomReplicationAwareTokenAllocatorTest extends AbstractReplicatio @Test public void testExistingCluster() { - testExistingCluster(new RandomPartitioner(), MAX_VNODE_COUNT); + testExistingCluster(RandomPartitioner.instance, MAX_VNODE_COUNT); } @Test public void testNewClusterr() { - testNewCluster(new RandomPartitioner(), MAX_VNODE_COUNT); + testNewCluster(RandomPartitioner.instance, MAX_VNODE_COUNT); } } diff --git a/test/microbench/org/apache/cassandra/test/microbench/CacheLoaderBench.java b/test/microbench/org/apache/cassandra/test/microbench/CacheLoaderBench.java index afafad67ed47..1aa55298dad0 100644 --- a/test/microbench/org/apache/cassandra/test/microbench/CacheLoaderBench.java +++ b/test/microbench/org/apache/cassandra/test/microbench/CacheLoaderBench.java @@ -115,7 +115,7 @@ public void setup() throws Throwable cfs.truncateBlocking(); for (int i = 0; i < numSSTables ; i++) { - ColumnMetadata colDef = ColumnMetadata.regularColumn(cfs.metadata(), ByteBufferUtil.bytes("val"), AsciiType.instance); + ColumnMetadata colDef = ColumnMetadata.regularColumn(cfs.metadata(), ByteBufferUtil.bytes("val"), AsciiType.instance, ColumnMetadata.NO_UNIQUE_ID); for (int k = 0; k < numKeysPerTable; k++) { RowUpdateBuilder rowBuilder = new RowUpdateBuilder(cfs.metadata(), System.currentTimeMillis() + random.nextInt(), "key" + k); diff --git a/test/microbench/org/apache/cassandra/test/microbench/ZeroCopyStreamingBench.java b/test/microbench/org/apache/cassandra/test/microbench/ZeroCopyStreamingBench.java index e10b91d9b2e5..07ff3edce90b 100644 --- a/test/microbench/org/apache/cassandra/test/microbench/ZeroCopyStreamingBench.java +++ b/test/microbench/org/apache/cassandra/test/microbench/ZeroCopyStreamingBench.java @@ -78,6 +78,7 @@ import org.openjdk.jmh.annotations.Threads; import org.openjdk.jmh.annotations.Warmup; +import static java.util.Collections.emptyList; import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; /** @@ -130,14 +131,14 @@ public void setupBenchmark() throws IOException serializedBlockStream = blockStreamCaptureChannel.getSerializedStream(); out.close(); - session.prepareReceiving(new StreamSummary(sstable.metadata().id, 1, serializedBlockStream.readableBytes())); + session.prepareReceiving(new StreamSummary(sstable.metadata().id, emptyList(), 1, serializedBlockStream.readableBytes())); CassandraStreamHeader entireSSTableStreamHeader = CassandraStreamHeader.builder() .withSSTableVersion(sstable.descriptor.version) .withSSTableLevel(0) .withEstimatedKeys(sstable.estimatedKeys()) - .withSections(Collections.emptyList()) + .withSections(emptyList()) .withSerializationHeader(sstable.header.toComponent()) .withComponentManifest(context.manifest()) .isEntireSSTable(true) @@ -219,7 +220,7 @@ private StreamSession setupStreamingSessionForTest() StreamResultFuture future = StreamResultFuture.createInitiator(nextTimeUUID(), StreamOperation.BOOTSTRAP, Collections.emptyList(), streamCoordinator); InetAddressAndPort peer = FBUtilities.getBroadcastAddressAndPort(); - streamCoordinator.addSessionInfo(new SessionInfo(peer, 0, peer, Collections.emptyList(), Collections.emptyList(), StreamSession.State.INITIALIZED, null)); + streamCoordinator.addSessionInfo(new SessionInfo(peer, 0, peer, emptyList(), emptyList(), StreamSession.State.INITIALIZED, null)); StreamSession session = streamCoordinator.getOrCreateOutboundSession(peer); session.init(future); diff --git a/test/microbench/org/apache/cassandra/test/microbench/btree/AtomicBTreePartitionUpdateBench.java b/test/microbench/org/apache/cassandra/test/microbench/btree/AtomicBTreePartitionUpdateBench.java index 7083832c012a..df9d919f5218 100644 --- a/test/microbench/org/apache/cassandra/test/microbench/btree/AtomicBTreePartitionUpdateBench.java +++ b/test/microbench/org/apache/cassandra/test/microbench/btree/AtomicBTreePartitionUpdateBench.java @@ -68,11 +68,11 @@ import org.apache.cassandra.schema.TableId; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.schema.TableMetadataRef; +import org.apache.cassandra.utils.BulkIterator; import org.apache.cassandra.utils.btree.BTree; import org.apache.cassandra.utils.btree.UpdateFunction; import org.apache.cassandra.utils.concurrent.ImmediateFuture; import org.apache.cassandra.utils.concurrent.OpOrder; -import org.apache.cassandra.utils.BulkIterator; import org.apache.cassandra.utils.memory.ByteBufferCloner; import org.apache.cassandra.utils.memory.Cloner; import org.apache.cassandra.utils.memory.HeapPool; @@ -107,7 +107,7 @@ public class AtomicBTreePartitionUpdateBench private static final MutableDeletionInfo NO_DELETION_INFO = new MutableDeletionInfo(DeletionTime.LIVE); private static final HeapPool POOL = new HeapPool(Long.MAX_VALUE, 1.0f, () -> ImmediateFuture.success(Boolean.TRUE)); private static final ByteBuffer zero = Int32Type.instance.decompose(0); - private static final DecoratedKey decoratedKey = new BufferDecoratedKey(new ByteOrderedPartitioner().getToken(zero), zero); + private static final DecoratedKey decoratedKey = new BufferDecoratedKey(ByteOrderedPartitioner.instance.getToken(zero), zero); static { @@ -565,6 +565,7 @@ private static ColumnMetadata[] columns(AbstractType type, ColumnMetadata.Kin "", new ColumnIdentifier(prefix + i, true), type, + ColumnMetadata.NO_UNIQUE_ID, kind != ColumnMetadata.Kind.REGULAR ? i : ColumnMetadata.NO_POSITION, kind, null)) diff --git a/test/simulator/asm/org/apache/cassandra/simulator/asm/ClassTransformer.java b/test/simulator/asm/org/apache/cassandra/simulator/asm/ClassTransformer.java index 778e44c80894..70fa3a6f0493 100644 --- a/test/simulator/asm/org/apache/cassandra/simulator/asm/ClassTransformer.java +++ b/test/simulator/asm/org/apache/cassandra/simulator/asm/ClassTransformer.java @@ -44,8 +44,8 @@ import static org.apache.cassandra.simulator.asm.TransformationKind.HASHCODE; import static org.apache.cassandra.simulator.asm.TransformationKind.SYNCHRONIZED; import static org.apache.cassandra.simulator.asm.Utils.deterministicToString; -import static org.apache.cassandra.simulator.asm.Utils.visitEachRefType; import static org.apache.cassandra.simulator.asm.Utils.generateTryFinallyProxyCall; +import static org.apache.cassandra.simulator.asm.Utils.visitEachRefType; import static org.objectweb.asm.Opcodes.ACC_PRIVATE; import static org.objectweb.asm.Opcodes.ACC_STATIC; import static org.objectweb.asm.Opcodes.ACC_SYNTHETIC; @@ -182,7 +182,6 @@ private static boolean contains(int value, int mask) public void visit(int version, int access, String name, String signature, String superName, String[] interfaces) { super.visit(version, makePublic(access), name, signature, superName, interfaces); - } @Override @@ -190,6 +189,10 @@ public FieldVisitor visitField(int access, String name, String descriptor, Strin { if (dependentTypes != null) Utils.visitIfRefType(descriptor, dependentTypes); + // org.apache.cassandra.simulator.systems.SimulatedTime.InstanceTime.nanoTime does not change between invokes which causes AbstractQueuedSynchronizer to loop forever, + // so need to make the threshold negative to avoid the spin loop. + if (className.equals("java/util/concurrent/locks/AbstractQueuedSynchronizer") && name.equals("SPIN_FOR_TIMEOUT_THRESHOLD")) + return super.visitField(makePublic(access), name, descriptor, signature, Long.MIN_VALUE); return super.visitField(makePublic(access), name, descriptor, signature, value); } @@ -301,6 +304,7 @@ void witness(TransformationKind kind) { case FIELD_NEMESIS: case SIGNAL_NEMESIS: + // TODO: this isn't correct: we will share any class we choose not to insert nemesis points into on first transformation isCacheablyTransformed = false; } methodLogger.witness(kind); diff --git a/test/simulator/asm/org/apache/cassandra/simulator/asm/DeterministicChanceSupplier.java b/test/simulator/asm/org/apache/cassandra/simulator/asm/DeterministicChanceSupplier.java new file mode 100644 index 000000000000..dae021a128b8 --- /dev/null +++ b/test/simulator/asm/org/apache/cassandra/simulator/asm/DeterministicChanceSupplier.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.asm; + +import java.util.function.IntFunction; + +public interface DeterministicChanceSupplier extends IntFunction +{ +} diff --git a/test/simulator/asm/org/apache/cassandra/simulator/asm/GlobalMethodTransformer.java b/test/simulator/asm/org/apache/cassandra/simulator/asm/GlobalMethodTransformer.java index 883b7a66a3cc..064d2056468c 100644 --- a/test/simulator/asm/org/apache/cassandra/simulator/asm/GlobalMethodTransformer.java +++ b/test/simulator/asm/org/apache/cassandra/simulator/asm/GlobalMethodTransformer.java @@ -85,12 +85,17 @@ else if (globalMethods && ((opcode == Opcodes.INVOKESTATIC && ( || !deterministic && owner.equals("java/lang/System") && name.equals("identityHashCode") || owner.equals("java/util/UUID") && name.equals("randomUUID") || owner.equals("com/google/common/util/concurrent/Uninterruptibles") && name.equals("sleepUninterruptibly") - || owner.equals("sun/misc/Unsafe") && name.equals("getUnsafe"))) - )) + || owner.equals("sun/misc/Unsafe") && name.equals("getUnsafe")))) + ) { transformer.witness(GLOBAL_METHOD); super.visitMethodInsn(Opcodes.INVOKESTATIC, "org/apache/cassandra/simulator/systems/InterceptorOfSystemMethods$Global", name, descriptor, false); } + else if (owner.equals("java/util/concurrent/TimeUnit") && name.equals("sleep")) + { + transformer.witness(GLOBAL_METHOD); + super.visitMethodInsn(Opcodes.INVOKESTATIC, "org/apache/cassandra/simulator/systems/InterceptorOfSystemMethods$Global", "sleep", "(Ljava/util/concurrent/TimeUnit;J)V", false); + } else if ((globalMethods || deterministic) && opcode == Opcodes.INVOKESTATIC && ((owner.equals("java/util/concurrent/ThreadLocalRandom") && (name.equals("getProbe") || name.equals("advanceProbe") || name.equals("localInit"))) || (owner.equals("java/util/concurrent/atomic/Striped64") && (name.equals("getProbe") || name.equals("advanceProbe")))) diff --git a/test/simulator/asm/org/apache/cassandra/simulator/asm/InterceptAgent.java b/test/simulator/asm/org/apache/cassandra/simulator/asm/InterceptAgent.java index 4cf1546ca826..8774d867d169 100644 --- a/test/simulator/asm/org/apache/cassandra/simulator/asm/InterceptAgent.java +++ b/test/simulator/asm/org/apache/cassandra/simulator/asm/InterceptAgent.java @@ -30,6 +30,7 @@ import java.util.Arrays; import java.util.EnumSet; import java.util.List; +import java.util.Objects; import java.util.function.BiFunction; import java.util.regex.Pattern; @@ -93,6 +94,9 @@ public byte[] transform(ClassLoader loader, String className, Class classBein if (className.equals("java/lang/Object")) return transformObject(bytecode); + if (className.equals("java/lang/Class")) + return transformClass(bytecode); + if (className.equals("java/lang/Enum")) return transformEnum(bytecode); @@ -103,10 +107,14 @@ public byte[] transform(ClassLoader loader, String className, Class classBein return transformThreadLocalRandom(bytecode); if (className.startsWith("java/util/concurrent/ConcurrentHashMap")) - return transformConcurrent(className, bytecode, DETERMINISTIC, NO_PROXY_METHODS); + return InterceptAgent.transform(className, bytecode, DETERMINISTIC, NO_PROXY_METHODS); if (className.startsWith("java/util/concurrent/locks")) - return transformConcurrent(className, bytecode, SYSTEM_CLOCK, LOCK_SUPPORT, NO_PROXY_METHODS); + { + if (className.equals("java/util/concurrent/locks/AbstractQueuedSynchronizer")) + return InterceptAgent.transformAbstractQueuedSynchronizer(className, bytecode, SYSTEM_CLOCK, LOCK_SUPPORT, NO_PROXY_METHODS); + return InterceptAgent.transform(className, bytecode, SYSTEM_CLOCK, LOCK_SUPPORT, NO_PROXY_METHODS); + } return null; } @@ -172,6 +180,29 @@ public MethodVisitor visitMethod(int access, String name, String descriptor, Str return transform(bytes, ObjectVisitor::new); } + /** + * We don't want Object.toString() to invoke our overridden identityHashCode by virtue of invoking some overridden hashCode() + * So we overwrite Object.toString() to replace calls to Object.hashCode() with direct calls to System.identityHashCode() + */ + private static byte[] transformClass(byte[] bytes) + { + class ClazzVisitor extends ClassVisitor + { + public ClazzVisitor(int api, ClassVisitor classVisitor) + { + super(api, classVisitor); + } + + @Override + public void visitEnd() + { + new StringHashcode(api).accept(this); + super.visitEnd(); + } + } + return transform(bytes, ClazzVisitor::new); + } + /** * We want Enum to have a deterministic hashCode() so we simply forward calls to ordinal() */ @@ -314,7 +345,7 @@ public MethodVisitor visitMethod(int access, String name, String descriptor, Str else { MethodVisitor mv = super.visitMethod(access, name, descriptor, signature, exceptions); - if (determinismCheck && (name.equals("nextSeed") || name.equals("nextSecondarySeed"))) + if (determinismCheck && (name.equals("nextSeed") || name.equals("nextSecondarySeed") || name.equals("advanceProbe"))) mv = new ThreadLocalRandomCheckTransformer(api, mv); return mv; } @@ -323,7 +354,61 @@ public MethodVisitor visitMethod(int access, String name, String descriptor, Str return transform(bytes, ThreadLocalRandomVisitor::new); } - private static byte[] transform(byte[] bytes, BiFunction constructor) + /** + * We require ThreadLocalRandom to be deterministic, so we modify its initialisation method to invoke a + * global deterministic random value generator + */ + private static byte[] transformAbstractQueuedSynchronizer(String className, byte[] bytes, Flag flag, Flag ... flags) + { + class AbstractQueuedSynchronizerVisitor extends ClassVisitor + { + private long defaultSpinForTimeoutThreshold = 1000L; + + public AbstractQueuedSynchronizerVisitor(int api, ClassVisitor classVisitor) + { + super(api, classVisitor); + } + + @Override + public FieldVisitor visitField(int access, String name, String descriptor, String signature, Object value) + { + if (name.equals("SPIN_FOR_TIMEOUT_THRESHOLD")) + { + defaultSpinForTimeoutThreshold = (Long)value; + return super.visitField(access, name, descriptor, signature, 0L); + } + + return super.visitField(access, name, descriptor, signature, value); + } + + @Override + public MethodVisitor visitMethod(int access, String name, String descriptor, String signature, String[] exceptions) + { + /// !!!!! WARNING !!!!! + /// THIS IS SUPER BRITTLE BECAUSE rt.jar INLINES GETSTATIC AS LDC + // TODO (desired): visit constructor to fetch actual value of constant in case changes in future release - + // but this is brittle enough changes upstream will likely need revisiting anyway + MethodVisitor mv = super.visitMethod(access, name, descriptor, signature, exceptions); + if (!name.equals("doAcquireNanos") && !name.equals("doAcquireSharedNanos")) + return mv; + + return new MethodVisitor(api, mv) + { + @Override + public void visitLdcInsn(Object value) + { + if (Objects.equals(defaultSpinForTimeoutThreshold, value)) + super.visitLdcInsn(0L); + else + super.visitLdcInsn(value); + } + }; + } + } + return transform(className, bytes, AbstractQueuedSynchronizerVisitor::new, flag, flags); + } + + private static byte[] transform(byte[] bytes, BiFunction constructor) { ClassWriter out = new ClassWriter(0); ClassReader in = new ClassReader(bytes); @@ -332,7 +417,7 @@ private static byte[] transform(byte[] bytes, BiFunction constructor, Flag flag, Flag ... flags) + { + ClassReader in = new ClassReader(bytes); + ClassTransformer transformer = new ClassTransformer(BYTECODE_VERSION, className, EnumSet.of(flag, flags), null); + ClassVisitor extraTransformer = constructor.apply(BYTECODE_VERSION, transformer); + in.accept(extraTransformer, 0); + return transformer.toBytes(); + } } diff --git a/test/simulator/asm/org/apache/cassandra/simulator/asm/InterceptClasses.java b/test/simulator/asm/org/apache/cassandra/simulator/asm/InterceptClasses.java index 473cc27032b8..81b4a288e988 100644 --- a/test/simulator/asm/org/apache/cassandra/simulator/asm/InterceptClasses.java +++ b/test/simulator/asm/org/apache/cassandra/simulator/asm/InterceptClasses.java @@ -61,12 +61,18 @@ public class InterceptClasses implements BiFunction "|org[/.]apache[/.]cassandra[/.]db.streaming[/.].*" + "|org[/.]apache[/.]cassandra[/.]distributed[/.]impl[/.]DirectStreamingConnectionFactory.*" + "|org[/.]apache[/.]cassandra[/.]db[/.]commitlog[/.].*" + - "|org[/.]apache[/.]cassandra[/.]service[/.]paxos[/.].*"); + "|org[/.]apache[/.]cassandra[/.]service[/.]paxos[/.].*" + + "|org[/.]apache[/.]cassandra[/.]service[/.]accord[/.].*" + + "|org[/.]apache[/.]cassandra[/.]journal[/.].*" + + "|accord[/.].*" + ); private static final Pattern GLOBAL_METHODS = Pattern.compile("org[/.]apache[/.]cassandra[/.](?!simulator[/.]).*" + "|org[/.]apache[/.]cassandra[/.]simulator[/.]test[/.].*" + "|org[/.]apache[/.]cassandra[/.]simulator[/.]cluster[/.].*" + - "|io[/.]netty[/.]util[/.]concurrent[/.]FastThreadLocal"); // intercept IdentityHashMap for execution consistency + "|io[/.]netty[/.]util[/.]concurrent[/.]FastThreadLocal" + + "|accord[/.].*" + ); // intercept IdentityHashMap for execution consistency private static final Pattern NEMESIS = GLOBAL_METHODS; private static final Set WARNED = Collections.newSetFromMap(new ConcurrentHashMap<>()); @@ -94,19 +100,24 @@ static class PeerGroup class SubTransformer implements BiFunction { private final Map isolatedCache = new ConcurrentHashMap<>(); + private final int id; + SubTransformer(int id) + { + this.id = id; + } @Override public byte[] apply(String name, byte[] bytes) { - return transformTransitiveClosure(name, bytes, isolatedCache); + return transformTransitiveClosure(name, bytes, isolatedCache, id); } } private final Map cache = new ConcurrentHashMap<>(); private final int api; - private final ChanceSupplier nemesisChance; - private final ChanceSupplier monitorDelayChance; + private final DeterministicChanceSupplier nemesisChance; + private final DeterministicChanceSupplier monitorDelayChance; private final Hashcode insertHashcode; private final NemesisFieldKind.Selector nemesisFieldSelector; private final ClassLoader prewarmClassLoader; @@ -114,12 +125,12 @@ public byte[] apply(String name, byte[] bytes) private final byte[] bufIn = new byte[4096]; private final ByteArrayOutputStream bufOut = new ByteArrayOutputStream(); - public InterceptClasses(ChanceSupplier monitorDelayChance, ChanceSupplier nemesisChance, NemesisFieldKind.Selector nemesisFieldSelector, ClassLoader prewarmClassLoader, Predicate prewarm) + public InterceptClasses(DeterministicChanceSupplier monitorDelayChance, DeterministicChanceSupplier nemesisChance, NemesisFieldKind.Selector nemesisFieldSelector, ClassLoader prewarmClassLoader, Predicate prewarm) { this(BYTECODE_VERSION, monitorDelayChance, nemesisChance, nemesisFieldSelector, prewarmClassLoader, prewarm); } - public InterceptClasses(int api, ChanceSupplier monitorDelayChance, ChanceSupplier nemesisChance, NemesisFieldKind.Selector nemesisFieldSelector, ClassLoader prewarmClassLoader, Predicate prewarm) + public InterceptClasses(int api, DeterministicChanceSupplier monitorDelayChance, DeterministicChanceSupplier nemesisChance, NemesisFieldKind.Selector nemesisFieldSelector, ClassLoader prewarmClassLoader, Predicate prewarm) { this.api = api; this.nemesisChance = nemesisChance; @@ -133,10 +144,10 @@ public InterceptClasses(int api, ChanceSupplier monitorDelayChance, ChanceSuppli @Override public byte[] apply(String name, byte[] bytes) { - return transformTransitiveClosure(name, bytes, null); + return transformTransitiveClosure(name, bytes, null, 0); } - private synchronized byte[] transformTransitiveClosure(String externalName, byte[] input, Map isolatedCache) + private synchronized byte[] transformTransitiveClosure(String externalName, byte[] input, Map isolatedCache, int id) { if (input == null) return maybeSynthetic(externalName); @@ -162,12 +173,12 @@ private synchronized byte[] transformTransitiveClosure(String externalName, byte case UNMODIFIED: return input; case UNSHAREABLE: - return transform(internalName, externalName, null, input, null, null); + return transform(internalName, externalName, null, input, null, id, null); } } for (String peer : cached.uncacheablePeers) - transform(peer, slashesToDots(peer), null, cache.get(peer).bytes, isolatedCache, null); + transform(peer, slashesToDots(peer), null, cache.get(peer).bytes, isolatedCache, id, null); switch (cached.kind) { @@ -190,13 +201,13 @@ private synchronized byte[] transformTransitiveClosure(String externalName, byte }; final PeerGroup peerGroup = new PeerGroup(); - byte[] result = transform(internalName, externalName, peerGroup, input, isolatedCache, dependentTypeConsumer); + byte[] result = transform(internalName, externalName, peerGroup, input, isolatedCache, id, dependentTypeConsumer); for (String next = load.pollFirst(); next != null; next = load.pollFirst()) { // TODO (now): otherwise merge peer groups Cached existing = cache.get(next); if (existing == null) - transform(next, slashesToDots(next), peerGroup, read(next), isolatedCache, dependentTypeConsumer); + transform(next, slashesToDots(next), peerGroup, read(next), isolatedCache, id, dependentTypeConsumer); } return result; @@ -220,7 +231,7 @@ private byte[] read(String name) } } - private byte[] transform(String internalName, String externalName, PeerGroup peerGroup, byte[] input, Map isolatedCache, Consumer dependentTypes) + private byte[] transform(String internalName, String externalName, PeerGroup peerGroup, byte[] input, Map isolatedCache, int id, Consumer dependentTypes) { Hashcode hashcode = insertHashCode(externalName); @@ -245,7 +256,8 @@ private byte[] transform(String internalName, String externalName, PeerGroup pee return input; } - ClassTransformer transformer = new ClassTransformer(api, internalName, flags, monitorDelayChance, new NemesisGenerator(api, internalName, nemesisChance), nemesisFieldSelector, hashcode, dependentTypes); + int chanceSeed = internalName.hashCode() * 31 + id; + ClassTransformer transformer = new ClassTransformer(api, internalName, flags, monitorDelayChance.apply(chanceSeed), new NemesisGenerator(api, internalName, nemesisChance.apply(chanceSeed + 1)), nemesisFieldSelector, hashcode, dependentTypes); transformer.setUpdateVisibility(true); transformer.readAndTransform(input); @@ -369,13 +381,15 @@ protected byte[] maybeSynthetic(String externalName) EnumSet flags = EnumSet.of(Flag.GLOBAL_METHODS, Flag.MONITORS, Flag.LOCK_SUPPORT); if (NEMESIS.matcher(externalName).matches()) flags.add(Flag.NEMESIS); - NemesisGenerator nemesis = new NemesisGenerator(api, externalName, nemesisChance); + + int hashCode = externalName.hashCode(); + NemesisGenerator nemesis = new NemesisGenerator(api, externalName, nemesisChance.apply(hashCode)); ShadowingTransformer transformer; transformer = new ShadowingTransformer(InterceptClasses.BYTECODE_VERSION, originalType, shadowType, originalRootType, shadowRootType, originalOuterTypePrefix, shadowOuterTypePrefix, - flags, monitorDelayChance, nemesis, nemesisFieldSelector, null); + flags, monitorDelayChance.apply(hashCode), nemesis, nemesisFieldSelector, null); transformer.readAndTransform(Utils.readDefinition(originalType + ".class")); return transformer.toBytes(); } diff --git a/test/simulator/asm/org/apache/cassandra/simulator/asm/MonitorMethodTransformer.java b/test/simulator/asm/org/apache/cassandra/simulator/asm/MonitorMethodTransformer.java index d9c9c7ad9492..a7c21bbba744 100644 --- a/test/simulator/asm/org/apache/cassandra/simulator/asm/MonitorMethodTransformer.java +++ b/test/simulator/asm/org/apache/cassandra/simulator/asm/MonitorMethodTransformer.java @@ -122,8 +122,7 @@ int loadParamsAndReturnInvokeCode() } int invokeCode; - if (isInstanceMethod && (access & Opcodes.ACC_PRIVATE) != 0) invokeCode = Opcodes.INVOKESPECIAL; - else if (isInstanceMethod) invokeCode = Opcodes.INVOKEVIRTUAL; + if (isInstanceMethod) invokeCode = Opcodes.INVOKESPECIAL; else invokeCode = Opcodes.INVOKESTATIC; return invokeCode; } diff --git a/test/simulator/asm/org/apache/cassandra/simulator/asm/StringHashcode.java b/test/simulator/asm/org/apache/cassandra/simulator/asm/StringHashcode.java new file mode 100644 index 000000000000..fc3c57f8b524 --- /dev/null +++ b/test/simulator/asm/org/apache/cassandra/simulator/asm/StringHashcode.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.asm; + +import org.objectweb.asm.Opcodes; +import org.objectweb.asm.tree.InsnNode; +import org.objectweb.asm.tree.LabelNode; +import org.objectweb.asm.tree.MethodInsnNode; +import org.objectweb.asm.tree.MethodNode; + +/** + * Generate a new hashCode method in the class that invokes a deterministic hashCode generator + */ +class StringHashcode extends MethodNode +{ + StringHashcode(int api) + { + super(api, Opcodes.ACC_PUBLIC, "hashCode", "()I", null, null); + maxLocals = 1; + maxStack = 1; + instructions.add(new LabelNode()); + instructions.add(new MethodInsnNode(Opcodes.INVOKEVIRTUAL, "java/lang/Object", "toString", "()Ljava/lang/String;", false)); + instructions.add(new LabelNode()); + instructions.add(new MethodInsnNode(Opcodes.INVOKEVIRTUAL, "java/lang/Object", "hashCode", "(Ljava/lang/Object;)I", false)); + instructions.add(new InsnNode(Opcodes.IRETURN)); + } +} diff --git a/test/simulator/main/org/apache/cassandra/simulator/ActionList.java b/test/simulator/main/org/apache/cassandra/simulator/ActionList.java index a6178c187078..dde9bbfcfd24 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/ActionList.java +++ b/test/simulator/main/org/apache/cassandra/simulator/ActionList.java @@ -41,6 +41,10 @@ public class ActionList extends AbstractCollection public static ActionList empty() { return EMPTY; } public static ActionList of(Action action) { return new ActionList(new Action[] { action }); } public static ActionList of(Stream action) { return new ActionList(action.toArray(Action[]::new)); } + public static ActionList of(Stream action, Stream... actions) + { + return new ActionList(Stream.concat(action, Stream.of(actions).flatMap(a -> a)).toArray(Action[]::new)); + } public static ActionList of(Collection actions) { return actions.isEmpty() ? EMPTY : new ActionList(actions.toArray(new Action[0])); } public static ActionList of(Action ... actions) { return new ActionList(actions); } diff --git a/test/simulator/main/org/apache/cassandra/simulator/ActionSchedule.java b/test/simulator/main/org/apache/cassandra/simulator/ActionSchedule.java index 6119e4706f53..39666077ea78 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/ActionSchedule.java +++ b/test/simulator/main/org/apache/cassandra/simulator/ActionSchedule.java @@ -322,6 +322,9 @@ public boolean hasNext() return false; } + // NOTE: this is only here for debugging, its a quick way to see if pre (0), interleave (1), or post (2) is active + private int step = -1; + private boolean moreWork() { if (!moreWork.hasNext()) @@ -347,6 +350,8 @@ else if (oldMode == UNLIMITED) work.actors.forEach(runnableScheduler::attachTo); work.actors.forEach(a -> a.forEach(Action::setConsequence)); work.actors.forEach(this::add); + + step++; return true; } diff --git a/test/simulator/main/org/apache/cassandra/simulator/ClusterSimulation.java b/test/simulator/main/org/apache/cassandra/simulator/ClusterSimulation.java index ef092709320d..ed21d7079d50 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/ClusterSimulation.java +++ b/test/simulator/main/org/apache/cassandra/simulator/ClusterSimulation.java @@ -38,6 +38,7 @@ import com.google.common.util.concurrent.AsyncFunction; import com.google.common.util.concurrent.FutureCallback; +import org.apache.cassandra.auth.PasswordSaltSupplier; import org.apache.cassandra.concurrent.ExecutorFactory; import org.apache.cassandra.config.ParameterizedClass; import org.apache.cassandra.distributed.Cluster; @@ -51,6 +52,7 @@ import org.apache.cassandra.distributed.api.IIsolatedExecutor.SerializableBiConsumer; import org.apache.cassandra.distributed.api.IIsolatedExecutor.SerializableConsumer; import org.apache.cassandra.distributed.api.IIsolatedExecutor.SerializableRunnable; +import org.apache.cassandra.distributed.impl.ClusterIDDefiner; import org.apache.cassandra.distributed.impl.DirectStreamingConnectionFactory; import org.apache.cassandra.distributed.impl.InstanceConfig; import org.apache.cassandra.distributed.impl.InstanceIDDefiner; @@ -59,12 +61,15 @@ import org.apache.cassandra.io.filesystem.ListenableFileSystem; import org.apache.cassandra.io.util.FileSystems; import org.apache.cassandra.net.Verb; +import org.apache.cassandra.service.consensus.TransactionalMode; import org.apache.cassandra.service.paxos.BallotGenerator; import org.apache.cassandra.service.paxos.PaxosPrepare; import org.apache.cassandra.simulator.RandomSource.Choices; +import org.apache.cassandra.simulator.asm.DeterministicChanceSupplier; import org.apache.cassandra.simulator.asm.InterceptAsClassTransformer; import org.apache.cassandra.simulator.asm.NemesisFieldSelectors; import org.apache.cassandra.simulator.cluster.ClusterActions; +import org.apache.cassandra.simulator.cluster.ClusterActions.ConsensusChange; import org.apache.cassandra.simulator.cluster.ClusterActions.TopologyChange; import org.apache.cassandra.simulator.systems.Failures; import org.apache.cassandra.simulator.systems.InterceptedWait.CaptureSites.Capture; @@ -91,7 +96,6 @@ import org.apache.cassandra.simulator.utils.LongRange; import org.apache.cassandra.utils.Clock; import org.apache.cassandra.utils.Closeable; -import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.Throwables; import org.apache.cassandra.utils.concurrent.Ref; import org.apache.cassandra.utils.memory.BufferPool; @@ -150,6 +154,9 @@ public static abstract class Builder protected TopologyChange[] topologyChanges = TopologyChange.values(); protected int topologyChangeLimit = -1; + protected ConsensusChange[] consensusChanges = ConsensusChange.values(); + protected int consensusChangeLimit = -1; + protected int primaryKeyCount; protected int secondsToSimulate; @@ -175,7 +182,8 @@ public static abstract class Builder schedulerLongDelayNanos = new LongRange(50, 5000, MICROSECONDS, NANOSECONDS), clockDriftNanos = new LongRange(1, 5000, MILLISECONDS, NANOSECONDS), clockDiscontinuitIntervalNanos = new LongRange(10, 60, SECONDS, NANOSECONDS), - topologyChangeIntervalNanos = new LongRange(5, 15, SECONDS, NANOSECONDS); + topologyChangeIntervalNanos = new LongRange(5, 15, SECONDS, NANOSECONDS), + consensusChangeIntervalNanos = new LongRange(1, 5, SECONDS, NANOSECONDS); @@ -192,6 +200,7 @@ public static abstract class Builder protected HeapPool.Logged.Listener memoryListener; protected SimulatedTime.Listener timeListener = (i1, i2) -> {}; protected LongConsumer onThreadLocalRandomCheck; + protected String transactionalMode = "full"; public Builder failures(Failures failures) { @@ -311,6 +320,24 @@ public Builder topologyChangeLimit(int topologyChangeLimit) return this; } + public Builder consensusChanges(ConsensusChange[] consensusChanges) + { + this.consensusChanges = consensusChanges; + return this; + } + + public Builder consensusChangeIntervalNanos(LongRange consensusChangeIntervalNanos) + { + this.consensusChangeIntervalNanos = consensusChangeIntervalNanos; + return this; + } + + public Builder consensusChangeLimit(int consensusChangeLimit) + { + this.consensusChangeLimit = consensusChangeLimit; + return this; + } + public int primaryKeyCount() { return primaryKeyCount; @@ -550,6 +577,17 @@ public Builder onThreadLocalRandomCheck(LongConsumer runnable) return this; } + public Builder transactionalMode(String mode) + { + this.transactionalMode = mode; + return this; + } + + public TransactionalMode transactionalMode() + { + return TransactionalMode.fromString(transactionalMode); + } + public abstract ClusterSimulation create(long seed) throws IOException; } @@ -580,7 +618,7 @@ IInstanceConfig update(IInstanceConfig config) .set("concurrent_counter_writes", take(1, 4)) .set("concurrent_materialized_view_writes", take(1, 4)) .set("concurrent_reads", take(1, 4)) - .forceSet("available_processors", take(3, 4)); + .set("available_processors", take(3, 4)); } // begin allocating for a new node @@ -627,7 +665,7 @@ int take(int times, int min, int max) if (remaining * min <= allocationPool) return min; if (times == remaining) - return allocationPool / remaining; + return Math.max(allocationPool / remaining, min); if (times + 1 == remaining) return random.uniform(Math.max(min, (allocationPool - max) / times), Math.min(max, (allocationPool - min) / times)); @@ -638,7 +676,6 @@ int take(int times, int min, int max) } } - public final RandomSource random; public final SimulatedSystems simulated; public final Cluster cluster; @@ -687,6 +724,9 @@ public ClusterSimulation(RandomSource random, long seed, int uniqueNum, nodeToDc[n++] = i; } } + if (builder.topologyChangeLimit < 0) + initialRf = maxRf; + snitch = new SimulatedSnitch(nodeToDc, numInDcs); execution = new SimulatedExecution(); @@ -702,7 +742,24 @@ public ClusterSimulation(RandomSource random, long seed, int uniqueNum, }); Predicate sharedClassPredicate = getSharedClassPredicate(ISOLATE, SHARE, ANY, SIMULATION); - InterceptAsClassTransformer interceptClasses = new InterceptAsClassTransformer(builder.monitorDelayChance.asSupplier(random), builder.nemesisChance.asSupplier(random), NemesisFieldSelectors.get(), ClassLoader.getSystemClassLoader(), sharedClassPredicate.negate()); + DeterministicChanceSupplier monitorDelayChance; { + long monitorDelayChanceSeed = random.uniform(0, Long.MAX_VALUE); + monitorDelayChance = hash -> { + RandomSource subRandom = new RandomSource.Default(); + subRandom.reset(monitorDelayChanceSeed * 31 + hash); + return builder.monitorDelayChance.asSupplier(subRandom); + }; + } + DeterministicChanceSupplier nemesisChance; { + long nemesisChanceSeed = random.uniform(0, Long.MAX_VALUE); + nemesisChance = hash -> { + RandomSource subRandom = new RandomSource.Default(); + subRandom.reset(nemesisChanceSeed * 31 + hash); + return builder.nemesisChance.asSupplier(subRandom); + }; + } + + InterceptAsClassTransformer interceptClasses = new InterceptAsClassTransformer(monitorDelayChance, nemesisChance, NemesisFieldSelectors.get(), ClassLoader.getSystemClassLoader(), sharedClassPredicate.negate()); threadLocalRandomCheck = new ThreadLocalRandomCheck(builder.onThreadLocalRandomCheck); Failures failures = builder.failures; @@ -722,11 +779,13 @@ public ClusterSimulation(RandomSource random, long seed, int uniqueNum, .set("memtable_allocation_type", builder.memoryListener != null ? "unslabbed_heap_buffers_logged" : "heap_buffers") .set("file_cache_size", "16MiB") .set("use_deterministic_table_id", true) + .set("accord.queue_submission_model", "ASYNC") .set("disk_access_mode", "standard") .set("failure_detector", SimulatedFailureDetector.Instance.class.getName()) .set("commitlog_compression", new ParameterizedClass(LZ4Compressor.class.getName(), emptyMap())) - .set("commitlog_sync", "batch"); - + .set("commitlog_sync", "batch") + .set("accord.journal.flush_mode", "BATCH") + .set("accord.command_store_shard_count", "4"); // TODO: Add remove() to IInstanceConfig if (config instanceof InstanceConfig) { @@ -742,6 +801,11 @@ public ClusterSimulation(RandomSource random, long seed, int uniqueNum, @Override public void initialise(ClassLoader classLoader, ThreadGroup threadGroup, int num, int generation) { + IsolatedExecutor.transferAdhoc((IIsolatedExecutor.SerializableConsumer) ClusterIDDefiner::setId, classLoader) + .accept(threadGroup.getParent().getName()); + IsolatedExecutor.transferAdhoc((IIsolatedExecutor.SerializableConsumer) InstanceIDDefiner::setInstanceId, classLoader) + .accept(num); + List onShutdown = new ArrayList<>(); IsolatedExecutor.transferAdhoc((SerializableConsumer) InstanceIDDefiner::setInstanceId, classLoader) .accept(num); @@ -769,9 +833,10 @@ public void initialise(ClassLoader classLoader, ThreadGroup threadGroup, int num @Override public void beforeStartup(IInstance i) { - ((IInvokableInstance) i).unsafeAcceptOnThisThread(FBUtilities::setAvailableProcessors, i.config().getInt("available_processors")); + ((IInvokableInstance) i).unsafeAcceptOnThisThread(PasswordSaltSupplier::unsafeSet, () -> "$2a$05$rT01y27MnvpE7NgzwvYNFe"); ((IInvokableInstance) i).unsafeAcceptOnThisThread(IfInterceptibleThread::setThreadLocalRandomCheck, (LongConsumer) threadLocalRandomCheck); + int num = i.config().num(); if (builder.memoryListener != null) { @@ -818,8 +883,11 @@ public void afterStartup(IInstance i) simulated.register((SimulatedFutureActionScheduler) futureActionScheduler); scheduler = builder.schedulerFactory.create(random); + // TODO (required): we aren't passing paxos variant change parameter anymore options = new ClusterActions.Options(builder.topologyChangeLimit, Choices.uniform(KindOfSequence.values()).choose(random).period(builder.topologyChangeIntervalNanos, random), Choices.random(random, builder.topologyChanges), + builder.consensusChangeLimit, Choices.uniform(KindOfSequence.values()).choose(random).period(builder.consensusChangeIntervalNanos, random), + Choices.random(random, builder.consensusChanges), minRf, initialRf, maxRf, null); this.factory = factory; } diff --git a/test/simulator/main/org/apache/cassandra/simulator/Debug.java b/test/simulator/main/org/apache/cassandra/simulator/Debug.java index bcf0947fe406..cf0be6709255 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/Debug.java +++ b/test/simulator/main/org/apache/cassandra/simulator/Debug.java @@ -67,7 +67,15 @@ // TODO (feature): log only deltas for schema/cluster data public class Debug { - private static final Logger logger = LoggerFactory.getLogger(Debug.class); + private static class LoggerHandle + { + private static final Logger logger = LoggerFactory.getLogger(Debug.class); + } + + private static Logger logger() + { + return LoggerHandle.logger; + } public enum EventType { PARTITION, CLUSTER } public enum Level @@ -219,15 +227,15 @@ private LogOne(SimulatedTime time, boolean logConsequences) @Override public void before(Action action, Before before) { - if (logger.isWarnEnabled()) // invoke toString() eagerly to ensure we have the task's descriptin - logger.warn(String.format("%6ds %s %s", TimeUnit.NANOSECONDS.toSeconds(time.nanoTime()), before, action)); + if (logger().isWarnEnabled()) // invoke toString() eagerly to ensure we have the task's descriptin + logger().warn(String.format("%6ds %s %s", TimeUnit.NANOSECONDS.toSeconds(time.nanoTime()), before, action)); } @Override public void consequences(ActionList consequences) { - if (logConsequences && !consequences.isEmpty() && logger.isWarnEnabled()) - logger.warn(String.format("%6ds Next: %s", TimeUnit.NANOSECONDS.toSeconds(time.nanoTime()), consequences)); + if (logConsequences && !consequences.isEmpty() && logger().isWarnEnabled()) + logger().warn(String.format("%6ds Next: %s", TimeUnit.NANOSECONDS.toSeconds(time.nanoTime()), consequences)); } } @@ -241,7 +249,7 @@ public LogTermination(ActionListener wrap) @Override public void transitivelyAfter(Action finished) { - logger.warn("Terminated {}", finished); + logger().warn("Terminated {}", finished); } } @@ -268,7 +276,7 @@ private Consumer debugGossip(Cluster cluster) for (InetAddressAndPort ep : Gossiper.instance.getLiveMembers()) { EndpointState epState = Gossiper.instance.getEndpointStateForEndpoint(ep); - logger.warn("Gossip {}: {} {}", ep, epState.isAlive(), epState.states().stream() + logger().warn("Gossip {}: {} {}", ep, epState.isAlive(), epState.states().stream() .map(e -> e.getKey().toString() + "=(" + e.getValue().value + ',' + e.getValue().version + ')') .collect(Collectors.joining(", ", "[", "]"))); } @@ -305,11 +313,11 @@ public static Consumer debugPaxos(Cluster cluster, String keyspace, int TableMetadata metadata = Keyspace.open(keyspace).getColumnFamilyStore("tbl").metadata.get(); ByteBuffer pkbb = Int32Type.instance.decompose(pkint); DecoratedKey key = new BufferDecoratedKey(DatabaseDescriptor.getPartitioner().getToken(pkbb), pkbb); - logger.warn("node{}({}): {}", num, primaryKey, paxosDebugInfo(key, metadata, FBUtilities.nowInSeconds())); + logger().warn("node{}({}): {}", num, primaryKey, paxosDebugInfo(key, metadata, FBUtilities.nowInSeconds())); } catch (Throwable t) { - logger.warn("node{}({})", num, primaryKey, t); + logger().warn("node{}({})", num, primaryKey, t); } }, node, primaryKey); } @@ -320,7 +328,7 @@ public static Consumer debugRf(Cluster cluster, String keyspace) { return ignore -> { cluster.forEach(i -> i.unsafeRunOnThisThread(() -> { - logger.warn("{} {}", + logger().warn("{} {}", Schema.instance.getKeyspaceMetadata(keyspace) == null ? "" : Schema.instance.getKeyspaceMetadata(keyspace).params.replication.toString(), Schema.instance.getKeyspaceMetadata(keyspace) == null ? "" : Keyspace.open(keyspace).getReplicationStrategy().configOptions.toString()); })); @@ -332,7 +340,7 @@ public static Consumer debugOwnership(Cluster cluster, String keyspace, return ignore -> { for (int node = 1 ; node <= cluster.size() ; ++node) { - logger.warn("node{}({}): {}", node, primaryKey, cluster.get(node).unsafeApplyOnThisThread(v -> { + logger().warn("node{}({}): {}", node, primaryKey, cluster.get(node).unsafeApplyOnThisThread(v -> { try { return ReplicaLayout.forTokenWriteLiveAndDown(Keyspace.open(keyspace), Murmur3Partitioner.instance.getToken(Int32Type.instance.decompose(v))).all().endpointList().toString(); @@ -350,7 +358,7 @@ public static Consumer debugRing(Cluster cluster, String keyspace) { return ignore -> cluster.forEach(i -> i.unsafeRunOnThisThread(() -> { if (Schema.instance.getKeyspaceMetadata(keyspace) != null) - logger.warn("{}", ClusterMetadata.current()); + logger().warn("{}", ClusterMetadata.current()); })); } diff --git a/test/simulator/main/org/apache/cassandra/simulator/RandomSource.java b/test/simulator/main/org/apache/cassandra/simulator/RandomSource.java index 14d7ad9b1d90..4e429e418fc6 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/RandomSource.java +++ b/test/simulator/main/org/apache/cassandra/simulator/RandomSource.java @@ -20,13 +20,17 @@ import java.lang.reflect.Array; import java.util.Arrays; +import java.util.List; import java.util.Map; import java.util.Random; +import java.util.Set; import java.util.function.IntSupplier; import java.util.function.LongSupplier; import java.util.stream.IntStream; import java.util.stream.LongStream; +import com.google.common.collect.Iterators; + import org.apache.cassandra.utils.Shared; import static org.apache.cassandra.utils.Shared.Scope.SIMULATION; @@ -46,11 +50,20 @@ private Choices(float[] cumulativeProbabilities, T[] options) } public T choose(RandomSource random) + { + return choose(random.uniformFloat()); + } + + public T choose(accord.utils.RandomSource random) + { + return choose(random.nextFloat()); + } + + private T choose(float choose) { if (options.length == 0) return null; - float choose = random.uniformFloat(); int i = Arrays.binarySearch(cumulativeProbabilities, choose); if (i < 0) i = -1 - i; @@ -131,6 +144,41 @@ public static Choices uniform(T ... options) Arrays.fill(nonCumulativeProbabilities, 1f / options.length); return new Choices<>(cumulativeProbabilities(nonCumulativeProbabilities), options); } + + public static T choose(RandomSource rs, Set set) + { + return choose(rs.uniform(0, set.size()), set); + } + + public static T choose(accord.utils.RandomSource rs, Set set) + { + return choose(rs.nextInt(set.size()), set); + } + + private static T choose(int i, Set set) + { + return Iterators.get(set.iterator(), i); + } + + public static T choose(RandomSource rs, List list) + { + return list.get(rs.uniform(0, list.size())); + } + + public static T choose(accord.utils.RandomSource rs, List list) + { + return list.get(rs.nextInt(list.size())); + } + + public static T choose(RandomSource rs, T ... array) + { + return array[rs.uniform(0, array.length)]; + } + + public static T choose(accord.utils.RandomSource rs, T ... array) + { + return array[rs.nextInt(array.length)]; + } } public static abstract class Abstract implements RandomSource diff --git a/test/simulator/main/org/apache/cassandra/simulator/SimulationRunner.java b/test/simulator/main/org/apache/cassandra/simulator/SimulationRunner.java index 97c2db35a551..b04fb0a4cb5a 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/SimulationRunner.java +++ b/test/simulator/main/org/apache/cassandra/simulator/SimulationRunner.java @@ -25,6 +25,7 @@ import java.util.Optional; import java.util.Random; import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; import java.util.function.ToDoubleFunction; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -41,14 +42,17 @@ import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.simulator.Debug.Info; import org.apache.cassandra.simulator.Debug.Levels; +import org.apache.cassandra.simulator.cluster.ClusterActions.ConsensusChange; import org.apache.cassandra.simulator.cluster.ClusterActions.TopologyChange; import org.apache.cassandra.simulator.debug.SelfReconcile; +import org.apache.cassandra.simulator.logging.SeedDefiner; import org.apache.cassandra.simulator.systems.InterceptedWait; import org.apache.cassandra.simulator.systems.InterceptedWait.CaptureSites.Capture; import org.apache.cassandra.simulator.systems.InterceptibleThread; import org.apache.cassandra.simulator.systems.InterceptorOfGlobalMethods; import org.apache.cassandra.simulator.utils.ChanceRange; import org.apache.cassandra.utils.Clock; +import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.Hex; import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; @@ -60,20 +64,23 @@ import static org.apache.cassandra.config.CassandraRelevantProperties.CLOCK_MONOTONIC_APPROX; import static org.apache.cassandra.config.CassandraRelevantProperties.CLOCK_MONOTONIC_PRECISE; import static org.apache.cassandra.config.CassandraRelevantProperties.CONSISTENT_DIRECTORY_LISTINGS; +import static org.apache.cassandra.config.CassandraRelevantProperties.DETERMINISM_SSTABLE_COMPRESSION_DEFAULT; import static org.apache.cassandra.config.CassandraRelevantProperties.DETERMINISM_UNSAFE_UUID_NODE; +import static org.apache.cassandra.config.CassandraRelevantProperties.DISABLE_GOSSIP_ENDPOINT_REMOVAL; import static org.apache.cassandra.config.CassandraRelevantProperties.DISABLE_SSTABLE_ACTIVITY_TRACKING; -import static org.apache.cassandra.config.CassandraRelevantProperties.DETERMINISM_SSTABLE_COMPRESSION_DEFAULT; import static org.apache.cassandra.config.CassandraRelevantProperties.DTEST_API_LOG_TOPOLOGY; import static org.apache.cassandra.config.CassandraRelevantProperties.GOSSIPER_SKIP_WAITING_TO_SETTLE; import static org.apache.cassandra.config.CassandraRelevantProperties.IGNORE_MISSING_NATIVE_FILE_HINTS; -import static org.apache.cassandra.config.CassandraRelevantProperties.ORG_APACHE_CASSANDRA_DISABLE_MBEAN_REGISTRATION; import static org.apache.cassandra.config.CassandraRelevantProperties.LIBJEMALLOC; import static org.apache.cassandra.config.CassandraRelevantProperties.MEMTABLE_OVERHEAD_SIZE; +import static org.apache.cassandra.config.CassandraRelevantProperties.ORG_APACHE_CASSANDRA_DISABLE_MBEAN_REGISTRATION; import static org.apache.cassandra.config.CassandraRelevantProperties.PAXOS_REPAIR_RETRY_TIMEOUT_IN_MS; import static org.apache.cassandra.config.CassandraRelevantProperties.RING_DELAY; import static org.apache.cassandra.config.CassandraRelevantProperties.SHUTDOWN_ANNOUNCE_DELAY_IN_MS; +import static org.apache.cassandra.config.CassandraRelevantProperties.SIMULATOR_STARTED; import static org.apache.cassandra.config.CassandraRelevantProperties.SYSTEM_AUTH_DEFAULT_RF; -import static org.apache.cassandra.config.CassandraRelevantProperties.DISABLE_GOSSIP_ENDPOINT_REMOVAL; +import static org.apache.cassandra.config.CassandraRelevantProperties.TEST_CASSANDRA_SUITENAME; +import static org.apache.cassandra.config.CassandraRelevantProperties.TEST_CASSANDRA_TESTTAG; import static org.apache.cassandra.config.CassandraRelevantProperties.TEST_JVM_DTEST_DISABLE_SSL; import static org.apache.cassandra.simulator.debug.Reconcile.reconcileWith; import static org.apache.cassandra.simulator.debug.Record.record; @@ -85,7 +92,15 @@ @SuppressWarnings({ "ZeroLengthArrayAllocation", "CodeBlock2Expr", "SameParameterValue", "DynamicRegexReplaceableByCompiledPattern", "CallToSystemGC" }) public class SimulationRunner { - private static final Logger logger = LoggerFactory.getLogger(SimulationRunner.class); + private static class LoggerHandle + { + private static final Logger logger = LoggerFactory.getLogger(SimulationRunner.class); + } + + private static Logger logger() + { + return LoggerHandle.logger; + } public enum RecordOption { NONE, VALUE, WITH_CALLSITES } @@ -125,6 +140,7 @@ public static void beforeAll() IGNORE_MISSING_NATIVE_FILE_HINTS.setBoolean(true); ORG_APACHE_CASSANDRA_DISABLE_MBEAN_REGISTRATION.setBoolean(true); TEST_JVM_DTEST_DISABLE_SSL.setBoolean(true); // to support easily running without netty from dtest-jar + SIMULATOR_STARTED.setString(Long.toString(TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis()))); if (Thread.currentThread() instanceof InterceptibleThread); // load InterceptibleThread class to avoid infinite loop in InterceptorOfGlobalMethods new InterceptedWait.CaptureSites(Thread.currentThread()) @@ -169,9 +185,16 @@ protected abstract static class BasicCommand { - builder.scheduler(stream(kinds.split(",")) - .filter(v -> !v.isEmpty()) - .map(v -> RunnableActionScheduler.Kind.valueOf(toUpperCaseLocalized(v))) - .toArray(RunnableActionScheduler.Kind[]::new)); + Optional.ofNullable(consensusChanges).ifPresent(consensusChanges -> { + builder.consensusChanges(stream(consensusChanges.split(",")) + .filter(v -> !v.isEmpty()) + .map(v -> ConsensusChange.valueOf(toUpperCaseLocalized(v))) + .toArray(ConsensusChange[]::new)); }); - + parseNanosRange(Optional.ofNullable(consensusChangeInterval)).ifPresent(builder::consensusChangeIntervalNanos); + builder.consensusChangeLimit(Integer.parseInt(consensusChangeLimit)); Optional.ofNullable(this.capture) .map(s -> s.split(",")) .map(s -> new Capture( @@ -315,11 +342,19 @@ protected void propagate(B builder) .orElse(new int[0]); builder.debug(debugLevels, debugPrimaryKeys); } + + Optional.ofNullable(transactionalMode).ifPresent(builder::transactionalMode); } public void run(B builder) throws IOException { + long seed = parseHex(Optional.ofNullable(this.seed)).orElse(new Random(System.nanoTime()).nextLong()); + SeedDefiner.setSeed(seed); beforeAll(); + // TODO (expected): this doesn't work properly for multiple seeds in a single JVM + TEST_CASSANDRA_TESTTAG.setString("simulator"); + TEST_CASSANDRA_SUITENAME.setString(SIMULATOR_STARTED.getString() + '-' + CassandraRelevantProperties.SIMULATOR_SEED.getString()); + logger(); Thread.setDefaultUncaughtExceptionHandler((th, e) -> { boolean isInterrupt = false; Throwable t = e; @@ -329,14 +364,12 @@ public void run(B builder) throws IOException t = t.getCause(); } if (!isInterrupt) - logger.error("Uncaught exception on {}", th, e); + logger().error("Uncaught exception on {}", th, e); if (e instanceof Error) throw (Error) e; }); propagate(builder); - - long seed = parseHex(Optional.ofNullable(this.seed)).orElse(new Random(System.nanoTime()).nextLong()); for (int i = 0 ; i < simulationCount ; ++i) { cleanup(); @@ -353,7 +386,8 @@ protected static class Run> extends Basic { protected void run(long seed, B builder) throws IOException { - logger.error("Seed 0x{}", Long.toHexString(seed)); + logger().error("Seed 0x{}", Long.toHexString(seed)); + logger().info("Cassandra {} / {}", FBUtilities.getReleaseVersionString(), FBUtilities.getGitSHA()); try (ClusterSimulation cluster = builder.create(seed)) { @@ -363,6 +397,7 @@ protected void run(long seed, B builder) throws IOException } catch (Throwable t) { + logger().error("Failed on seed 0x{}", Long.toHexString(seed), t); throw new SimulationException(seed, t); } } @@ -431,6 +466,16 @@ public void run(B builder) throws IOException } } + @Command(name = "version", description = "Display version information") + protected static class VersionCommand> implements ICommand + { + @Override + public void run(B builder) throws IOException + { + System.out.println(FBUtilities.getReleaseVersionString()); + System.out.println(FBUtilities.getGitSHA()); + } + } public static Optional parseHex(Optional value) { diff --git a/test/simulator/main/org/apache/cassandra/simulator/SimulatorUtils.java b/test/simulator/main/org/apache/cassandra/simulator/SimulatorUtils.java index 5be3384eebd1..622e35ab5d85 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/SimulatorUtils.java +++ b/test/simulator/main/org/apache/cassandra/simulator/SimulatorUtils.java @@ -18,9 +18,13 @@ package org.apache.cassandra.simulator; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import org.slf4j.Logger; @@ -40,9 +44,43 @@ public static RuntimeException failWithOOM() public static void dumpStackTraces(Logger logger) { Map threadMap = Thread.getAllStackTraces(); - threadMap.forEach((thread, ste) -> { - logger.error("{}:\n {}", thread, Threads.prettyPrint(ste, false, " ", "\n", "")); - }); + String prefix = " "; + String delimiter = "\n" + prefix; + threadMap.forEach((thread, ste) -> + logger.error("{}:\n{}", thread, Threads.prettyPrint(ste, false, prefix, delimiter, ""))); FastThreadLocal.destroy(); } + + public static void verifyAndlogSimulatorArgs(Logger logger, String[] args) + { + RuntimeMXBean runtimeMxBean = ManagementFactory.getRuntimeMXBean(); + final List jvmArgs = runtimeMxBean.getInputArguments(); + logger.error("JVM Args: {}", jvmArgs.stream().collect(Collectors.joining("\" \"", "\"", "\""))); + logger.error("Command Args: {}", Arrays.stream(args).collect(Collectors.joining("\" \"", "\"", "\""))); + + assert jvmArgs.stream().anyMatch(arg -> arg.startsWith("-Xbootclasspath/a") && arg.endsWith("simulator-bootstrap.jar")) : + "must launch JVM with -Xbootclasspath/a:simulator-bootstrap.jar"; + assert jvmArgs.stream().anyMatch(arg -> arg.startsWith("-javaagent:") && arg.endsWith("simulator-asm.jar")) : + "must launch JVM with -javaagent:simulator-asm.jar"; + if (!jvmArgs.stream().anyMatch(arg -> arg.equals("-XX:-BackgroundCompilation"))) + logger.warn("JVM Argument -XX:-BackgroundCompilation not set, non-determinism possible"); + if (!jvmArgs.stream().anyMatch(arg -> arg.equals("-XX:-TieredCompilation"))) + logger.warn("JVM Argument -XX:-TieredCompilation not set, non-determinism possible"); + if (!jvmArgs.stream().anyMatch(arg -> arg.equals("-XX:CICompilerCount=1"))) + logger.warn("JVM Argument -XX:CICompilerCount=1 not set, non-determinism possible"); + if (!jvmArgs.stream().anyMatch(arg -> arg.startsWith("-XX:Tier4CompileThreshold="))) + logger.warn("JVM Argument -XX:Tier4CompileThreshold not set, non-determinism possible. Typically set -XX:Tier4CompileThreshold=1000"); + if (!jvmArgs.stream().anyMatch(arg -> arg.equals("-Dcassandra.disable_tcactive_openssl=true"))) + logger.warn("JVM Argument -Dcassandra.disable_tcactive_openssl=true not set, non-determinism possible. Typically set -XX:Tier4CompileThreshold=1000"); + + // log4j support + if (!jvmArgs.stream().anyMatch(arg -> arg.equals("-Dlog4j2.disableJmx=true"))) + logger.warn("JVM Argument -Dlog4j2.disableJmx=true not set, non-determinism possible"); + if (!jvmArgs.stream().anyMatch(arg -> arg.equals("-Dlog4j2.disable.jmx=true"))) + logger.warn("JVM Argument -Dlog4j2.disable.jmx=true not set, non-determinism possible"); + if (!jvmArgs.stream().anyMatch(arg -> arg.equals("-Dlog4j.shutdownHookEnabled=false"))) + logger.warn("JVM Argument -Dlog4j.shutdownHookEnabled=false not set, non-determinism possible"); + if (!jvmArgs.stream().anyMatch(arg -> arg.equals("-Dcassandra.simulator.skiplog4jreload=true"))) + logger.warn("JVM Argument -Dcassandra.simulator.skiplog4jreload=true not set, non-determinism possible"); + } } diff --git a/test/simulator/main/org/apache/cassandra/simulator/asm/InterceptAsClassTransformer.java b/test/simulator/main/org/apache/cassandra/simulator/asm/InterceptAsClassTransformer.java index 200f98409438..b3f04c157f32 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/asm/InterceptAsClassTransformer.java +++ b/test/simulator/main/org/apache/cassandra/simulator/asm/InterceptAsClassTransformer.java @@ -25,12 +25,13 @@ // an adapter to IClassTransformer that is loaded by the system classloader public class InterceptAsClassTransformer extends InterceptClasses implements IClassTransformer { - public InterceptAsClassTransformer(ChanceSupplier monitorDelayChance, ChanceSupplier nemesisChance, NemesisFieldKind.Selector nemesisFieldSelector, ClassLoader prewarmClassLoader, Predicate prewarm) + private int subTransformerCount = 0; + public InterceptAsClassTransformer(DeterministicChanceSupplier monitorDelayChance, DeterministicChanceSupplier nemesisChance, NemesisFieldKind.Selector nemesisFieldSelector, ClassLoader prewarmClassLoader, Predicate prewarm) { super(monitorDelayChance, nemesisChance, nemesisFieldSelector, prewarmClassLoader, prewarm); } - public InterceptAsClassTransformer(int api, ChanceSupplier monitorDelayChance, ChanceSupplier nemesisChance, NemesisFieldKind.Selector nemesisFieldSelector, ClassLoader prewarmClassLoader, Predicate prewarm) + public InterceptAsClassTransformer(int api, DeterministicChanceSupplier monitorDelayChance, DeterministicChanceSupplier nemesisChance, NemesisFieldKind.Selector nemesisFieldSelector, ClassLoader prewarmClassLoader, Predicate prewarm) { super(api, monitorDelayChance, nemesisChance, nemesisFieldSelector, prewarmClassLoader, prewarm); } @@ -44,6 +45,6 @@ public byte[] transform(String name, byte[] bytecode) @Override public IClassTransformer initialise() { - return new SubTransformer()::apply; + return new SubTransformer(++subTransformerCount)::apply; } } diff --git a/test/simulator/main/org/apache/cassandra/simulator/cluster/ClusterActions.java b/test/simulator/main/org/apache/cassandra/simulator/cluster/ClusterActions.java index 60b6a62e5726..c8bc94d1456b 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/cluster/ClusterActions.java +++ b/test/simulator/main/org/apache/cassandra/simulator/cluster/ClusterActions.java @@ -61,6 +61,7 @@ import org.apache.cassandra.tcm.transformations.UnsafeJoin; import static org.apache.cassandra.simulator.Action.Modifiers.NO_TIMEOUTS; +import static org.apache.cassandra.simulator.Action.Modifiers.RELIABLE; import static org.apache.cassandra.simulator.Debug.EventType.CLUSTER; import static org.apache.cassandra.simulator.cluster.ClusterActions.TopologyChange.JOIN; import static org.apache.cassandra.simulator.cluster.ClusterActions.TopologyChange.LEAVE; @@ -85,6 +86,11 @@ public enum TopologyChange JOIN, LEAVE, REPLACE, CHANGE_RF } + public enum ConsensusChange + { + ACCORD_MIGRATE + } + public static class Options { public final int topologyChangeLimit; @@ -92,6 +98,9 @@ public static class Options public final Choices allChoices; public final Choices choicesNoLeave; public final Choices choicesNoJoin; + public final int consensusChangeLimit; + public final KindOfSequence.Period consensusChangeInterval; + public final Choices consensusChoices; public final int[] minRf, initialRf, maxRf; public final PaxosVariant changePaxosVariantTo; @@ -108,32 +117,45 @@ public Options(Options copy, PaxosVariant changePaxosVariantTo) this.allChoices = copy.allChoices; this.choicesNoLeave = copy.choicesNoLeave; this.choicesNoJoin = copy.choicesNoJoin; + this.consensusChangeLimit = copy.consensusChangeLimit; + this.consensusChangeInterval = copy.consensusChangeInterval; + this.consensusChoices = copy.consensusChoices; this.minRf = copy.minRf; this.initialRf = copy.initialRf; this.maxRf = copy.maxRf; this.changePaxosVariantTo = changePaxosVariantTo; } - public Options(int topologyChangeLimit, KindOfSequence.Period topologyChangeInterval, Choices choices, int[] minRf, int[] initialRf, int[] maxRf, PaxosVariant changePaxosVariantTo) + public Options(int topologyChangeLimit, + KindOfSequence.Period topologyChangeInterval, + Choices topologyChangeChoices, + int consensusChangeLimit, + KindOfSequence.Period consensusChangeInterval, + Choices consensusChangeChoices, + int[] minRf, int[] initialRf, int[] maxRf, + PaxosVariant changePaxosVariantTo) { if (Arrays.equals(minRf, maxRf)) - choices = choices.without(TopologyChange.CHANGE_RF); + topologyChangeChoices = topologyChangeChoices.without(TopologyChange.CHANGE_RF); this.topologyChangeInterval = topologyChangeInterval; this.topologyChangeLimit = topologyChangeLimit; + this.consensusChangeInterval = consensusChangeInterval; + this.consensusChangeLimit = consensusChangeLimit; this.minRf = minRf; this.initialRf = initialRf; this.maxRf = maxRf; - this.allChoices = choices; + this.allChoices = topologyChangeChoices; this.choicesNoJoin = allChoices.without(JOIN).without(REPLACE); this.choicesNoLeave = allChoices.without(LEAVE); + this.consensusChoices = consensusChangeChoices; this.changePaxosVariantTo = changePaxosVariantTo; } public static Options noActions(int clusterSize) { int[] rf = new int[]{clusterSize}; - return new Options(0, UNIFORM.period(null, null), Choices.uniform(), rf, rf, rf, null); + return new Options(0, UNIFORM.period(null, null), Choices.uniform(), 0, UNIFORM.period(null, null), Choices.uniform(), rf, rf, rf, null); } public Options changePaxosVariantTo(PaxosVariant newVariant) @@ -195,7 +217,7 @@ public Action initializeCluster(int[] joined, int[] prejoin) return StrictAction.of("Initialise Cluster", () -> { List actions = new ArrayList<>(); - cluster.stream().forEach(i -> actions.add(invoke("Startup " + i.broadcastAddress(), NO_TIMEOUTS, NO_TIMEOUTS, + cluster.stream().forEach(i -> actions.add(invoke("Startup " + i.broadcastAddress(), RELIABLE, RELIABLE, new InterceptedRunnableExecution((InterceptingExecutor) i.executor(), i::startup)))); List endpoints = cluster.stream().map(IInstance::broadcastAddress).collect(Collectors.toList()); diff --git a/test/simulator/main/org/apache/cassandra/simulator/cluster/KeyspaceActions.java b/test/simulator/main/org/apache/cassandra/simulator/cluster/KeyspaceActions.java index 5d32ab6eaa91..6ddf61109d34 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/cluster/KeyspaceActions.java +++ b/test/simulator/main/org/apache/cassandra/simulator/cluster/KeyspaceActions.java @@ -44,6 +44,7 @@ import org.apache.cassandra.simulator.systems.InterceptedExecution; import org.apache.cassandra.simulator.systems.InterceptingExecutor; import org.apache.cassandra.simulator.systems.SimulatedSystems; +import org.apache.cassandra.simulator.utils.KindOfSequence; import org.apache.cassandra.tcm.ClusterMetadataService; import static java.util.Collections.singletonList; @@ -62,7 +63,8 @@ public class KeyspaceActions extends ClusterActions final ConsistencyLevel serialConsistency; final int[] primaryKeys; - final EnumSet ops = EnumSet.noneOf(TopologyChange.class); + final EnumSet topologyOps = EnumSet.noneOf(TopologyChange.class); + final EnumSet consensusOps = EnumSet.noneOf(ConsensusChange.class); final NodeLookup nodeLookup; final TokenPlacementModel.NodeFactory factory; final int[] minRf, initialRf, maxRf; @@ -77,7 +79,9 @@ public class KeyspaceActions extends ClusterActions final int[] currentRf; Topology topology; boolean haveChangedVariant; + boolean haveConsensusMigrated; int topologyChangeCount = 0; + int consensusChangeCount = 0; public KeyspaceActions(SimulatedSystems simulated, String keyspace, String table, String createTableCql, @@ -118,13 +122,13 @@ public KeyspaceActions(SimulatedSystems simulated, maxRf = options.maxRf; currentRf = initialRf.clone(); membersOfQuorumDcs = serialConsistency == LOCAL_SERIAL ? all.dcs[0] : all.toArray(); - ops.addAll(Arrays.asList(options.allChoices.options)); - + topologyOps.addAll(Arrays.asList(options.allChoices.options)); + consensusOps.addAll(Arrays.asList(options.consensusChoices.options)); } - public ActionPlan plan() + public ActionPlan plan(boolean joinAll) { - ActionList pre = ActionList.of(pre(createKeyspaceCql(keyspace), createTableCql)); + ActionList pre = ActionList.of(pre(createKeyspaceCql(keyspace), createTableCql, joinAll)); ActionList interleave = stream(); ActionList post = ActionList.empty(); return new ActionPlan(pre, singletonList(interleave), post); @@ -140,12 +144,13 @@ private String createKeyspaceCql(String keyspace) return createKeyspaceCql; } - private Action pre(String createKeyspaceCql, String createTableCql) + private Action pre(String createKeyspaceCql, String createTableCql, boolean joinAll) { + int[] joinPerDC = joinAll ? options.maxRf : options.initialRf; // randomise initial cluster, and return action to initialise it - for (int dc = 0 ; dc < options.initialRf.length ; ++dc) + for (int dc = 0 ; dc < joinPerDC.length ; ++dc) { - for (int i = 0 ; i < options.initialRf[dc] ; ++i) + for (int i = 0 ; i < joinPerDC[dc] ; ++i) { int join = registered.removeRandom(random, dc); joined.add(join); @@ -214,7 +219,7 @@ private Topology recomputeTopology(TokenPlacementModel.ReplicatedRanges readPlac for (int i = 0 ; i < primaryKeys.length ; ++i) { int primaryKey = primaryKeys[i]; - LongToken token = new Murmur3Partitioner().getToken(Int32Type.instance.decompose(primaryKey)); + LongToken token = Murmur3Partitioner.instance.getToken(Int32Type.instance.decompose(primaryKey)); List readReplicas = readPlacements.replicasFor(token.token); List writeReplicas = writePlacements.replicasFor(token.token); @@ -234,15 +239,53 @@ private Topology recomputeTopology(TokenPlacementModel.ReplicatedRanges readPlac private Action next() { - if (options.topologyChangeLimit >= 0 && topologyChangeCount++ > options.topologyChangeLimit) + Action nextTopologyChangeAction = nextTopologyChangeAction(); + if (nextTopologyChangeAction != null) + return nextTopologyChangeAction; + + Action nextConsensusChangeAction = nextConsensusChangeAction(); + if (nextConsensusChangeAction != null) + return nextConsensusChangeAction; + + if (options.changePaxosVariantTo != null && !haveChangedVariant) + { + haveChangedVariant = true; + return schedule(new OnClusterSetPaxosVariant(KeyspaceActions.this, options.changePaxosVariantTo), options.topologyChangeInterval); + } + + return null; + } + + private Action nextConsensusChangeAction() + { + if (options.consensusChangeLimit >= 0 && ++consensusChangeCount > options.consensusChangeLimit) + return null; + + while (!consensusOps.isEmpty() && !haveConsensusMigrated) + { + ConsensusChange nextChange = options.consensusChoices.choose(random); + switch (nextChange) + { + case ACCORD_MIGRATE: + haveConsensusMigrated = true; + return schedule(new OnClusterMigrateConsensus(this), options.topologyChangeInterval); + } + } + + return null; + } + + private Action nextTopologyChangeAction() + { + if (options.topologyChangeLimit >= 0 && ++topologyChangeCount > options.topologyChangeLimit) return null; - while (!ops.isEmpty() && (!registered.isEmpty() || joined.size() > sum(minRf))) + while (!topologyOps.isEmpty() && (!registered.isEmpty() || joined.size() > sum(minRf))) { if (options.changePaxosVariantTo != null && !haveChangedVariant && random.decide(1f / (1 + registered.size()))) { haveChangedVariant = true; - return schedule(new OnClusterSetPaxosVariant(KeyspaceActions.this, options.changePaxosVariantTo)); + return schedule(new OnClusterSetPaxosVariant(KeyspaceActions.this, options.changePaxosVariantTo), options.topologyChangeInterval); } // pick a dc @@ -251,8 +294,8 @@ private Action next() // try to pick an action (and simply loop again if we cannot for this dc) TopologyChange next; if (registered.size(dc) > 0 && joined.size(dc) > currentRf[dc]) next = options.allChoices.choose(random); - else if (registered.size(dc) > 0 && ops.contains(JOIN)) next = options.choicesNoLeave.choose(random); - else if (joined.size(dc) > currentRf[dc] && ops.contains(LEAVE)) next = options.choicesNoJoin.choose(random); + else if (registered.size(dc) > 0 && topologyOps.contains(JOIN)) next = options.choicesNoLeave.choose(random); + else if (joined.size(dc) > currentRf[dc] && topologyOps.contains(LEAVE)) next = options.choicesNoJoin.choose(random); else if (joined.size(dc) > minRf[dc]) next = CHANGE_RF; else continue; @@ -330,19 +373,12 @@ else if (random.decide(0.5f)) // can do either } } } - - if (options.changePaxosVariantTo != null && !haveChangedVariant) - { - haveChangedVariant = true; - return schedule(new OnClusterSetPaxosVariant(KeyspaceActions.this, options.changePaxosVariantTo)); - } - return null; } - private Action schedule(Action action) + private Action schedule(Action action, KindOfSequence.Period period) { - action.setDeadline(time, time.nanoTime() + options.topologyChangeInterval.get(random)); + action.setDeadline(time, time.nanoTime() + period.get(random)); return action; } @@ -364,7 +400,7 @@ public void transitivelyAfter(Action finished) time.permitDiscontinuities(); } }); - return schedule(action); + return schedule(action, options.topologyChangeInterval); } void updateTopology(Topology newTopology) diff --git a/test/simulator/main/org/apache/cassandra/simulator/cluster/OnClusterMigrateConsensus.java b/test/simulator/main/org/apache/cassandra/simulator/cluster/OnClusterMigrateConsensus.java new file mode 100644 index 000000000000..d4c4cb87d570 --- /dev/null +++ b/test/simulator/main/org/apache/cassandra/simulator/cluster/OnClusterMigrateConsensus.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.cluster; + +import java.util.AbstractMap.SimpleEntry; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.harry.model.TokenPlacementModel; +import org.apache.cassandra.simulator.Action; +import org.apache.cassandra.simulator.ActionList; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.ownership.TokenMap; +import org.apache.cassandra.utils.Pair; + +import static com.google.common.base.Preconditions.checkState; +import static org.apache.cassandra.simulator.Action.Modifiers.NONE; + +class OnClusterMigrateConsensus extends Action +{ + private final KeyspaceActions actions; + + OnClusterMigrateConsensus(KeyspaceActions actions) + { + super("Performing consensus migration", NONE, NONE); + this.actions = actions; + } + + public ActionList performSimple() + { + List result = new ArrayList<>(); + List>> ranges = new ArrayList<>(); + ClusterMetadata cm = ClusterMetadata.current(); + TokenMap tm = cm.tokenMap; + IPartitioner partitioner = tm.partitioner(); + TokenPlacementModel.Lookup lookup = actions.factory.lookup(); + Map idToNodeId = new HashMap<>(); + for (int id : actions.all.toArray()) + idToNodeId.put(id, lookup.nodeId(id)); + + for (int ii = 0; ii < actions.all.size(); ii++) + { + int nodeIdx = ii + 1; + List tokens = tm.tokens(idToNodeId.get(nodeIdx)); + checkState(tokens.size() == 1, "Expect only 1, not handling vnodes tokenRanges " + tokens); + Token token = tokens.get(0); + Range tokenRange = new Range(tm.getPredecessor(token), token); + Range firstRange = new Range<>(tokenRange.left, partitioner.split(tokenRange.left, tokenRange.right, 0.33)); + Range secondRange = new Range<>(firstRange.right, partitioner.split(tokenRange.left, tokenRange.right, 0.66)); + Range thirdRange = new Range<>(secondRange.right, tokenRange.right); + ranges.add(Pair.create(nodeIdx, new SimpleEntry<>(firstRange.left.toString(), firstRange.right.toString()))); + ranges.add(Pair.create(nodeIdx, new SimpleEntry<>(secondRange.left.toString(), secondRange.right.toString()))); + ranges.add(Pair.create(nodeIdx, new SimpleEntry<>(thirdRange.left.toString(), thirdRange.right.toString()))); + } + + Collections.shuffle(ranges); + + System.out.println("Ranges to migrate " + ranges); + + ranges.stream().forEach(p -> result.add(new OnClusterMigrateConsensusOneRange(actions, p.left(), p.right()))); + return ActionList.of(result); + } +} diff --git a/test/simulator/main/org/apache/cassandra/simulator/cluster/OnClusterMigrateConsensusOneRange.java b/test/simulator/main/org/apache/cassandra/simulator/cluster/OnClusterMigrateConsensusOneRange.java new file mode 100644 index 000000000000..84a1d43c5091 --- /dev/null +++ b/test/simulator/main/org/apache/cassandra/simulator/cluster/OnClusterMigrateConsensusOneRange.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.cluster; + +import java.util.Map; + +import com.google.common.collect.ImmutableList; + +import org.apache.cassandra.simulator.Action; +import org.apache.cassandra.simulator.ActionList; + +import static org.apache.cassandra.simulator.Action.Modifiers.NONE; +import static org.apache.cassandra.simulator.Action.Modifiers.STRICT; + +class OnClusterMigrateConsensusOneRange extends Action +{ + private final KeyspaceActions actions; + private final int repairOn; + Map.Entry startMigrationRange; + + OnClusterMigrateConsensusOneRange(KeyspaceActions actions, int repairOn, Map.Entry startMigrationRange) + { + super("Performing consensus migration one range " + startMigrationRange, STRICT, NONE); + this.actions = actions; + this.repairOn = repairOn; + this.startMigrationRange = startMigrationRange; + } + + public ActionList performSimple() + { + return ActionList.of(new OnInstanceStartConsensusMigration(actions, 1, startMigrationRange ), + new OnClusterRepairRanges(actions, new int[] { repairOn }, true, false, ImmutableList.of(startMigrationRange))); + } +} diff --git a/test/simulator/main/org/apache/cassandra/simulator/cluster/OnInstanceRepair.java b/test/simulator/main/org/apache/cassandra/simulator/cluster/OnInstanceRepair.java index 46edfb392649..20283718a15b 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/cluster/OnInstanceRepair.java +++ b/test/simulator/main/org/apache/cassandra/simulator/cluster/OnInstanceRepair.java @@ -97,7 +97,7 @@ private static void invokeRepair(String keyspaceName, boolean repairPaxos, boole { Collection> ranges = rangesSupplier.call(); // no need to wait for completion, as we track all task submissions and message exchanges, and ensure they finish before continuing to next action - StorageService.instance.repair(keyspaceName, new RepairOption(RepairParallelism.SEQUENTIAL, isPrimaryRangeOnly, false, false, 1, ranges, false, false, force, PreviewKind.NONE, false, true, repairPaxos, repairOnlyPaxos, false), singletonList((tag, event) -> { + StorageService.instance.repair(keyspaceName, new RepairOption(RepairParallelism.SEQUENTIAL, isPrimaryRangeOnly, false, false, 1, ranges, false, force, PreviewKind.NONE, false, true, !repairOnlyPaxos, repairPaxos, false, false), singletonList((tag, event) -> { if (event.getType() == ProgressEventType.COMPLETE) listener.run(); })); diff --git a/test/simulator/main/org/apache/cassandra/simulator/cluster/OnInstanceStartConsensusMigration.java b/test/simulator/main/org/apache/cassandra/simulator/cluster/OnInstanceStartConsensusMigration.java new file mode 100644 index 000000000000..e566529a46aa --- /dev/null +++ b/test/simulator/main/org/apache/cassandra/simulator/cluster/OnInstanceStartConsensusMigration.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.cluster; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.apache.cassandra.distributed.api.IIsolatedExecutor; +import org.apache.cassandra.service.StorageService; + +import static org.apache.cassandra.simulator.Action.Modifiers.RELIABLE_NO_TIMEOUTS; + +class OnInstanceStartConsensusMigration extends ClusterAction +{ + + public OnInstanceStartConsensusMigration(KeyspaceActions actions, int on, Map.Entry startMigrationRange) + { + this(actions, on, RELIABLE_NO_TIMEOUTS, RELIABLE_NO_TIMEOUTS, startMigrationRange); + } + + public OnInstanceStartConsensusMigration(KeyspaceActions actions, int on, Modifiers self, Modifiers transitive, Map.Entry startMigrationRange) + { + super("Start consensus migration on " + on, self, transitive, actions, on, invokableBlockingStartConsensusMigration(actions.keyspace, actions.table, startMigrationRange)); + } + + private static IIsolatedExecutor.SerializableRunnable invokableBlockingStartConsensusMigration(String keyspaceName, String cfName, Map.Entry range) + { + return () -> { + List keyspaces = new ArrayList<>(); + keyspaces.add(keyspaceName); + List tables = new ArrayList<>(); + tables.add(cfName); + StorageService.instance.migrateConsensusProtocol(keyspaces, tables, range.getKey() + ":" + range.getValue()); + }; + } +} diff --git a/test/simulator/main/org/apache/cassandra/simulator/debug/Reconcile.java b/test/simulator/main/org/apache/cassandra/simulator/debug/Reconcile.java index 5face389acd8..cbbf85fc32a0 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/debug/Reconcile.java +++ b/test/simulator/main/org/apache/cassandra/simulator/debug/Reconcile.java @@ -61,7 +61,7 @@ public class Reconcile private static final Pattern STRIP_TRACES = Pattern.compile("(Wakeup|Continue|Timeout|Waiting)\\[(((([a-zA-Z]\\.)*[a-zA-Z0-9_$]+\\.[a-zA-Z0-9_<>$]+:[\\-0-9]+; )*(([a-zA-Z]\\.)*[a-zA-Z0-9_$]+\\.[a-zA-Z0-9_<>$]+:[\\-0-9]+))( #\\[.*?]#)?) ?(by\\[.*?])?]"); private static final Pattern STRIP_NOW_TRACES = Pattern.compile("( #\\[.*?]#)"); private static final Pattern NORMALISE_THREAD_RECORDING_IN = Pattern.compile("(Thread\\[[^]]+:[0-9]+),?[0-9]+(,node[0-9]+)]"); - static final Pattern NORMALISE_LAMBDA = Pattern.compile("((\\$\\$Lambda\\$[0-9]+/[0-9]+)?(@[0-9a-f]+)?)"); + static final Pattern NORMALISE_LAMBDA = Pattern.compile("((\\$\\$Lambda\\$[0-9]+/(0x)?[a-f0-9]+)?(@[0-9a-f]+)?)"); static final Pattern NORMALISE_THREAD = Pattern.compile("(Thread\\[[^]]+:[0-9]+),[0-9](,node[0-9]+)(_[0-9]+)?]"); public static class AbstractReconciler diff --git a/test/simulator/main/org/apache/cassandra/simulator/debug/Record.java b/test/simulator/main/org/apache/cassandra/simulator/debug/Record.java index 8f449f18caff..54b335175359 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/debug/Record.java +++ b/test/simulator/main/org/apache/cassandra/simulator/debug/Record.java @@ -46,6 +46,7 @@ import org.apache.cassandra.simulator.systems.SimulatedTime; import org.apache.cassandra.utils.Closeable; import org.apache.cassandra.utils.CloseableIterator; +import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.concurrent.Threads; import static org.apache.cassandra.io.util.File.WriteMode.OVERWRITE; @@ -58,7 +59,7 @@ public class Record { private static final Logger logger = LoggerFactory.getLogger(Record.class); private static final Pattern NORMALISE_THREAD_RECORDING_OUT = Pattern.compile("(Thread\\[[^]]+:[0-9]+),[0-9](,node[0-9]+)_[0-9]+]"); - private static final Pattern NORMALISE_LAMBDA = Pattern.compile("((\\$\\$Lambda\\$[0-9]+/[0-9]+)?(@[0-9a-f]+)?)"); + private static final Pattern NORMALISE_LAMBDA = Pattern.compile("((\\$\\$Lambda\\$[0-9]+/(0x)?[a-f0-9]+)?(@[0-9a-f]+)?)"); public static void record(String saveToDir, long seed, RecordOption withRng, RecordOption withTime, ClusterSimulation.Builder builder) { @@ -81,6 +82,7 @@ else if (withTime == VALUE) if (builder.capture().wakeSites) modifiers.add("WakeSites"); logger.error("Seed 0x{} ({}) (With: {})", Long.toHexString(seed), eventFile, modifiers); + logger.info("Cassandra {} / {}", FBUtilities.getReleaseVersionString(), FBUtilities.getGitSHA()); } try (PrintWriter eventOut = new PrintWriter(new GZIPOutputStream(eventFile.newOutputStream(OVERWRITE), 1 << 16)); diff --git a/test/simulator/main/org/apache/cassandra/simulator/debug/SelfReconcile.java b/test/simulator/main/org/apache/cassandra/simulator/debug/SelfReconcile.java index 390bb82a9a8c..e00924cd2783 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/debug/SelfReconcile.java +++ b/test/simulator/main/org/apache/cassandra/simulator/debug/SelfReconcile.java @@ -43,6 +43,7 @@ import org.apache.cassandra.simulator.systems.InterceptorOfConsequences; import org.apache.cassandra.simulator.systems.SimulatedTime; import org.apache.cassandra.utils.CloseableIterator; +import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.Pair; import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; import org.apache.cassandra.utils.memory.HeapPool; @@ -124,18 +125,27 @@ synchronized void verify(Object event) if (events.size() == 1) { - int cur = counter; - while (cur == counter) + boolean restoreInterrupt = Thread.interrupted(); + try { - try - { - wait(); - } - catch (InterruptedException e) + int cur = counter; + while (cur == counter) { - throw new UncheckedInterruptedException(e); + try + { + wait(); + } + catch (InterruptedException e) + { + throw new UncheckedInterruptedException(e); + } } } + finally + { + if (restoreInterrupt) + Thread.currentThread().interrupt(); + } } else { @@ -239,6 +249,7 @@ public void accept(String kind, long value) public static void reconcileWithSelf(long seed, RecordOption withRng, RecordOption withTime, boolean withAllocations, ClusterSimulation.Builder builder) { logger.error("Seed 0x{}", Long.toHexString(seed)); + logger.info("Cassandra {} / {}", FBUtilities.getReleaseVersionString(), FBUtilities.getGitSHA()); InterceptReconciler reconciler = new InterceptReconciler(withRng == WITH_CALLSITES); if (withRng != NONE) builder.random(reconciler); @@ -311,5 +322,4 @@ private static String normalise(String input) ).replaceAll("$1$2]") ).replaceAll("$1]"); } - } diff --git a/test/simulator/main/org/apache/cassandra/simulator/logging/RunStartDefiner.java b/test/simulator/main/org/apache/cassandra/simulator/logging/RunStartDefiner.java new file mode 100644 index 000000000000..6d541eb7c345 --- /dev/null +++ b/test/simulator/main/org/apache/cassandra/simulator/logging/RunStartDefiner.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.logging; + +import accord.utils.Invariants; +import ch.qos.logback.core.PropertyDefinerBase; +import org.apache.cassandra.config.CassandraRelevantProperties; + +public class RunStartDefiner extends PropertyDefinerBase +{ + static + { + Invariants.require(CassandraRelevantProperties.SIMULATOR_STARTED.getString() != null); + } + + @Override + public String getPropertyValue() + { + return CassandraRelevantProperties.SIMULATOR_STARTED.getString(); + } +} diff --git a/test/simulator/main/org/apache/cassandra/simulator/logging/SeedDefiner.java b/test/simulator/main/org/apache/cassandra/simulator/logging/SeedDefiner.java new file mode 100644 index 000000000000..12d1ca8d2d8a --- /dev/null +++ b/test/simulator/main/org/apache/cassandra/simulator/logging/SeedDefiner.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.logging; + +import ch.qos.logback.core.PropertyDefinerBase; +import org.apache.cassandra.config.CassandraRelevantProperties; + +public class SeedDefiner extends PropertyDefinerBase +{ + public static void setSeed(long seed) + { + CassandraRelevantProperties.SIMULATOR_SEED.setString("0x" + Long.toHexString(seed)); + } + + @Override + public String getPropertyValue() + { + if (CassandraRelevantProperties.SIMULATOR_SEED.getString() == null) + { + System.err.println("SeedDefiner is being called before the seed has been set, check static init order"); + CassandraRelevantProperties.SIMULATOR_SEED.setString(""); + } + return CassandraRelevantProperties.SIMULATOR_SEED.getString(); + } +} diff --git a/test/simulator/main/org/apache/cassandra/simulator/paxos/AbstractPairOfSequencesPaxosSimulation.java b/test/simulator/main/org/apache/cassandra/simulator/paxos/AbstractPairOfSequencesPaxosSimulation.java new file mode 100644 index 000000000000..c0fec09c4342 --- /dev/null +++ b/test/simulator/main/org/apache/cassandra/simulator/paxos/AbstractPairOfSequencesPaxosSimulation.java @@ -0,0 +1,305 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.paxos; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.BiFunction; +import java.util.function.LongSupplier; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +import com.google.common.annotations.VisibleForTesting; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.ListType; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.IIsolatedExecutor; +import org.apache.cassandra.distributed.api.LogAction; +import org.apache.cassandra.distributed.api.LogResult; +import org.apache.cassandra.distributed.impl.Instance; +import org.apache.cassandra.distributed.shared.Metrics; +import org.apache.cassandra.simulator.Action; +import org.apache.cassandra.simulator.ActionList; +import org.apache.cassandra.simulator.ActionPlan; +import org.apache.cassandra.simulator.Actions; +import org.apache.cassandra.simulator.Debug; +import org.apache.cassandra.simulator.RandomSource; +import org.apache.cassandra.simulator.RunnableActionScheduler; +import org.apache.cassandra.simulator.cluster.ClusterActions; +import org.apache.cassandra.simulator.cluster.KeyspaceActions; +import org.apache.cassandra.simulator.systems.SimulatedActionTask; +import org.apache.cassandra.simulator.systems.SimulatedSystems; +import org.apache.cassandra.simulator.utils.IntRange; +import org.apache.cassandra.utils.Pair; + +import static java.util.Collections.singletonList; +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.apache.cassandra.simulator.Action.Modifiers.RELIABLE; +import static org.apache.cassandra.simulator.Action.Modifiers.RELIABLE_NO_TIMEOUTS; +import static org.apache.cassandra.simulator.ActionSchedule.Mode.STREAM_LIMITED; +import static org.apache.cassandra.simulator.ActionSchedule.Mode.TIME_AND_STREAM_LIMITED; +import static org.apache.cassandra.simulator.ActionSchedule.Mode.TIME_LIMITED; + +@SuppressWarnings("unused") +abstract class AbstractPairOfSequencesPaxosSimulation extends PaxosSimulation +{ + private static final Logger logger = LoggerFactory.getLogger(AbstractPairOfSequencesPaxosSimulation.class); + + static final String KEYSPACE = "simple_paxos_simulation"; + static final String TABLE = "tbl"; + static final ListType LIST_TYPE = ListType.getInstance(Int32Type.instance, true); + + final ClusterActions.Options clusterOptions; + final float readRatio; + final IntRange withinKeyConcurrency; + final int concurrency; + final IntRange simulateKeyForSeconds; + final ConsistencyLevel serialConsistency; + final Debug debug; + final AtomicInteger successfulReads = new AtomicInteger(); + final AtomicInteger successfulWrites = new AtomicInteger(); + final AtomicInteger failedReads = new AtomicInteger(); + final AtomicInteger failedWrites = new AtomicInteger(); + final long seed; + final int[] primaryKeys; + + public AbstractPairOfSequencesPaxosSimulation(SimulatedSystems simulated, + Cluster cluster, + ClusterActions.Options clusterOptions, + float readRatio, + int concurrency, IntRange simulateKeyForSeconds, IntRange withinKeyConcurrency, + ConsistencyLevel serialConsistency, RunnableActionScheduler scheduler, Debug debug, + long seed, int[] primaryKeys, + long runForNanos, LongSupplier jitter) + { + super(runForNanos < 0 ? STREAM_LIMITED : (clusterOptions.topologyChangeLimit <= 0 && clusterOptions.consensusChangeLimit <= 0) ? TIME_LIMITED : TIME_AND_STREAM_LIMITED, + simulated, cluster, scheduler, runForNanos, jitter); + this.readRatio = readRatio; + this.concurrency = concurrency; + this.simulateKeyForSeconds = simulateKeyForSeconds; + this.withinKeyConcurrency = withinKeyConcurrency; + this.serialConsistency = serialConsistency; + this.clusterOptions = clusterOptions; + this.debug = debug; + this.seed = seed; + this.primaryKeys = primaryKeys.clone(); + Arrays.sort(this.primaryKeys); + } + + protected abstract String createTableStmt(); + + protected abstract String preInsertStmt(); + + abstract boolean joinAll(); + boolean allowMultiplePartitions() { return false; } + + abstract BiFunction> actionFactory(); + + protected Action checkErrorLogs(IInvokableInstance inst) + { + DatabaseDescriptor.clientInitialization(); + return new Action("Error logs for node" + inst.config().num(), Action.Modifiers.NONE) + { + @Override + protected ActionList performSimple() + { + LogAction logs = inst.logs(); + + checkErrorLogs(inst.config().num(), logs.grepForErrors()); + return ActionList.empty(); + } + }; + } + + @VisibleForTesting + protected void checkErrorLogs(int node, LogResult> errors) + { + if (!errors.getResult().isEmpty()) + { + List> errorsSeen = new ArrayList<>(); + for (String error : errors.getResult()) + { + for (String line : error.split("\\n")) + { + line = line.trim(); + if (line.startsWith("ERROR")) continue; + if (line.startsWith("WARN")) continue; + if (line.startsWith("at ")) continue; + errorsSeen.add(Pair.create(line.split(":")[0], error)); + break; + } + } + Class[] expected = expectedExceptions(); + StringBuilder sb = new StringBuilder(); + for (Pair pair : errorsSeen) + { + String name = pair.left; + String exception = pair.right; + Class klass; + try + { + klass = Class.forName(name); + } + catch (ClassNotFoundException e) + { + sb.append("Unexpected exception (could not parse line):\n").append(exception).append('\n'); + continue; + } + + if (!Stream.of(expected).anyMatch(e -> e.isAssignableFrom(klass))) + sb.append("Unexpected exception:\n").append(exception).append('\n'); + } + if (sb.length() > 0) + { + AssertionError error = new AssertionError("Saw errors in node" + node + ": " + sb); + // this stacktrace isn't helpful, can be more confusing + error.setStackTrace(new StackTraceElement[0]); + throw error; + } + } + } + + protected Metrics getMetrics(int coordinatorIndex) + { + return cluster.get(coordinatorIndex).metrics(); + } + + public ActionPlan plan() + { + ActionPlan plan = new KeyspaceActions(simulated, KEYSPACE, TABLE, createTableStmt(), cluster, + clusterOptions, serialConsistency, this, primaryKeys, debug).plan(joinAll()); + + plan = plan.encapsulate(ActionPlan.setUpTearDown( + ActionList.of( + cluster.stream().map(i -> simulated.run("Insert Partitions", i, executeForPrimaryKeys(preInsertStmt(), primaryKeys))) + ), + ActionList.of( + cluster.stream().map(i -> checkErrorLogs(i)), + cluster.stream().map(i -> SimulatedActionTask.unsafeTask("Shutdown " + i.broadcastAddress(), RELIABLE, RELIABLE_NO_TIMEOUTS, simulated, i, i::shutdown)) + ) + )); + + BiFunction> factory = actionFactory(); + + List available = IntStream.range(0, primaryKeys.length).boxed().collect(Collectors.toList()); + Action stream = Actions.infiniteStream(concurrency, new Supplier() { + @Override + public Action get() + { + int[] primaryKeyIndex = consume(simulated.random, available); + if (primaryKeyIndex == null) + return Actions.empty("All primary keys are taken, try again later"); + long untilNanos = simulated.time.nanoTime() + SECONDS.toNanos(simulateKeyForSeconds.select(simulated.random)); + int concurrency = withinKeyConcurrency.select(simulated.random); + Supplier supplier = factory.apply(simulated, primaryKeyIndex); + // while this stream is finite, it participates in an infinite stream via its parent, so we want to permit termination while it's running + return Actions.infiniteStream(concurrency, new Supplier() + { + @Override + public Action get() + { + if (simulated.time.nanoTime() >= untilNanos) + { + IntStream.of(primaryKeyIndex).boxed().forEach(available::add); + return null; + } + return supplier.get(); + } + + @Override + public String toString() + { + return supplier.toString(); + } + }); + } + + @Override + public String toString() + { + return "Primary Key Actions"; + } + }); + + return simulated.execution.plan() + .encapsulate(plan) + .encapsulate(ActionPlan.interleave(singletonList(ActionList.of(stream)))); + } + + private int[] consume(RandomSource random, List available) + { + if (available.isEmpty()) + return null; + int numPartitions = available.size() == 1 || !allowMultiplePartitions() ? 1 : random.uniform(1, available.size()); + int[] partitions = new int[numPartitions]; + for (int counter = 0; counter < numPartitions; counter++) + { + int idx = random.uniform(0, available.size()); + int next = available.get(idx); + int last = available.get(available.size() - 1); + if (available.set(idx, last) != next) + throw new IllegalStateException("Expected to set " + last + " index " + idx + " but did not return " + next); + int removed = available.remove(available.size() - 1); + if (last != removed) + throw new IllegalStateException("Expected to remove " + last + " but removed " + removed); + + partitions[counter] = next; + } + Arrays.sort(partitions); + return partitions; + } + + IIsolatedExecutor.SerializableRunnable executeForPrimaryKeys(String cql, int[] primaryKeys) + { + return () -> { + for (int primaryKey : primaryKeys) + Instance.unsafeExecuteInternalWithResult(cql, primaryKey); + }; + } + + @Override + public TopologyChangeValidator newTopologyChangeValidator(Object id) + { + return new PaxosTopologyChangeVerifier(cluster, KEYSPACE, TABLE, id); + } + + @Override + public RepairValidator newRepairValidator(Object id) + { + return new PaxosRepairValidator(cluster, KEYSPACE, TABLE, id); + } + + @Override + public void run() + { + super.run(); + logger.warn("Writes: {} successful, {} failed", successfulWrites, failedWrites); + logger.warn("Reads: {} successful {} failed", successfulReads, failedReads); + } +} diff --git a/test/simulator/main/org/apache/cassandra/simulator/paxos/AccordClusterSimulation.java b/test/simulator/main/org/apache/cassandra/simulator/paxos/AccordClusterSimulation.java new file mode 100644 index 000000000000..24e038b3b3ff --- /dev/null +++ b/test/simulator/main/org/apache/cassandra/simulator/paxos/AccordClusterSimulation.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.paxos; + +import java.io.IOException; + +import org.apache.cassandra.simulator.ClusterSimulation; +import org.apache.cassandra.simulator.RandomSource; +import org.apache.cassandra.simulator.utils.KindOfSequence; + +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.SERIAL; + +class AccordClusterSimulation extends ClusterSimulation implements AutoCloseable +{ + @SuppressWarnings("UnusedReturnValue") + static class Builder extends ClusterSimulation.Builder + { + public AccordClusterSimulation create(long seed) throws IOException + { + RandomSource random = randomSupplier.get(); + random.reset(seed); + return new AccordClusterSimulation(random, seed, uniqueNum, this); + } + } + + AccordClusterSimulation(RandomSource random, long seed, int uniqueNum, Builder builder) throws IOException + { + super(random, seed, uniqueNum, builder, + config -> config.set("storage_compatibility_mode", "NONE"), + (simulated, schedulers, cluster, options) -> { + int[] primaryKeys = primaryKeys(seed, builder.primaryKeyCount()); + KindOfSequence.Period jitter = RandomSource.Choices.uniform(KindOfSequence.values()).choose(random) + .period(builder.schedulerJitterNanos(), random); + return new PairOfSequencesAccordSimulation(simulated, cluster, options, builder.transactionalMode(), + builder.readChance().select(random), builder.concurrency(), builder.primaryKeySeconds(), builder.withinKeyConcurrency(), + SERIAL, schedulers, builder.debug(), seed, + primaryKeys, builder.secondsToSimulate() >= 0 ? SECONDS.toNanos(builder.secondsToSimulate()) : -1, + () -> jitter.get(random)); + }); + } + + private static int[] primaryKeys(long seed, int count) + { + int primaryKey = (int) (seed); + int[] primaryKeys = new int[count]; + for (int i = 0 ; i < primaryKeys.length ; ++i) + primaryKeys[i] = primaryKey += 1 << 20; + return primaryKeys; + } +} diff --git a/test/simulator/main/org/apache/cassandra/simulator/paxos/AccordSimulationRunner.java b/test/simulator/main/org/apache/cassandra/simulator/paxos/AccordSimulationRunner.java new file mode 100644 index 000000000000..e782848ea494 --- /dev/null +++ b/test/simulator/main/org/apache/cassandra/simulator/paxos/AccordSimulationRunner.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.paxos; + +import java.io.IOException; +import java.util.concurrent.atomic.AtomicInteger; + +import org.junit.BeforeClass; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import io.airlift.airline.Cli; +import io.airlift.airline.Command; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.simulator.SimulationRunner; +import org.apache.cassandra.simulator.SimulatorUtils; +import org.apache.cassandra.utils.StorageCompatibilityMode; + +public class AccordSimulationRunner extends SimulationRunner +{ + private static Logger logger = LoggerFactory.getLogger(AccordSimulationRunner.class); + + @BeforeClass + public static void beforeAll() + { + CassandraRelevantProperties.JUNIT_STORAGE_COMPATIBILITY_MODE.setString(StorageCompatibilityMode.NONE.toString()); + } + + @Command(name = "run") + public static class Run extends SimulationRunner.Run + { + public Run() {} + + @Override + protected void run(long seed, AccordClusterSimulation.Builder builder) throws IOException + { + beforeAll(); + super.run(seed, builder); + } + } + + @Command(name = "record") + public static class Record extends SimulationRunner.Record + { + public Record() {} + + @Override + protected void run(long seed, AccordClusterSimulation.Builder builder) throws IOException + { + beforeAll(); + super.run(seed, builder); + } + } + + @Command(name = "reconcile") + public static class Reconcile extends SimulationRunner.Reconcile + { + public Reconcile() {} + + @Override + protected void run(long seed, AccordClusterSimulation.Builder builder) throws IOException + { + beforeAll(); + super.run(seed, builder); + } + } + + public static class Help extends HelpCommand {} + + // for simple unit tests so we can simply invoke main() + private static final AtomicInteger uniqueNum = new AtomicInteger(); + + /** + * See {@link org.apache.cassandra.simulator} package info for execution tips + */ + public static void main(String[] args) throws IOException + { + SimulatorUtils.verifyAndlogSimulatorArgs(logger, args); + AccordClusterSimulation.Builder builder = new AccordClusterSimulation.Builder(); + builder.unique(uniqueNum.getAndIncrement()); + + Cli.>builder("accord") + .withCommand(Run.class) + .withCommand(Reconcile.class) + .withCommand(Record.class) + .withCommand(Help.class) + .withDefaultCommand(Help.class) + .build() + .parse(args) + .run(builder); + } +} diff --git a/test/simulator/main/org/apache/cassandra/simulator/paxos/Ballots.java b/test/simulator/main/org/apache/cassandra/simulator/paxos/Ballots.java index c57f49341fbc..90cc76a475fa 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/paxos/Ballots.java +++ b/test/simulator/main/org/apache/cassandra/simulator/paxos/Ballots.java @@ -27,12 +27,16 @@ import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts; import org.apache.cassandra.db.ReadExecutionController; import org.apache.cassandra.db.SinglePartitionReadCommand; import org.apache.cassandra.db.Slice; import org.apache.cassandra.db.Slices; import org.apache.cassandra.db.SystemKeyspace; +import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter; import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.filter.DataLimits; +import org.apache.cassandra.db.filter.RowFilter; import org.apache.cassandra.db.marshal.Int32Type; import org.apache.cassandra.db.marshal.TimeUUIDType; import org.apache.cassandra.db.memtable.Memtable; @@ -159,7 +163,7 @@ public static String paxosDebugInfo(DecoratedKey key, TableMetadata metadata, lo private static ColumnMetadata paxosUUIDColumn(String name) { - return ColumnMetadata.regularColumn(SchemaConstants.SYSTEM_KEYSPACE_NAME, SystemKeyspace.PAXOS, name, TimeUUIDType.instance); + return ColumnMetadata.regularColumn(SchemaConstants.SYSTEM_KEYSPACE_NAME, SystemKeyspace.PAXOS, name, TimeUUIDType.instance, ColumnMetadata.NO_UNIQUE_ID); } /** @@ -200,7 +204,8 @@ private static Row getRow(DecoratedKey key, TableMetadata metadata, ColumnFamily public static long latestBallotFromBaseTable(DecoratedKey key, TableMetadata metadata) { - SinglePartitionReadCommand cmd = SinglePartitionReadCommand.create(metadata, 0, key, Slice.ALL); + ClusteringIndexSliceFilter filter = new ClusteringIndexSliceFilter(Slices.with(metadata.comparator, Slice.ALL), false); + SinglePartitionReadCommand cmd = SinglePartitionReadCommand.create(metadata, 0, ColumnFilter.all(metadata), RowFilter.none(), DataLimits.NONE, key, filter, PotentialTxnConflicts.ALLOW); try (ReadExecutionController controller = cmd.executionController(); UnfilteredPartitionIterator partitions = cmd.executeLocally(controller)) { if (!partitions.hasNext()) diff --git a/test/simulator/main/org/apache/cassandra/simulator/paxos/HistoryChecker.java b/test/simulator/main/org/apache/cassandra/simulator/paxos/HistoryChecker.java index d1e0771b1ecc..2465bf62cf99 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/paxos/HistoryChecker.java +++ b/test/simulator/main/org/apache/cassandra/simulator/paxos/HistoryChecker.java @@ -127,13 +127,23 @@ Event setById(int id, Event event) return byId[id] = event; } - void witness(Observation witness, int[] witnessSequence, int start, int end) + private static int eventId(int[] witnessSequence, int eventPosition) + { + return eventPosition == 0 ? -1 : witnessSequence[eventPosition - 1]; + } + + void witness(Observation witness, int[] witnessSequence) + { + witness(witness.id, witnessSequence, witness.start, witness.end); + } + + void witness(int id, int[] witnessSequence, int start, int end) { int eventPosition = witnessSequence.length; - int eventId = eventPosition == 0 ? -1 : witnessSequence[eventPosition - 1]; - setById(witness.id, new Event(witness.id)).log.add(new VerboseWitness(witness.id, start, end, witnessSequence)); + int eventId = eventId(witnessSequence, eventPosition); + setById(id, new Event(id)).log.add(new VerboseWitness(id, start, end, witnessSequence)); Event event = get(eventPosition, eventId); - recordWitness(event, witness, witnessSequence); + recordWitness(event, id, start, end, witnessSequence); recordVisibleBy(event, end); recordVisibleUntil(event, start); @@ -154,7 +164,7 @@ void witness(Observation witness, int[] witnessSequence, int start, int end) } else if (e.result) { - throw fail(primaryKey, "%d witnessed as absent by %d", e.eventId, witness.id); + throw fail(primaryKey, "%d witnessed as absent by %d", e.eventId, id); } } } @@ -181,16 +191,16 @@ void applied(int eventId, int start, int end, boolean success) } } - void recordWitness(Event event, Observation witness, int[] witnessSequence) + void recordWitness(Event event, int id, int start, int end, int[] witnessSequence) { - recordWitness(event, witness, witnessSequence.length, witnessSequence); + recordWitness(event, id, start, end, witnessSequence.length, witnessSequence); } - void recordWitness(Event event, Observation witness, int eventPosition, int[] witnessSequence) + void recordWitness(Event event, int id, int start, int end, int eventPosition, int[] witnessSequence) { while (true) { - event.log.add(new Witness(READ, witness.id, witness.start, witness.end)); + event.log.add(new Witness(READ, id, start, end)); if (event.witnessSequence != null) { if (!Arrays.equals(event.witnessSequence, witnessSequence)) @@ -238,7 +248,7 @@ void recordVisibleUntil(Event event, int visibleUntil) event.visibleUntil = visibleUntil; Event next = next(event); if (next != null && visibleUntil >= next.visibleBy) - throw fail(primaryKey, "%s %d not witnessed >= %d, but also witnessed <= %d", next.witnessSequence, next.eventId, event.visibleUntil, next.visibleBy); + throw fail(primaryKey, "%s+%d not witnessed >= %d, but also witnessed <= %d", next.witnessSequence, next.eventId, event.visibleUntil, next.visibleBy); } } @@ -295,7 +305,7 @@ Event prev(Event event) return null; // initialise the event, if necessary importing information from byId - return get(eventPosition, eventPosition == 0 ? -1 : event.witnessSequence[eventPosition - 1]); + return get(eventPosition, eventId(event.witnessSequence, eventPosition)); } Event next(Event event) diff --git a/test/simulator/main/org/apache/cassandra/simulator/paxos/HistoryValidator.java b/test/simulator/main/org/apache/cassandra/simulator/paxos/HistoryValidator.java new file mode 100644 index 000000000000..282b16d3b1af --- /dev/null +++ b/test/simulator/main/org/apache/cassandra/simulator/paxos/HistoryValidator.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.paxos; + +import javax.annotation.Nullable; + +public interface HistoryValidator +{ + Checker witness(int start, int end); + + void print(@Nullable Integer pk); + + interface Checker extends AutoCloseable + { + void read(int pk, int id, int count, int[] seq); + void write(int pk, int id, boolean success); + + default void writeSuccess(int pk, int id) + { + write(pk, id, true); + } + + default void writeUnknownFailure(int pk, int id) + { + write(pk, id, false); + } + + @Override + default void close() {} + } + + interface Factory + { + HistoryValidator create(int[] partitions); + } +} diff --git a/test/simulator/main/org/apache/cassandra/simulator/paxos/LinearizabilityValidator.java b/test/simulator/main/org/apache/cassandra/simulator/paxos/LinearizabilityValidator.java new file mode 100644 index 000000000000..67c95a7378fe --- /dev/null +++ b/test/simulator/main/org/apache/cassandra/simulator/paxos/LinearizabilityValidator.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.paxos; + +import java.util.function.Consumer; +import javax.annotation.Nullable; + +import com.carrotsearch.hppc.IntObjectHashMap; +import com.carrotsearch.hppc.IntObjectMap; +import com.carrotsearch.hppc.cursors.ObjectCursor; + +public class LinearizabilityValidator implements HistoryValidator +{ + private final IntObjectMap historyCheckers; + + public LinearizabilityValidator(int[] primaryKeys) + { + historyCheckers = new IntObjectHashMap<>(primaryKeys.length); + for (int primaryKey : primaryKeys) + historyCheckers.put(primaryKey, new HistoryChecker(primaryKey)); + } + + @Override + public Checker witness(int start, int end) + { + return new Checker() + { + @Override + public void read(int pk, int id, int count, int[] seq) + { + get(pk).witness(id, seq, start, end); + } + + @Override + public void write(int pk, int id, boolean success) + { + get(pk).applied(id, start, end, success); + } + }; + } + + @Override + public void print(@Nullable Integer pk) + { + if (pk == null) historyCheckers.values().forEach((Consumer>) c -> c.value.print()); + else historyCheckers.get(pk).print(); + } + + private HistoryChecker get(int pk) + { + HistoryChecker checker = historyCheckers.get(pk); + if (checker == null) + throw new NullPointerException("Unable to find checker for pk=" + pk); + return checker; + } + + public static class Factory implements HistoryValidator.Factory + { + public static final Factory instance = new Factory(); + + @Override + public HistoryValidator create(int[] partitions) + { + return new LinearizabilityValidator(partitions); + } + } +} diff --git a/test/simulator/main/org/apache/cassandra/simulator/paxos/LoggingHistoryValidator.java b/test/simulator/main/org/apache/cassandra/simulator/paxos/LoggingHistoryValidator.java new file mode 100644 index 000000000000..b39c3111ea66 --- /dev/null +++ b/test/simulator/main/org/apache/cassandra/simulator/paxos/LoggingHistoryValidator.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.paxos; + +import java.util.Arrays; +import javax.annotation.Nullable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class LoggingHistoryValidator implements HistoryValidator +{ + private static final Logger logger = LoggerFactory.getLogger(LoggingHistoryValidator.class); + private final HistoryValidator delegate; + + public LoggingHistoryValidator(HistoryValidator delegate) + { + this.delegate = delegate; + } + + @Override + public Checker witness(int start, int end) + { + StringBuilder sb = new StringBuilder(); + sb.append("Witness(start=").append(start).append(", end=").append(end).append(")\n"); + Checker sub = delegate.witness(start, end); + return new Checker() + { + @Override + public void read(int pk, int id, int count, int[] seq) + { + sb.append("\tread(pk=").append(pk).append(", id=").append(id).append(", count=").append(count).append(", seq=").append(Arrays.toString(seq)).append(")\n"); + sub.read(pk, id, count, seq); + } + + @Override + public void write(int pk, int id, boolean success) + { + sb.append("\twrite(pk=").append(pk).append(", id=").append(id).append(", success=").append(success).append(")\n"); + sub.write(pk, id, success); + } + + @Override + public void close() + { + logger.info(sb.toString()); + sub.close(); + } + }; + } + + @Override + public void print(@Nullable Integer pk) + { + delegate.print(pk); + } +} diff --git a/test/simulator/main/org/apache/cassandra/simulator/paxos/Observation.java b/test/simulator/main/org/apache/cassandra/simulator/paxos/Observation.java index 546fd3179fc3..41eb2c348dcc 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/paxos/Observation.java +++ b/test/simulator/main/org/apache/cassandra/simulator/paxos/Observation.java @@ -18,14 +18,16 @@ package org.apache.cassandra.simulator.paxos; +import org.apache.cassandra.distributed.api.SimpleQueryResult; + class Observation implements Comparable { final int id; - final Object[][] result; + final SimpleQueryResult result; final int start; final int end; - Observation(int id, Object[][] result, int start, int end) + Observation(int id, SimpleQueryResult result, int start, int end) { this.id = id; this.result = result; @@ -33,6 +35,16 @@ class Observation implements Comparable this.end = end; } + boolean isSuccess() + { + return result != null; + } + + boolean isUnknownFailure() + { + return result == null; + } + // computes a PARTIAL ORDER on when the outcome occurred, i.e. for many pair-wise comparisons the answer is 0 public int compareTo(Observation that) { diff --git a/test/simulator/main/org/apache/cassandra/simulator/paxos/PairOfSequencesAccordSimulation.java b/test/simulator/main/org/apache/cassandra/simulator/paxos/PairOfSequencesAccordSimulation.java new file mode 100644 index 000000000000..3bb911b41d70 --- /dev/null +++ b/test/simulator/main/org/apache/cassandra/simulator/paxos/PairOfSequencesAccordSimulation.java @@ -0,0 +1,318 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.paxos; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.BiFunction; +import java.util.function.LongSupplier; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import javax.annotation.Nullable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.carrotsearch.hppc.IntArrayList; +import com.carrotsearch.hppc.IntHashSet; +import com.carrotsearch.hppc.cursors.IntCursor; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.ComplexColumnData; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.QueryResults; +import org.apache.cassandra.distributed.api.SimpleQueryResult; +import org.apache.cassandra.distributed.impl.Query; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.simulator.Action; +import org.apache.cassandra.simulator.Debug; +import org.apache.cassandra.simulator.RunnableActionScheduler; +import org.apache.cassandra.simulator.cluster.ClusterActions; +import org.apache.cassandra.simulator.systems.SimulatedSystems; +import org.apache.cassandra.simulator.utils.IntRange; + +import static org.apache.cassandra.simulator.paxos.HistoryChecker.fail; + +// TODO: the class hierarchy is a bit broken, but hard to untangle. Need to go Paxos->Consensus, probably. +@SuppressWarnings("unused") +public class PairOfSequencesAccordSimulation extends AbstractPairOfSequencesPaxosSimulation +{ + private static final Logger logger = LoggerFactory.getLogger(PairOfSequencesAccordSimulation.class); + private static final String SELECT = "SELECT pk, count, seq FROM " + KEYSPACE + ".tbl WHERE pk IN (%s);"; + private static final String UPDATE = "UPDATE " + KEYSPACE + ".tbl SET count += 1, seq = seq + ? WHERE pk = ?;"; + + private static void append(TableMetadata metadata, ByteBuffer[] keyComponents, Row row, QueryResults.Builder builder, String[] columnNames) + { + Object[] buffer = new Object[columnNames.length]; + Clustering clustering = row.clustering(); + int idx = 0; + for (String columnName : columnNames) + { + ColumnMetadata column = metadata.getColumn(new ColumnIdentifier(columnName, true)); + switch (column.kind) + { + case PARTITION_KEY: + buffer[idx++] = column.type.compose(keyComponents[column.position()]); + break; + case CLUSTERING: + buffer[idx++] = column.type.compose(clustering.bufferAt(column.position())); + break; + case REGULAR: + { + if (column.isComplex()) + { + ComplexColumnData data = row.getComplexColumnData(column); + if (data == null) + { + buffer[idx++] = new ArrayList<>(); + } + else + { + List result = new ArrayList<>(data.cellsCount()); + for (Cell cell : data) + result.add(column.cellValueType().compose(cell.buffer())); + buffer[idx++] = result; + } + } + else + { + //TODO deletes + buffer[idx++] = column.type.compose(row.getCell(column).buffer()); + } + } + break; +// case STATIC: + default: + throw new IllegalArgumentException("Unsupported kind: " + column.kind); + } + } + builder.row(buffer); + } + + @Override + void log(@Nullable Integer pk) + { + validator.print(pk); + } + + private final float writeRatio; + private final HistoryValidator validator; + private final TransactionalMode transactionalMode; + + public PairOfSequencesAccordSimulation(SimulatedSystems simulated, + Cluster cluster, + ClusterActions.Options clusterOptions, + TransactionalMode transactionalMode, + float readRatio, + int concurrency, IntRange simulateKeyForSeconds, IntRange withinKeyConcurrency, + ConsistencyLevel serialConsistency, RunnableActionScheduler scheduler, Debug debug, + long seed, int[] primaryKeys, + long runForNanos, LongSupplier jitter) + { + super(simulated, cluster, clusterOptions, + readRatio, concurrency, simulateKeyForSeconds, withinKeyConcurrency, + serialConsistency, + scheduler, debug, + seed, primaryKeys, + runForNanos, jitter); + this.transactionalMode = transactionalMode; + this.writeRatio = 1F - readRatio; + HistoryValidator validator = new StrictSerializabilityValidator(primaryKeys); + if (CassandraRelevantProperties.TEST_HISTORY_VALIDATOR_LOGGING_ENABLED.getBoolean()) + validator = new LoggingHistoryValidator(validator); + this.validator = validator; + } + + @Override + protected Class[] expectedExceptions() + { + return expectedExceptionsAccord(); + } + + @Override + protected String createTableStmt() + { + return String.format("CREATE TABLE " + KEYSPACE + ".tbl (pk int, count int, seq text, PRIMARY KEY (pk)) WITH transactional_mode = '%s'", transactionalMode); + } + + @Override + protected String preInsertStmt() + { + return "INSERT INTO " + KEYSPACE + ".tbl (pk, count, seq) VALUES (?, 0, '') USING TIMESTAMP 0"; + } + + @Override + boolean allowMultiplePartitions() { return true; } + + @Override + BiFunction> actionFactory() + { + AtomicInteger id = new AtomicInteger(0); + + return (simulated, primaryKeyIndex) -> { + int[] primaryKeys = IntStream.of(primaryKeyIndex).map(i -> this.primaryKeys[i]).toArray(); + return () -> accordAction(id.getAndIncrement(), simulated, primaryKeys); + }; + } + + public class ReadWriteOperation extends Operation + { + private final IntHashSet reads, writes; + + public ReadWriteOperation(int id, int[] primaryKeys, IntHashSet reads, IntHashSet writes, IInvokableInstance instance) + { + super(primaryKeys, id, instance, "Accord ReadWrite Txn", createQuery(id, reads, writes)); + this.reads = reads; + this.writes = writes; + } + + @Override + void verify(Observation outcome) + { + SimpleQueryResult result = outcome.result; + (result != null ? successfulWrites : failedWrites).incrementAndGet(); + if (result != null) + { + IntHashSet seen = new IntHashSet(); + //TODO if there isn't a value then we get empty read, which then doesn't make it into the QueryResult + // given the fact that we always run with the partitions defined this should be fine + try (HistoryValidator.Checker checker = validator.witness(outcome.start, outcome.end)) + { + while (result.hasNext()) + { + org.apache.cassandra.distributed.api.Row row = result.next(); + + int pk = row.getInteger("pk"); + int count = row.getInteger("count", 0); + int[] seq = Arrays.stream(row.getString("seq", "").split(",")) + .filter(s -> !s.isEmpty()) + .mapToInt(Integer::parseInt) + .toArray(); + + if (!seen.add(pk)) + throw new IllegalStateException("Duplicate partition key " + pk); + // every partition was read, but not all were written to... need to verify each partition + if (seq.length != count) + throw fail(pk, "%d != #%s", count, seq); + + checker.read(pk, outcome.id, count, seq); + } + if (!seen.equals(reads)) + throw fail(0, "#result had %s partitions, but should have had %s", seen, reads); + // handle writes + for (IntCursor c : writes) + checker.write(c.value, outcome.id, outcome.isSuccess()); + } + } + } + } + + private Action accordAction(int id, SimulatedSystems simulated, int[] partitions) + { + IntArrayList reads = new IntArrayList(); + IntArrayList writes = new IntArrayList(); + for (int partition : partitions) + { + boolean added = false; + if (simulated.random.decide(readRatio)) + { + reads.add(partition); + added = true; + } + if (simulated.random.decide(writeRatio)) + { + writes.add(partition); + added = true; + } + if (!added) + { + // when read ratio fails that implies write + // when write ratio fails that implies read + // so make that case a read/write + // Its possible that both cases were true leading to a read/write; which is fine + // this just makes sure every partition is consumed. + reads.add(partition); + writes.add(partition); + } + } + + int node = simulated.random.uniform(1, cluster.size() + 1); + IInvokableInstance instance = cluster.get(node); + return new ReadWriteOperation(id, partitions, new IntHashSet(reads), new IntHashSet(writes), instance); + } + + private int[] genReadOnly(SimulatedSystems simulated, int[] partitions) + { + IntArrayList readOnly = new IntArrayList(); + for (int partition : partitions) + { + if (simulated.random.decide(readRatio)) + readOnly.add(partition); + } + return readOnly.toArray(); + } + + private static Query createQuery(int id, IntHashSet reads, IntHashSet writes) + { + if (reads.isEmpty() && writes.isEmpty()) + throw new IllegalArgumentException("Partitions are empty"); + List binds = new ArrayList<>(); + StringBuilder sb = new StringBuilder(); + sb.append("BEGIN TRANSACTION\n"); + if (!reads.isEmpty()) + { + + sb.append("\t") + .append(String.format(SELECT, String.join(", ", IntStream.of(reads.toArray()) + .mapToObj(i -> { + binds.add(i); + return "?"; + }) + .collect(Collectors.joining(", "))))) + .append('\n'); + } + + for (IntCursor c : writes) + { + sb.append('\t').append(UPDATE).append("\n"); + binds.add(id + ","); + binds.add(c.value); + } + + sb.append("COMMIT TRANSACTION"); + return new Query(sb.toString(), 0, ConsistencyLevel.ANY, ConsistencyLevel.ANY, binds.toArray(new Object[0])); + } + + @Override + boolean joinAll() + { + return true; + } +} diff --git a/test/simulator/main/org/apache/cassandra/simulator/paxos/PairOfSequencesPaxosSimulation.java b/test/simulator/main/org/apache/cassandra/simulator/paxos/PairOfSequencesPaxosSimulation.java index 77eefb337bda..fbda89645102 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/paxos/PairOfSequencesPaxosSimulation.java +++ b/test/simulator/main/org/apache/cassandra/simulator/paxos/PairOfSequencesPaxosSimulation.java @@ -21,34 +21,27 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.BiFunction; import java.util.function.LongSupplier; import java.util.function.Supplier; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - import javax.annotation.Nullable; +import org.apache.commons.lang3.ArrayUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.cassandra.db.marshal.Int32Type; -import org.apache.cassandra.db.marshal.ListType; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.api.IInvokableInstance; -import org.apache.cassandra.distributed.api.IIsolatedExecutor; -import org.apache.cassandra.distributed.impl.Instance; +import org.apache.cassandra.distributed.api.Row; +import org.apache.cassandra.distributed.api.SimpleQueryResult; +import org.apache.cassandra.distributed.impl.Query; +import org.apache.cassandra.service.consensus.TransactionalMode; import org.apache.cassandra.simulator.Action; -import org.apache.cassandra.simulator.ActionList; import org.apache.cassandra.simulator.ActionListener; -import org.apache.cassandra.simulator.ActionPlan; +import org.apache.cassandra.simulator.Debug; import org.apache.cassandra.simulator.RunnableActionScheduler; -import org.apache.cassandra.simulator.Actions; import org.apache.cassandra.simulator.cluster.ClusterActions; -import org.apache.cassandra.simulator.Debug; -import org.apache.cassandra.simulator.cluster.KeyspaceActions; -import org.apache.cassandra.simulator.systems.SimulatedActionTask; import org.apache.cassandra.simulator.systems.SimulatedSystems; import org.apache.cassandra.simulator.utils.IntRange; import org.apache.cassandra.utils.ByteBufferUtil; @@ -56,31 +49,18 @@ import static java.lang.Boolean.TRUE; import static java.util.Collections.emptyList; import static java.util.Collections.singletonList; -import static java.util.concurrent.TimeUnit.SECONDS; import static org.apache.cassandra.distributed.api.ConsistencyLevel.ANY; -import static org.apache.cassandra.simulator.Action.Modifiers.RELIABLE; -import static org.apache.cassandra.simulator.Action.Modifiers.RELIABLE_NO_TIMEOUTS; -import static org.apache.cassandra.simulator.ActionSchedule.Mode.STREAM_LIMITED; -import static org.apache.cassandra.simulator.ActionSchedule.Mode.TIME_AND_STREAM_LIMITED; -import static org.apache.cassandra.simulator.ActionSchedule.Mode.TIME_LIMITED; import static org.apache.cassandra.simulator.Debug.EventType.PARTITION; import static org.apache.cassandra.simulator.paxos.HistoryChecker.fail; @SuppressWarnings("unused") -public class PairOfSequencesPaxosSimulation extends PaxosSimulation +public class PairOfSequencesPaxosSimulation extends AbstractPairOfSequencesPaxosSimulation { private static final Logger logger = LoggerFactory.getLogger(PairOfSequencesPaxosSimulation.class); - - private static final String KEYSPACE = "simple_paxos_simulation"; - private static final String TABLE = "tbl"; - private static final String CREATE_TABLE = "CREATE TABLE " + KEYSPACE + ".tbl (pk int, count int, seq1 text, seq2 list, PRIMARY KEY (pk))"; - private static final String INSERT = "INSERT INTO " + KEYSPACE + ".tbl (pk, count, seq1, seq2) VALUES (?, 0, '', []) IF NOT EXISTS"; - private static final String INSERT1 = "INSERT INTO " + KEYSPACE + ".tbl (pk, count, seq1, seq2) VALUES (?, 0, '', []) USING TIMESTAMP 0"; private static final String UPDATE = "UPDATE " + KEYSPACE + ".tbl SET count = count + 1, seq1 = seq1 + ?, seq2 = seq2 + ? WHERE pk = ? IF EXISTS"; private static final String SELECT = "SELECT pk, count, seq1, seq2 FROM " + KEYSPACE + ".tbl WHERE pk = ?"; - private static final ListType LIST_TYPE = ListType.getInstance(Int32Type.instance, true); - class VerifyingOperation extends Operation + class VerifyingOperation extends PaxosOperation { final HistoryChecker historyChecker; public VerifyingOperation(int id, IInvokableInstance instance, ConsistencyLevel consistencyLevel, int primaryKey, HistoryChecker historyChecker) @@ -91,23 +71,26 @@ public VerifyingOperation(int id, IInvokableInstance instance, ConsistencyLevel void verify(Observation outcome) { - (outcome.result != null ? successfulReads : failedReads).incrementAndGet(); + SimpleQueryResult result = outcome.result; + (result != null ? successfulReads : failedReads).incrementAndGet(); - if (outcome.result == null) + if (result == null) return; - if (outcome.result.length != 1) - throw fail(primaryKey, "#result (%s) != 1", Arrays.toString(outcome.result)); + if (!result.hasNext()) + throw fail(primaryKey, "#result: ([]) != 1"); + + // pk, count, seq1, seq2 + Row row = result.next(); - Object[] row = outcome.result[0]; // first verify internally consistent - int count = row[1] == null ? 0 : (Integer) row[1]; - int[] seq1 = Arrays.stream((row[2] == null ? "" : (String) row[2]).split(",")) + int count = row.getInteger("count", 0); + int[] seq1 = Arrays.stream(row.getString("seq1", "").split(",")) .filter(s -> !s.isEmpty()) .mapToInt(Integer::parseInt) .toArray(); - int[] seq2 = ((List) (row[3] == null ? emptyList() : row[3])) - .stream().mapToInt(x -> x).toArray(); + + int[] seq2 = row.getList("seq2", emptyList()).stream().mapToInt(x -> x).toArray(); if (!Arrays.equals(seq1, seq2)) throw fail(primaryKey, "%s != %s", seq1, seq2); @@ -115,11 +98,24 @@ void verify(Observation outcome) if (seq1.length != count) throw fail(primaryKey, "%d != #%s", count, seq1); - historyChecker.witness(outcome, seq1, outcome.start, outcome.end); + if (result.hasNext()) + throw fail(primaryKey, "#result (%s) != 1", ArrayUtils.toString(result.toObjectArrays())); + + historyChecker.witness(outcome, seq1); } } - class NonVerifyingOperation extends Operation + private abstract class PaxosOperation extends Operation + { + final int primaryKey; + PaxosOperation(int primaryKey, int id, IInvokableInstance instance, String idString, String query, ConsistencyLevel commitConsistency, ConsistencyLevel serialConsistency, Object... params) + { + super(new int[] {primaryKey}, id, instance, idString, new Query(query, -1, commitConsistency, serialConsistency, params)); + this.primaryKey = primaryKey; + } + } + + class NonVerifyingOperation extends PaxosOperation { public NonVerifyingOperation(int id, IInvokableInstance instance, ConsistencyLevel consistencyLevel, int primaryKey, HistoryChecker historyChecker) { @@ -131,7 +127,7 @@ void verify(Observation outcome) } } - public class ModifyingOperation extends Operation + public class ModifyingOperation extends PaxosOperation { final HistoryChecker historyChecker; public ModifyingOperation(int id, IInvokableInstance instance, ConsistencyLevel commitConsistency, ConsistencyLevel serialConsistency, int primaryKey, HistoryChecker historyChecker) @@ -142,70 +138,46 @@ public ModifyingOperation(int id, IInvokableInstance instance, ConsistencyLevel void verify(Observation outcome) { - (outcome.result != null ? successfulWrites : failedWrites).incrementAndGet(); - if (outcome.result != null) + SimpleQueryResult result = outcome.result; + (result != null ? successfulWrites : failedWrites).incrementAndGet(); + if (result != null) { - if (outcome.result.length != 1) - throw fail(primaryKey, "Result: 1 != #%s", Arrays.toString(outcome.result)); - if (outcome.result[0][0] != TRUE) + if (!result.hasNext()) + throw fail(primaryKey, "Paxos Result: 1 != #[]"); + if (result.next().getBoolean(0) != TRUE) throw fail(primaryKey, "Result != TRUE"); + if (result.hasNext()) + throw fail(primaryKey, "Paxos Result: 1 != #%s", ArrayUtils.toString(result.toObjectArrays())); } - historyChecker.applied(outcome.id, outcome.start, outcome.end, outcome.result != null); + historyChecker.applied(outcome.id, outcome.start, outcome.end, outcome.isSuccess()); } } - final ClusterActions.Options clusterOptions; - final float readRatio; - final IntRange withinKeyConcurrency; - final int concurrency; - final IntRange simulateKeyForSeconds; - final ConsistencyLevel serialConsistency; - final Debug debug; final List historyCheckers = new ArrayList<>(); - final AtomicInteger successfulReads = new AtomicInteger(); - final AtomicInteger successfulWrites = new AtomicInteger(); - final AtomicInteger failedReads = new AtomicInteger(); - final AtomicInteger failedWrites = new AtomicInteger(); - final long seed; - final int[] primaryKeys; + private final TransactionalMode transactionalMode; public PairOfSequencesPaxosSimulation(SimulatedSystems simulated, Cluster cluster, ClusterActions.Options clusterOptions, + TransactionalMode transactionalMode, float readRatio, int concurrency, IntRange simulateKeyForSeconds, IntRange withinKeyConcurrency, ConsistencyLevel serialConsistency, RunnableActionScheduler scheduler, Debug debug, long seed, int[] primaryKeys, long runForNanos, LongSupplier jitter) { - super(runForNanos < 0 ? STREAM_LIMITED : clusterOptions.topologyChangeLimit < 0 ? TIME_LIMITED : TIME_AND_STREAM_LIMITED, - simulated, cluster, scheduler, runForNanos, jitter); - this.readRatio = readRatio; - this.concurrency = concurrency; - this.simulateKeyForSeconds = simulateKeyForSeconds; - this.withinKeyConcurrency = withinKeyConcurrency; - this.serialConsistency = serialConsistency; - this.clusterOptions = clusterOptions; - this.debug = debug; - this.seed = seed; - this.primaryKeys = primaryKeys.clone(); - Arrays.sort(this.primaryKeys); + super(simulated, cluster, clusterOptions, + readRatio, concurrency, simulateKeyForSeconds, withinKeyConcurrency, + serialConsistency, + scheduler, debug, + seed, primaryKeys, + runForNanos, jitter); + this.transactionalMode = transactionalMode; } - public ActionPlan plan() + @Override + BiFunction> actionFactory() { - ActionPlan plan = new KeyspaceActions(simulated, KEYSPACE, TABLE, CREATE_TABLE, cluster, - clusterOptions, serialConsistency, this, primaryKeys, debug).plan(); - - plan = plan.encapsulate(ActionPlan.setUpTearDown( - ActionList.of( - cluster.stream().map(i -> simulated.run("Insert Partitions", i, executeForPrimaryKeys(INSERT1, primaryKeys))) - ), - ActionList.of( - cluster.stream().map(i -> SimulatedActionTask.unsafeTask("Shutdown " + i.broadcastAddress(), RELIABLE, RELIABLE_NO_TIMEOUTS, simulated, i, i::shutdown)) - ) - )); - final int nodes = cluster.size(); for (int primaryKey : primaryKeys) historyCheckers.add(new HistoryChecker(primaryKey)); @@ -231,12 +203,12 @@ public Action get() if (simulated.snitch.dcOf(node) > 0) { // perform some queries against these nodes but don't expect them to be linearizable - return new NonVerifyingOperation(i++, instance, serialConsistency, primaryKey, historyChecker); + return nonVerifying(i++, instance, primaryKey, historyChecker); } case SERIAL: return simulated.random.decide(readRatio) - ? new VerifyingOperation(i++, instance, serialConsistency, primaryKey, historyChecker) - : new ModifyingOperation(i++, instance, ANY, serialConsistency, primaryKey, historyChecker); + ? verifying(i++, instance, primaryKey, historyChecker) + : modifying(i++, instance, primaryKey, historyChecker); } } @@ -271,71 +243,55 @@ public String toString() primaryKeyActions.add(supplier); } + return (ignore, primaryKeyIndex) -> primaryKeyActions.get(only(primaryKeyIndex)); + } - List available = IntStream.range(0, primaryKeys.length).boxed().collect(Collectors.toList()); - Action stream = Actions.infiniteStream(concurrency, new Supplier() { - @Override - public Action get() - { - int i = simulated.random.uniform(0, available.size()); - int next = available.get(i); - available.set(i, available.get(available.size() - 1)); - available.remove(available.size() - 1); - long untilNanos = simulated.time.nanoTime() + SECONDS.toNanos(simulateKeyForSeconds.select(simulated.random)); - int concurrency = withinKeyConcurrency.select(simulated.random); - Supplier supplier = primaryKeyActions.get(next); - // while this stream is finite, it participates in an infinite stream via its parent, so we want to permit termination while it's running - return Actions.infiniteStream(concurrency, new Supplier() - { - @Override - public Action get() - { - if (simulated.time.nanoTime() >= untilNanos) - { - available.add(next); - return null; - } - return supplier.get(); - } + private static int only(int[] array) + { + if (array.length != 1) + throw new AssertionError("Require only 1 element but found array " + Arrays.toString(array)); + return array[0]; + } - @Override - public String toString() - { - return supplier.toString(); - } - }); - } + @Override + protected String createTableStmt() + { + return "CREATE TABLE " + KEYSPACE + ".tbl (pk int, count int, seq1 text, seq2 list, PRIMARY KEY (pk)) WITH " + transactionalMode.asCqlParam(); + } - @Override - public String toString() - { - return "Primary Key Actions"; - } - }); + @Override + protected String preInsertStmt() + { + return "INSERT INTO " + KEYSPACE + ".tbl (pk, count, seq1, seq2) VALUES (?, 0, '', []) USING TIMESTAMP 0"; + } - return simulated.execution.plan() - .encapsulate(plan) - .encapsulate(ActionPlan.interleave(singletonList(ActionList.of(stream)))); + private Operation verifying(int operationId, IInvokableInstance instance, int primaryKey, HistoryChecker historyChecker) + { + return new VerifyingOperation(operationId, instance, serialConsistency, primaryKey, historyChecker); } - private IIsolatedExecutor.SerializableRunnable executeForPrimaryKeys(String cql, int[] primaryKeys) + private Operation nonVerifying(int operationId, IInvokableInstance instance, int primaryKey, HistoryChecker historyChecker) { - return () -> { - for (int primaryKey : primaryKeys) - Instance.unsafeExecuteInternalWithResult(cql, primaryKey); - }; + return new NonVerifyingOperation(operationId, instance, serialConsistency, primaryKey, historyChecker); } - @Override - public TopologyChangeValidator newTopologyChangeValidator(Object id) + private Operation modifying(int operationId, IInvokableInstance instance, int primaryKey, HistoryChecker historyChecker) { - return new PaxosTopologyChangeVerifier(cluster, KEYSPACE, TABLE, id); + return new ModifyingOperation(operationId, instance, ANY, serialConsistency, primaryKey, historyChecker); } @Override - public RepairValidator newRepairValidator(Object id) + protected Class[] expectedExceptions() + { + return maybeAccord() ? expectedExceptionsAccord() : expectedExceptionsPaxos(); + } + + private boolean maybeAccord() { - return new PaxosRepairValidator(cluster, KEYSPACE, TABLE, id); + // at startup are we accord? + if (transactionalMode != null && transactionalMode.accordIsEnabled) return true; + // do we migrate to accord? + return clusterOptions.consensusChangeLimit > 0; } @Override @@ -346,10 +302,12 @@ void log(@Nullable Integer primaryKey) } @Override - public void run() + boolean joinAll() { - super.run(); - logger.warn("Writes: {} successful, {} failed", successfulWrites, failedWrites); - logger.warn("Reads: {} successful {} failed", successfulReads, failedReads); + // Consensus migration means Accord is running and Accord doesn't yet support joining nodes + if ((clusterOptions.consensusChangeLimit == -1 || clusterOptions.consensusChangeLimit > 0) + && clusterOptions.consensusChoices.options.length > 0) + return true; + return false; } } diff --git a/test/simulator/main/org/apache/cassandra/simulator/paxos/PaxosClusterSimulation.java b/test/simulator/main/org/apache/cassandra/simulator/paxos/PaxosClusterSimulation.java index 03d7e61e7ff2..dc8a538b6ca3 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/paxos/PaxosClusterSimulation.java +++ b/test/simulator/main/org/apache/cassandra/simulator/paxos/PaxosClusterSimulation.java @@ -22,8 +22,8 @@ import org.apache.cassandra.config.Config.PaxosVariant; import org.apache.cassandra.distributed.api.ConsistencyLevel; -import org.apache.cassandra.simulator.RandomSource; import org.apache.cassandra.simulator.ClusterSimulation; +import org.apache.cassandra.simulator.RandomSource; import org.apache.cassandra.simulator.utils.KindOfSequence; import static java.util.concurrent.TimeUnit.SECONDS; @@ -79,12 +79,13 @@ public PaxosClusterSimulation create(long seed) throws IOException .set("paxos_cache_size", (builder.stateCache != null ? builder.stateCache : random.uniformFloat() < 0.5) ? null : "0MiB") .set("paxos_state_purging", "repaired") .set("paxos_on_linearizability_violations", "log") + .set("storage_compatibility_mode", "NONE") , (simulated, schedulers, cluster, options) -> { int[] primaryKeys = primaryKeys(seed, builder.primaryKeyCount()); KindOfSequence.Period jitter = RandomSource.Choices.uniform(KindOfSequence.values()).choose(random) .period(builder.schedulerJitterNanos(), random); - return new PairOfSequencesPaxosSimulation(simulated, cluster, options.changePaxosVariantTo(builder.finalPaxosVariant), + return new PairOfSequencesPaxosSimulation(simulated, cluster, options.changePaxosVariantTo(builder.finalPaxosVariant), builder.transactionalMode(), builder.readChance().select(random), builder.concurrency(), builder.primaryKeySeconds(), builder.withinKeyConcurrency(), builder.serialConsistency, schedulers, builder.debug(), seed, primaryKeys, builder.secondsToSimulate() >= 0 ? SECONDS.toNanos(builder.secondsToSimulate()) : -1, diff --git a/test/simulator/main/org/apache/cassandra/simulator/paxos/PaxosSimulation.java b/test/simulator/main/org/apache/cassandra/simulator/paxos/PaxosSimulation.java index a6fbc444651b..e58e691962ef 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/paxos/PaxosSimulation.java +++ b/test/simulator/main/org/apache/cassandra/simulator/paxos/PaxosSimulation.java @@ -18,7 +18,12 @@ package org.apache.cassandra.simulator.paxos; +import java.nio.channels.ClosedChannelException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import java.util.Map; +import java.util.concurrent.CancellationException; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; @@ -26,17 +31,23 @@ import java.util.concurrent.atomic.AtomicReference; import java.util.function.BiConsumer; import java.util.function.LongSupplier; +import java.util.stream.Stream; import javax.annotation.Nullable; import com.google.common.base.Throwables; +import org.apache.cassandra.gms.FailureDetector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import accord.coordinate.CoordinationFailed; +import accord.coordinate.Invalidated; +import accord.coordinate.Preempted; import org.apache.cassandra.concurrent.ExecutorFactory; import org.apache.cassandra.concurrent.ScheduledExecutorPlus; import org.apache.cassandra.distributed.Cluster; -import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.IIsolatedExecutor; +import org.apache.cassandra.distributed.api.SimpleQueryResult; import org.apache.cassandra.exceptions.RequestExecutionException; import org.apache.cassandra.service.paxos.BallotGenerator; import org.apache.cassandra.simulator.ActionList; @@ -46,8 +57,10 @@ import org.apache.cassandra.simulator.Simulation; import org.apache.cassandra.simulator.cluster.ClusterActionListener; import org.apache.cassandra.simulator.systems.InterceptorOfGlobalMethods; -import org.apache.cassandra.simulator.systems.SimulatedQuery; +import org.apache.cassandra.simulator.systems.SimulatedActionCallable; import org.apache.cassandra.simulator.systems.SimulatedSystems; +import org.apache.cassandra.streaming.StreamReceivedOutOfTokenRangeException; +import org.apache.cassandra.utils.AssertionUtils; import org.apache.cassandra.utils.CloseableIterator; import org.apache.cassandra.utils.concurrent.Threads; import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; @@ -57,22 +70,53 @@ import static org.apache.cassandra.simulator.Action.Modifiers.NONE; import static org.apache.cassandra.simulator.SimulatorUtils.failWithOOM; import static org.apache.cassandra.simulator.paxos.HistoryChecker.causedBy; +import static org.apache.cassandra.utils.AssertionUtils.anyOf; +import static org.apache.cassandra.utils.AssertionUtils.hasCause; public abstract class PaxosSimulation implements Simulation, ClusterActionListener { private static final Logger logger = LoggerFactory.getLogger(PaxosSimulation.class); - abstract class Operation extends SimulatedQuery implements BiConsumer + private static String createDescription(int[] primaryKeys, int id, String idString) { - final int primaryKey; + return (primaryKeys.length == 1 ? Integer.toString(primaryKeys[0]) : Arrays.toString(primaryKeys)) + '/' + id + ": " + idString; + } + + @SuppressWarnings("unchecked") + protected Class[] expectedExceptionsPaxos() + { + return (Class[]) new Class[] { RequestExecutionException.class, + CancellationException.class, + FailureDetector.UnknownEndpointException.class}; + } + + @SuppressWarnings("unchecked") + protected Class[] expectedExceptionsAccord() + { + return (Class[]) new Class[] { RequestExecutionException.class, + Invalidated.class, + Preempted.class, + CancellationException.class, + CoordinationFailed.class, + ClosedChannelException.class, + FailureDetector.UnknownEndpointException.class, + StreamReceivedOutOfTokenRangeException.class // should always come in combination with closed channel exception + }; + } + + protected abstract Class[] expectedExceptions(); + + abstract class Operation extends SimulatedActionCallable implements BiConsumer + { + final int[] primaryKeys; final int id; int start; - public Operation(int primaryKey, int id, IInvokableInstance instance, - String idString, String query, ConsistencyLevel commitConsistency, ConsistencyLevel serialConistency, Object... params) + public Operation(int[] primaryKeys, int id, IInvokableInstance instance, + String idString, IIsolatedExecutor.SerializableCallable query) { - super(primaryKey + "/" + id + ": " + idString, DISPLAY_ORIGIN, NONE, PaxosSimulation.this.simulated, instance, query, commitConsistency, serialConistency, params); - this.primaryKey = primaryKey; + super(createDescription(primaryKeys, id, idString), DISPLAY_ORIGIN, NONE, PaxosSimulation.this.simulated, instance, query); + this.primaryKeys = primaryKeys; this.id = id; } @@ -82,12 +126,30 @@ public ActionList performAndRegister() return super.performAndRegister(); } + private boolean wasInterrupted(Throwable failure) + { + if (failure instanceof UncheckedInterruptedException) + return true; + + if (failure instanceof InterruptedException) + return true; + + Throwable cause = failure.getCause(); + while (cause != null && cause != failure) + { + if (cause instanceof InterruptedException) + return true; + cause = cause.getCause(); + } + return false; + } + @Override - public void accept(Object[][] success, Throwable failure) + public void accept(SimpleQueryResult success, Throwable failure) { - if (failure != null && !(failure instanceof RequestExecutionException)) + if (failure != null && !expectedException(failure)) { - if (!simulated.failures.hasFailure() || !(failure instanceof UncheckedInterruptedException)) + if (!simulated.failures.hasFailure() || !wasInterrupted(failure)) logger.error("Unexpected exception", failure); simulated.failures.accept(failure); return; @@ -96,10 +158,14 @@ else if (failure != null) { logger.trace("{}", failure.getMessage()); } - verify(new Observation(id, success, start, logicalClock.incrementAndGet())); } + protected boolean expectedException(Throwable failure) + { + // due to class loaders can't use instanceOf directly + return hasCause(anyOf(Stream.of(expectedExceptions()).map(AssertionUtils::isThrowableInstanceof))).matches(failure); + } abstract void verify(Observation outcome); } @@ -248,19 +314,39 @@ RuntimeException failWith(Throwable t) private RuntimeException logAndThrow() { - Integer causedByPrimaryKey = null; - Throwable causedByThrowable = null; + class Violation + { + final int primaryKey; + final Throwable cause; + + Violation(int primaryKey, Throwable cause) + { + this.primaryKey = primaryKey; + this.cause = cause; + } + } + List violations = new ArrayList<>(); for (Throwable t : simulated.failures.get()) { + Integer causedByPrimaryKey; if (null != (causedByPrimaryKey = causedBy(t))) { - causedByThrowable = t; + violations.add(new Violation(causedByPrimaryKey, t)); break; } } - log(causedByPrimaryKey); - Throwable t = (causedByPrimaryKey != null) ? causedByThrowable : simulated.failures.get().get(0); + if (!violations.isEmpty()) + { + AssertionError error = new AssertionError("History violations detected"); + violations.forEach(v -> { + log(v.primaryKey); + error.addSuppressed(v.cause); + }); + throw error; + } + + Throwable t = simulated.failures.get().get(0); Throwables.throwIfUnchecked(t); throw new RuntimeException(t); } diff --git a/test/simulator/main/org/apache/cassandra/simulator/paxos/PaxosSimulationRunner.java b/test/simulator/main/org/apache/cassandra/simulator/paxos/PaxosSimulationRunner.java index 50a0ee5b51c5..7c9062d3d52f 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/paxos/PaxosSimulationRunner.java +++ b/test/simulator/main/org/apache/cassandra/simulator/paxos/PaxosSimulationRunner.java @@ -19,18 +19,26 @@ package org.apache.cassandra.simulator.paxos; import java.io.IOException; +import java.util.Objects; import java.util.Optional; import java.util.concurrent.atomic.AtomicInteger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import io.airlift.airline.Cli; import io.airlift.airline.Command; import io.airlift.airline.Option; import org.apache.cassandra.config.Config; import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.simulator.SimulationRunner; +import org.apache.cassandra.simulator.SimulatorUtils; +import org.apache.cassandra.simulator.utils.IntRange; public class PaxosSimulationRunner extends SimulationRunner { + private static Logger logger = LoggerFactory.getLogger(PaxosSimulationRunner.class); + @Command(name = "run") public static class Run extends SimulationRunner.Run { @@ -57,6 +65,18 @@ protected void propagate(PaxosClusterSimulation.Builder builder) super.propagate(builder); propagateTo(consistency, withStateCache, withoutStateCache, variant, toVariant, builder); } + + @Override + protected void run( long seed, PaxosClusterSimulation.Builder builder) throws IOException + { + if (!Objects.equals(builder.transactionalMode(), "off")) + { + // Apply handicaps + builder.dcs(new IntRange(1, 1)); + builder.nodes(new IntRange(3, 3)); + } + super.run(seed, builder); + } } @Command(name = "record") @@ -88,7 +108,8 @@ protected void propagate(PaxosClusterSimulation.Builder builder) } @Command(name = "reconcile") - public static class Reconcile extends SimulationRunner.Reconcile + public static class + Reconcile extends SimulationRunner.Reconcile { @Option(name = "--consistency") String consistency; @@ -116,6 +137,7 @@ protected void propagate(PaxosClusterSimulation.Builder builder) } public static class Help extends HelpCommand {} + public static class Version extends VersionCommand {} static void propagateTo(String consistency, boolean withStateCache, boolean withoutStateCache, String variant, String toVariant, PaxosClusterSimulation.Builder builder) { @@ -134,6 +156,7 @@ static void propagateTo(String consistency, boolean withStateCache, boolean with */ public static void main(String[] args) throws IOException { + SimulatorUtils.verifyAndlogSimulatorArgs(logger, args); PaxosClusterSimulation.Builder builder = new PaxosClusterSimulation.Builder(); builder.unique(uniqueNum.getAndIncrement()); @@ -141,6 +164,7 @@ public static void main(String[] args) throws IOException .withCommand(Run.class) .withCommand(Reconcile.class) .withCommand(Record.class) + .withCommand(Version.class) .withCommand(Help.class) .withDefaultCommand(Help.class) .build() diff --git a/test/simulator/main/org/apache/cassandra/simulator/paxos/PaxosTopologyChangeVerifier.java b/test/simulator/main/org/apache/cassandra/simulator/paxos/PaxosTopologyChangeVerifier.java index 46c4c9ecf6c3..42a80b9121d0 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/paxos/PaxosTopologyChangeVerifier.java +++ b/test/simulator/main/org/apache/cassandra/simulator/paxos/PaxosTopologyChangeVerifier.java @@ -58,7 +58,7 @@ public void before(Topology before, int[] participatingKeys) @Override public void after(Topology topologyAfter) { - afterInternal(topologyAfter.select(topologyBefore.primaryKeys)); + afterInternal(topologyBefore == null ? topologyAfter : topologyAfter.select(topologyBefore.primaryKeys)); } public void afterInternal(Topology topologyAfter) diff --git a/test/simulator/main/org/apache/cassandra/simulator/paxos/StrictSerializabilityValidator.java b/test/simulator/main/org/apache/cassandra/simulator/paxos/StrictSerializabilityValidator.java new file mode 100644 index 000000000000..332bb3d660ad --- /dev/null +++ b/test/simulator/main/org/apache/cassandra/simulator/paxos/StrictSerializabilityValidator.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.paxos; + +import javax.annotation.Nullable; + +import accord.verify.StrictSerializabilityVerifier; +import com.carrotsearch.hppc.IntIntHashMap; +import com.carrotsearch.hppc.IntIntMap; + +public class StrictSerializabilityValidator implements HistoryValidator +{ + private final StrictSerializabilityVerifier verifier; + private final IntIntMap pkToIndex; + private final int[] indexToPk; + + public StrictSerializabilityValidator(int[] primaryKeys) + { + this.verifier = new StrictSerializabilityVerifier("", primaryKeys.length); + pkToIndex = new IntIntHashMap(primaryKeys.length); + indexToPk = new int[primaryKeys.length]; + for (int i = 0; i < primaryKeys.length; i++) + { + pkToIndex.put(primaryKeys[i], i); + indexToPk[i] = primaryKeys[i]; + } + } + + @Override + public Checker witness(int start, int end) + { + verifier.begin(); + return new Checker() + { + @Override + public void read(int pk, int id, int count, int[] seq) + { + verifier.witnessRead(get(pk), seq); + } + + @Override + public void write(int pk, int id, boolean success) + { + verifier.witnessWrite(get(pk), id); + } + + @Override + public void close() + { + convertHistoryViolation(() -> verifier.apply("", start, end)); + } + }; + } + + @Override + public void print(@Nullable Integer pk) + { + if (pk == null) verifier.print(); + else verifier.print(get(pk)); + } + + private int get(int pk) + { + if (pkToIndex.containsKey(pk)) + return pkToIndex.get(pk); + throw new IllegalArgumentException("Unknown pk=" + pk); + } + + private void convertHistoryViolation(Runnable fn) + { + try + { + fn.run(); + } + catch (accord.verify.HistoryViolation e) + { + if (!(e.primaryKey() >= 0 && e.primaryKey() < indexToPk.length)) throw new IllegalArgumentException("Unable to find primary key by index " + e.primaryKey()); + int pk = indexToPk[e.primaryKey()]; + HistoryViolation v = new HistoryViolation(pk, e.getMessage()); + v.setStackTrace(e.getStackTrace()); + throw v; + } + } + + public static class Factory implements HistoryValidator.Factory + { + public static final Factory instance = new Factory(); + + @Override + public HistoryValidator create(int[] partitions) + { + return new StrictSerializabilityValidator(partitions); + } + } +} diff --git a/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptibleThread.java b/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptibleThread.java index 0cb26bf0e5a5..1300a0a47d30 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptibleThread.java +++ b/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptibleThread.java @@ -159,8 +159,7 @@ synchronized void await() } catch (InterruptedException e) { - if (!isTriggered()) throw new UncheckedInterruptedException(e); - else doInterrupt(); + doInterrupt(); } } diff --git a/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptingExecutor.java b/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptingExecutor.java index a9c0fb2e00ef..1c38cd813d72 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptingExecutor.java +++ b/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptingExecutor.java @@ -91,43 +91,9 @@ interface InterceptingTaskFactory extends TaskFactory OrderOn orderAppliesAfterScheduling(); - static class InterceptedScheduledFutureTask extends SyncFutureTask implements ScheduledFuture + interface InterceptableScheduledFuture extends ScheduledFuture, RunnableFuture { - final long delayNanos; - Runnable onCancel; - public InterceptedScheduledFutureTask(long delayNanos, Callable call) - { - super(call); - this.delayNanos = delayNanos; - } - - @Override - public long getDelay(TimeUnit unit) - { - return unit.convert(delayNanos, NANOSECONDS); - } - - @Override - public int compareTo(Delayed that) - { - return Long.compare(delayNanos, that.getDelay(NANOSECONDS)); - } - - void onCancel(Runnable onCancel) - { - this.onCancel = onCancel; - } - - @Override - public boolean cancel(boolean b) - { - if (onCancel != null) - { - onCancel.run(); - onCancel = null; - } - return super.cancel(b); - } + void onCancel(Runnable onCancel); } @PerClassLoader @@ -715,6 +681,45 @@ public String toString() @PerClassLoader class InterceptingSequentialExecutor extends AbstractSingleThreadedExecutorPlus implements InterceptingExecutor, ScheduledExecutorPlus, OrderOn { + static class InterceptableScheduledFutureTask extends SyncFutureTask implements InterceptableScheduledFuture + { + final long delayNanos; + Runnable onCancel; + public InterceptableScheduledFutureTask(long delayNanos, Callable call) + { + super(call); + this.delayNanos = delayNanos; + } + + @Override + public long getDelay(TimeUnit unit) + { + return unit.convert(delayNanos, NANOSECONDS); + } + + @Override + public int compareTo(Delayed that) + { + return Long.compare(delayNanos, that.getDelay(NANOSECONDS)); + } + + public void onCancel(Runnable onCancel) + { + this.onCancel = onCancel; + } + + @Override + public boolean cancel(boolean b) + { + if (onCancel != null) + { + onCancel.run(); + onCancel = null; + } + return super.cancel(b); + } + } + InterceptingSequentialExecutor(InterceptorOfExecution interceptorOfExecution, ThreadFactory threadFactory, InterceptingTaskFactory taskFactory) { super(interceptorOfExecution, threadFactory, taskFactory); @@ -765,7 +770,7 @@ public ScheduledFuture schedule(Runnable run, long delay, TimeUnit unit) throw new RejectedExecutionException(); long delayNanos = unit.toNanos(delay); - return interceptorOfExecution.intercept().schedule(SCHEDULED_TASK, delayNanos, relativeToGlobalNanos(delayNanos), callable(run, null), this); + return schedule(SCHEDULED_TASK, delayNanos, relativeToGlobalNanos(delayNanos), callable(run, null)); } public ScheduledFuture schedule(Callable callable, long delay, TimeUnit unit) @@ -774,7 +779,7 @@ public ScheduledFuture schedule(Callable callable, long delay, TimeUni throw new RejectedExecutionException(); long delayNanos = unit.toNanos(delay); - return interceptorOfExecution.intercept().schedule(SCHEDULED_TASK, delayNanos, relativeToGlobalNanos(delayNanos), callable, this); + return schedule(SCHEDULED_TASK, delayNanos, relativeToGlobalNanos(delayNanos), callable); } public ScheduledFuture scheduleTimeoutWithDelay(Runnable run, long delay, TimeUnit unit) @@ -787,7 +792,7 @@ public ScheduledFuture scheduleAt(Runnable run, long deadlineNanos) if (isShutdown) throw new RejectedExecutionException(); - return interceptorOfExecution.intercept().schedule(SCHEDULED_TASK, localToRelativeNanos(deadlineNanos), localToGlobalNanos(deadlineNanos), callable(run, null), this); + return schedule(SCHEDULED_TASK, localToRelativeNanos(deadlineNanos), localToGlobalNanos(deadlineNanos), callable(run, null)); } public ScheduledFuture scheduleTimeoutAt(Runnable run, long deadlineNanos) @@ -795,7 +800,7 @@ public ScheduledFuture scheduleTimeoutAt(Runnable run, long deadlineNanos) if (isShutdown) throw new RejectedExecutionException(); - return interceptorOfExecution.intercept().schedule(SCHEDULED_TIMEOUT, localToRelativeNanos(deadlineNanos), localToGlobalNanos(deadlineNanos), callable(run, null), this); + return schedule(SCHEDULED_TIMEOUT, localToRelativeNanos(deadlineNanos), localToGlobalNanos(deadlineNanos), callable(run, null)); } public ScheduledFuture scheduleSelfRecurring(Runnable run, long delay, TimeUnit unit) @@ -804,7 +809,7 @@ public ScheduledFuture scheduleSelfRecurring(Runnable run, long delay, TimeUn throw new RejectedExecutionException(); long delayNanos = unit.toNanos(delay); - return interceptorOfExecution.intercept().schedule(SCHEDULED_DAEMON, delayNanos, relativeToGlobalNanos(delayNanos), callable(run, null), this); + return schedule(SCHEDULED_DAEMON, delayNanos, relativeToGlobalNanos(delayNanos), callable(run, null)); } public ScheduledFuture scheduleAtFixedRate(Runnable run, long initialDelay, long period, TimeUnit unit) @@ -813,7 +818,7 @@ public ScheduledFuture scheduleAtFixedRate(Runnable run, long initialDelay, l throw new RejectedExecutionException(); long delayNanos = unit.toNanos(initialDelay); - return interceptorOfExecution.intercept().schedule(SCHEDULED_DAEMON, delayNanos, relativeToGlobalNanos(delayNanos), new Callable() + return schedule(SCHEDULED_DAEMON, delayNanos, relativeToGlobalNanos(delayNanos), new Callable() { @Override public Object call() @@ -829,7 +834,12 @@ public String toString() { return run.toString(); } - }, this); + }); + } + + ScheduledFuture schedule(SimulatedAction.Kind kind, long delayNanos, long deadlineNanos, Callable task) + { + return interceptorOfExecution.intercept().schedule(kind, delayNanos, deadlineNanos, new InterceptableScheduledFutureTask<>(delayNanos, task), this); } public ScheduledFuture scheduleWithFixedDelay(Runnable run, long initialDelay, long delay, TimeUnit unit) @@ -843,6 +853,8 @@ public int concurrency() } } + + @PerClassLoader class InterceptingPooledLocalAwareExecutor extends InterceptingPooledExecutor implements LocalAwareExecutorPlus { diff --git a/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptingExecutorFactory.java b/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptingExecutorFactory.java index c7f4dce8b85a..aa22e3ef8683 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptingExecutorFactory.java +++ b/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptingExecutorFactory.java @@ -30,13 +30,13 @@ import com.google.common.annotations.VisibleForTesting; +import accord.utils.UnhandledEnum; import io.netty.util.concurrent.FastThreadLocal; import org.apache.cassandra.concurrent.ExecutorBuilder; import org.apache.cassandra.concurrent.ExecutorBuilderFactory; import org.apache.cassandra.concurrent.ExecutorFactory; import org.apache.cassandra.concurrent.ExecutorPlus; import org.apache.cassandra.concurrent.InfiniteLoopExecutor; -import org.apache.cassandra.concurrent.InfiniteLoopExecutor.Daemon; import org.apache.cassandra.concurrent.InfiniteLoopExecutor.Interrupts; import org.apache.cassandra.concurrent.InfiniteLoopExecutor.SimulatorSafe; import org.apache.cassandra.concurrent.Interruptible.Task; @@ -69,6 +69,8 @@ import org.apache.cassandra.utils.concurrent.RunnableFuture; import static org.apache.cassandra.simulator.systems.SimulatedAction.Kind.INFINITE_LOOP; +import static org.apache.cassandra.simulator.systems.SimulatedAction.Kind.SCHEDULED_DAEMON; +import static org.apache.cassandra.simulator.systems.SimulatedAction.Kind.THREAD; public class InterceptingExecutorFactory implements ExecutorFactory, Closeable { @@ -327,9 +329,18 @@ public ExecutorPlus pooled(String name, int threads) return configurePooled(name, threads).build(); } - public Thread startThread(String name, Runnable runnable, Daemon daemon) + public Thread startThread(String name, Runnable runnable, SystemThreadTag systemTag, SimulatorThreadTag simulatorTag) { - return simulatedExecution.intercept().start(SimulatedAction.Kind.THREAD, factory(name)::newThread, runnable); + SimulatedAction.Kind kind; + switch (simulatorTag) + { + default: throw UnhandledEnum.unknown(simulatorTag); + case INFINITE_LOOP: kind = INFINITE_LOOP; break; + case JOB: kind = THREAD; break; + case DAEMON: kind = SCHEDULED_DAEMON; break; + } + + return simulatedExecution.intercept().start(kind, factory(name)::newThread, runnable); } @VisibleForTesting @@ -341,7 +352,7 @@ public InterceptedExecution.InterceptedThreadStart startParked(String name, Runn } @Override - public Interruptible infiniteLoop(String name, Task task, SimulatorSafe simulatorSafe, Daemon daemon, Interrupts interrupts) + public Interruptible infiniteLoop(String name, Task task, SimulatorSafe simulatorSafe, SystemThreadTag systemTag, Interrupts interrupts) { if (simulatorSafe != SimulatorSafe.SAFE) { diff --git a/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptingGlobalMethods.java b/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptingGlobalMethods.java index 34c0f6bacc26..75f525e640a4 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptingGlobalMethods.java +++ b/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptingGlobalMethods.java @@ -116,7 +116,7 @@ public InterceptibleThread ifIntercepted() if (!disabled) { - logger.error("Caught a non-intercepted thread! " + thread, new RuntimeException()); + logger().error("Caught a non-intercepted thread! " + thread, new RuntimeException()); throw failWithOOM(); } diff --git a/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptingMonitors.java b/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptingMonitors.java index 3aabd18e5e29..1fcc4a8354c5 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptingMonitors.java +++ b/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptingMonitors.java @@ -55,9 +55,21 @@ @SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter") public abstract class InterceptingMonitors implements InterceptorOfGlobalMethods, Closeable { - private static final Logger logger = LoggerFactory.getLogger(InterceptingMonitors.class); private static final boolean DEBUG_MONITOR_STATE = TEST_SIMULATOR_DEBUG.getBoolean(); + // eagerly initializing the logger prevents the dtest instance variables + // from being setup correctly, which causes all nodes to log as
    + private static class LoggerHandle + { + private static final Logger logger = LoggerFactory.getLogger(InterceptingMonitors.class); + } + + protected static Logger logger() + { + return LoggerHandle.logger; + } + + static class MonitorState { InterceptedMonitorWait waitingOnNotify; @@ -650,8 +662,9 @@ public void preMonitorEnter(Object monitor, float preMonitorDelayChance) if (!(anyThread instanceof InterceptibleThread)) return; - boolean restoreInterrupt = false; + // save any interrupt before testing random.decide, in case we are trapping these for verification InterceptibleThread thread = (InterceptibleThread) anyThread; + boolean restoreInterrupt = Thread.interrupted(); try { if ( !thread.isEvaluationDeterministic() @@ -662,8 +675,6 @@ public void preMonitorEnter(Object monitor, float preMonitorDelayChance) InterceptedConditionWait signal = new InterceptedConditionWait(NEMESIS, 0L, thread, captureWaitSite(thread), null); thread.interceptWait(signal); - // save interrupt state to restore afterwards - new ones only arrive if terminating simulation - restoreInterrupt = Thread.interrupted(); while (true) { try @@ -687,10 +698,7 @@ public void preMonitorEnter(Object monitor, float preMonitorDelayChance) { if (!thread.isIntercepting() && disabled) return; else if (!thread.isIntercepting()) - { throw new AssertionError("Thread " + thread + " is running but is not simulated"); - } - checkForDeadlock(thread, state.heldBy); InterceptedMonitorWait wait = new InterceptedMonitorWait(UNBOUNDED_WAIT, 0L, state, thread, captureWaitSite(thread)); @@ -825,7 +833,7 @@ private void checkForDeadlock(Thread waiting, Thread blockedBy) return; // not really waiting, just hasn't woken up yet if (next == waiting) { - logger.error("Deadlock between {}{} and {}{}", waiting, Threads.prettyPrintStackTrace(waiting, true, ";"), cur, Threads.prettyPrintStackTrace(cur, true, ";")); + logger().error("Deadlock between {}{} and {}{}", waiting, Threads.prettyPrintStackTrace(waiting, true, ";"), cur, Threads.prettyPrintStackTrace(cur, true, ";")); throw failWithOOM(); } cur = next; diff --git a/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptorOfExecution.java b/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptorOfExecution.java index ac8255d506cc..6633e27408d5 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptorOfExecution.java +++ b/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptorOfExecution.java @@ -18,10 +18,10 @@ package org.apache.cassandra.simulator.systems; -import java.util.concurrent.Callable; import java.util.concurrent.ScheduledFuture; import java.util.function.Function; +import org.apache.cassandra.simulator.systems.InterceptingExecutor.InterceptableScheduledFuture; import org.apache.cassandra.simulator.systems.SimulatedAction.Kind; import org.apache.cassandra.utils.Shared; import org.apache.cassandra.utils.concurrent.RunnableFuture; @@ -38,7 +38,7 @@ public interface InterceptorOfExecution interface InterceptExecution { > T addTask(T task, InterceptingExecutor executor); - ScheduledFuture schedule(Kind kind, long delayNanos, long deadlineNanos, Callable runnable, InterceptingExecutor executor); + ScheduledFuture schedule(Kind kind, long delayNanos, long deadlineNanos, InterceptableScheduledFuture task, InterceptingExecutor executor); Thread start(Kind kind, Function factory, Runnable run); } } diff --git a/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptorOfGlobalMethods.java b/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptorOfGlobalMethods.java index adb8183bffc6..eca4ebb2b436 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptorOfGlobalMethods.java +++ b/test/simulator/main/org/apache/cassandra/simulator/systems/InterceptorOfGlobalMethods.java @@ -25,6 +25,9 @@ import java.util.function.LongConsumer; import java.util.function.ToIntFunction; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import net.openhft.chronicle.core.util.WeakIdentityHashMap; import org.apache.cassandra.simulator.systems.InterceptedWait.CaptureSites; import org.apache.cassandra.utils.Clock; @@ -377,6 +380,16 @@ public void close() @SuppressWarnings("unused") public static class Global { + private static class LoggerHandle + { + private static final Logger logger = LoggerFactory.getLogger(Global.class); + } + + private static Logger logger() + { + return LoggerHandle.logger; + } + private static InterceptorOfGlobalMethods methods; public static WaitQueue newWaitQueue() @@ -426,8 +439,7 @@ public static InterceptibleThread ifIntercepted() public static void uncaughtException(Thread thread, Throwable throwable) { - System.err.println(thread); - throwable.printStackTrace(System.err); + logger().error("Exception in thread {}", thread, throwable); methods.uncaughtException(thread, throwable); } diff --git a/test/simulator/main/org/apache/cassandra/simulator/systems/NotInterceptedSyncCondition.java b/test/simulator/main/org/apache/cassandra/simulator/systems/NotInterceptedSyncCondition.java index 6c53a4e981f6..04a72b695c58 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/systems/NotInterceptedSyncCondition.java +++ b/test/simulator/main/org/apache/cassandra/simulator/systems/NotInterceptedSyncCondition.java @@ -64,7 +64,10 @@ private static boolean notInterceptedWaitUntil(Object monitor, long deadlineNano if (wait <= 0) return false; - monitor.wait((wait + 999999) / 1000000); + wait = (wait + 999999) / 1000000; + if (wait < 0) + wait = Long.MAX_VALUE; + monitor.wait(wait); return true; } } diff --git a/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedAction.java b/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedAction.java index 0cf60dfb5e61..b30626136b29 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedAction.java +++ b/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedAction.java @@ -32,7 +32,7 @@ import org.apache.cassandra.concurrent.ImmediateExecutor; import org.apache.cassandra.distributed.api.IInvokableInstance; import org.apache.cassandra.distributed.api.IMessage; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.RequestCallback; import org.apache.cassandra.net.RequestCallbacks; @@ -242,7 +242,8 @@ private ActionList simulate(Runnable simulate) } catch (Throwable t) { - consequences.forEach(Action::invalidate); + if (consequences != null) + consequences.forEach(Action::invalidate); throw t; } @@ -384,18 +385,18 @@ List applyToMessage(IInvokableInstance from, IInvokableInstance to, IMes notify = from; } boolean isTimeout = deliver != FAILURE; - Executor callbackExecutor = notify.executorFor(verb.id); - if (callbackExecutor instanceof ImmediateExecutor) - callbackExecutor = to.executor(); + Executor notifierExecutor = notify.executorFor(verb.id); + if (notifierExecutor instanceof ImmediateExecutor) + notifierExecutor = notify.executor(); InterceptedExecution.InterceptedTaskExecution failTask = new InterceptedRunnableExecution( - (InterceptingExecutor) callbackExecutor, + (InterceptingExecutor) notifierExecutor, () -> notify.unsafeApplyOnThisThread((socketAddress, id, innerIsTimeout) -> { InetAddressAndPort address = InetAddressAndPort.getByAddress(socketAddress); RequestCallbacks.CallbackInfo callback = instance().callbacks.remove(id, address); if (callback != null) { RequestCallback invokeOn = (RequestCallback) callback.callback; - RequestFailureReason reason = innerIsTimeout ? RequestFailureReason.TIMEOUT : RequestFailureReason.UNKNOWN; + RequestFailure reason = innerIsTimeout ? RequestFailure.TIMEOUT : RequestFailure.UNKNOWN; invokeOn.onFailure(address, reason); } return null; diff --git a/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedExecution.java b/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedExecution.java index bb6387908e3f..a56aa84ea6ec 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedExecution.java +++ b/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedExecution.java @@ -29,7 +29,7 @@ import org.apache.cassandra.simulator.OrderOn; import org.apache.cassandra.simulator.systems.InterceptedExecution.InterceptedFutureTaskExecution; import org.apache.cassandra.simulator.systems.InterceptedExecution.InterceptedThreadStart; -import org.apache.cassandra.simulator.systems.InterceptingExecutor.InterceptedScheduledFutureTask; +import org.apache.cassandra.simulator.systems.InterceptingExecutor.InterceptableScheduledFuture; import org.apache.cassandra.utils.concurrent.Condition; import org.apache.cassandra.utils.concurrent.NotScheduledFuture; import org.apache.cassandra.utils.concurrent.RunnableFuture; @@ -93,7 +93,7 @@ public void run() return task; } - public ScheduledFuture schedule(SimulatedAction.Kind kind, long delayNanos, long deadlineNanos, Callable runnable, InterceptingExecutor executor) + public ScheduledFuture schedule(SimulatedAction.Kind kind, long delayNanos, long deadlineNanos, InterceptableScheduledFuture task, InterceptingExecutor executor) { return new NotScheduledFuture<>(); } @@ -135,10 +135,9 @@ public > T addTask(T task, InterceptingExecutor e return task; } - public ScheduledFuture schedule(SimulatedAction.Kind kind, long delayNanos, long deadlineNanos, Callable call, InterceptingExecutor executor) + public ScheduledFuture schedule(SimulatedAction.Kind kind, long delayNanos, long deadlineNanos, InterceptableScheduledFuture task, InterceptingExecutor executor) { assert kind == SCHEDULED_TASK || kind == SCHEDULED_TIMEOUT || kind == SCHEDULED_DAEMON; - InterceptedScheduledFutureTask task = new InterceptedScheduledFutureTask<>(delayNanos, call); InterceptedFutureTaskExecution intercepted = new InterceptedFutureTaskExecution<>(kind, executor, task, deadlineNanos); task.onCancel(intercepted::cancel); intercept.interceptExecution(intercepted, executor.orderAppliesAfterScheduling()); diff --git a/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedFailureDetector.java b/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedFailureDetector.java index 6bdc74c1d651..52bd7b6cf586 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedFailureDetector.java +++ b/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedFailureDetector.java @@ -19,12 +19,12 @@ package org.apache.cassandra.simulator.systems; import java.net.InetSocketAddress; -import java.util.Collections; -import java.util.IdentityHashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.CopyOnWriteArraySet; import java.util.function.Consumer; import java.util.function.Function; @@ -41,7 +41,7 @@ public static class Instance implements IFailureDetector private static volatile FailureDetector wrapped; private static volatile Function OVERRIDE; - private static final Map LISTENERS = Collections.synchronizedMap(new IdentityHashMap<>()); + private static final Set LISTENERS = new CopyOnWriteArraySet<>(); private static FailureDetector wrapped() { @@ -92,7 +92,7 @@ public void forceConviction(InetAddressAndPort ep) public void registerFailureDetectionEventListener(IFailureDetectionEventListener listener) { - LISTENERS.put(listener, Boolean.TRUE); + LISTENERS.add(listener); } public void unregisterFailureDetectionEventListener(IFailureDetectionEventListener listener) @@ -103,7 +103,7 @@ public void unregisterFailureDetectionEventListener(IFailureDetectionEventListen synchronized static void setup(Function override, Consumer> register) { OVERRIDE = override; - register.accept(ep -> LISTENERS.keySet().forEach(c -> c.convict(InetAddressAndPort.getByAddress(ep), Double.MAX_VALUE))); + register.accept(ep -> LISTENERS.forEach(c -> c.convict(InetAddressAndPort.getByAddress(ep), Double.MAX_VALUE))); } } diff --git a/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedQuery.java b/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedQuery.java index d190fd7a1530..106cd8c027cf 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedQuery.java +++ b/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedQuery.java @@ -20,9 +20,10 @@ import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.SimpleQueryResult; import org.apache.cassandra.distributed.impl.Query; -public class SimulatedQuery extends SimulatedActionCallable +public class SimulatedQuery extends SimulatedActionCallable { public SimulatedQuery(Object description, SimulatedSystems simulated, IInvokableInstance instance, String query, ConsistencyLevel commitConsistency, ConsistencyLevel serialConsistency, Object... params) { @@ -45,7 +46,7 @@ public SimulatedQuery(Object description, Modifiers self, Modifiers transitive, } @Override - public void accept(Object[][] success, Throwable failure) + public void accept(SimpleQueryResult success, Throwable failure) { if (failure != null) simulated.failures.accept(failure); diff --git a/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedSnitch.java b/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedSnitch.java index 55fe73c301f4..55495562982e 100644 --- a/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedSnitch.java +++ b/test/simulator/main/org/apache/cassandra/simulator/systems/SimulatedSnitch.java @@ -28,11 +28,13 @@ import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.IInstanceConfig; +import org.apache.cassandra.locator.Endpoint; import org.apache.cassandra.locator.IEndpointSnitch; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.locator.ReplicaCollection; import org.apache.cassandra.simulator.cluster.NodeLookup; +import org.apache.cassandra.utils.Sortable; public class SimulatedSnitch extends NodeLookup { @@ -73,6 +75,18 @@ public static void setup(Function lookupDc) { LOOKUP_DC = lookupDc; } + + @Override + public boolean supportCompareByEndpoint() + { + return true; + } + + @Override + public > Comparator endpointComparator(InetAddressAndPort address, C addresses) + { + return Comparator.comparingInt(SimulatedSnitch::asInt); + } } final int[] numInDcs; @@ -127,7 +141,7 @@ public List dcs() return Arrays.asList(nameOfDcs); } - private static int asInt(Replica address) + private static int asInt(Endpoint address) { byte[] bytes = address.endpoint().addressBytes; return bytes[0] | (bytes[1] << 8) | (bytes[2] << 16) | (bytes[3] << 24); diff --git a/test/simulator/test/org/apache/cassandra/simulator/paxos/AbstractPairOfSequencesPaxosSimulationTest.java b/test/simulator/test/org/apache/cassandra/simulator/paxos/AbstractPairOfSequencesPaxosSimulationTest.java new file mode 100644 index 000000000000..32af5717484a --- /dev/null +++ b/test/simulator/test/org/apache/cassandra/simulator/paxos/AbstractPairOfSequencesPaxosSimulationTest.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.paxos; + +import java.util.Collections; +import java.util.List; + +import org.junit.Test; + +import org.apache.cassandra.distributed.api.LogAction; +import org.apache.cassandra.distributed.api.LogResult; +import org.assertj.core.api.Assertions; +import org.mockito.Mockito; + +public class AbstractPairOfSequencesPaxosSimulationTest +{ + @Test + public void parseSuccess() + { + String log = "WARN [AccordExecutor[1,0]:1] node1 2024-12-03 17:45:24,574 [10,1577851211987004,9(RX),1]: Exception coordinating ExclusiveSyncPoint for [1b255f4d-ef25-40a6-0000-000000000009:[(-2978380553567688022,-2930342157542402732]]] durability. Increased numberOfSplits to 256\n" + + "accord.coordinate.Invalidated: null\n" + + "\tat accord.coordinate.Propose$Invalidate.lambda$proposeAndCommitInvalidate$2(Propose.java:193)\n" + + "\tat accord.local.Node.withEpoch(Node.java:391)\n" + + "\tat accord.coordinate.Propose$Invalidate.lambda$proposeAndCommitInvalidate$3(Propose.java:186)\n" + + "\tat accord.coordinate.Propose$Invalidate.onSuccess(Propose.java:217)\n" + + "\tat accord.coordinate.Propose$Invalidate.onSuccess(Propose.java:146)\n" + + "\tat accord.impl.RequestCallbacks$CallbackStripe$RegisteredCallback.unsafeOnSuccess(RequestCallbacks.java:119)\n" + + "\tat accord.impl.RequestCallbacks$CallbackStripe.lambda$onSuccess$0(RequestCallbacks.java:189)\n" + + "\tat accord.impl.RequestCallbacks$CallbackStripe$RegisteredCallback.lambda$safeInvoke$0(RequestCallbacks.java:140)\n" + + "\tat java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)\n" + + "\tat accord.utils.async.AsyncChains.lambda$encapsulate$0(AsyncChains.java:498)\n" + + "\tat org.apache.cassandra.service.accord.AccordExecutor$PlainRunnable.run(AccordExecutor.java:989)\n" + + "\tat org.apache.cassandra.service.accord.AccordExecutor$CommandStoreQueue.run(AccordExecutor.java:729)\n" + + "\tat org.apache.cassandra.service.accord.AccordExecutorSimple.run(AccordExecutorSimple.java:95)\n" + + "\tat org.apache.cassandra.concurrent.FutureTask$2.call(FutureTask.java:124)\n" + + "\tat org.apache.cassandra.concurrent.SyncFutureTask.run(SyncFutureTask.java:68)\n" + + "\tat org.apache.cassandra.simulator.systems.InterceptingExecutor$AbstractSingleThreadedExecutorPlus.lambda$new$0(InterceptingExecutor.java:585)\n" + + "\tat io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)\n" + + "\tat java.base/java.lang.Thread.run(Thread.java:829)"; + LogResult> errors = new LogAction.BasicLogResult<>(42, Collections.singletonList(log)); + + AbstractPairOfSequencesPaxosSimulation simulation = Mockito.mock(AbstractPairOfSequencesPaxosSimulation.class); + Mockito.doCallRealMethod().when(simulation).checkErrorLogs(Mockito.eq(0), Mockito.eq(errors)); + Mockito.when(simulation.expectedExceptions()).thenReturn((Class[]) new Class[] { accord.coordinate.Invalidated.class }); + + simulation.checkErrorLogs(0, errors); + } + + @Test + public void parseFailure() + { + String log = "FAKE [AccordExecutor[1,0]:1] node1 2024-12-03 17:45:24,574 [10,1577851211987004,9(RX),1]: Exception coordinating ExclusiveSyncPoint for [1b255f4d-ef25-40a6-0000-000000000009:[(-2978380553567688022,-2930342157542402732]]] durability. Increased numberOfSplits to 256\n" + + "accord.coordinate.Invalidated: null\n" + + "\tat accord.coordinate.Propose$Invalidate.lambda$proposeAndCommitInvalidate$2(Propose.java:193)\n" + + "\tat accord.local.Node.withEpoch(Node.java:391)\n" + + "\tat accord.coordinate.Propose$Invalidate.lambda$proposeAndCommitInvalidate$3(Propose.java:186)\n" + + "\tat accord.coordinate.Propose$Invalidate.onSuccess(Propose.java:217)\n" + + "\tat accord.coordinate.Propose$Invalidate.onSuccess(Propose.java:146)\n" + + "\tat accord.impl.RequestCallbacks$CallbackStripe$RegisteredCallback.unsafeOnSuccess(RequestCallbacks.java:119)\n" + + "\tat accord.impl.RequestCallbacks$CallbackStripe.lambda$onSuccess$0(RequestCallbacks.java:189)\n" + + "\tat accord.impl.RequestCallbacks$CallbackStripe$RegisteredCallback.lambda$safeInvoke$0(RequestCallbacks.java:140)\n" + + "\tat java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)\n" + + "\tat accord.utils.async.AsyncChains.lambda$encapsulate$0(AsyncChains.java:498)\n" + + "\tat org.apache.cassandra.service.accord.AccordExecutor$PlainRunnable.run(AccordExecutor.java:989)\n" + + "\tat org.apache.cassandra.service.accord.AccordExecutor$CommandStoreQueue.run(AccordExecutor.java:729)\n" + + "\tat org.apache.cassandra.service.accord.AccordExecutorSimple.run(AccordExecutorSimple.java:95)\n" + + "\tat org.apache.cassandra.concurrent.FutureTask$2.call(FutureTask.java:124)\n" + + "\tat org.apache.cassandra.concurrent.SyncFutureTask.run(SyncFutureTask.java:68)\n" + + "\tat org.apache.cassandra.simulator.systems.InterceptingExecutor$AbstractSingleThreadedExecutorPlus.lambda$new$0(InterceptingExecutor.java:585)\n" + + "\tat io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)\n" + + "\tat java.base/java.lang.Thread.run(Thread.java:829)"; + LogResult> errors = new LogAction.BasicLogResult<>(42, Collections.singletonList(log)); + + AbstractPairOfSequencesPaxosSimulation simulation = Mockito.mock(AbstractPairOfSequencesPaxosSimulation.class); + Mockito.doCallRealMethod().when(simulation).checkErrorLogs(Mockito.eq(0), Mockito.eq(errors)); + Mockito.when(simulation.expectedExceptions()).thenReturn((Class[]) new Class[] { accord.coordinate.Invalidated.class }); + + Assertions.assertThatThrownBy(() -> simulation.checkErrorLogs(0, errors)) + .isInstanceOf(AssertionError.class) + .hasMessageStartingWith("Saw errors in node0: Unexpected exception (could not parse line):"); + } +} \ No newline at end of file diff --git a/test/simulator/test/org/apache/cassandra/simulator/paxos/HistoryValidatorTest.java b/test/simulator/test/org/apache/cassandra/simulator/paxos/HistoryValidatorTest.java new file mode 100644 index 000000000000..a52345d0807d --- /dev/null +++ b/test/simulator/test/org/apache/cassandra/simulator/paxos/HistoryValidatorTest.java @@ -0,0 +1,704 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.paxos; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.EnumSet; +import java.util.List; +import java.util.Random; +import java.util.function.Consumer; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.junit.Assume; +import org.junit.FixMethodOrder; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.MethodSorters; +import org.junit.runners.Parameterized; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.carrotsearch.hppc.IntHashSet; +import com.carrotsearch.hppc.IntIntHashMap; +import com.carrotsearch.hppc.IntIntMap; +import com.carrotsearch.hppc.IntSet; +import com.carrotsearch.hppc.cursors.IntCursor; +import org.apache.cassandra.distributed.api.QueryResults; +import org.apache.cassandra.utils.Clock; +import org.assertj.core.api.AbstractThrowableAssert; +import org.assertj.core.api.Assertions; + +import static org.apache.commons.lang3.ArrayUtils.add; +import static org.apache.commons.lang3.ArrayUtils.swap; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/** + * Notes: + * * anomalyDirtyRead was left out as Accord doesn't reject requests, so without a way to reject or abort + * requests then client doesn't have any way to abserve a REJECT, so all issues are UNKNOWN. + * + */ +@RunWith(Parameterized.class) +@FixMethodOrder(MethodSorters.NAME_ASCENDING) // since Random is used, make sure tests run in a determanistic order +public class HistoryValidatorTest +{ + private static final Logger logger = LoggerFactory.getLogger(HistoryValidatorTest.class); + private static final Random RANDOM = random(); + private static final int[] PARTITIONS = IntStream.range(0, 10).toArray(); + private static final int x = 1; + private static final int y = 2; + + @Parameterized.Parameters(name = "{0}") + public static Collection data() + { + List tests = new ArrayList<>(); + tests.add(test(LinearizabilityValidator.Factory.instance)); + tests.add(test(StrictSerializabilityValidator.Factory.instance)); + return tests; + } + + private static Object[] test(HistoryValidator.Factory factory) + { + return new Object[]{ factory }; + } + + private final HistoryValidator.Factory factory; + + public HistoryValidatorTest(HistoryValidator.Factory factory) + { + this.factory = factory; + } + + @Test + public void orderingWithWriteTimeout() + { + IntSet timeoutEvents = set(4, 17, 83); + for (boolean reject : Arrays.asList(true, false)) + { + HistoryValidator validator = create(); + + int logicalClock = 1; + int[] seq = seq(); + for (int eventId = 0; eventId < 100; eventId++) + { + if (timeoutEvents.contains(eventId)) + { + if (!reject) + seq = add(seq, eventId); // wastn't observed, but was applied + continue; + } + single(validator, ob(eventId, ++logicalClock, ++logicalClock), 1, seq, true); + seq = add(seq, eventId); //TODO forgot to add this and LinearizabilityValidator was success... should reject! + } + } + } + + /** + * This test differs from {@link #orderingWithWriteTimeout} as it defines event orders assuming + * requests were concurrent, so may happen in different orderings. + *

    + * This means that we may see the results out of order, but the sequence/count ordering will remain + */ + @Test + public void orderingWithWriteTimeoutWithConcurrency() + { + IntSet timeoutEvents = set(4, 17, 83); + for (boolean reject : Arrays.asList(true, false)) + { + HistoryValidator validator = create(); + // Since the requests are "concurrent" the window in which the operations happened are between start=1 and + // end=responseOrder. + int start = 1; + int logicalClock = start; + + // 'ordering' is the order in which the txns are applied + // 'indexOrder' is the order in which the events are seen; since requests are "concurrent" the order we + // validate may differ from the ordering they were applied. + int[] ordering = IntStream.range(0, 100).toArray(); + if (reject) + ordering = IntStream.of(ordering).filter(i -> !timeoutEvents.contains(i)).toArray(); + shuffle(ordering); + int[] indexOrder = IntStream.range(0, ordering.length - 1).toArray(); + shuffle(indexOrder); + for (int i = 0; i < indexOrder.length; i++) + { + int idx = indexOrder[i]; + int eventId = ordering[idx]; + if (timeoutEvents.contains(eventId)) + continue; + int[] seq = Arrays.copyOf(ordering, idx); + single(validator, ob(eventId, start, ++logicalClock), 1, seq, true); + } + } + } + + @Test + public void anomalyNonMonotonicRead() + { + // Session1: w[x=10] -> Session2: r[x=10] -> r[x=0] + test(dsl -> { + dsl.txn(writeOnly(x)); + dsl.txn(readOnly(x, seq(0))); + dsl.failingTxn(readOnly(x, seq())).isInstanceOf(HistoryViolation.class); + }); + } + + @Test + public void anomalyNonMonotonicWrite() + { + requiresMultiKeySupport(); + // Session1: w[x=10] -> w[y=10] -> Session2: r[y=10] -> r[x=0] + test(dsl -> { + dsl.txn(writeOnly(x)); + dsl.txn(writeOnly(y)); + dsl.txn(readOnly(y, seq(1))); + dsl.failingTxn(readOnly(x, seq())).isInstanceOf(HistoryViolation.class); + }); + } + + @Test + public void anomalyNonMonotonicTransaction() + { + // Session1: r[x=5] -> w[y=10] -> Session2: r[y=10] -> r[x=0] + requiresMultiKeySupport(); + test(dsl -> { + dsl.txn(writeOnly(x), writeOnly(y)); + + dsl.txn(readOnly(x, seq(0))); + dsl.txn(writeOnly(y)); + dsl.txn(readOnly(y, seq(0, 2))); + + dsl.failingTxn(readOnly(x, seq())).isInstanceOf(HistoryViolation.class); + }); + } + + @Test + public void anomalyReadYourOwnWrites() + { + // This test is kinda a duplicate; here just for completness + // w[x=12] -> r[x=8] + test(dsl -> { + dsl.txn(writeOnly(x)); + dsl.failingTxn(readOnly(x, seq())).isInstanceOf(HistoryViolation.class); + }); + } + + //TODO write skew + @Test + public void anomalyReadSkew() + { + requiresMultiKeySupport(); + // two different txn are involved to make this happen + // x=0, y=0 + // U1: starts + // U2: starts + // U1: r[x=0] + // U2: w[x=5], w[y=5] + // U2: commit + // U1: r[y=5] + // U1: commit + HistoryValidator validator = create(); + + // init + txn(validator, ob(0, 1, 2), writeOnly(x), writeOnly(y)); + int u1 = 1, u1_start = 3, u1_end = 6; + int u2 = 2, u2_start = 4, u2_end = 5; + txn(validator, ob(u2, u2_start, u2_end), readWrite(x, seq(0)), readWrite(y, seq(0))); + Assertions.assertThatThrownBy(() -> txn(validator, ob(u1, u1_start, u1_end), readWrite(x, seq(0)), readWrite(y, seq(0, u2)))) + .isInstanceOf(HistoryViolation.class); + } + + @Test + public void anomalyWriteSkew() + { + // two different txn are involved to make this happen + // x=0, y=0 + // U1: starts + // U2: starts + // U1: r[x=0] + } + + @Test + public void seenBehavior() + { + fromLog("Witness(start=4, end=7)\n" + + "\tread(pk=121901541, id=2, count=0, seq=[])\n" + + "\twrite(pk=121901541, id=2, success=true)\n" + + + "Witness(start=3, end=8)\n" + + "\tread(pk=122950117, id=0, count=0, seq=[])\n" + + "\twrite(pk=122950117, id=0, success=true)\n" + + "\twrite(pk=119804389, id=0, success=true)\n" + + + "Witness(start=5, end=9)\n" + + "\tread(pk=121901541, id=3, count=1, seq=[2])\n" + + "\twrite(pk=121901541, id=3, success=true)\n" + + + "Witness(start=2, end=10)\n" + + "\twrite(pk=122950117, id=1, success=true)\n" + + "\twrite(pk=119804389, id=1, success=true)\n" + + + "Witness(start=6, end=11)\n" + + "\tread(pk=121901541, id=4, count=2, seq=[2, 3])\n" + + "\twrite(pk=121901541, id=4, success=true)\n" + + + "Witness(start=12, end=14)\n" + + "\twrite(pk=121901541, id=5, success=true)\n" + + + "Witness(start=13, end=16)\n" + + "\tread(pk=119804389, id=6, count=2, seq=[0, 1])\n" + + "\twrite(pk=119804389, id=6, success=true)\n" + + "\twrite(pk=122950117, id=6, success=true)\n" + + + "Witness(start=15, end=18)\n" + + "\tread(pk=121901541, id=7, count=4, seq=[2, 3, 4, 5])\n" + + "\twrite(pk=121901541, id=7, success=true)\n" + + + "Witness(start=17, end=20)\n" + + "\tread(pk=119804389, id=8, count=3, seq=[0, 1, 6])\n" + + "\twrite(pk=119804389, id=8, success=true)\n" + + "\twrite(pk=122950117, id=8, success=true)\n" // this partition is what triggers + ); + } + + private static String trim(String log, int... keys) + { + // this is deaad code, but exists to help when new validation errors are detected + // the logic will shrink the history to only contain transactions that contain the set of keys + IntSet set = new IntHashSet(); + IntStream.of(keys).forEach(set::add); + Parsed parsed = parse(log); + StringBuilder sb = new StringBuilder(); + for (Witness w : parsed.witnesses) + { + boolean match = false; + for (IntCursor pk : w.pks()) + { + if (set.contains(pk.value)) + { + match = true; + break; + } + } + if (!match) continue; + sb.append(w).append("\n"); + } + return sb.toString(); + } + + + private void requiresMultiKeySupport() + { + Assume.assumeTrue("Validator " + factory.getClass() + " does not support multi-key", factory instanceof StrictSerializabilityValidator.Factory); + } + + private int[] shuffle(int[] ordering) + { + // shuffle array + for (int i = ordering.length; i > 1; i--) + swap(ordering, i - 1, RANDOM.nextInt(i)); + return ordering; + } + + private static void txn(HistoryValidator validator, Observation ob, Event... events) + { + String type = events.length == 1 ? "single" : "multiple"; + logger.info("[Validator={}, Observation=({}, {}, {})] Validating {} {}}", validator.getClass().getSimpleName(), ob.id, ob.start, ob.end, type, events); + try (HistoryValidator.Checker check = validator.witness(ob.start, ob.end)) + { + for (Event e : events) + e.process(ob, check); + } + } + + private static void single(HistoryValidator validator, Observation ob, int pk, int[] seq, boolean hasWrite) + { + txn(validator, ob, hasWrite ? readWrite(pk, seq) : readOnly(pk, seq)); + } + + private static Observation ob(int id, int start, int end) + { + // why empty result? The users don't actually check the result's data, just existence + return new Observation(id, QueryResults.empty(), start, end); + } + + private static int[] seq(int... seq) + { + return seq; + } + + private HistoryValidator create() + { + return factory.create(PARTITIONS); + } + + private static IntSet set(int... values) + { + IntSet set = new IntHashSet(values.length); + for (int v : values) + set.add(v); + return set; + } + + private static Random random() + { + long seed = Long.parseLong(CassandraRelevantProperties.TEST_SEED.getString(Long.toString(Clock.Global.nanoTime()))); + logger.info("Random seed={}; set -Dcassandra.test.seed={} while reruning the tests to get the same order", seed, seed); + return new Random(seed); + } + + private static Event readWrite(int pk, int[] seq) + { + return new Event(EnumSet.of(Event.Type.READ, Event.Type.WRITE), pk, seq); + } + + private static Event readOnly(int pk, int[] seq) + { + return new Event(EnumSet.of(Event.Type.READ), pk, seq); + } + + private static Event writeOnly(int pk) + { + return new Event(EnumSet.of(Event.Type.WRITE), pk, null); + } + + private interface Operation + { + int pk(); + void check(HistoryValidator.Checker check); + void appendString(StringBuilder sb); + } + + private static class Read implements Operation + { + final int pk, id, count; + final int[] seq; + + Read(int pk, int id, int count, int[] seq) + { + this.pk = pk; + this.id = id; + this.count = count; + this.seq = seq; + } + + @Override + public int pk() + { + return pk; + } + + @Override + public void check(HistoryValidator.Checker check) + { + check.read(pk, id, count, seq); + } + + @Override + public void appendString(StringBuilder sb) + { + sb.append("read(pk=").append(pk).append(", id=").append(id).append(", count=").append(count).append(", seq=").append(Arrays.toString(seq)).append(")\n"); + } + } + + private static class Write implements Operation + { + final int pk, id; + final boolean success; + + Write(int pk, int id, boolean success) + { + this.pk = pk; + this.id = id; + this.success = success; + } + + @Override + public int pk() + { + return pk; + } + + @Override + public void check(HistoryValidator.Checker check) + { + check.write(pk, id, success); + } + + @Override + public void appendString(StringBuilder sb) + { + sb.append("write(pk=").append(pk).append(", id=").append(id).append(", success=").append(success).append(")\n"); + } + } + + private static class Witness + { + final int start, end; + final List actions = new ArrayList<>(); + + Witness(int start, int end) + { + this.start = start; + this.end = end; + } + + void read(int pk, int id, int count, int[] seq) + { + actions.add(new Read(pk, id, count, seq)); + } + + void write(int pk, int id, boolean success) + { + actions.add(new Write(pk, id, success)); + } + + void process(HistoryValidator validator) + { + try (HistoryValidator.Checker check = validator.witness(start, end)) + { + for (Operation a : actions) + a.check(check); + } + } + + IntSet pks() + { + IntSet pks = new IntHashSet(); + for (Operation action : actions) + pks.add(action.pk()); + return pks; + } + + @Override + public String toString() + { + StringBuilder sb = new StringBuilder(); + sb.append("Witness(start=").append(start).append(", end=").append(end).append(")\n"); + for (Operation a : actions) + a.appendString(sb.append('\t')); + return sb.toString(); + } + } + + private static class Parsed + { + private final int[] keys; + private final List witnesses; + + private Parsed(int[] keys, List witnesses) + { + this.keys = keys; + this.witnesses = witnesses; + } + } + + private static Parsed parse(String log) + { + IntSet pks = new IntHashSet(); + List witnesses = new ArrayList<>(); + Witness current = null; + for (String line : log.split("\n")) + { + if (line.trim().isEmpty()) + continue; + if (line.startsWith("Witness")) + { + if (current != null) + witnesses.add(current); + Matcher matcher = Pattern.compile("Witness\\(start=(.+), end=(.+)\\)").matcher(line); + if (!matcher.find()) throw new AssertionError("Unable to match start/end of " + line); + current = new Witness(Integer.parseInt(matcher.group(1)), Integer.parseInt(matcher.group(2))); + } + else if (line.startsWith("\tread")) + { + Matcher matcher = Pattern.compile("\tread\\(pk=(.+), id=(.+), count=(.+), seq=\\[(.*)\\]\\)").matcher(line); + if (!matcher.find()) throw new AssertionError("Unable to match read of " + line); + int pk = Integer.parseInt(matcher.group(1)); + pks.add(pk); + int id = Integer.parseInt(matcher.group(2)); + int count = Integer.parseInt(matcher.group(3)); + String seqStr = matcher.group(4); + int[] seq = seqStr.isEmpty() ? new int[0] : Stream.of(seqStr.split(",")).map(String::trim).mapToInt(Integer::parseInt).toArray(); + current.read(pk, id, count, seq); + } + else if (line.startsWith("\twrite")) + { + Matcher matcher = Pattern.compile("\twrite\\(pk=(.+), id=(.+), success=(.+)\\)").matcher(line); + if (!matcher.find()) throw new AssertionError("Unable to match write of " + line); + int pk = Integer.parseInt(matcher.group(1)); + pks.add(pk); + int id = Integer.parseInt(matcher.group(2)); + boolean success = Boolean.parseBoolean(matcher.group(3)); + current.write(pk, id, success); + } + else + { + throw new IllegalArgumentException("Unknow line: " + line); + } + } + if (current != null) + witnesses.add(current); + int[] keys = pks.toArray(); + Arrays.sort(keys); + return new Parsed(keys, witnesses); + } + + private void fromLog(String log) + { + Parsed parsed = parse(log); + HistoryValidator validator = factory.create(parsed.keys); + for (Witness w : parsed.witnesses) + { + try + { + w.process(validator); + } + catch (HistoryViolation e) + { + HistoryViolation hv = new HistoryViolation(e.primaryKey, "Violation detected for witnessed action " + w + "; " + e.getMessage() + ";\n" + log); + hv.setStackTrace(e.getStackTrace()); + throw hv; + } + } + } + + private static class Event + { + enum Type + {READ, WRITE} + + ; + private final EnumSet types; + private final int pk; + private final int[] seq; + + private Event(EnumSet types, int pk, int[] seq) + { + this.types = types; + this.pk = pk; + this.seq = seq; + } + + private void process(Observation ob, HistoryValidator.Checker check) + { + if (types.contains(Type.READ)) + check.read(pk, ob.id, seq.length, seq); + if (types.contains(Type.WRITE)) + check.write(pk, ob.id, ob.isSuccess()); + } + } + + private interface TestDSL + { + void txn(Event... events); + + AbstractThrowableAssert failingTxn(Event... events); + } + + private static boolean supportMultiKey(HistoryValidator validator) + { + return validator instanceof StrictSerializabilityValidator; + } + + private void test(Consumer fn) + { + HistoryValidator validator = create(); + boolean global = supportMultiKey(validator); + EventIdGen eventIdGen = global ? new AllPks() : new PerPk(); + TestDSL dsl = new TestDSL() + { + int logicalClock = 0; + + @Override + public void txn(Event... events) + { + if (global) + { + int eventId = eventIdGen.next(); + HistoryValidatorTest.txn(validator, ob(eventId, ++logicalClock, ++logicalClock), events); + } + else + { + for (Event e : events) + { + int eventId = eventIdGen.next(e.pk); + HistoryValidatorTest.txn(validator, ob(eventId, ++logicalClock, ++logicalClock), e); + } + } + } + + @Override + public AbstractThrowableAssert failingTxn(Event... events) + { + return assertThatThrownBy(() -> txn(events)); + } + }; + fn.accept(dsl); + } + + private interface EventIdGen + { + int next(int pk); + + int next(); + } + + private static class PerPk implements EventIdGen + { + private final IntIntMap map = new IntIntHashMap(); + + @Override + public int next(int pk) + { + int next = !map.containsKey(pk) ? 0 : map.get(pk) + 1; + map.put(pk, next); + return next; + } + + @Override + public int next() + { + throw new UnsupportedOperationException("next without pk not supported"); + } + } + + private static class AllPks implements EventIdGen + { + private int value = 0; + + @Override + public int next(int pk) + { + return next(); + } + + @Override + public int next() + { + return value++; + } + } +} diff --git a/test/simulator/test/org/apache/cassandra/simulator/test/AccordHarrySimulationTest.java b/test/simulator/test/org/apache/cassandra/simulator/test/AccordHarrySimulationTest.java new file mode 100644 index 000000000000..c4dc9f10fc88 --- /dev/null +++ b/test/simulator/test/org/apache/cassandra/simulator/test/AccordHarrySimulationTest.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.test; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.harry.SchemaSpec; +import org.apache.cassandra.harry.gen.Generator; +import org.apache.cassandra.harry.gen.SchemaGenerators; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.simulator.FixedLossNetworkScheduler; +import org.apache.cassandra.simulator.FutureActionScheduler; +import org.apache.cassandra.simulator.RandomSource; +import org.apache.cassandra.simulator.systems.SimulatedTime; +import org.apache.cassandra.simulator.utils.KindOfSequence; + +public class AccordHarrySimulationTest extends HarrySimulatorTest +{ + + @Override + public Map networkSchedulers(int nodes, SimulatedTime time, RandomSource random) + { + + Set extremelyLossy = new HashSet<>(Arrays.asList(Verb.ACCORD_SIMPLE_RSP, Verb.ACCORD_PRE_ACCEPT_RSP, Verb.ACCORD_PRE_ACCEPT_REQ, + Verb.ACCORD_ACCEPT_RSP, Verb.ACCORD_ACCEPT_REQ, Verb.ACCORD_NOT_ACCEPT_REQ, + Verb.ACCORD_READ_RSP, Verb.ACCORD_READ_REQ, Verb.ACCORD_COMMIT_REQ, + Verb.ACCORD_COMMIT_INVALIDATE_REQ, Verb.ACCORD_APPLY_RSP, Verb.ACCORD_APPLY_REQ, + Verb.ACCORD_BEGIN_RECOVER_RSP, Verb.ACCORD_BEGIN_RECOVER_REQ, Verb.ACCORD_BEGIN_INVALIDATE_RSP)); + + Set somewhatLossy = new HashSet<>(Arrays.asList(Verb.ACCORD_SYNC_NOTIFY_RSP, Verb.ACCORD_SYNC_NOTIFY_REQ, Verb.ACCORD_APPLY_AND_WAIT_REQ, + Verb.ACCORD_FETCH_TOPOLOGY_RSP, Verb.ACCORD_FETCH_TOPOLOGY_REQ)); + + Map schedulers = new HashMap<>(); + for (Verb verb : Verb.values()) + { + if (extremelyLossy.contains(verb)) + schedulers.put(verb, new FixedLossNetworkScheduler(nodes, random, time, KindOfSequence.UNIFORM, .15f, .20f)); + else if (somewhatLossy.contains(verb)) + schedulers.put(verb, new FixedLossNetworkScheduler(nodes, random, time, KindOfSequence.UNIFORM, .1f, .15f)); + } + return schedulers; + } + + protected ConsistencyLevel validateQueryConsistency() + { + return ConsistencyLevel.QUORUM; + } + + public Generator schemaSpecGen(String keyspace, String prefix) + { + return SchemaGenerators.schemaSpecGen(keyspace, prefix, 1000, SchemaSpec.optionsBuilder().withTransactionalMode(TransactionalMode.full)); + } + +} diff --git a/test/simulator/test/org/apache/cassandra/simulator/test/AccordJournalSimulationTest.java b/test/simulator/test/org/apache/cassandra/simulator/test/AccordJournalSimulationTest.java new file mode 100644 index 000000000000..36040e115c1e --- /dev/null +++ b/test/simulator/test/org/apache/cassandra/simulator/test/AccordJournalSimulationTest.java @@ -0,0 +1,267 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.simulator.test; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; +import java.util.zip.Checksum; + +import com.google.common.collect.ImmutableMap; +import com.google.common.jimfs.Jimfs; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.concurrent.ExecutorFactory; +import org.apache.cassandra.concurrent.ExecutorPlus; +import org.apache.cassandra.config.AccordSpec; +import org.apache.cassandra.config.Config; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.config.ParameterizedClass; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.config.DurationSpec; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.io.filesystem.ListenableFileSystem; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.journal.Journal; +import org.apache.cassandra.journal.KeySupport; +import org.apache.cassandra.journal.RecordPointer; +import org.apache.cassandra.journal.SegmentCompactor; +import org.apache.cassandra.journal.ValueSerializer; +import org.apache.cassandra.utils.Isolated; +import org.apache.cassandra.utils.concurrent.CountDownLatch; + +public class AccordJournalSimulationTest extends SimulationTestBase +{ + @Test + public void simpleRWTest() + { + simulate(arr(() -> { + ListenableFileSystem fs = new ListenableFileSystem(Jimfs.newFileSystem()); + File.unsafeSetFilesystem(fs); + DatabaseDescriptor.daemonInitialization(); + DatabaseDescriptor.setCommitLogCompression(new ParameterizedClass("LZ4Compressor", ImmutableMap.of())); + DatabaseDescriptor.setCommitLogWriteDiskAccessMode(Config.DiskAccessMode.standard); + DatabaseDescriptor.initializeCommitLogDiskAccessMode(); + DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); + DatabaseDescriptor.setAccordJournalDirectory("/journal"); + new File("/journal").createDirectoriesIfNotExists(); + + DatabaseDescriptor.setDumpHeapOnUncaughtException(false); + + Keyspace.setInitialized(); + + AccordSpec.JournalSpec spec = new AccordSpec.JournalSpec(); + spec.flushPeriod = new DurationSpec.IntSecondsBound(1); + + State.journal = new Journal<>("AccordJournal", + new File("/journal"), + spec, + new IdentityKeySerializer(), + new IdentityValueSerializer(), + SegmentCompactor.noop()); + }), + () -> check()); + } + + public static void check() + { + State.journal.start(); + try + { + final int count = 100; + for (int i = 0; i < count; i++) + { + int finalI = i; + State.executor.submit(() -> { + RecordPointer ptr = State.journal.asyncWrite("test" + finalI, "test" + finalI); + State.journal.onDurable(ptr, State.latch::decrement); + }); + } + + State.latch.await(); + + for (int i = 0; i < count; i++) + { + State.logger.debug("Reading {}", i); + Assert.assertEquals(State.journal.readLast("test" + i), "test" + i); + } + } + + catch (InterruptedException e) + { + throw new RuntimeException(e); + } + finally + { + State.journal.shutdown(); + + if (!State.thrown.isEmpty()) + { + AssertionError throwable = new AssertionError("Caught exceptions"); + for (Throwable t: State.thrown) + throwable.addSuppressed(t); + throw throwable; + } + } + } + + @Isolated + public static class IdentityValueSerializer implements ValueSerializer + { + @Override + public void serialize(String key, String value, DataOutputPlus out, int userVersion) throws IOException + { + out.writeInt(key.length()); + out.writeBytes(key); + } + + @Override + public String deserialize(String key, DataInputPlus in, int userVersion) throws IOException + { + int size = in.readInt(); + byte[] value = new byte[size]; + for (int i = 0; i < size; i++) + value[i] = in.readByte(); + + return new String(value); + } + } + + @Isolated + public static class IdentityKeySerializer implements KeySupport + { + private final byte aByte = 0xd; + @Override + public int serializedSize(int userVersion) + { + return 16; + } + + @Override + public void serialize(String key, DataOutputPlus out, int userVersion) throws IOException + { + int maxSize = 16 - TypeSizes.INT_SIZE; + if (key.length() > maxSize) + throw new IllegalStateException(); + + out.writeInt(key.length()); + out.writeBytes(key); + int remaining = maxSize - key.length(); + for (int i = 0; i < remaining; i++) + out.writeByte(aByte + i); + } + + @Override + public String deserialize(DataInputPlus in, int userVersion) throws IOException + { + int size = in.readInt(); + byte[] key = new byte[size]; + for (int i = 0; i < size; i++) + key[i] = in.readByte(); + + int maxSize = 16 - TypeSizes.INT_SIZE; + int remaining = maxSize - size; + for (int i = 0; i < remaining; i++) + Assert.assertEquals(aByte + i, in.readByte()); + + return new String(key); + } + + @Override + public void serialize(String key, ByteBuffer out, int userVersion) throws IOException + { + int maxSize = 16 - TypeSizes.INT_SIZE; + if (key.length() > maxSize) + throw new IllegalStateException(); + + out.putInt(key.length()); + for (int i = 0 ; i < key.length() ; ++i) + out.put((byte)key.charAt(i)); + int remaining = maxSize - key.length(); + for (int i = 0; i < remaining; i++) + out.put((byte) (aByte + i)); + } + + @Override + public String deserialize(ByteBuffer in, int userVersion) + { + int size = in.getInt(); + byte[] key = new byte[size]; + for (int i = 0; i < size; i++) + key[i] = in.get(); + + int maxSize = 16 - TypeSizes.INT_SIZE; + int remaining = maxSize - size; + for (int i = 0; i < remaining; i++) + Assert.assertEquals(aByte + i, in.get()); + + return new String(key); + } + + @Override + public String deserialize(ByteBuffer buffer, int position, int userVersion) + { + int size = buffer.getInt(); + byte[] key = new byte[size]; + for (int i = 0; i < size; i++) + key[i] = buffer.get(); + + int maxSize = 16 - TypeSizes.INT_SIZE; + int remaining = maxSize - size; + for (int i = 0; i < remaining; i++) + Assert.assertEquals(aByte + i, buffer.get()); + + return new String(key); + } + + @Override + public void updateChecksum(Checksum crc, String key, int userVersion) + { + crc.update(key.getBytes()); + } + + @Override + public int compareWithKeyAt(String key, ByteBuffer buffer, int position, int userVersion) + { + throw new IllegalStateException(); + } + + @Override + public int compare(String o1, String o2) + { + return o1.compareTo(o2); + } + } + + @Isolated + public static class State + { + private static final Logger logger = LoggerFactory.getLogger(State.class); + static Journal journal; + static CountDownLatch latch = CountDownLatch.newCountDownLatch(100); + static List thrown = new ArrayList<>(); + static ExecutorPlus executor = ExecutorFactory.Global.executorFactory().pooled("name", 10); + } +} \ No newline at end of file diff --git a/test/simulator/test/org/apache/cassandra/simulator/test/EpochStressTest.java b/test/simulator/test/org/apache/cassandra/simulator/test/EpochStressTest.java new file mode 100644 index 000000000000..8aaab3d78ac4 --- /dev/null +++ b/test/simulator/test/org/apache/cassandra/simulator/test/EpochStressTest.java @@ -0,0 +1,232 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.test; + +import java.io.IOException; +import java.time.Duration; +import java.util.ArrayList; +import java.util.EnumMap; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.function.Consumer; +import java.util.function.Supplier; + +import com.google.common.util.concurrent.Uninterruptibles; +import org.junit.Test; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.ConfigurationService; +import accord.local.Node; +import accord.primitives.Ranges; +import accord.topology.TopologyManager; +import org.apache.cassandra.service.accord.AccordConfigurationService; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.simulator.Action; +import org.apache.cassandra.simulator.ActionList; +import org.apache.cassandra.simulator.Debug; +import org.apache.cassandra.simulator.RandomSource; +import org.apache.cassandra.simulator.cluster.ClusterActionListener; +import org.apache.cassandra.simulator.cluster.ClusterActions; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.utils.Clock; + +import static org.apache.cassandra.simulator.cluster.ClusterActions.InitialConfiguration.initializeAll; +import static org.apache.cassandra.simulator.cluster.ClusterActions.Options.noActions; + +/** + * In order to run these tests in your IDE, you need to first build a simulator jara + * + * ant simulator-jars + * + * And then run your test using the following settings (omit add-* if you are running on jdk8): + * + -Dstorage-config=$MODULE_DIR$/test/conf + -Djava.awt.headless=true + -javaagent:$MODULE_DIR$/lib/jamm-0.4.0.jar + -ea + -Dcassandra.debugrefcount=true + -Xss384k + -XX:SoftRefLRUPolicyMSPerMB=0 + -XX:ActiveProcessorCount=2 + -XX:HeapDumpPath=build/test + -Dcassandra.test.driver.connection_timeout_ms=10000 + -Dcassandra.test.driver.read_timeout_ms=24000 + -Dcassandra.memtable_row_overhead_computation_step=100 + -Dcassandra.test.use_prepared=true + -Dcassandra.test.sstableformatdevelopment=true + -Djava.security.egd=file:/dev/urandom + -Dcassandra.testtag=.jdk11 + -Dcassandra.keepBriefBrief=true + -Dcassandra.allow_simplestrategy=true + -Dcassandra.strict.runtime.checks=true + -Dcassandra.reads.thresholds.coordinator.defensive_checks_enabled=true + -Dcassandra.test.flush_local_schema_changes=false + -Dcassandra.test.messagingService.nonGracefulShutdown=true + -Dcassandra.use_nix_recursive_delete=true + -Dcie-cassandra.disable_schema_drop_log=true + -Dlogback.configurationFile=file://$MODULE_DIR$/test/conf/logback-simulator.xml + -Dcassandra.ring_delay_ms=10000 + -Dcassandra.tolerate_sstable_size=true + -Dcassandra.skip_sync=true + -Dcassandra.debugrefcount=false + -Dcassandra.test.simulator.determinismcheck=strict + -Dcassandra.test.simulator.print_asm=none + -javaagent:$MODULE_DIR$/build/test/lib/jars/simulator-asm.jar + -Xbootclasspath/a:$MODULE_DIR$/build/test/lib/jars/simulator-bootstrap.jar + -XX:ActiveProcessorCount=4 + -XX:-TieredCompilation + -XX:-BackgroundCompilation + -XX:CICompilerCount=1 + -XX:Tier4CompileThreshold=1000 + -XX:ReservedCodeCacheSize=256M + -Xmx16G + -Xmx4G + --add-exports java.base/jdk.internal.misc=ALL-UNNAMED + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-exports java.base/sun.nio.ch=ALL-UNNAMED + --add-exports java.management.rmi/com.sun.jmx.remote.internal.rmi=ALL-UNNAMED + --add-exports java.rmi/sun.rmi.registry=ALL-UNNAMED + --add-exports java.rmi/sun.rmi.server=ALL-UNNAMED + --add-exports java.sql/java.sql=ALL-UNNAMED + --add-exports java.rmi/sun.rmi.registry=ALL-UNNAMED + --add-opens java.base/java.lang.module=ALL-UNNAMED + --add-opens java.base/java.net=ALL-UNNAMED + --add-opens java.base/jdk.internal.loader=ALL-UNNAMED + --add-opens java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/jdk.internal.reflect=ALL-UNNAMED + --add-opens java.base/jdk.internal.math=ALL-UNNAMED + --add-opens java.base/jdk.internal.module=ALL-UNNAMED + --add-opens java.base/jdk.internal.util.jar=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + --add-opens jdk.management.jfr/jdk.management.jfr=ALL-UNNAMED + --add-opens java.desktop/com.sun.beans.introspect=ALL-UNNAMED + + */ +public class EpochStressTest extends SimulationTestBase +{ + @Test + public void manyEpochsAndAccordConverges() throws IOException + { + simulate(simulation -> { + // setup + ClusterActions.Options options = noActions(simulation.cluster.size()); + ClusterActions clusterActions = new ClusterActions(simulation.simulated, simulation.cluster, + options, new ClusterActionListener.NoOpListener(), new Debug(new EnumMap<>(Debug.Info.class), new int[0])); + return ActionList.of(clusterActions.initializeCluster(initializeAll(simulation.cluster.size())), + simulation.schemaChange(1, "CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor' : 3}"), + simulation.schemaChange(1, "CREATE TABLE IF NOT EXISTS ks.tbl (pk int PRIMARY KEY, v int) WITH " + TransactionalMode.full.asCqlParam())); + }, + simulation -> { + // test + RandomSource random = simulation.simulated.random; + int numEpochs = 100; + List actions = new ArrayList<>(numEpochs); + for (int i = 0; i < numEpochs; i++) + { + int node = random.uniform(1, simulation.cluster.size() + 1); + actions.add(simulation.schemaChange(node, "ALTER TABLE ks.tbl WITH comment = 'step=" + i + "'")); + } + return ActionList.of(actions); + }, + simulation -> { + // teardown + List actions = new ArrayList<>(simulation.cluster.size()); + for (int i = 0; i < simulation.cluster.size(); i++) + actions.add(HarrySimulatorTest.lazy(simulation.simulated, simulation.cluster.get(i + 1), EpochStressTest::validate)); + return ActionList.of(actions); + }, + config -> config.nodes(3, 3) + .dcs(1, 1) + .threadCount(100)); + } + + private static void validate() + { + Logger logger = LoggerFactory.getLogger(EpochStressTest.class); + NodeId nodeId = ClusterMetadata.current().myNodeId(); + long maxEpoch = ClusterMetadataService.instance().log().waitForHighestConsecutive().epoch.getEpoch(); + long startNano = Clock.Global.nanoTime(); + long deadlineNanos = startNano + TimeUnit.MINUTES.toNanos(10); + + AccordService accord = (AccordService) AccordService.instance(); + Node node = accord.node(); + AccordConfigurationService configService = (AccordConfigurationService) node.configService(); + TopologyManager tm = node.topology(); + long minEpoch = tm.minEpoch(); + + logger.info("Starting validation on node {} for epochs {} -> {}", nodeId, minEpoch, maxEpoch); + + Consumer> sleep = msg -> { + long now = Clock.Global.nanoTime(); + if (now > deadlineNanos) + throw new AssertionError(msg.get()); + logger.debug("Step is not ready yet: {}", msg.get()); + Uninterruptibles.sleepUninterruptibly(50, TimeUnit.MILLISECONDS); + }; + + for (long epoch = minEpoch; epoch <= maxEpoch; epoch++) + { + long finalEpoch = epoch; + ConfigurationService.EpochReady ready = tm.epochReady(epoch); + while (!isDone(ready)) + sleep.accept(() -> "Epoch " + finalEpoch + "'s EpochReady is not done; " + ready); + + AccordConfigurationService.EpochSnapshot snapshot = configService.getEpochSnapshot(epoch); + while (!isDone(snapshot)) + { + AccordConfigurationService.SyncStatus status = snapshot.syncStatus; + sleep.accept(() -> "Epoch " + finalEpoch + "'s SyncStatus is not done; " + status); + snapshot = configService.getEpochSnapshot(epoch); + } + + Ranges expected = tm.globalForEpoch(epoch).ranges().mergeTouching(); + Ranges synced = tm.syncComplete(epoch).mergeTouching(); + while (!isDone(synced, expected)) + { + Ranges finalSynced = synced; + sleep.accept(() -> "Epoch " + finalEpoch + "'s syncComplete is not done; missing " + expected.without(finalSynced)); + synced = tm.syncComplete(epoch).mergeTouching(); + } + } + logger.info("All epochs completed in {}", Duration.ofNanos(Clock.Global.nanoTime() - startNano)); + } + + private static boolean isDone(Ranges synced, Ranges expected) + { + return synced.equals(expected); + } + + private static boolean isDone(AccordConfigurationService.EpochSnapshot snapshot) + { + return snapshot.syncStatus == AccordConfigurationService.SyncStatus.COMPLETED; + } + + private static boolean isDone(ConfigurationService.EpochReady ready) + { + return ready.metadata.isDone() + && ready.coordinate.isDone() + && ready.data.isDone() + && ready.reads.isDone(); + } +} diff --git a/test/simulator/test/org/apache/cassandra/simulator/test/HarrySimulatorTest.java b/test/simulator/test/org/apache/cassandra/simulator/test/HarrySimulatorTest.java index 9fc7e6983243..9bc9b6e759bb 100644 --- a/test/simulator/test/org/apache/cassandra/simulator/test/HarrySimulatorTest.java +++ b/test/simulator/test/org/apache/cassandra/simulator/test/HarrySimulatorTest.java @@ -52,22 +52,23 @@ import org.apache.cassandra.distributed.api.IInstanceConfig; import org.apache.cassandra.distributed.api.IInvokableInstance; import org.apache.cassandra.distributed.api.IIsolatedExecutor; +import org.apache.cassandra.distributed.api.SimpleQueryResult; import org.apache.cassandra.distributed.impl.Query; import org.apache.cassandra.distributed.shared.WithProperties; import org.apache.cassandra.harry.SchemaSpec; -import org.apache.cassandra.harry.op.Visit; -import org.apache.cassandra.harry.op.Operations; -import org.apache.cassandra.harry.gen.OperationsGenerators; import org.apache.cassandra.harry.execution.CompiledStatement; import org.apache.cassandra.harry.execution.DataTracker; import org.apache.cassandra.harry.execution.QueryBuildingVisitExecutor; import org.apache.cassandra.harry.gen.EntropySource; import org.apache.cassandra.harry.gen.Generator; +import org.apache.cassandra.harry.gen.OperationsGenerators; import org.apache.cassandra.harry.gen.SchemaGenerators; import org.apache.cassandra.harry.gen.rng.JdkRandomEntropySource; import org.apache.cassandra.harry.model.Model; import org.apache.cassandra.harry.model.QuiescentChecker; import org.apache.cassandra.harry.model.TokenPlacementModel; +import org.apache.cassandra.harry.op.Operations; +import org.apache.cassandra.harry.op.Visit; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.Verb; import org.apache.cassandra.schema.ReplicationParams; @@ -201,7 +202,7 @@ public static void main(String... args) throws Throwable HarrySimulatorTest test = SingleCommand.singleCommand(HarrySimulatorTest.class).parse(args); if (test.helpOption.showHelpIfRequested()) return; - test.harryTest(); + test.testInternal(); System.exit(1); } @@ -210,11 +211,10 @@ public void test() throws Exception { // To rerun a failing test for a given seed, uncomment the below and set the seed // this.seed = ""; - this.seed = "0xdd3bb3793a6b925a"; - harryTest(); + testInternal(); } - private void harryTest() throws Exception + protected void testInternal() throws Exception { int bootstrapNode1 = 4; int bootstrapNode2 = 8; @@ -288,7 +288,7 @@ private void harryTest() throws Exception work.add(interleave("Start generating", HarrySimulatorTest.generateWrites(rowsPerPhase, simulation, cl))); work.add(work("Validate all data locally", - lazy(() -> validateAllLocal(simulation, simulation.nodeState.ring, rf)))); + lazy(() -> validateAllLocal(simulation, simulation.nodeState.ring, rf, validateQueryConsistency(), simulation.rng)))); return arr(work.toArray(new ActionSchedule.Work[0])); }, @@ -339,8 +339,8 @@ private void harryTest() throws Exception run(() -> simulation.nodeState.decommission(node)))); work.add(work("Check node state", assertNodeState(simulation.simulated, simulation.cluster, node, NodeState.LEFT))); } - work.add(work("Validate data locally", - lazy(() -> validateAllLocal(simulation, simulation.nodeState.ring, rf)))); + work.add(work("Validate data with " + validateQueryConsistency(), + lazy(() -> validateAllLocal(simulation, simulation.nodeState.ring, rf, validateQueryConsistency(), simulation.rng)))); boolean tmp = shouldBootstrap; work.add(work("Output message", run(() -> logger.warn("Finished {} of {} and data validation!\n", tmp ? "bootstrap" : "decommission", node)))); @@ -513,7 +513,7 @@ public void close() throws Exception } } - static class HarrySimulationBuilder extends ClusterSimulation.Builder + class HarrySimulationBuilder extends ClusterSimulation.Builder { protected final Consumer configUpdater; @@ -525,7 +525,7 @@ static class HarrySimulationBuilder extends ClusterSimulation.Builder perVerbFutureActionSchedulers(int nodeCount, SimulatedTime time, RandomSource random) { - return HarrySimulatorTest.networkSchedulers(nodeCount, time, random); + return networkSchedulers(nodeCount, time, random); } @Override @@ -599,7 +599,7 @@ void simulate(Consumer> configure, /** * Custom network scheduler for testing TCM. */ - public static Map networkSchedulers(int nodes, SimulatedTime time, RandomSource random) + public Map networkSchedulers(int nodes, SimulatedTime time, RandomSource random) { Set extremelyLossy = new HashSet<>(Arrays.asList(Verb.TCM_ABORT_MIG, Verb.TCM_REPLICATION, Verb.TCM_COMMIT_REQ, Verb.TCM_NOTIFY_REQ, @@ -632,7 +632,7 @@ else if (somewhatSlow.contains(verb)) return schedulers; } - public Action reconfigureCMS(SimulatedSystems simulated, Cluster cluster, int rf, boolean inEachDc) + public static Action reconfigureCMS(SimulatedSystems simulated, Cluster cluster, int rf, boolean inEachDc) { return new SimulatedActionTask("", Action.Modifiers.RELIABLE_NO_TIMEOUTS, Action.Modifiers.RELIABLE_NO_TIMEOUTS, null, simulated, new InterceptedExecution.InterceptedRunnableExecution((InterceptingExecutor) cluster.get(1).executor(), @@ -689,7 +689,7 @@ public static Action bootstrap(SimulatedSystems simulated, Cluster cluster, long cluster.get(node).transfer(runnable))); } - public Action decommission(SimulatedSystems simulated, Cluster cluster, int node) + public static Action decommission(SimulatedSystems simulated, Cluster cluster, int node) { IIsolatedExecutor.SerializableRunnable runnable = () -> { try @@ -757,18 +757,20 @@ public static ActionList generateWrites(int ops, HarrySimulation simulation, Con Visit visit = new Visit(lts, new Operations.Operation[]{ simulation.insertGen.generate(simulation.rng).toOp(lts) }); Visit prev_ = simulation.log.put(lts, visit); - Invariants.checkState(prev_ == null); + Invariants.require(prev_ == null); actions[i] = new Actions.LambdaAction("", Action.Modifiers.RELIABLE_NO_TIMEOUTS, () -> { CompiledStatement compiledStatement = simulation.queryBuilder.compile(visit); DataTracker tracker = simulation.tracker; + int[] joined = simulation.nodeState.joined(); + int coordinator = joined[simulation.rng.nextInt(joined.length)]; RetryingQuery query = new RetryingQuery(compiledStatement.cql(), cl, compiledStatement.bindings()); Action wrapper = new SimulatedActionCallable<>("Query", Action.Modifiers.RELIABLE_NO_TIMEOUTS, Action.Modifiers.RELIABLE_NO_TIMEOUTS, simulation.simulated, - simulation.cluster.get((int) ((lts % simulation.cluster.size()) + 1)), + simulation.cluster.get(coordinator), query) { @Override @@ -779,7 +781,6 @@ protected InterceptedExecution.InterceptedTaskExecution task() public void run() { tracker.begin(visit); - System.out.println("Started visit = " + visit); // we'll be invoked on the node's executor, but we need to ensure the task is loaded on its classloader try { @@ -798,15 +799,12 @@ public void run() } @Override - public void accept(Object[][] result, Throwable failure) + public void accept(SimpleQueryResult result, Throwable failure) { if (failure != null) simulated.failures.accept(failure); else - { - System.out.println("Finished visit = " + visit); tracker.end(visit); - } } }; @@ -824,7 +822,7 @@ public RetryingQuery(String query, ConsistencyLevel cl, Object[] boundValues) } @Override - public Object[][] call() + public SimpleQueryResult call() { while (true) { @@ -848,7 +846,7 @@ public Object[][] call() * Given you have used `generate` methods to generate data with Harry, you can use this method to check whether all * data has been propagated everywhere it should be, be it via streaming, read repairs, or regular writes. */ - public static Action validateAllLocal(HarrySimulation simulation, List owernship, TokenPlacementModel.ReplicationFactor rf) + public static Action validateAllLocal(HarrySimulation simulation, List owernship, TokenPlacementModel.ReplicationFactor rf, ConsistencyLevel consistencyLevel, EntropySource rng) { return new Actions.LambdaAction("Validate", Action.Modifiers.RELIABLE_NO_TIMEOUTS, () -> { @@ -863,12 +861,19 @@ public static Action validateAllLocal(HarrySimulation simulation, List visitedPds(HarrySimulation simulation) { Set pds = new HashSet<>(); @@ -1037,7 +1042,7 @@ public static T[] arr(T... arr) return arr; } - public static Generator schemaSpecGen(String keyspace, String prefix) + public Generator schemaSpecGen(String keyspace, String prefix) { return SchemaGenerators.schemaSpecGen(keyspace, prefix, 1000); } diff --git a/test/simulator/test/org/apache/cassandra/simulator/test/HarryValidatingQuery.java b/test/simulator/test/org/apache/cassandra/simulator/test/HarryValidatingQuery.java index 2a011c944c74..3e34bf69b10f 100644 --- a/test/simulator/test/org/apache/cassandra/simulator/test/HarryValidatingQuery.java +++ b/test/simulator/test/org/apache/cassandra/simulator/test/HarryValidatingQuery.java @@ -25,7 +25,9 @@ import accord.utils.Invariants; import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.api.IInstance; +import org.apache.cassandra.harry.gen.EntropySource; import org.apache.cassandra.harry.op.Visit; import org.apache.cassandra.harry.op.Operations; import org.apache.cassandra.harry.execution.CompiledStatement; @@ -39,6 +41,8 @@ import org.apache.cassandra.simulator.systems.InterceptingExecutor; import org.apache.cassandra.simulator.systems.SimulatedAction; +import static org.apache.cassandra.simulator.SimulatorUtils.failWithOOM; + public class HarryValidatingQuery extends SimulatedAction { private static final Logger logger = LoggerFactory.getLogger(HarryValidatingQuery.class); @@ -51,13 +55,17 @@ public class HarryValidatingQuery extends SimulatedAction private final HarrySimulatorTest.HarrySimulation simulation; private final Visit visit; private final QueryBuildingVisitExecutor queryBuilder; + private final ConsistencyLevel consistencyLevel; + private final EntropySource rng; public HarryValidatingQuery(HarrySimulatorTest.HarrySimulation simulation, Cluster cluster, TokenPlacementModel.ReplicationFactor rf, List owernship, Visit visit, - QueryBuildingVisitExecutor queryBuilder) + QueryBuildingVisitExecutor queryBuilder, + ConsistencyLevel consistencyLevel, + EntropySource rng) { super(visit, Modifiers.RELIABLE_NO_TIMEOUTS, Modifiers.RELIABLE_NO_TIMEOUTS, null, simulation.simulated); this.rf = rf; @@ -67,7 +75,8 @@ public HarryValidatingQuery(HarrySimulatorTest.HarrySimulation simulation, this.visit = visit; this.queryBuilder = queryBuilder; this.simulation = simulation; - + this.consistencyLevel = consistencyLevel; + this.rng = rng; } protected InterceptedExecution task() @@ -78,14 +87,25 @@ public void run() { try { - TokenPlacementModel.ReplicatedRanges ring = rf.replicate(owernship); - Invariants.checkState(visit.operations.length == 1); - Invariants.checkState(visit.operations[0] instanceof Operations.SelectStatement); - Operations.SelectStatement select = (Operations.SelectStatement) visit.operations[0]; - for (TokenPlacementModel.Replica replica : ring.replicasFor(token(select.pd))) + if (consistencyLevel == ConsistencyLevel.NODE_LOCAL) + { + TokenPlacementModel.ReplicatedRanges ring = rf.replicate(owernship); + Invariants.require(visit.operations.length == 1); + Invariants.require(visit.operations[0] instanceof Operations.SelectStatement); + Operations.SelectStatement select = (Operations.SelectStatement) visit.operations[0]; + for (TokenPlacementModel.Replica replica : ring.replicasFor(token(select.pd))) + { + CompiledStatement compiled = queryBuilder.compile(visit); + Object[][] objects = executeNodeLocal(compiled.cql(), replica.node(), compiled.bindings()); + List actualRows = InJvmDTestVisitExecutor.rowsToResultSet(simulation.schema, select, objects); + simulation.model.validate(select, actualRows); + } + } + else { + Operations.SelectStatement select = (Operations.SelectStatement) visit.operations[0]; CompiledStatement compiled = queryBuilder.compile(visit); - Object[][] objects = executeNodeLocal(compiled.cql(), replica.node(), compiled.bindings()); + Object[][] objects = execute(compiled.cql(), rng.nextInt(cluster.size()) + 1, compiled.bindings()); List actualRows = InJvmDTestVisitExecutor.rowsToResultSet(simulation.schema, select, objects); simulation.model.validate(select, actualRows); } @@ -93,6 +113,7 @@ public void run() catch (Throwable t) { logger.error("Caught an exception while validating", t); + failWithOOM(); throw t; } } @@ -113,4 +134,10 @@ protected Object[][] executeNodeLocal(String statement, TokenPlacementModel.Node .get(); return instance.executeInternal(statement, bindings); } + + protected Object[][] execute(String statement, int id, Object... bindings) + { + IInstance instance = cluster.get(id); + return instance.coordinator().execute(statement, consistencyLevel, bindings); + } } diff --git a/test/simulator/test/org/apache/cassandra/simulator/test/ShortAccordSimulationTest.java b/test/simulator/test/org/apache/cassandra/simulator/test/ShortAccordSimulationTest.java new file mode 100644 index 000000000000..cb83160be8d6 --- /dev/null +++ b/test/simulator/test/org/apache/cassandra/simulator/test/ShortAccordSimulationTest.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.simulator.test; + +import java.io.IOException; + +import org.junit.Test; + +import org.apache.cassandra.simulator.paxos.AccordSimulationRunner; + +/** + * In order to run these tests in your IDE, you need to first build a simulator jara + * + * ant simulator-jars + * + * And then run your test using the following settings (omit add-* if you are running on jdk8): + * + -Dstorage-config=$MODULE_DIR$/test/conf + -Djava.awt.headless=true + -javaagent:$MODULE_DIR$/lib/jamm-0.4.0.jar + -ea + -Dcassandra.debugrefcount=true + -Xss384k + -XX:SoftRefLRUPolicyMSPerMB=0 + -XX:ActiveProcessorCount=2 + -XX:HeapDumpPath=build/test + -Dcassandra.test.driver.connection_timeout_ms=10000 + -Dcassandra.test.driver.read_timeout_ms=24000 + -Dcassandra.memtable_row_overhead_computation_step=100 + -Dcassandra.test.use_prepared=true + -Dcassandra.test.sstableformatdevelopment=true + -Djava.security.egd=file:/dev/urandom + -Dcassandra.testtag=.jdk11 + -Dcassandra.keepBriefBrief=true + -Dcassandra.allow_simplestrategy=true + -Dcassandra.strict.runtime.checks=true + -Dcassandra.reads.thresholds.coordinator.defensive_checks_enabled=true + -Dcassandra.test.flush_local_schema_changes=false + -Dcassandra.test.messagingService.nonGracefulShutdown=true + -Dcassandra.use_nix_recursive_delete=true + -Dcie-cassandra.disable_schema_drop_log=true + -Dlogback.configurationFile=file://$MODULE_DIR$/test/conf/logback-simulator.xml + -Dcassandra.ring_delay_ms=10000 + -Dcassandra.tolerate_sstable_size=true + -Dcassandra.skip_sync=true + -Dcassandra.debugrefcount=false + -Dcassandra.test.simulator.determinismcheck=strict + -Dcassandra.test.simulator.print_asm=none + -javaagent:$MODULE_DIR$/build/test/lib/jars/simulator-asm.jar + -Xbootclasspath/a:$MODULE_DIR$/build/test/lib/jars/simulator-bootstrap.jar + -XX:ActiveProcessorCount=4 + -XX:-TieredCompilation + -XX:-BackgroundCompilation + -XX:CICompilerCount=1 + -XX:Tier4CompileThreshold=1000 + -XX:ReservedCodeCacheSize=256M + -Xmx16G + -Xmx4G + --add-exports java.base/jdk.internal.misc=ALL-UNNAMED + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-exports java.base/sun.nio.ch=ALL-UNNAMED + --add-exports java.management.rmi/com.sun.jmx.remote.internal.rmi=ALL-UNNAMED + --add-exports java.rmi/sun.rmi.registry=ALL-UNNAMED + --add-exports java.rmi/sun.rmi.server=ALL-UNNAMED + --add-exports java.sql/java.sql=ALL-UNNAMED + --add-exports java.rmi/sun.rmi.registry=ALL-UNNAMED + --add-opens java.base/java.lang.module=ALL-UNNAMED + --add-opens java.base/java.net=ALL-UNNAMED + --add-opens java.base/jdk.internal.loader=ALL-UNNAMED + --add-opens java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/jdk.internal.reflect=ALL-UNNAMED + --add-opens java.base/jdk.internal.math=ALL-UNNAMED + --add-opens java.base/jdk.internal.module=ALL-UNNAMED + --add-opens java.base/jdk.internal.util.jar=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + --add-opens jdk.management.jfr/jdk.management.jfr=ALL-UNNAMED + --add-opens java.desktop/com.sun.beans.introspect=ALL-UNNAMED + */ +public class ShortAccordSimulationTest +{ + @Test + public void simulationTest() throws IOException + { + AccordSimulationRunner.main(new String[] { "run", "-n", "3..6", "-t", "1000", "--cluster-action-limit", "-1", "-c", "2", "-s", "30"}); + } +} diff --git a/test/simulator/test/org/apache/cassandra/simulator/test/ShortPaxosSimulationTest.java b/test/simulator/test/org/apache/cassandra/simulator/test/ShortPaxosSimulationTest.java index 7eee1a69c8ab..e83645bfae26 100644 --- a/test/simulator/test/org/apache/cassandra/simulator/test/ShortPaxosSimulationTest.java +++ b/test/simulator/test/org/apache/cassandra/simulator/test/ShortPaxosSimulationTest.java @@ -98,7 +98,21 @@ public class ShortPaxosSimulationTest @Test public void simulationTest() throws IOException { - PaxosSimulationRunner.main(new String[] { "run", "-n", "3..6", "-t", "1000", "-c", "2", "--cluster-action-limit", "2", "-s", "30" }); + PaxosSimulationRunner.main(new String[] { "run", "--variant", "v2", "-n", "3..6", "-t", "1000", "-c", "2", "--cluster-action-limit", "2", "-s", "30" }); + } + + @Test + public void casOnAccordSimulationTest() throws IOException + { + PaxosSimulationRunner.main(new String[] { "run", + "--transactional-mode", "full", + "-n", "3...6", + "-t", "1000", + "--cluster-action-limit", "0", + "--consensus-action-limit", "0", + "--consensus-actions", "ACCORD_MIGRATE", + "-c", "10", + "-s", "30"}); } @Test diff --git a/test/simulator/test/org/apache/cassandra/simulator/test/SimulationTestBase.java b/test/simulator/test/org/apache/cassandra/simulator/test/SimulationTestBase.java index 88aaf5374cb6..406725bb1c33 100644 --- a/test/simulator/test/org/apache/cassandra/simulator/test/SimulationTestBase.java +++ b/test/simulator/test/org/apache/cassandra/simulator/test/SimulationTestBase.java @@ -251,7 +251,7 @@ public static void simulate(IIsolatedExecutor.SerializableRunnable[] runnables, InstanceClassLoader classLoader = new InstanceClassLoader(1, 1, AbstractCluster.CURRENT_VERSION.classpath, Thread.currentThread().getContextClassLoader(), sharedClassPredicate, - new InterceptClasses(() -> 1.0f, () -> 1.0f, + new InterceptClasses((x) -> () -> 1.0f, (x) -> () -> 1.0f, NemesisFieldSelectors.get(), ClassLoader.getSystemClassLoader(), sharedClassPredicate.negate())::apply); diff --git a/test/unit/accord/utils/Gen.java b/test/unit/accord/utils/Gen.java deleted file mode 100644 index 523812ccf433..000000000000 --- a/test/unit/accord/utils/Gen.java +++ /dev/null @@ -1,237 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package accord.utils; - -import java.util.function.BiFunction; -import java.util.function.Function; -import java.util.function.IntPredicate; -import java.util.function.IntSupplier; -import java.util.function.IntUnaryOperator; -import java.util.function.LongPredicate; -import java.util.function.LongSupplier; -import java.util.function.LongUnaryOperator; -import java.util.function.Predicate; -import java.util.function.Supplier; -import java.util.function.ToIntFunction; -import java.util.function.ToLongFunction; -import java.util.stream.IntStream; -import java.util.stream.LongStream; -import java.util.stream.Stream; - -public interface Gen { - /** - * For cases where method handles isn't able to detect the proper type, this method acts as a cast - * to inform the compiler of the desired type. - */ - static Gen of(Gen fn) - { - return fn; - } - - A next(RandomSource random); - - default Gen map(Function fn) - { - return r -> fn.apply(this.next(r)); - } - - default Gen map(BiFunction fn) - { - return r -> fn.apply(r, this.next(r)); - } - - default IntGen mapToInt(ToIntFunction fn) - { - return r -> fn.applyAsInt(next(r)); - } - - default LongGen mapToLong(ToLongFunction fn) - { - return r -> fn.applyAsLong(next(r)); - } - - default Gen flatMap(Function> mapper) - { - return rs -> mapper.apply(this.next(rs)).next(rs); - } - - default Gen flatMap(BiFunction> mapper) - { - return rs -> mapper.apply(rs, this.next(rs)).next(rs); - } - - default Gen filter(Predicate fn) - { - Gen self = this; - return r -> { - A value; - do { - value = self.next(r); - } - while (!fn.test(value)); - return value; - }; - } - - default Gen filter(int maxAttempts, A defaultValue, Predicate fn) - { - Invariants.checkArgument(maxAttempts > 0, "Max attempts must be positive; given %d", maxAttempts); - Gen self = this; - return r -> { - for (int i = 0; i < maxAttempts; i++) - { - A v = self.next(r); - if (fn.test(v)) - return v; - - } - return defaultValue; - }; - } - - default Supplier asSupplier(RandomSource rs) - { - return () -> next(rs); - } - - default Stream asStream(RandomSource rs) - { - return Stream.generate(() -> next(rs)); - } - - interface Int2IntMapFunction - { - int applyAsInt(RandomSource rs, int value); - } - - interface Int2LongMapFunction - { - long applyAsLong(RandomSource rs, int value); - } - - interface Long2LongMapFunction - { - long applyAsLong(RandomSource rs, long value); - } - - interface IntGen extends Gen - { - int nextInt(RandomSource random); - - @Override - default Integer next(RandomSource random) - { - return nextInt(random); - } - - default IntGen mapAsInt(IntUnaryOperator fn) - { - return r -> fn.applyAsInt(nextInt(r)); - } - - default IntGen mapAsInt(Int2IntMapFunction fn) - { - return r -> fn.applyAsInt(r, nextInt(r)); - } - - default LongGen mapAsLong(Int2LongMapFunction fn) - { - return r -> fn.applyAsLong(r, nextInt(r)); - } - - default Gen.IntGen filterAsInt(IntPredicate fn) - { - return rs -> { - int value; - do - { - value = nextInt(rs); - } - while (!fn.test(value)); - return value; - }; - } - - @Override - default Gen.IntGen filter(Predicate fn) - { - return filterAsInt(i -> fn.test(i)); - } - - default IntSupplier asIntSupplier(RandomSource rs) - { - return () -> nextInt(rs); - } - - default IntStream asIntStream(RandomSource rs) - { - return IntStream.generate(() -> nextInt(rs)); - } - } - - interface LongGen extends Gen - { - long nextLong(RandomSource random); - - @Override - default Long next(RandomSource random) - { - return nextLong(random); - } - - default LongGen mapAsLong(LongUnaryOperator fn) - { - return r -> fn.applyAsLong(nextLong(r)); - } - - default LongGen mapAsLong(Long2LongMapFunction fn) - { - return r -> fn.applyAsLong(r, nextLong(r)); - } - - default Gen.LongGen filterAsLong(LongPredicate fn) - { - return rs -> { - long value; - do - { - value = nextLong(rs); - } - while (!fn.test(value)); - return value; - }; - } - - @Override - default Gen.LongGen filter(Predicate fn) - { - return filterAsLong(i -> fn.test(i)); - } - - default LongSupplier asLongSupplier(RandomSource rs) - { - return () -> nextLong(rs); - } - - default LongStream asLongStream(RandomSource rs) - { - return LongStream.generate(() -> nextLong(rs)); - } - } -} diff --git a/test/unit/accord/utils/Gens.java b/test/unit/accord/utils/Gens.java deleted file mode 100644 index 218189206f39..000000000000 --- a/test/unit/accord/utils/Gens.java +++ /dev/null @@ -1,1152 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package accord.utils; - -import java.lang.reflect.Array; -import java.math.BigDecimal; -import java.math.RoundingMode; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.EnumMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.NavigableSet; -import java.util.Objects; -import java.util.Set; -import java.util.function.BooleanSupplier; -import java.util.function.Function; -import java.util.function.Supplier; -import java.util.stream.IntStream; -import java.util.stream.LongStream; -import java.util.stream.Stream; - -import com.google.common.collect.Iterables; - -import accord.utils.random.Picker; - -public class Gens { - private Gens() { - } - - public static Gen flatten(Gen> gen) - { - return rs -> gen.next(rs).next(rs); - } - - public static Gen constant(T constant) - { - return ignore -> constant; - } - - public static Gen constant(Supplier constant) - { - return ignore -> constant.get(); - } - - public static Gen oneOf(Gen... gens) - { - switch (gens.length) - { - case 0: throw new IllegalArgumentException("Unable to select oneOf an empty list"); - case 1: return (Gen) gens[0]; - } - return oneOf(Arrays.asList(gens)); - } - - public static Gen oneOf(List> gens) - { - switch (gens.size()) - { - case 0: throw new IllegalArgumentException("Unable to select oneOf an empty list"); - case 1: return (Gen) gens.get(0); - } - return rs -> rs.pick(gens).next(rs); - } - - public static Gen oneOf(Map, Integer> values) - { - Gen> gen = pick(values); - return rs -> gen.next(rs).next(rs); - } - - public static OneOfBuilder oneOf() - { - return new OneOfBuilder<>(); - } - - public static class OneOfBuilder - { - private final Map, Integer> weighted = new LinkedHashMap<>(); - private final Set> unweighted = new LinkedHashSet<>(); - private Gen.IntGen unknownWeightGen = Gens.ints().between(1, 10); - - public OneOfBuilder add(Gen gen) - { - unweighted.add(gen); - return this; - } - - public OneOfBuilder add(int weight, Gen gen) - { - weighted.put(gen, weight); - return this; - } - - public OneOfBuilder unknownWeights(Gen.IntGen gen) - { - this.unknownWeightGen = gen; - return this; - } - - public Gen> buildWithDynamicWeights() - { - if (unweighted.isEmpty()) - { - Gen gen = build(); - return i -> gen; - } - return rs -> { - Map, Integer> commands = new LinkedHashMap<>(); - commands.putAll(weighted); - for (var gen : unweighted) - commands.put(gen, unknownWeightGen.nextInt(rs)); - var top = pick(commands); - return rs2 -> top.next(rs2).next(rs2); - }; - } - - public Gen build() - { - Map, Integer> commands = new LinkedHashMap<>(); - commands.putAll(weighted); - for (var gen : unweighted) - commands.put(gen, 1); - var top = pick(commands); - return rs -> top.next(rs).next(rs); - } - } - - public static Gen.IntGen pickInt(int... ts) - { - return rs -> ts[rs.nextInt(0, ts.length)]; - } - - public static Gen pick(T... ts) - { - return pick(Arrays.asList(ts)); - } - - public static Gen pick(List ts) - { - Gen.IntGen offset = ints().between(0, ts.size() - 1); - return rs -> ts.get(offset.nextInt(rs)); - } - - public static > Gen pick(Set set) - { - List list = new ArrayList<>(set); - // Non-ordered sets may have different iteration order on different environments, which would make a seed produce different histories! - // To avoid such a problem, make sure to apply a deterministic function (sort). - if (!(set instanceof NavigableSet)) - list.sort(Comparator.naturalOrder()); - return pick(list); - } - - public static Gen pick(Map values) - { - if (values == null || values.isEmpty()) - throw new IllegalArgumentException("values is empty"); - // if 2 values have the same weight we need some way to tie-break, but that isn't always possible... - // this method relies on the map having some order and will reject any map that doesn't define a deterministic order - if (!(values instanceof EnumMap || values instanceof LinkedHashMap)) - throw new IllegalArgumentException("pick(Map) requires a map with deterministic iteration; given " + values.getClass()); - if (values.size() == 1) - return constant(Objects.requireNonNull(Iterables.getFirst(values.keySet(), null))); - double totalWeight = values.values().stream().mapToDouble(Integer::intValue).sum(); - List> list = new ArrayList<>(values.size()); - Iterator> it = values.entrySet().iterator(); - for (int i = 0; it.hasNext(); i++) - { - Map.Entry e = it.next(); - list.add(new Weight<>(e.getKey(), e.getValue(), i)); - } - Collections.sort(list); - return rs -> { - double value = rs.nextDouble() * totalWeight; - for (Weight w : list) - { - value -= w.weight; - if (value <= 0) - return w.value; - } - return list.get(list.size() - 1).value; - }; - } - - public static Gen.IntGen pickZipf(int[] array) - { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Empty array given"); - if (array.length == 1) - return ignore -> array[0]; - BigDecimal[] weights = new BigDecimal[array.length]; - BigDecimal base = BigDecimal.valueOf(Math.pow(2, array.length)); - weights[0] = base; - for (int i = 1; i < array.length; i++) - weights[i] = base.divide(BigDecimal.valueOf(i + 1), RoundingMode.UP); - BigDecimal totalWeights = Stream.of(weights).reduce(BigDecimal.ZERO, BigDecimal::add); - - return rs -> { - BigDecimal value = BigDecimal.valueOf(rs.nextDouble()).multiply(totalWeights); - for (int i = 0; i < weights.length; i++) - { - value = value.subtract(weights[i]); - if (value.compareTo(BigDecimal.ZERO) <= 0) - return array[i]; - } - return array[array.length - 1]; - }; - } - - public static Gen.LongGen pickZipf(long[] array) - { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Empty array given"); - if (array.length == 1) - return ignore -> array[0]; - BigDecimal[] weights = new BigDecimal[array.length]; - BigDecimal base = BigDecimal.valueOf(Math.pow(2, array.length)); - weights[0] = base; - for (int i = 1; i < array.length; i++) - weights[i] = base.divide(BigDecimal.valueOf(i + 1), RoundingMode.UP); - BigDecimal totalWeights = Stream.of(weights).reduce(BigDecimal.ZERO, BigDecimal::add); - - return rs -> { - BigDecimal value = BigDecimal.valueOf(rs.nextDouble()).multiply(totalWeights); - for (int i = 0; i < weights.length; i++) - { - value = value.subtract(weights[i]); - if (value.compareTo(BigDecimal.ZERO) <= 0) - return array[i]; - } - return array[array.length - 1]; - }; - } - - public static Gen pickZipf(T... array) - { - return pickZipf(Arrays.asList(array)); - } - - public static Gen pickZipf(List array) - { - if (array == null || array.isEmpty()) - throw new IllegalArgumentException("Empty array given"); - if (array.size() == 1) - return ignore -> array.get(0); - BigDecimal[] weights = new BigDecimal[array.size()]; - BigDecimal base = BigDecimal.valueOf(Math.pow(2, array.size())); - weights[0] = base; - for (int i = 1; i < array.size(); i++) - weights[i] = base.divide(BigDecimal.valueOf(i + 1), RoundingMode.UP); - BigDecimal totalWeights = Stream.of(weights).reduce(BigDecimal.ZERO, BigDecimal::add); - - return rs -> { - BigDecimal value = BigDecimal.valueOf(rs.nextDouble()).multiply(totalWeights); - for (int i = 0; i < weights.length; i++) - { - value = value.subtract(weights[i]); - if (value.compareTo(BigDecimal.ZERO) <= 0) - return array.get(i); - } - return array.get(array.size() - 1); - }; - } - - public static Gen randomWeights(int[] array) - { - return rs -> { - float[] weights = Picker.randomWeights(rs, array.length); - return r -> array[index(r, weights)]; - }; - } - - public static Gen randomWeights(long[] array) - { - return rs -> { - float[] weights = Picker.randomWeights(rs, array.length); - return r -> array[index(r, weights)]; - }; - } - - public static Gen> randomWeights(T[] array) - { - return rs -> { - float[] weights = Picker.randomWeights(rs, array.length); - return r -> array[index(r, weights)]; - }; - } - - public static Gen> randomWeights(List array) - { - return rs -> { - float[] weights = Picker.randomWeights(rs, array.size()); - return r -> array.get(index(r, weights)); - }; - } - - private static int index(RandomSource rs, float[] weights) - { - int i = Arrays.binarySearch(weights, rs.nextFloat()); - if (i < 0) i = -1 - i; - return i; - } - - public static Gen mixedDistribution(int minInclusive, int maxExclusive, int numBuckets) - { - int domainSize = (maxExclusive - minInclusive); - if (domainSize < 0) - throw new IllegalArgumentException("Range is too large; min=" + minInclusive + ", max=" + maxExclusive); - if (numBuckets <= 0 || numBuckets > domainSize) - throw new IllegalArgumentException("Num buckets must be between 1 and " + domainSize + "; given " + numBuckets); - int[] bucket, indexes; - bucket = new int[numBuckets]; - int delta = domainSize / numBuckets; - for (int i = 0; i < numBuckets; i++) - bucket[i] = minInclusive + i * delta; - indexes = IntStream.range(0, bucket.length).toArray(); - Gen indexDistro = mixedDistribution(indexes); - return rs -> { - Gen.IntGen indexGen = indexDistro.next(rs); - switch (rs.nextInt(0, 2)) - { - case 0: // uniform - { - return r -> { - int idx = indexGen.next(rs); - int start = bucket[idx]; - int end = idx == bucket.length - 1 ? maxExclusive : bucket[idx + 1]; - return r.nextInt(start, end); - }; - } - case 1: // median biased - { - int medians[] = new int[bucket.length]; - for (int i = 0; i < medians.length; i++) - { - int start = bucket[i]; - int end = i == bucket.length - 1 ? maxExclusive : bucket[i + 1]; - medians[i] = rs.nextInt(start, end); - } - return r -> { - int idx = indexGen.next(rs); - int start = bucket[idx]; - int end = idx == bucket.length - 1 ? maxExclusive : bucket[idx + 1]; - int median = medians[idx]; - return r.nextBiasedInt(start, median, end); - }; - } - default: - throw new AssertionError(); - } - }; - } - - public static Gen mixedDistribution(int minInclusive, int maxExclusive) - { - int domainSize = (maxExclusive - minInclusive + 1); - if (domainSize < 0) - throw new IllegalArgumentException("Range is too large; min=" + minInclusive + ", max=" + maxExclusive); - int[] array, indexes; - if (domainSize > 200) // randomly selected - { - int numBuckets = 10; - int delta = domainSize / numBuckets; - array = new int[numBuckets]; - for (int i = 0; i < numBuckets; i++) - array[i] = minInclusive + i * delta; - indexes = IntStream.range(0, array.length).toArray(); - } - else - { - array = IntStream.range(minInclusive, maxExclusive).toArray(); - indexes = null; - } - return rs -> { - switch (rs.nextInt(0, 4)) - { - case 0: // uniform - return r -> r.nextInt(minInclusive, maxExclusive); - case 1: // median biased - int median = rs.nextInt(minInclusive, maxExclusive); - return r -> r.nextBiasedInt(minInclusive, median, maxExclusive); - case 2: // zipf - if (indexes == null) - return Gens.pickZipf(rs.nextBoolean() ? reverseAndCopy(array) : array); - return Gens.pickZipf(rs.nextBoolean() ? reverseAndCopy(indexes) : indexes).mapAsInt((r, index) -> { - int start = array[index]; - int end = index == array.length - 1 ? maxExclusive : array[index + 1]; - return r.nextInt(start, end); - }); - case 3: // random weight - if (indexes == null) - return randomWeights(array).next(rs); - return randomWeights(indexes).next(rs).mapAsInt((r, index) -> { - int start = array[index]; - int end = index == array.length - 1 ? maxExclusive : array[index + 1]; - return r.nextInt(start, end); - }); - default: - throw new AssertionError(); - } - }; - } - - private static int[] reverseAndCopy(int[] array) - { - array = Arrays.copyOf(array, array.length); - for (int i = 0, mid = array.length / 2, j = array.length - 1; i < mid; i++, j--) - { - int tmp = array[i]; - array[i] = array[j]; - array[j] = tmp; - } - return array; - } - - public static Gen mixedDistribution(long minInclusive, long maxExclusive) - { - long domainSize = (maxExclusive - minInclusive + 1); - if (domainSize < 0) - throw new IllegalArgumentException("Range is too large; min=" + minInclusive + ", max=" + maxExclusive); - long[] array; - int[] indexes; - if (domainSize > 200) // randomly selected - { - int numBuckets = 10; - long delta = domainSize / numBuckets; - array = new long[numBuckets]; - for (int i = 0; i < numBuckets; i++) - array[i] = minInclusive + i * delta; - indexes = IntStream.range(0, array.length).toArray(); - } - else - { - array = LongStream.range(minInclusive, maxExclusive).toArray(); - indexes = null; - } - return rs -> { - switch (rs.nextInt(0, 4)) - { - case 0: // uniform - return r -> r.nextLong(minInclusive, maxExclusive); - case 1: // median biased - long median = rs.nextLong(minInclusive, maxExclusive); - return r -> r.nextBiasedLong(minInclusive, median, maxExclusive); - case 2: // zipf - if (indexes == null) - return Gens.pickZipf(rs.nextBoolean() ? reverseAndCopy(array) : array); - return Gens.pickZipf(rs.nextBoolean() ? reverseAndCopy(indexes) : indexes).mapAsLong((r, index) -> { - long start = array[index]; - long end = index == array.length - 1 ? maxExclusive : array[index + 1]; - return r.nextLong(start, end); - }); - case 3: // random weight - if (indexes == null) - return randomWeights(array).next(rs); - return randomWeights(indexes).next(rs).mapAsLong((r, index) -> { - long start = array[index]; - long end = index == array.length - 1 ? maxExclusive : array[index + 1]; - return r.nextLong(start, end); - }); - default: - throw new AssertionError(); - } - }; - } - - private static long[] reverseAndCopy(long[] array) - { - array = Arrays.copyOf(array, array.length); - for (int i = 0, mid = array.length / 2, j = array.length - 1; i < mid; i++, j--) - { - long tmp = array[i]; - array[i] = array[j]; - array[j] = tmp; - } - return array; - } - - public static Gen> mixedDistribution(T... list) - { - return mixedDistribution(Arrays.asList(list)); - } - - public static Gen> mixedDistribution(List list) - { - return rs -> { - switch (rs.nextInt(0, 4)) - { - case 0: // uniform - return r -> list.get(rs.nextInt(0, list.size())); - case 1: // median biased - int median = rs.nextInt(0, list.size()); - return r -> list.get(r.nextBiasedInt(0, median, list.size())); - case 2: // zipf - List array = list; - if (rs.nextBoolean()) - { - array = new ArrayList<>(list); - Collections.reverse(array); - } - return pickZipf(array); - case 3: // random weight - return randomWeights(list).next(rs); - default: - throw new AssertionError(); - } - }; - } - - public static Gen mixedDistribution(int[] list) - { - return rs -> { - switch (rs.nextInt(0, 4)) - { - case 0: // uniform - return r -> list[rs.nextInt(0, list.length)]; - case 1: // median biased - int median = rs.nextInt(0, list.length); - return r -> list[r.nextBiasedInt(0, median, list.length)]; - case 2: // zipf - int[] array = list; - if (rs.nextBoolean()) - { - array = Arrays.copyOf(array, array.length); - reverse(array); - } - return pickZipf(array); - case 3: // random weight - return randomWeights(list).next(rs); - default: - throw new AssertionError(); - } - }; - } - - /** - * This is a change from accord as that uses {@link accord.utils.Utils#reverse}, which doesn't exist in this forward port. - * - * To avoid adding another class and merge conflicts to cep-15-accord, this method was inlined - */ - private static void reverse(int[] array) - { - for (int i = 0; i < array.length / 2; i++) - { - int tmp = array[i]; - array[i] = array[array.length- 1 - i]; - array[array.length - 1 - i] = tmp; - } - } - - public static Gen charArray(Gen.IntGen sizes, char[] domain) - { - return charArray(sizes, domain, (a, b) -> true); - } - - public interface IntCharBiPredicate - { - boolean test(int a, char b); - } - - public static Gen charArray(Gen.IntGen sizes, char[] domain, IntCharBiPredicate fn) - { - Gen.IntGen indexGen = ints().between(0, domain.length - 1); - return rs -> { - int size = sizes.nextInt(rs); - char[] is = new char[size]; - for (int i = 0; i != size; i++) - { - char c; - do - { - c = domain[indexGen.nextInt(rs)]; - } - while (!fn.test(i, c)); - is[i] = c; - } - return is; - }; - } - - public static Gen random() { - return r -> r; - } - - public static BooleanDSL bools() - { - return new BooleanDSL(); - } - - public static IntDSL ints() - { - return new IntDSL(); - } - - public static LongDSL longs() { - return new LongDSL(); - } - - public static ListDSL lists(Gen fn) { - return new ListDSL<>(fn); - } - - public static ArrayDSL arrays(Class type, Gen fn) { - return new ArrayDSL<>(type, fn); - } - - public static IntArrayDSL arrays(Gen.IntGen fn) { - return new IntArrayDSL(fn); - } - - public static LongArrayDSL arrays(Gen.LongGen fn) { - return new LongArrayDSL(fn); - } - - public static EnumDSL enums() - { - return new EnumDSL(); - } - - public static StringDSL strings() - { - return new StringDSL(); - } - - public static BooleanSupplier supplier(Gen gen, RandomSource rs) - { - return () -> gen.next(rs); - } - - public static class BooleanDSL - { - public Gen all() - { - return RandomSource::nextBoolean; - } - - public Gen biasedRepeatingRuns(double ratio, int maxRuns) - { - Invariants.checkArgument(ratio > 0 && ratio <= 1, "Expected %d to be larger than 0 and <= 1", ratio); - double lower = ratio * .8; - double upper = ratio * 1.2; - return new Gen() { - // run represents how many consecutaive true values should be returned; -1 implies no active "run" exists - private int run = -1; - private long falseCount = 0, trueCount = 0; - @Override - public Boolean next(RandomSource rs) - { - if (run != -1) - { - run--; - trueCount++; - return true; - } - double currentRatio = trueCount / (double) (falseCount + trueCount); - if (currentRatio < lower) - { - // not enough true - trueCount++; - return true; - } - if (currentRatio > upper) - { - // not enough false - falseCount++; - return false; - } - if (rs.decide(ratio)) - { - run = rs.nextInt(maxRuns); - run--; - trueCount++; - return true; - } - falseCount++; - return false; - } - }; - } - - public Gen> mixedDistribution() - { - return rs -> { - int selection = rs.nextInt(0, 4); - switch (selection) - { - case 0: // uniform 50/50 - return r -> r.nextBoolean(); - case 1: // variable frequency - var freq = rs.nextFloat(); - return r -> r.decide(freq); - case 2: // fixed result - boolean result = rs.nextBoolean(); - return ignore -> result; - case 3: // biased repeating runs - return biasedRepeatingRuns(rs.nextDouble(), rs.nextInt(1, 100)); - default: - throw new IllegalStateException("Unexpected int for bool selection: " + selection); - } - }; - } - } - - public static class IntDSL - { - public Gen.IntGen of(int value) - { - return r -> value; - } - - public Gen.IntGen all() - { - return RandomSource::nextInt; - } - - public Gen.IntGen between(int min, int max) - { - Invariants.checkArgument(max >= min, "max (%d) < min (%d)", max, min); - if (min == max) - return of(min); - // since bounds is exclusive, if max == max_value unable to do +1 to include... so will return a gen - // that does not include - if (max == Integer.MAX_VALUE) - return r -> r.nextInt(min, max); - return r -> r.nextInt(min, max + 1); - } - - public Gen mixedDistribution(int minInclusive, int maxExclusive) - { - return Gens.mixedDistribution(minInclusive, maxExclusive); - } - - public Gen mixedDistribution(int minInclusive, int maxExclusive, int numBuckets) - { - return Gens.mixedDistribution(minInclusive, maxExclusive, numBuckets); - } - } - - public static class LongDSL { - public Gen.LongGen of(long value) - { - return r -> value; - } - - public Gen.LongGen all() { - return RandomSource::nextLong; - } - - public Gen.LongGen between(long min, long max) { - Invariants.checkArgument(max >= min); - if (min == max) - return of(min); - // since bounds is exclusive, if max == max_value unable to do +1 to include... so will return a gen - // that does not include - if (max == Long.MAX_VALUE) - return r -> r.nextLong(min, max); - return r -> r.nextLong(min, max + 1); - } - } - - public static class EnumDSL - { - public > Gen all(Class klass) - { - return pick(klass.getEnumConstants()); - } - - public > Gen> allMixedDistribution(Class klass) - { - return mixedDistribution(klass.getEnumConstants()); - } - - public > Gen allWithWeights(Class klass, int... weights) - { - T[] constants = klass.getEnumConstants(); - if (constants.length != weights.length) - throw new IllegalArgumentException(String.format("Total number of weights (%s) does not match the enum (%s)", Arrays.toString(weights), Arrays.toString(constants))); - Map values = new EnumMap<>(klass); - for (int i = 0; i < constants.length; i++) - values.put(constants[i], weights[i]); - return pick(values); - } - } - - public static class StringDSL - { - public Gen of(Gen.IntGen sizes, char[] domain) - { - // note, map is overloaded so String::new is ambugious to javac, so need a lambda here - return charArray(sizes, domain).map(c -> new String(c)); - } - - public SizeBuilder of(char[] domain) - { - return new SizeBuilder<>(sizes -> of(sizes, domain)); - } - - public Gen of(Gen.IntGen sizes, char[] domain, IntCharBiPredicate fn) - { - // note, map is overloaded so String::new is ambugious to javac, so need a lambda here - return charArray(sizes, domain, fn).map(c -> new String(c)); - } - - public SizeBuilder of(char[] domain, IntCharBiPredicate fn) - { - return new SizeBuilder<>(sizes -> of(sizes, domain, fn)); - } - - public Gen all(Gen.IntGen sizes) - { - return betweenCodePoints(sizes, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT); - } - - public SizeBuilder all() - { - return new SizeBuilder<>(this::all); - } - - public Gen ascii(Gen.IntGen sizes) - { - return betweenCodePoints(sizes, 0, 127); - } - - public SizeBuilder ascii() - { - return new SizeBuilder<>(this::ascii); - } - - public Gen betweenCodePoints(Gen.IntGen sizes, int min, int max) - { - Gen.IntGen codePointGen = ints().between(min, max).filter(Character::isDefined); - return rs -> { - int[] array = new int[sizes.nextInt(rs)]; - for (int i = 0; i < array.length; i++) - array[i] = codePointGen.nextInt(rs); - return new String(array, 0, array.length); - }; - } - - public SizeBuilder betweenCodePoints(int min, int max) - { - return new SizeBuilder<>(sizes -> betweenCodePoints(sizes, min, max)); - } - } - - public static class SizeBuilder - { - private final Function> fn; - - public SizeBuilder(Function> fn) - { - this.fn = fn; - } - - public Gen ofLength(int fixed) - { - return ofLengthBetween(fixed, fixed); - } - - public Gen ofLengthBetween(int min, int max) - { - return fn.apply(ints().between(min, max)); - } - } - - public static class ListDSL implements BaseSequenceDSL, List> { - private final Gen fn; - - public ListDSL(Gen fn) { - this.fn = Objects.requireNonNull(fn); - } - - @Override - public ListDSL unique() - { - return new ListDSL<>(new GenReset<>(fn, false)); - } - - public ListDSL uniqueBestEffort() - { - return new ListDSL<>(new GenReset<>(fn, true)); - } - - @Override - public Gen> ofSizeBetween(int minSize, int maxSize) { - Gen.IntGen sizeGen = ints().between(minSize, maxSize); - return r -> - { - Reset.tryReset(fn); - int size = sizeGen.nextInt(r); - List list = new ArrayList<>(size); - for (int i = 0; i < size; i++) - { - try - { - list.add(fn.next(r)); - } - catch (IgnoreGenResult e) - { - // ignore - } - } - return list; - }; - } - } - - public static class ArrayDSL implements BaseSequenceDSL, T[]> { - private final Class type; - private final Gen fn; - - public ArrayDSL(Class type, Gen fn) { - this.type = Objects.requireNonNull(type); - this.fn = Objects.requireNonNull(fn); - } - - @Override - public ArrayDSL unique() - { - return new ArrayDSL<>(type, new GenReset<>(fn, false)); - } - - public ArrayDSL uniqueBestEffort() - { - return new ArrayDSL<>(type, new GenReset<>(fn, true)); - } - - @Override - public Gen ofSizeBetween(int minSize, int maxSize) { - Gen.IntGen sizeGen = ints().between(minSize, maxSize); - return r -> - { - Reset.tryReset(fn); - int size = sizeGen.nextInt(r); - T[] list = (T[]) Array.newInstance(type, size); - for (int i = 0; i < size; i++) - list[i] = fn.next(r); - return list; - }; - } - } - - public static class IntArrayDSL implements BaseSequenceDSL { - private final Gen.IntGen fn; - - public IntArrayDSL(Gen.IntGen fn) { - this.fn = Objects.requireNonNull(fn); - } - - @Override - public IntArrayDSL unique() - { - return new IntArrayDSL(new IntGenReset(fn)); - } - - @Override - public Gen ofSizeBetween(int minSize, int maxSize) { - Gen.IntGen sizeGen = ints().between(minSize, maxSize); - return r -> - { - int size = sizeGen.nextInt(r); - int[] list = new int[size]; - for (int i = 0; i < size; i++) - list[i] = fn.nextInt(r); - return list; - }; - } - } - - public static class LongArrayDSL implements BaseSequenceDSL { - private final Gen.LongGen fn; - - public LongArrayDSL(Gen.LongGen fn) { - this.fn = Objects.requireNonNull(fn); - } - - @Override - public LongArrayDSL unique() - { - return new LongArrayDSL(new LongGenReset(fn)); - } - - @Override - public Gen ofSizeBetween(int minSize, int maxSize) { - Gen.IntGen sizeGen = ints().between(minSize, maxSize); - return r -> - { - int size = sizeGen.nextInt(r); - long[] list = new long[size]; - for (int i = 0; i < size; i++) - list[i] = fn.nextLong(r); - return list; - }; - } - } - - public interface BaseSequenceDSL, B> - { - A unique(); - - Gen ofSizeBetween(int min, int max); - - default Gen ofSize(int size) { - return ofSizeBetween(size, size); - } - } - - protected interface Reset { - static void tryReset(Object o) - { - if (o instanceof Reset) - ((Reset) o).reset(); - } - - void reset(); - } - - private static final class IgnoreGenResult extends RuntimeException - { - private static final IgnoreGenResult INSTANCE = new IgnoreGenResult(); - private IgnoreGenResult() - { - super(null, null, false, false); - } - } - - private static class GenReset implements Gen, Reset - { - private final Set seen = new HashSet<>(); - private final Gen fn; - private final boolean bestEffort; - - private GenReset(Gen fn, boolean bestEffort) - { - this.fn = fn; - this.bestEffort = bestEffort; - } - - @Override - public T next(RandomSource random) - { - if (!bestEffort) - { - T value; - // 10k attempts - for (int i = 0; i < 10_000; i++) - { - if (seen.add((value = fn.next(random)))) - return value; - } - - throw new IllegalArgumentException("Could not generate a unique value after 10k attempts"); - } - else - { - T value = null; - int i; - for (i = 0; i < 42 && !seen.add((value = fn.next(random))); i++) {} - if (i == 42) throw IgnoreGenResult.INSTANCE; - return value; - } - } - - @Override - public void reset() - { - seen.clear(); - } - } - - private static class IntGenReset implements Gen.IntGen, Reset - { - private final GenReset base; - - private IntGenReset(Gen.IntGen fn) - { - this.base = new GenReset<>(fn, false); - } - @Override - public int nextInt(RandomSource random) { - return base.next(random); - } - - @Override - public void reset() { - base.reset(); - } - } - - private static class LongGenReset implements Gen.LongGen, Reset - { - private final GenReset base; - - private LongGenReset(Gen.LongGen fn) - { - this.base = new GenReset<>(fn, false); - } - @Override - public long nextLong(RandomSource random) { - return base.next(random); - } - - @Override - public void reset() { - base.reset(); - } - } - - private static class Weight implements Comparable> - { - private final T value; - private final double weight; - private final int index; - - private Weight(T value, double weight, int index) { - this.value = value; - this.weight = weight; - this.index = index; - } - - @Override - public int compareTo(Weight o) { - int rc = Double.compare(weight, o.weight); - if (rc == 0) - rc = Integer.compare(index, o.index); - return rc; - } - } -} diff --git a/test/unit/accord/utils/Invariants.java b/test/unit/accord/utils/Invariants.java deleted file mode 100644 index 2977272d4aa7..000000000000 --- a/test/unit/accord/utils/Invariants.java +++ /dev/null @@ -1,339 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package accord.utils; - -import net.nicoulaj.compilecommand.annotations.Inline; - -import javax.annotation.Nullable; -import java.util.function.Predicate; -import java.util.function.Supplier; - -import static java.lang.String.format; - -public class Invariants -{ - private static final boolean PARANOID = true; - private static final boolean DEBUG = true; - - public static boolean isParanoid() - { - return PARANOID; - } - public static boolean debug() - { - return DEBUG; - } - - public static IllegalStateException createIllegalState(String msg) - { - return new IllegalStateException(msg); - } - - public static IllegalStateException illegalState(String msg) - { - throw createIllegalState(msg); - } - - private static void illegalState() - { - illegalState(null); - } - - private static void illegalArgument(String msg) - { - throw new IllegalArgumentException(msg); - } - - - private static void illegalArgument() - { - illegalArgument(null); - } - - public static T2 checkType(T1 cast) - { - return (T2)cast; - } - - public static T2 checkType(Class to, T1 cast) - { - if (cast != null && !to.isInstance(cast)) - illegalState(); - return (T2)cast; - } - - public static T2 checkType(Class to, T1 cast, String msg) - { - if (cast != null && !to.isInstance(cast)) - illegalState(msg); - return (T2)cast; - } - - public static void paranoid(boolean condition) - { - if (PARANOID && !condition) - illegalState(); - } - - public static void checkState(boolean condition) - { - if (!condition) - illegalState(); - } - - public static void checkState(boolean condition, Supplier msg) - { - if (!condition) - throw illegalState(msg.get()); - } - - public static void checkState(boolean condition, String msg) - { - if (!condition) - illegalState(msg); - } - - public static void checkState(boolean condition, String fmt, int p1) - { - if (!condition) - illegalState(format(fmt, p1)); - } - - public static void checkState(boolean condition, String fmt, int p1, int p2) - { - if (!condition) - illegalState(format(fmt, p1, p2)); - } - - public static void checkState(boolean condition, String fmt, long p1) - { - if (!condition) - illegalState(format(fmt, p1)); - } - - public static void checkState(boolean condition, String fmt, long p1, long p2) - { - if (!condition) - illegalState(format(fmt, p1, p2)); - } - - public static void checkState(boolean condition, String fmt, @Nullable Object p1) - { - if (!condition) - illegalState(format(fmt, p1)); - } - - public static void checkState(boolean condition, String fmt, @Nullable Object p1, @Nullable Object p2) - { - if (!condition) - illegalState(format(fmt, p1, p2)); - } - - public static void checkState(boolean condition, String fmt, Object... args) - { - if (!condition) - illegalState(format(fmt, args)); - } - - public static T nonNull(T param) - { - if (param == null) - throw new NullPointerException(); - return param; - } - - public static T nonNull(T param, String fmt, Object... args) - { - if (param == null) - throw new NullPointerException(format(fmt, args)); - return param; - } - - public static int isNatural(int input) - { - if (input < 0) - illegalState(); - return input; - } - - public static long isNatural(long input) - { - if (input < 0) - illegalState(); - return input; - } - - public static void checkArgument(boolean condition) - { - if (!condition) - illegalArgument(); - } - - public static void checkArgument(boolean condition, String msg) - { - if (!condition) - illegalArgument(msg); - } - - public static void checkArgument(boolean condition, String fmt, int p1) - { - if (!condition) - illegalArgument(format(fmt, p1)); - } - - public static void checkArgument(boolean condition, String fmt, int p1, int p2) - { - if (!condition) - illegalArgument(format(fmt, p1, p2)); - } - - public static void checkArgument(boolean condition, String fmt, long p1) - { - if (!condition) - illegalArgument(format(fmt, p1)); - } - - public static void checkArgument(boolean condition, String fmt, long p1, long p2) - { - if (!condition) - illegalArgument(format(fmt, p1, p2)); - } - - public static void checkArgument(boolean condition, String fmt, @Nullable Object p1) - { - if (!condition) - illegalArgument(format(fmt, p1)); - } - - public static void checkArgument(boolean condition, String fmt, @Nullable Object p1, @Nullable Object p2) - { - if (!condition) - illegalArgument(format(fmt, p1, p2)); - } - - public static void checkArgument(boolean condition, String fmt, Object... args) - { - if (!condition) - illegalArgument(format(fmt, args)); - } - - public static T checkArgument(T param, boolean condition) - { - if (!condition) - illegalArgument(); - return param; - } - - public static T checkArgument(T param, boolean condition, String msg) - { - if (!condition) - illegalArgument(msg); - return param; - } - - public static T checkArgument(T param, boolean condition, String fmt, int p1) - { - if (!condition) - illegalArgument(format(fmt, p1)); - return param; - } - - public static T checkArgument(T param, boolean condition, String fmt, int p1, int p2) - { - if (!condition) - illegalArgument(format(fmt, p1, p2)); - return param; - } - - public static T checkArgument(T param, boolean condition, String fmt, long p1) - { - if (!condition) - illegalArgument(format(fmt, p1)); - return param; - } - - public static T checkArgument(T param, boolean condition, String fmt, long p1, long p2) - { - if (!condition) - illegalArgument(format(fmt, p1, p2)); - return param; - } - - public static T checkArgument(T param, boolean condition, String fmt, @Nullable Object p1) - { - if (!condition) - illegalArgument(format(fmt, p1)); - return param; - } - - public static T checkArgument(T param, boolean condition, String fmt, @Nullable Object p1, @Nullable Object p2) - { - if (!condition) - illegalArgument(format(fmt, p1, p2)); - return param; - } - - public static T checkArgument(T param, boolean condition, String fmt, Object... args) - { - if (!condition) - illegalArgument(format(fmt, args)); - return param; - } - - @Inline - public static T checkArgument(T param, Predicate condition) - { - if (!condition.test(param)) - illegalArgument(); - return param; - } - - @Inline - public static T checkArgument(T param, Predicate condition, String msg) - { - if (!condition.test(param)) - illegalArgument(msg); - return param; - } - - public static O cast(Object o, Class klass) - { - try - { - return klass.cast(o); - } - catch (ClassCastException e) - { - throw new IllegalArgumentException(format("Unable to cast %s to %s", o, klass.getName())); - } - } - - public static void checkIndexInBounds(int realLength, int offset, int length) - { - if (realLength == 0 || length == 0) - throw new IndexOutOfBoundsException("Unable to access offset " + offset + "; empty"); - if (offset < 0) - throw new IndexOutOfBoundsException("Offset " + offset + " must not be negative"); - if (length < 0) - throw new IndexOutOfBoundsException("Length " + length + " must not be negative"); - int endOffset = offset + length; - if (endOffset > realLength) - throw new IndexOutOfBoundsException(String.format("Offset %d, length = %d; real length was %d", offset, length, realLength)); - } -} diff --git a/test/unit/accord/utils/Property.java b/test/unit/accord/utils/Property.java deleted file mode 100644 index 79c29c5a41de..000000000000 --- a/test/unit/accord/utils/Property.java +++ /dev/null @@ -1,1074 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package accord.utils; - -import accord.utils.async.TimeoutUtils; -import org.agrona.collections.LongArrayList; - -import java.time.Duration; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.LinkedHashMap; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Set; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.TimeoutException; -import java.util.function.BiFunction; -import java.util.function.Consumer; -import java.util.function.Function; -import java.util.function.Predicate; -import java.util.function.Supplier; -import java.util.stream.Collectors; -import javax.annotation.Nullable; - -public class Property -{ - public static abstract class Common> - { - protected long seed = SeedProvider.instance.nextSeed(); - protected int examples = 1000; - - protected boolean pure = true; - @Nullable - protected Duration timeout = null; - - protected Common() { - } - - protected Common(Common other) { - this.seed = other.seed; - this.examples = other.examples; - this.pure = other.pure; - this.timeout = other.timeout; - } - - public T withSeed(long seed) - { - this.seed = seed; - return (T) this; - } - - public T withExamples(int examples) - { - if (examples <= 0) - throw new IllegalArgumentException("Examples must be positive"); - this.examples = examples; - return (T) this; - } - - public T withPure(boolean pure) - { - this.pure = pure; - return (T) this; - } - - public T withTimeout(Duration timeout) - { - this.timeout = timeout; - this.pure = false; - return (T) this; - } - - protected void checkWithTimeout(Runnable fn) - { - try - { - TimeoutUtils.runBlocking(timeout, "property with timeout", fn::run); - } - catch (ExecutionException e) - { - throw new PropertyError(propertyError(this, e.getCause())); - } - catch (InterruptedException e) - { - throw new PropertyError(propertyError(this, e)); - } - catch (TimeoutException e) - { - TimeoutException override = new TimeoutException("property test did not complete within " + this.timeout); - override.setStackTrace(new StackTraceElement[0]); - throw new PropertyError(propertyError(this, override)); - } - } - } - - public static class ForBuilder extends Common - { - public void check(FailingConsumer fn) - { - forAll(Gens.random()).check(fn); - } - - public SingleBuilder forAll(Gen gen) - { - return new SingleBuilder<>(gen, this); - } - - public DoubleBuilder forAll(Gen a, Gen b) - { - return new DoubleBuilder<>(a, b, this); - } - - public TrippleBuilder forAll(Gen a, Gen b, Gen c) - { - return new TrippleBuilder<>(a, b, c, this); - } - } - - private static Object normalizeValue(Object value) - { - if (value == null) - return null; - // one day java arrays will have a useful toString... one day... - if (value.getClass().isArray()) - { - Class subType = value.getClass().getComponentType(); - if (!subType.isPrimitive()) - return Arrays.asList((Object[]) value); - if (Byte.TYPE == subType) - return Arrays.toString((byte[]) value); - if (Character.TYPE == subType) - return Arrays.toString((char[]) value); - if (Short.TYPE == subType) - return Arrays.toString((short[]) value); - if (Integer.TYPE == subType) - return Arrays.toString((int[]) value); - if (Long.TYPE == subType) - return Arrays.toString((long[]) value); - if (Float.TYPE == subType) - return Arrays.toString((float[]) value); - if (Double.TYPE == subType) - return Arrays.toString((double[]) value); - } - try - { - String result = value.toString(); - if (result != null && result.length() > 100 && value instanceof Collection) - result = ((Collection) value).stream().map(o -> "\n\t " + o).collect(Collectors.joining(",", "[", "]")); - return result; - } - catch (Throwable t) - { - return "Object.toString failed: " + t.getClass().getCanonicalName() + ": " + t.getMessage(); - } - } - - private static StringBuilder propertyErrorCommon(Common input, Throwable cause) - { - StringBuilder sb = new StringBuilder(); - // return "Seed=" + seed + "\nExamples=" + examples; - sb.append("Property error detected:\nSeed = ").append(input.seed).append('\n'); - sb.append("Examples = ").append(input.examples).append('\n'); - sb.append("Pure = ").append(input.pure).append('\n'); - if (cause != null) - { - String msg = cause.getMessage(); - sb.append("Error: "); - // to improve readability, if a newline is detected move the error msg to the next line - if (msg != null && msg.contains("\n")) - msg = "\n\t" + msg.replace("\n", "\n\t"); - if (msg == null) - msg = cause.getClass().getCanonicalName(); - sb.append(msg).append('\n'); - } - return sb; - } - - private static String propertyError(Common input, Throwable cause, Object... values) - { - StringBuilder sb = propertyErrorCommon(input, cause); - if (values != null) - { - sb.append("Values:\n"); - for (int i = 0; i < values.length; i++) - sb.append('\t').append(i).append(" = ").append(normalizeValue(values[i])).append(": ").append(values[i] == null ? "unknown type" : values[i].getClass().getCanonicalName()).append('\n'); - } - return sb.toString(); - } - - private static String statefulPropertyError(StatefulBuilder input, Throwable cause, Object state, List history) - { - StringBuilder sb = propertyErrorCommon(input, cause); - sb.append("Steps: ").append(input.steps).append('\n'); - sb.append("Values:\n"); - String stateStr = state == null ? null : state.toString().replace("\n", "\n\t\t"); - sb.append("\tState: ").append(stateStr).append(": ").append(state == null ? "unknown type" : state.getClass().getCanonicalName()).append('\n'); - sb.append("\tHistory:").append('\n'); - addList(sb, "\t\t", history); - return sb.toString(); - } - - private static void addList(StringBuilder sb, String prefix, List list) - { - int idx = 0; - for (var event : list) - sb.append(prefix).append(++idx).append(": ").append(event).append('\n'); - } - - public static String formatList(String prefix, List list) - { - StringBuilder sb = new StringBuilder(); - addList(sb, prefix, list); - return sb.toString(); - } - - public interface FailingConsumer - { - void accept(A value) throws Exception; - } - - public static class SingleBuilder extends Common> - { - private final Gen gen; - - private SingleBuilder(Gen gen, Common other) { - super(other); - this.gen = Objects.requireNonNull(gen); - } - - public void check(FailingConsumer fn) - { - if (timeout != null) - { - checkWithTimeout(() -> checkInternal(fn)); - return; - } - checkInternal(fn); - } - - private void checkInternal(FailingConsumer fn) - { - RandomSource random = new DefaultRandom(seed); - for (int i = 0; i < examples; i++) - { - T value = null; - try - { - checkInterrupted(); - fn.accept(value = gen.next(random)); - } - catch (Throwable t) - { - throw new PropertyError(propertyError(this, t, value), t); - } - if (pure) - { - seed = random.nextLong(); - random.setSeed(seed); - } - } - } - } - - public interface FailingBiConsumer - { - void accept(A a, B b) throws Exception; - } - - public static class DoubleBuilder extends Common> - { - private final Gen aGen; - private final Gen bGen; - - private DoubleBuilder(Gen aGen, Gen bGen, Common other) { - super(other); - this.aGen = Objects.requireNonNull(aGen); - this.bGen = Objects.requireNonNull(bGen); - } - - public void check(FailingBiConsumer fn) - { - if (timeout != null) - { - checkWithTimeout(() -> checkInternal(fn)); - return; - } - checkInternal(fn); - } - - private void checkInternal(FailingBiConsumer fn) - { - RandomSource random = new DefaultRandom(seed); - for (int i = 0; i < examples; i++) - { - A a = null; - B b = null; - try - { - checkInterrupted(); - fn.accept(a = aGen.next(random), b = bGen.next(random)); - } - catch (Throwable t) - { - throw new PropertyError(propertyError(this, t, a, b), t); - } - if (pure) - { - seed = random.nextLong(); - random.setSeed(seed); - } - } - } - } - - public interface FailingTriConsumer - { - void accept(A a, B b, C c) throws Exception; - } - - public static class TrippleBuilder extends Common> - { - private final Gen as; - private final Gen bs; - private final Gen cs; - - public TrippleBuilder(Gen as, Gen bs, Gen cs, Common other) - { - super(other); - this.as = as; - this.bs = bs; - this.cs = cs; - } - - public void check(FailingTriConsumer fn) - { - if (timeout != null) - { - checkWithTimeout(() -> checkInternal(fn)); - return; - } - checkInternal(fn); - } - - private void checkInternal(FailingTriConsumer fn) - { - RandomSource random = new DefaultRandom(seed); - for (int i = 0; i < examples; i++) - { - A a = null; - B b = null; - C c = null; - try - { - checkInterrupted(); - fn.accept(a = as.next(random), b = bs.next(random), c = cs.next(random)); - } - catch (Throwable t) - { - throw new PropertyError(propertyError(this, t, a, b, c), t); - } - if (pure) - { - seed = random.nextLong(); - random.setSeed(seed); - } - } - } - } - - private static void checkInterrupted() throws InterruptedException - { - if (Thread.currentThread().isInterrupted()) - throw new InterruptedException(); - } - - public static class PropertyError extends AssertionError - { - public PropertyError(String message, Throwable cause) - { - super(message, cause); - } - - public PropertyError(String message) - { - super(message); - } - } - - public static ForBuilder qt() - { - return new ForBuilder(); - } - - public static StatefulBuilder stateful() - { - return new StatefulBuilder(); - } - - public static class StatefulBuilder extends Common - { - protected int steps = 1000; - @Nullable - protected Duration stepTimeout = null; - - public StatefulBuilder() - { - examples = 500; - } - - public StatefulBuilder withSteps(int steps) - { - this.steps = steps; - return this; - } - - public StatefulBuilder withStepTimeout(Duration duration) - { - stepTimeout = duration; - return this; - } - - @SuppressWarnings("rawtypes") - public void check(Commands commands) - { - RandomSource rs = new DefaultRandom(seed); - for (int i = 0; i < examples; i++) - { - State state = null; - List history = new ArrayList<>(steps); - LongArrayList historyTiming = stepTimeout == null ? null : new LongArrayList(); - try - { - checkInterrupted(); - - state = commands.genInitialState().next(rs); - SystemUnderTest sut = commands.createSut(state); - - try - { - for (int j = 0; j < steps; j++) - { - Gen> cmdGen = commands.commands(state); - Command cmd = cmdGen.next(rs); - for (int a = 0; cmd.checkPreconditions(state) != PreCheckResult.Ok && a < 42; a++) - { - if (a == 41) - throw new IllegalArgumentException("Unable to find next command"); - cmd = cmdGen.next(rs); - } - if (cmd instanceof MultistepCommand) - { - for (Command sub : ((MultistepCommand) cmd)) - { - history.add(sub.detailed(state)); - process(sub, state, sut, history.size(), historyTiming); - } - } - else - { - history.add(cmd.detailed(state)); - process(cmd, state, sut, history.size(), historyTiming); - } - } - commands.destroySut(sut, null); - commands.destroyState(state, null); - commands.onSuccess(state, sut, maybeRewriteHistory(history, historyTiming)); - } - catch (Throwable t) - { - try - { - commands.destroySut(sut, t); - commands.destroyState(state, t); - } - catch (Throwable t2) - { - t.addSuppressed(t2); - } - throw t; - } - } - catch (Throwable t) - { - - throw new PropertyError(statefulPropertyError(this, t, state, maybeRewriteHistory(history, historyTiming)), t); - } - if (pure) - { - seed = rs.nextLong(); - rs.setSeed(seed); - } - } - } - - private static List maybeRewriteHistory(List history, @Nullable LongArrayList historyTiming) - { - if (historyTiming == null) return history; - List newHistory = new ArrayList<>(history.size()); - for (int i = 0; i < history.size(); i++) - { - String step = history.get(i); - long timeNanos = historyTiming.getLong(i); - newHistory.add(step + ";\tDuration " + Duration.ofNanos(timeNanos)); - } - return newHistory; - } - - private void process(Command cmd, State state, SystemUnderTest sut, int id, @Nullable LongArrayList stepTiming) throws Throwable - { - if (stepTimeout == null) - { - cmd.process(state, sut); - return; - } - long startNanos = System.nanoTime(); - try - { - TimeoutUtils.runBlocking(stepTimeout, "Stateful Step " + id + ": " + cmd.detailed(state), () -> cmd.process(state, sut)); - } - finally - { - stepTiming.add(System.nanoTime() - startNanos); - } - } - } - - public enum PreCheckResult { Ok, Ignore } - public interface Command - { - default PreCheckResult checkPreconditions(State state) {return PreCheckResult.Ok;} - Result apply(State state) throws Throwable; - Result run(SystemUnderTest sut) throws Throwable; - default void checkPostconditions(State state, Result expected, - SystemUnderTest sut, Result actual) throws Throwable {} - default String detailed(State state) {return this.toString();} - default void process(State state, SystemUnderTest sut) throws Throwable - { - checkPostconditions(state, apply(state), - sut, run(sut)); - } - } - - public static class ForwardingCommand implements Command - { - private final Command delegate; - - public ForwardingCommand(Command delegate) - { - this.delegate = delegate; - } - - protected Command delegate() - { - return delegate; - } - - @Override - public PreCheckResult checkPreconditions(State state) - { - return delegate().checkPreconditions(state); - } - - @Override - public Result apply(State state) throws Throwable - { - return delegate().apply(state); - } - - @Override - public Result run(SystemUnderTest sut) throws Throwable - { - return delegate().run(sut); - } - - @Override - public void checkPostconditions(State state, Result expected, SystemUnderTest sut, Result actual) throws Throwable - { - delegate().checkPostconditions(state, expected, sut, actual); - } - - @Override - public String detailed(State state) - { - return delegate().detailed(state); - } - - @Override - public void process(State state, SystemUnderTest sut) throws Throwable - { - // don't call delegate here else the process function calls the delegate and not this class - Command.super.process(state, sut); - } - } - - public static MultistepCommand multistep(Command... cmds) - { - return multistep(Arrays.asList(cmds)); - } - - public static MultistepCommand multistep(List> cmds) - { - List> result = new ArrayList<>(cmds.size()); - for (Command c : cmds) - { - if (c instanceof MultistepCommand) result.addAll(flatten((MultistepCommand) c)); - else result.add(c); - } - return result::iterator; - } - - private static Collection> flatten(MultistepCommand mc) - { - List> result = new ArrayList<>(); - for (Command c : mc) - { - if (c instanceof MultistepCommand) result.addAll(flatten((MultistepCommand) c)); - else result.add(c); - } - return result; - } - - public interface MultistepCommand extends Command, Iterable> - { - @Override - default PreCheckResult checkPreconditions(State state) - { - for (Command cmd : this) - { - PreCheckResult result = cmd.checkPreconditions(state); - if (result != PreCheckResult.Ok) return result; - } - return PreCheckResult.Ok; - } - - @Override - default Object apply(State state) throws Throwable - { - throw new UnsupportedOperationException(); - } - - @Override - default Object run(SystemUnderTest sut) throws Throwable - { - throw new UnsupportedOperationException(); - } - - @Override - default void checkPostconditions(State state, Object expected, SystemUnderTest sut, Object actual) throws Throwable - { - throw new UnsupportedOperationException(); - } - - @Override - default String detailed(State state) - { - throw new UnsupportedOperationException(); - } - - @Override - default void process(State state, SystemUnderTest sut) throws Throwable - { - throw new UnsupportedOperationException(); - } - } - - public static Command ignoreCommand() - { - return new Command<>() - { - @Override - public PreCheckResult checkPreconditions(State state) - { - return PreCheckResult.Ignore; - } - - @Override - public Result apply(State state) throws Throwable - { - throw new UnsupportedOperationException(); - } - - @Override - public Result run(SystemUnderTest sut) throws Throwable - { - throw new UnsupportedOperationException(); - } - - @Override - public String detailed(State state) - { - throw new UnsupportedOperationException(); - } - }; - } - - public interface UnitCommand extends Command - { - void applyUnit(State state) throws Throwable; - void runUnit(SystemUnderTest sut) throws Throwable; - - @Override - default Void apply(State state) throws Throwable - { - applyUnit(state); - return null; - } - - @Override - default Void run(SystemUnderTest sut) throws Throwable - { - runUnit(sut); - return null; - } - } - - public interface StateOnlyCommand extends UnitCommand - { - @Override - default void runUnit(Void sut) throws Throwable {} - } - - public static class SimpleCommand implements StateOnlyCommand - { - private final Function name; - private final Consumer fn; - - public SimpleCommand(String name, Consumer fn) - { - this.name = ignore -> name; - this.fn = fn; - } - - public SimpleCommand(Function name, Consumer fn) - { - this.name = name; - this.fn = fn; - } - - @Override - public String detailed(State state) - { - return name.apply(state); - } - - @Override - public void applyUnit(State state) - { - fn.accept(state); - } - } - - public interface Commands - { - Gen genInitialState() throws Throwable; - SystemUnderTest createSut(State state) throws Throwable; - default void onSuccess(State state, SystemUnderTest sut, List history) throws Throwable {} - default void destroyState(State state, @Nullable Throwable cause) throws Throwable {} - default void destroySut(SystemUnderTest sut, @Nullable Throwable cause) throws Throwable {} - Gen> commands(State state) throws Throwable; - } - - public static CommandsBuilder commands(Supplier> stateGen, Function sutFactory) - { - return new CommandsBuilder<>(stateGen, sutFactory); - } - - public static CommandsBuilder commands(Supplier> stateGen) - { - return new CommandsBuilder<>(stateGen, ignore -> null); - } - - public interface StatefulSuccess - { - void apply(State state, SystemUnderTest sut, List history) throws Throwable; - } - - public static class CommandsBuilder - { - public interface Setup - { - Command setup(RandomSource rs, State state); - } - private final Supplier> stateGen; - private final Function sutFactory; - private final Map, Integer> knownWeights = new LinkedHashMap<>(); - @Nullable - private Set> unknownWeights = null; - @Nullable - private Map, List>> conditionalCommands = null; - private Gen.IntGen unknownWeightGen = Gens.ints().between(1, 10); - @Nullable - private FailingConsumer preCommands = null; - @Nullable - private FailingBiConsumer destroyState = null; - @Nullable - private FailingBiConsumer destroySut = null; - @Nullable - private BiFunction>, Gen>> commandsTransformer = null; - private final List> onSuccess = new ArrayList<>(); - - public CommandsBuilder(Supplier> stateGen, Function sutFactory) - { - this.stateGen = stateGen; - this.sutFactory = sutFactory; - } - - public CommandsBuilder preCommands(FailingConsumer preCommands) - { - this.preCommands = preCommands; - return this; - } - - public CommandsBuilder destroyState(FailingConsumer destroyState) - { - return destroyState((success, failure) -> { - if (failure == null) - destroyState.accept(success); - }); - } - - public CommandsBuilder destroyState(FailingBiConsumer destroyState) - { - this.destroyState = destroyState; - return this; - } - - public CommandsBuilder destroySut(FailingConsumer destroySut) - { - return destroySut((success, failure) -> { - if (failure == null) - destroySut.accept(success); - }); - } - - public CommandsBuilder destroySut(FailingBiConsumer destroySut) - { - this.destroySut = destroySut; - return this; - } - - public CommandsBuilder add(int weight, Command cmd) - { - return add(weight, (i1, i2) -> cmd); - } - - public CommandsBuilder add(int weight, Gen> cmd) - { - return add(weight, (rs, state) -> cmd.next(rs)); - } - - public CommandsBuilder add(int weight, Setup cmd) - { - knownWeights.put(cmd, weight); - return this; - } - - public CommandsBuilder add(Command cmd) - { - return add((i1, i2) -> cmd); - } - - public CommandsBuilder add(Gen> cmd) - { - return add((rs, state) -> cmd.next(rs)); - } - - public CommandsBuilder add(Setup cmd) - { - if (unknownWeights == null) - unknownWeights = new LinkedHashSet<>(); - unknownWeights.add(cmd); - return this; - } - - public CommandsBuilder addIf(Predicate predicate, Gen> cmd) - { - return addIf(predicate, (rs, state) -> cmd.next(rs)); - } - - public CommandsBuilder addIf(Predicate predicate, Command cmd) - { - return addIf(predicate, (rs, state) -> cmd); - } - - public CommandsBuilder addIf(Predicate predicate, Setup cmd) - { - if (conditionalCommands == null) - conditionalCommands = new LinkedHashMap<>(); - conditionalCommands.computeIfAbsent(predicate, i -> new ArrayList<>()).add(cmd); - return this; - } - - public CommandsBuilder addAllIf(Predicate predicate, Consumer> sub) - { - sub.accept(new IfBuilder<>() - { - @Override - public IfBuilder add(Setup cmd) - { - CommandsBuilder.this.addIf(predicate, cmd); - return this; - } - - @Override - public IfBuilder addIf(Predicate nextPredicate, Setup cmd) { - CommandsBuilder.this.addIf(predicate.and(nextPredicate), cmd); - return this; - } - }); - return this; - } - - public interface IfBuilder - { - IfBuilder add(Setup cmd); - IfBuilder addIf(Predicate predicate, Setup cmd); - } - - public CommandsBuilder unknownWeight(Gen.IntGen unknownWeightGen) - { - this.unknownWeightGen = Objects.requireNonNull(unknownWeightGen); - return this; - } - - public CommandsBuilder commandsTransformer(BiFunction>, Gen>> commandsTransformer) - { - this.commandsTransformer = commandsTransformer; - return this; - } - - public CommandsBuilder onSuccess(StatefulSuccess fn) - { - onSuccess.add(fn); - return this; - } - - public Commands build() - { - Gen> commandsGen; - if (unknownWeights == null && conditionalCommands == null) - { - commandsGen = Gens.pick(new LinkedHashMap<>(knownWeights)); - } - else - { - class DynamicWeightsGen implements Gen>, Gens.Reset - { - LinkedHashMap, Integer> weights; - LinkedHashMap, Integer> conditionalWeights; - Gen> nonConditional; - @Override - public Setup next(RandomSource rs) - { - if (weights == null) - { - // create random weights - weights = new LinkedHashMap<>(knownWeights); - if (unknownWeights != null) - { - for (Setup s : unknownWeights) - weights.put(s, unknownWeightGen.nextInt(rs)); - } - nonConditional = Gens.pick(weights); - if (conditionalCommands != null) - { - conditionalWeights = new LinkedHashMap<>(); - for (List> commands : conditionalCommands.values()) - { - for (Setup c : commands) - conditionalWeights.put(c, unknownWeightGen.nextInt(rs)); - } - } - } - if (conditionalWeights == null) return nonConditional.next(rs); - return (r, s) -> { - // need to figure out what conditions apply... - LinkedHashMap, Integer> clone = new LinkedHashMap<>(weights); - for (Map.Entry, List>> e : conditionalCommands.entrySet()) - { - if (e.getKey().test(s)) - e.getValue().forEach(c -> clone.put(c, conditionalWeights.get(c))); - } - Setup select = Gens.pick(clone).next(r); - return select.setup(r, s); - }; - } - - @Override - public void reset() - { - weights = null; - nonConditional = null; - conditionalWeights = null; - } - } - commandsGen = new DynamicWeightsGen(); - } - return new Commands<>() - { - @Override - public Gen genInitialState() throws Throwable - { - return stateGen.get(); - } - - @Override - public SystemUnderTest createSut(State state) throws Throwable - { - return sutFactory.apply(state); - } - - @Override - public Gen> commands(State state) throws Throwable - { - if (preCommands != null) - preCommands.accept(state); - Gen> map = commandsGen.map((rs, setup) -> setup.setup(rs, state)); - return commandsTransformer == null ? map : commandsTransformer.apply(state, map); - } - - @Override - public void destroyState(State state, @Nullable Throwable cause) throws Throwable - { - Gens.Reset.tryReset(commandsGen); - if (destroyState != null) - destroyState.accept(state, cause); - } - - @Override - public void destroySut(SystemUnderTest sut, @Nullable Throwable cause) throws Throwable - { - if (destroySut != null) - destroySut.accept(sut, cause); - } - - @Override - public void onSuccess(State state, SystemUnderTest sut, List history) throws Throwable - { - for (var fn : onSuccess) - fn.apply(state, sut, history); - } - }; - } - - public interface FailingConsumer - { - void accept(T value) throws Throwable; - } - - public interface FailingBiConsumer - { - void accept(A a, B b) throws Throwable; - } - } -} diff --git a/test/unit/accord/utils/RandomSource.java b/test/unit/accord/utils/RandomSource.java deleted file mode 100644 index ddba6237adb1..000000000000 --- a/test/unit/accord/utils/RandomSource.java +++ /dev/null @@ -1,424 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package accord.utils; - -import java.util.ArrayList; -import java.util.Comparator; -import java.util.EnumSet; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Random; -import java.util.Set; -import java.util.SortedSet; -import java.util.function.BooleanSupplier; -import java.util.function.IntSupplier; -import java.util.function.LongSupplier; -import java.util.function.Supplier; - -import com.google.common.collect.Iterables; - -import accord.utils.random.Picker; - -// TODO (expected): merge with C* RandomSource -public interface RandomSource -{ - static RandomSource wrap(Random random) - { - return new WrappedRandomSource(random); - } - - void nextBytes(byte[] bytes); - - boolean nextBoolean(); - default BooleanSupplier uniformBools() { return this::nextBoolean; } - default BooleanSupplier biasedUniformBools(float chance) { return () -> decide(chance); } - default Supplier biasedUniformBoolsSupplier(float minChance) - { - return () -> { - float chance = minChance + (1 - minChance)*nextFloat(); - return () -> decide(chance); - }; - } - - /** - * Returns true with a probability of {@code chance}. This is logically the same as - *

    {@code nextFloat() < chance}
    - * - * @param chance cumulative probability in range [0..1] - */ - default boolean decide(float chance) - { - return nextFloat() < chance; - } - - /** - * Returns true with a probability of {@code chance}. This is logically the same as - *
    {@code nextDouble() < chance}
    - * - * @param chance cumulative probability in range [0..1] - */ - default boolean decide(double chance) - { - return nextDouble() < chance; - } - - int nextInt(); - default int nextInt(int maxExclusive) { return nextInt(0, maxExclusive); } - default int nextInt(int minInclusive, int maxExclusive) - { - // this is diff behavior than ThreadLocalRandom, which returns nextInt - if (minInclusive >= maxExclusive) - throw new IllegalArgumentException(String.format("Min (%s) should be less than max (%d).", minInclusive, maxExclusive)); - - int result = nextInt(); - int delta = maxExclusive - minInclusive; - int mask = delta - 1; - if ((delta & mask) == 0) // power of two - result = (result & mask) + minInclusive; - else if (delta > 0) - { - // reject over-represented candidates - for (int u = result >>> 1; // ensure nonnegative - u + mask - (result = u % delta) < 0; // rejection check - u = nextInt() >>> 1) // retry - ; - result += minInclusive; - } - else - { - // range not representable as int - while (result < minInclusive || result >= maxExclusive) - result = nextInt(); - } - return result; - } - default int nextBiasedInt(int minInclusive, int median, int maxExclusive) - { - checkBiasedUniform(minInclusive, median, maxExclusive); - - int range = Math.max(maxExclusive - median, median - minInclusive) * 2; - int next = nextInt(range) - range/2; - next += median; - return next >= median ? next < maxExclusive ? next : nextInt(median, maxExclusive) - : next >= minInclusive ? next : minInclusive == median ? median : nextInt(minInclusive, median); - } - - default IntSupplier uniformInts(int minInclusive, int maxExclusive) { return () -> nextInt(minInclusive, maxExclusive); } - default IntSupplier biasedUniformInts(int minInclusive, int median, int maxExclusive) - { - checkBiasedUniform(minInclusive, median, maxExclusive); - return () -> nextBiasedInt(minInclusive, median, maxExclusive); - } - default Supplier biasedUniformIntsSupplier(int absoluteMinInclusive, int absoluteMaxExclusive, int minMedian, int maxMedian, int minRange, int maxRange) - { - return biasedUniformIntsSupplier(absoluteMinInclusive, absoluteMaxExclusive, minMedian, (minMedian+maxMedian)/2, maxMedian, minRange, (minRange+maxRange)/2, maxRange); - } - default Supplier biasedUniformIntsSupplier(int absoluteMinInclusive, int absoluteMaxExclusive, int minMedian, int medianMedian, int maxMedian, int minRange, int medianRange, int maxRange) - { - checkBiasedUniform(minMedian, medianMedian, maxMedian); - checkBiasedUniform(minRange, medianRange, maxRange); - if (minMedian < absoluteMinInclusive) - throw new IllegalArgumentException(String.format("absoluteMin (%s) should be less than or equal to minMedian (%s)", absoluteMinInclusive, minMedian)); - if (maxMedian > absoluteMaxExclusive) - throw new IllegalArgumentException(String.format("absoluteMax (%s) should be greater than or equal to maxMedian (%s)", absoluteMaxExclusive, maxMedian)); - if (minRange < 1) - throw new IllegalArgumentException(String.format("minRange (%s) should be greater than or equal to 1", minRange)); - return () -> { - int median = nextBiasedInt(minMedian, medianMedian, maxMedian); - int minInclusive = Math.max(absoluteMinInclusive, median - nextBiasedInt(minRange, medianRange, maxRange)/2); - int maxExclusive = Math.min(absoluteMaxExclusive, median + (nextBiasedInt(minRange, medianRange, maxRange)+1)/2); - return biasedUniformInts(minInclusive, median, maxExclusive); - }; - } - - long nextLong(); - default long nextLong(long maxExclusive) { return nextLong(0, maxExclusive); } - default long nextLong(long minInclusive, long maxExclusive) - { - // this is diff behavior than ThreadLocalRandom, which returns nextLong - if (minInclusive >= maxExclusive) - throw new IllegalArgumentException(String.format("Min (%s) should be less than max (%d).", minInclusive, maxExclusive)); - - long result = nextLong(); - long delta = maxExclusive - minInclusive; - long mask = delta - 1; - if ((delta & mask) == 0L) // power of two - result = (result & mask) + minInclusive; - else if (delta > 0L) - { - // reject over-represented candidates - for (long u = result >>> 1; // ensure nonnegative - u + mask - (result = u % delta) < 0L; // rejection check - u = nextLong() >>> 1) // retry - ; - result += minInclusive; - } - else - { - // range not representable as long - while (result < minInclusive || result >= maxExclusive) - result = nextLong(); - } - return result; - } - default long nextBiasedLong(long minInclusive, long median, long maxExclusive) - { - checkBiasedUniform(minInclusive, median, maxExclusive); - - long range = Math.max(maxExclusive - median, median - minInclusive) * 2; - long next = nextLong(range) - range/2; - next += median; - return next >= median ? next < maxExclusive ? next : nextLong(median, maxExclusive) - : next >= minInclusive ? next : minInclusive == median ? median : nextLong(minInclusive, median); - } - - default LongSupplier uniformLongs(long minInclusive, long maxExclusive) { return () -> nextLong(minInclusive, maxExclusive); } - default LongSupplier biasedUniformLongs(long minInclusive, long median, long maxExclusive) - { - checkBiasedUniform(minInclusive, median, maxExclusive); - return () -> nextBiasedLong(minInclusive, median, maxExclusive); - } - default Supplier biasedUniformLongsSupplier(long absoluteMinInclusive, long absoluteMaxExclusive, long minMedian, long maxMedian, long minRange, long maxRange) - { - return biasedUniformLongsSupplier(absoluteMinInclusive, absoluteMaxExclusive, minMedian, (minMedian+maxMedian)/2, maxRange, minRange, (minRange+maxRange)/2, maxRange); - } - default Supplier biasedUniformLongsSupplier(long absoluteMinInclusive, long absoluteMaxExclusive, long minMedian, long medianMedian, long maxMedian, long minRange, long medianRange, long maxRange) - { - checkBiasedUniform(minMedian, medianMedian, maxMedian); - checkBiasedUniform(minRange, medianRange, maxRange); - if (minMedian < absoluteMinInclusive) - throw new IllegalArgumentException(String.format("absoluteMin (%s) should be less than or equal to minMedian (%s)", absoluteMinInclusive, minMedian)); - if (maxMedian > absoluteMaxExclusive) - throw new IllegalArgumentException(String.format("absoluteMax (%s) should be greater than or equal to maxMedian (%s)", absoluteMaxExclusive, maxMedian)); - if (minRange < 1) - throw new IllegalArgumentException(String.format("minRange (%s) should be greater than or equal to 1", minRange)); - return () -> { - long median = nextBiasedLong(minMedian, medianMedian, maxMedian); - long minInclusive = Math.max(absoluteMinInclusive, median - nextBiasedLong(minRange, medianRange, maxRange)/2); - long maxExclusive = Math.min(absoluteMaxExclusive, median + (1+nextBiasedLong(minRange, medianRange, maxRange))/2); - return biasedUniformLongs(minInclusive, median, maxExclusive); - }; - } - - static void checkBiasedUniform(long minInclusive, long median, long maxExclusive) - { - if (minInclusive > median) - throw new IllegalArgumentException(String.format("Min (%s) should be equal to or less than median (%d).", minInclusive, median)); - if (median >= maxExclusive) - throw new IllegalArgumentException(String.format("Median (%s) should be less than max (%d).", median, maxExclusive)); - } - - float nextFloat(); - - double nextDouble(); - default double nextDouble(double maxExclusive) { return nextDouble(0, maxExclusive); } - default double nextDouble(double minInclusive, double maxExclusive) - { - if (minInclusive >= maxExclusive) - throw new IllegalArgumentException(String.format("Min (%s) should be less than max (%d).", minInclusive, maxExclusive)); - - double result = nextDouble(); - result = result * (maxExclusive - minInclusive) + minInclusive; - if (result >= maxExclusive) // correct for rounding - result = Double.longBitsToDouble(Double.doubleToLongBits(maxExclusive) - 1); - return result; - } - - double nextGaussian(); - - default int pickInt(int first, int second, int... rest) - { - int offset = nextInt(0, rest.length + 2); - switch (offset) - { - case 0: return first; - case 1: return second; - default: return rest[offset - 2]; - } - } - - default int pickInt(int[] array) - { - return pickInt(array, 0, array.length); - } - - default int pickInt(int[] array, int offset, int length) - { - Invariants.checkIndexInBounds(array.length, offset, length); - if (length == 1) - return array[offset]; - return array[nextInt(offset, offset + length)]; - } - - default long pickLong(long first, long second, long... rest) - { - int offset = nextInt(0, rest.length + 2); - switch (offset) - { - case 0: return first; - case 1: return second; - default: return rest[offset - 2]; - } - } - - default long pickLong(long[] array) - { - return pickLong(array, 0, array.length); - } - - default long pickLong(long[] array, int offset, int length) - { - Invariants.checkIndexInBounds(array.length, offset, length); - if (length == 1) - return array[offset]; - return array[nextInt(offset, offset + length)]; - } - - default T pickOrderedSet(SortedSet set) - { - int offset = nextInt(0, set.size()); - return Iterables.get(set, offset); - } - - default T pickOrderedSet(LinkedHashSet set) - { - int offset = nextInt(0, set.size()); - return Iterables.get(set, offset); - } - - default > T pickOrderedSet(EnumSet set) - { - int offset = nextInt(0, set.size()); - return Iterables.get(set, offset); - } - - default > T pickUnorderedSet(Set set) - { - if (set instanceof SortedSet) - return pickOrderedSet((SortedSet) set); - List values = new ArrayList<>(set); - // Non-ordered sets may have different iteration order on different environments, which would make a seed produce different histories! - // To avoid such a problem, make sure to apply a deterministic function (sort). - values.sort(Comparator.naturalOrder()); - return pick(values); - } - - default T pick(T first, T second, T... rest) - { - int offset = nextInt(0, rest.length + 2); - switch (offset) - { - case 0: return first; - case 1: return second; - default: return rest[offset - 2]; - } - } - - default T pick(T[] array) - { - return array[nextInt(array.length)]; - } - - default T pick(List values) - { - return pick(values, 0, values.size()); - } - - default T pick(List values, int offset, int length) - { - Invariants.checkIndexInBounds(values.size(), offset, length); - if (length == 1) - return values.get(offset); - return values.get(nextInt(offset, offset + length)); - } - - default Supplier randomWeightedPicker(T[] objects) { return Picker.WeightedObjectPicker.randomWeighted(this, objects); } - default Supplier randomWeightedPicker(T[] objects, float[] bias) { return Picker.WeightedObjectPicker.randomWeighted(this, objects, bias); } - default Supplier weightedPicker(T[] objects, float[] proportionalWeights) { return Picker.WeightedObjectPicker.weighted(this, objects, proportionalWeights); } - - void setSeed(long seed); - RandomSource fork(); - - default long reset() - { - long seed = nextLong(); - setSeed(seed); - return seed; - } - - default Random asJdkRandom() - { - return new Random(nextLong()) - { - @Override - public void setSeed(long seed) - { - RandomSource.this.setSeed(seed); - } - - @Override - public void nextBytes(byte[] bytes) - { - RandomSource.this.nextBytes(bytes); - } - - @Override - public int nextInt() - { - return RandomSource.this.nextInt(); - } - - @Override - public int nextInt(int bound) - { - return RandomSource.this.nextInt(bound); - } - - @Override - public long nextLong() - { - return RandomSource.this.nextLong(); - } - - @Override - public boolean nextBoolean() - { - return RandomSource.this.nextBoolean(); - } - - @Override - public float nextFloat() - { - return RandomSource.this.nextFloat(); - } - - @Override - public double nextDouble() - { - return RandomSource.this.nextDouble(); - } - - @Override - public double nextGaussian() - { - return RandomSource.this.nextGaussian(); - } - }; - } -} diff --git a/test/unit/accord/utils/SeedProvider.java b/test/unit/accord/utils/SeedProvider.java deleted file mode 100644 index 9c7858dafed8..000000000000 --- a/test/unit/accord/utils/SeedProvider.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package accord.utils; - -import java.util.concurrent.atomic.AtomicLong; - -/** - * Utility class for creating seeds. This class mostly matches the semantics of {@link java.util.Random} but makes the logic work - * for any random source. This class should be used in replacement of most seed methods, and should always replace {@link java.util.concurrent.ThreadLocalRandom} - * as that randomness will have a bias twords the same seed after a restart (if you rerun randomized tests by restarting - * the JVM you will run with the same seed over and over again). - */ -public class SeedProvider -{ - public static final SeedProvider instance = new SeedProvider(); - private final AtomicLong seedUniquifier = new AtomicLong(8682522807148012L); - - private long seedUniquifier() - { - // L'Ecuyer, "Tables of Linear Congruential Generators of - // Different Sizes and Good Lattice Structure", 1999 - for (; ; ) - { - long current = seedUniquifier.get(); - long next = current * 1181783497276652981L; - if (seedUniquifier.compareAndSet(current, next)) - return next; - } - } - - public long nextSeed() - { - return seedUniquifier() ^ System.nanoTime(); - } -} diff --git a/test/unit/accord/utils/async/TimeoutUtils.java b/test/unit/accord/utils/async/TimeoutUtils.java deleted file mode 100644 index f12c0b17e964..000000000000 --- a/test/unit/accord/utils/async/TimeoutUtils.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package accord.utils.async; - -import java.time.Duration; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; - -import org.apache.cassandra.utils.concurrent.AsyncPromise; - -public class TimeoutUtils -{ - public interface FailingRunnable - { - void run() throws Throwable; - } - - public static void runBlocking(Duration timeout, String threadName, FailingRunnable fn) throws ExecutionException, InterruptedException, TimeoutException - { - // MAINTENANCE: Once the accord branch merges to trunk this can be dropped and will be AsyncChain again, but since this is forked into C* (that doesn't have AsyncChain) need to use Futures -// AsyncResult.Settable promise = AsyncResults.settable(); - AsyncPromise promise = new AsyncPromise<>(); - Thread t = new Thread(() -> { - try - { - fn.run(); - promise.setSuccess(null); - } - catch (Throwable e) - { - promise.setFailure(e); - } - }); - t.setName(threadName); - t.setDaemon(true); - t.start(); - try - { -// AsyncChains.getBlocking(promise, timeout.toNanos(), TimeUnit.NANOSECONDS); - promise.get(timeout.toNanos(), TimeUnit.NANOSECONDS); - } - catch (InterruptedException e) - { - t.interrupt(); - throw e; - } - catch (TimeoutException e) - { - t.interrupt(); - throw e; - } - } -} diff --git a/test/unit/accord/utils/random/Picker.java b/test/unit/accord/utils/random/Picker.java deleted file mode 100644 index f12369d35fca..000000000000 --- a/test/unit/accord/utils/random/Picker.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package accord.utils.random; - -import java.util.Arrays; -import java.util.function.Supplier; - -import accord.utils.Invariants; -import accord.utils.RandomSource; - -public class Picker -{ - public static float[] randomWeights(RandomSource random, int length) - { - float[] weights = new float[length - 1]; - float sum = 0; - for (int i = 0 ; i < weights.length ; ++i) - weights[i] = sum += random.nextFloat(); - sum += random.nextFloat(); - for (int i = 0 ; i < weights.length ; ++i) - weights[i] /= sum; - return weights; - } - - static abstract class Weighted - { - final RandomSource random; - final float[] weights; - - public Weighted(RandomSource random, float[] weights) - { - this.random = random; - this.weights = weights; - } - - - static float[] randomWeights(RandomSource random, float[] bias) - { - float[] weights = new float[bias.length - 1]; - float sum = 0; - for (int i = 0 ; i < weights.length ; ++i) - weights[i] = sum += random.nextFloat() * bias[i]; - sum += random.nextFloat() * bias[weights.length]; - for (int i = 0 ; i < weights.length ; ++i) - weights[i] /= sum; - return weights; - } - - static float[] normaliseWeights(float[] input) - { - float[] output = new float[input.length - 1]; - float sum = 0; - for (int i = 0 ; i < output.length ; ++i) - output[i] = sum += input[i]; - sum += input[output.length]; - for (int i = 0 ; i < output.length ; ++i) - output[i] /= sum; - return output; - } - - int pickIndex() - { - int i = Arrays.binarySearch(weights, random.nextFloat()); - if (i < 0) i = -1 - i; - return i; - } - } - - public static class WeightedObjectPicker extends Weighted implements Supplier - { - final T[] values; - - private WeightedObjectPicker(RandomSource random, T[] values, float[] weights) - { - super(random, weights); - this.values = values; - } - - @Override - public T get() - { - return values[pickIndex()]; - } - - public static WeightedObjectPicker randomWeighted(RandomSource random, T[] values) - { - return new WeightedObjectPicker<>(random, values, Picker.randomWeights(random, values.length)); - } - - public static WeightedObjectPicker randomWeighted(RandomSource random, T[] values, float[] bias) - { - Invariants.checkArgument(values.length == bias.length); - return new WeightedObjectPicker<>(random, values, randomWeights(random, bias)); - } - - public static WeightedObjectPicker weighted(RandomSource random, T[] values, float[] proportionalWeights) - { - Invariants.checkArgument(values.length == proportionalWeights.length); - return new WeightedObjectPicker<>(random, values, normaliseWeights(proportionalWeights)); - } - } -} diff --git a/test/unit/org/apache/cassandra/CassandraTestBase.java b/test/unit/org/apache/cassandra/CassandraTestBase.java new file mode 100644 index 000000000000..bfb7bc8c4123 --- /dev/null +++ b/test/unit/org/apache/cassandra/CassandraTestBase.java @@ -0,0 +1,261 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra; + +import java.lang.annotation.Annotation; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.reflect.Method; + +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.rules.TestName; +import org.junit.rules.TestWatcher; +import org.junit.runner.Description; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.ByteOrderedPartitioner; +import org.apache.cassandra.dht.LengthPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.OrderPreservingPartitioner; +import org.apache.cassandra.dht.RandomPartitioner; +import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.tcm.ClusterMetadataService; + +import static org.apache.cassandra.config.CassandraRelevantProperties.ORG_APACHE_CASSANDRA_DISABLE_MBEAN_REGISTRATION; +import static org.junit.Assert.assertTrue; + +/* + * Many tests declare their own test base and duplicate functionality + * Hopefully this can serve as a place to put common initialization patterns and annotations + * So people have fewer problems to solve when authoring tests. + */ +public class CassandraTestBase +{ + @Retention(RetentionPolicy.RUNTIME) + public @interface UseMurmur3Partitioner {} + + @Retention(RetentionPolicy.RUNTIME) + public @interface UseRandomPartitioner {} + + @Retention(RetentionPolicy.RUNTIME) + public @interface UseOrderPreservingPartitioner {} + + @Retention(RetentionPolicy.RUNTIME) + public @interface UseLengthPartitioner {} + + @Retention(RetentionPolicy.RUNTIME) + public @interface UseByteOrderedPartitioner {} + + @Retention(RetentionPolicy.RUNTIME) + public @interface DDDaemonInitialization {} + + @Retention(RetentionPolicy.RUNTIME) + public @interface SchemaLoaderPrepareServer {} + + @Retention(RetentionPolicy.RUNTIME) + public @interface SchemaLoaderLoadSchema {} + + @Retention(RetentionPolicy.RUNTIME) + public @interface PrepareServerNoRegister {} + + @Retention(RetentionPolicy.RUNTIME) + public @interface DisableMBeanRegistration {} + + private static boolean classResetStorageServicePartitioner; + + private static Boolean oldMBeanRegistrationValue; + + @BeforeClass + public static void cassandraTestBaseBeforeClass() + { + if (hasClassAnnotation(DisableMBeanRegistration.class)) + { + oldMBeanRegistrationValue = ORG_APACHE_CASSANDRA_DISABLE_MBEAN_REGISTRATION.getBoolean(); + ORG_APACHE_CASSANDRA_DISABLE_MBEAN_REGISTRATION.setBoolean(true); + } + + if (hasClassAnnotation(DDDaemonInitialization.class)) + DatabaseDescriptor.daemonInitialization(); + else if (hasClassAnnotation(SchemaLoaderPrepareServer.class)) + SchemaLoader.prepareServer(); + else if (hasClassAnnotation(SchemaLoaderLoadSchema.class)) + SchemaLoader.loadSchema(); + else if (hasClassAnnotation(PrepareServerNoRegister.class)) + ServerTestUtils.daemonInitialization(); + + int partitionerAnnotationCount = 0; + if (hasClassAnnotation(UseMurmur3Partitioner.class)) + { + partitionerAnnotationCount++; + classResetStorageServicePartitioner = true; + StorageService.instance.setPartitionerUnsafe(Murmur3Partitioner.instance); + } + if (hasClassAnnotation(UseRandomPartitioner.class)) + { + partitionerAnnotationCount++; + classResetStorageServicePartitioner = true; + StorageService.instance.setPartitionerUnsafe(RandomPartitioner.instance); + } + if (hasClassAnnotation(UseOrderPreservingPartitioner.class)) + { + partitionerAnnotationCount++; + classResetStorageServicePartitioner = true; + StorageService.instance.setPartitionerUnsafe(OrderPreservingPartitioner.instance); + } + if (hasClassAnnotation(UseLengthPartitioner.class)) + { + partitionerAnnotationCount++; + classResetStorageServicePartitioner = true; + StorageService.instance.setPartitionerUnsafe(LengthPartitioner.instance); + } + if (hasClassAnnotation(UseByteOrderedPartitioner.class)) + { + partitionerAnnotationCount++; + classResetStorageServicePartitioner = true; + StorageService.instance.setPartitionerUnsafe(ByteOrderedPartitioner.instance); + } + assertTrue("At most one partitioner should be annotated", partitionerAnnotationCount <= 1); + + if (hasClassAnnotation(PrepareServerNoRegister.class)) + ServerTestUtils.prepareServerNoRegister(); + } + + @AfterClass + public static void cassandraTestBaseAfterClass() + { + if (oldMBeanRegistrationValue != null) + { + ORG_APACHE_CASSANDRA_DISABLE_MBEAN_REGISTRATION.setBoolean(oldMBeanRegistrationValue); + oldMBeanRegistrationValue = null; + } + + if (classResetStorageServicePartitioner) + { + StorageService.instance.resetPartitionerUnsafe(); + classResetStorageServicePartitioner = false; + } + } + + public static boolean hasClassAnnotation(Class annotation) + { + return hasClassAnnotation(testClass, annotation); + } + + public static boolean hasClassAnnotation(Class clazz, Class annotation) + { + if (clazz == null) + return false; + if (clazz.getAnnotation(annotation) != null) + return true; + return hasClassAnnotation(clazz.getSuperclass(), annotation); + } + + private static Class testClass; + + @ClassRule + public static TestWatcher classWatcher = new TestWatcher() + { + @Override + public void starting(Description description) + { + testClass = description.getTestClass(); + } + }; + + @Rule + public TestName testMethodName = new TestName(); + public Method testMethod; + + private boolean testResetPartitioner; + + ClusterMetadataService toRestore; + + @Before + public void cassandraTestBaseSetUp() throws Exception + { + testMethod = testClass.getMethod(testMethodName.getMethodName()); + int partitionerAnnotationCount = 0; + if (hasMethodAnnotation(UseMurmur3Partitioner.class)) + { + partitionerAnnotationCount++; + testResetPartitioner = true; + StorageService.instance.setPartitionerUnsafe(Murmur3Partitioner.instance); + } + if (hasMethodAnnotation(UseRandomPartitioner.class)) + { + partitionerAnnotationCount++; + testResetPartitioner = true; + StorageService.instance.setPartitionerUnsafe(RandomPartitioner.instance); + } + if (hasMethodAnnotation(UseOrderPreservingPartitioner.class)) + { + partitionerAnnotationCount++; + testResetPartitioner = true; + StorageService.instance.setPartitionerUnsafe(OrderPreservingPartitioner.instance); + } + if (hasMethodAnnotation(UseLengthPartitioner.class)) + { + partitionerAnnotationCount++; + testResetPartitioner = true; + StorageService.instance.setPartitionerUnsafe(LengthPartitioner.instance); + } + if (hasMethodAnnotation(UseByteOrderedPartitioner.class)) + { + partitionerAnnotationCount++; + testResetPartitioner = true; + StorageService.instance.setPartitionerUnsafe(ByteOrderedPartitioner.instance); + } + + if (testResetPartitioner) + { + toRestore = ClusterMetadataService.unsetInstance(); + ClusterMetadataService withNewPartitioner = ClusterMetadataTestHelper.instanceForTest(); + ClusterMetadataService.setInstance(withNewPartitioner); + } + assertTrue("At most one partitioner should be annotated", partitionerAnnotationCount <= 1); + } + + private boolean hasMethodAnnotation(Class annotation) + { + return testMethod.getAnnotation(annotation) != null; + } + + @After + public void cassandraTestBaseTearDown() + { + if (testResetPartitioner) + { + StorageService.instance.resetPartitionerUnsafe(); + testResetPartitioner = false; + ClusterMetadataService.unsetInstance(); + + if (toRestore != null) + { + ClusterMetadataService.setInstance(toRestore); + toRestore = null; + } + } + } +} diff --git a/test/unit/org/apache/cassandra/SchemaLoader.java b/test/unit/org/apache/cassandra/SchemaLoader.java index 73ef7285b389..53f98859e587 100644 --- a/test/unit/org/apache/cassandra/SchemaLoader.java +++ b/test/unit/org/apache/cassandra/SchemaLoader.java @@ -285,6 +285,7 @@ public static ColumnMetadata integerColumn(String ksName, String cfName) cfName, ColumnIdentifier.getInterned(IntegerType.instance.fromString("42"), IntegerType.instance), UTF8Type.instance, + ColumnMetadata.NO_UNIQUE_ID, ColumnMetadata.NO_POSITION, ColumnMetadata.Kind.REGULAR, null); @@ -296,6 +297,7 @@ public static ColumnMetadata utf8Column(String ksName, String cfName) cfName, ColumnIdentifier.getInterned("fortytwo", true), UTF8Type.instance, + ColumnMetadata.NO_UNIQUE_ID, ColumnMetadata.NO_POSITION, ColumnMetadata.Kind.REGULAR, null); @@ -303,7 +305,7 @@ public static ColumnMetadata utf8Column(String ksName, String cfName) public static TableMetadata perRowIndexedCFMD(String ksName, String cfName) { - ColumnMetadata indexedColumn = ColumnMetadata.regularColumn(ksName, cfName, "indexed", AsciiType.instance); + ColumnMetadata indexedColumn = ColumnMetadata.regularColumn(ksName, cfName, "indexed", AsciiType.instance, ColumnMetadata.NO_UNIQUE_ID); TableMetadata.Builder builder = TableMetadata.builder(ksName, cfName) diff --git a/test/unit/org/apache/cassandra/ServerTestUtils.java b/test/unit/org/apache/cassandra/ServerTestUtils.java index ead4a1a558cb..702e2b687100 100644 --- a/test/unit/org/apache/cassandra/ServerTestUtils.java +++ b/test/unit/org/apache/cassandra/ServerTestUtils.java @@ -21,15 +21,16 @@ import java.net.UnknownHostException; import java.util.Arrays; import java.util.Collections; +import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Set; -import java.util.function.Function; import java.util.stream.Collectors; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import accord.impl.basic.SimulatedFault; import org.apache.cassandra.audit.AuditLogManager; import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.config.DatabaseDescriptor; @@ -44,6 +45,7 @@ import org.apache.cassandra.io.sstable.format.big.BigTableReader; import org.apache.cassandra.io.sstable.indexsummary.IndexSummarySupport; import org.apache.cassandra.io.util.File; +import org.apache.cassandra.locator.Endpoint; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.locator.BaseProximity; @@ -70,6 +72,7 @@ import org.apache.cassandra.tcm.transformations.UnsafeJoin; import org.apache.cassandra.tcm.transformations.cms.Initialize; import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.Sortable; import static org.apache.cassandra.config.CassandraRelevantProperties.ORG_APACHE_CASSANDRA_DISABLE_MBEAN_REGISTRATION; @@ -108,6 +111,18 @@ public int compareEndpoints(InetAddressAndPort target, Replica a1, Replica a2) { return 0; } + + @Override + public boolean supportCompareByEndpoint() + { + return true; + } + + @Override + public > Comparator endpointComparator(InetAddressAndPort address, C addresses) + { + return (a, b) -> 0; + } }); } @@ -168,7 +183,8 @@ public static void prepareServerNoRegister() { public void uncaughtException(Thread t, Throwable e) { - logger.error("Fatal exception in thread " + t, e); + if (e instanceof SimulatedFault) logger.error("SimulatedFault {} in thread {}", e.getMessage(), t); + else logger.error("Fatal exception in thread " + t, e); } }); @@ -208,6 +224,7 @@ public static void cleanup() if (cdcDir != null) cleanupDirectory(cdcDir); cleanupDirectory(DatabaseDescriptor.getHintsDirectory()); + cleanupDirectory(DatabaseDescriptor.getAccordJournalDirectory()); cleanupSavedCaches(); // clean up data directory which are stored as data directory/keyspace/data files @@ -221,11 +238,11 @@ private static void cleanupDirectory(File directory) { if (directory.exists()) { - Arrays.stream(directory.tryList()).forEach(File::deleteRecursive); + Arrays.stream(directory.tryList()).forEach(File::tryDeleteRecursive); } } - private static void cleanupDirectory(String dirName) + public static void cleanupDirectory(String dirName) { if (dirName != null) cleanupDirectory(new File(dirName)); @@ -263,7 +280,6 @@ public static void initCMS() // log entries is always done by the dedicated log follower thread. DatabaseDescriptor.setMetadataSnapshotFrequency(Integer.MAX_VALUE); - Function processorFactory = AtomicLongBackedProcessor::new; IPartitioner partitioner = DatabaseDescriptor.getPartitioner(); Location location = DatabaseDescriptor.getLocator().local(); boolean addListeners = true; @@ -271,15 +287,17 @@ public static void initCMS() if (!Keyspace.isInitialized()) Keyspace.setInitialized(); + AtomicLongBackedProcessor.InMemoryStorage storage = new AtomicLongBackedProcessor.InMemoryStorage(); LocalLog log = LocalLog.logSpec() .withInitialState(initial) .withDefaultListeners(addListeners) + .withStorage(storage) .createLog(); ResettableClusterMetadataService service = new ResettableClusterMetadataService(new UniformRangePlacement(), MetadataSnapshots.NO_OP, log, - processorFactory.apply(log), + new AtomicLongBackedProcessor(log), Commit.Replicator.NO_OP, true); diff --git a/test/unit/org/apache/cassandra/Util.java b/test/unit/org/apache/cassandra/Util.java index 590f7e356f12..eb41f5b864d6 100644 --- a/test/unit/org/apache/cassandra/Util.java +++ b/test/unit/org/apache/cassandra/Util.java @@ -1,4 +1,3 @@ -package org.apache.cassandra; /* * * Licensed to the Apache Software Foundation (ASF) under one @@ -18,6 +17,7 @@ * under the License. * */ +package org.apache.cassandra; import java.io.Closeable; import java.io.DataInputStream; @@ -25,6 +25,7 @@ import java.io.IOError; import java.io.IOException; import java.io.InputStream; +import java.lang.reflect.Field; import java.math.BigInteger; import java.net.UnknownHostException; import java.nio.ByteBuffer; @@ -56,16 +57,13 @@ import com.google.common.collect.Iterables; import com.google.common.collect.Iterators; -import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; -import org.apache.cassandra.gms.ApplicationState; -import org.apache.cassandra.gms.Gossiper; -import org.apache.cassandra.gms.VersionedValue; -import org.apache.cassandra.io.util.File; import org.apache.commons.lang3.StringUtils; import org.junit.Assume; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import accord.utils.Invariants; +import org.apache.cassandra.config.Config; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.ColumnIdentifier; import org.apache.cassandra.db.AbstractReadCommandBuilder; @@ -84,18 +82,13 @@ import org.apache.cassandra.db.PartitionRangeReadCommand; import org.apache.cassandra.db.ReadCommand; import org.apache.cassandra.db.ReadExecutionController; +import org.apache.cassandra.db.SinglePartitionReadCommand; import org.apache.cassandra.db.compaction.AbstractCompactionTask; import org.apache.cassandra.db.compaction.ActiveCompactionsTracker; import org.apache.cassandra.db.compaction.CompactionManager; import org.apache.cassandra.db.compaction.CompactionTasks; import org.apache.cassandra.db.compaction.OperationType; import org.apache.cassandra.db.lifecycle.LifecycleTransaction; -import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.locator.ReplicaCollection; -import org.apache.cassandra.net.MessagingService; -import org.apache.cassandra.schema.ColumnMetadata; -import org.apache.cassandra.schema.TableId; -import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.AsciiType; import org.apache.cassandra.db.marshal.Int32Type; @@ -121,6 +114,10 @@ import org.apache.cassandra.dht.RandomPartitioner.BigIntegerToken; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; +import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; +import org.apache.cassandra.gms.ApplicationState; +import org.apache.cassandra.gms.Gossiper; +import org.apache.cassandra.gms.VersionedValue; import org.apache.cassandra.index.internal.CassandraIndex; import org.apache.cassandra.io.sstable.Descriptor; import org.apache.cassandra.io.sstable.SSTableId; @@ -130,8 +127,16 @@ import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.io.sstable.format.SSTableReaderWithFilter; import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.Replica; +import org.apache.cassandra.locator.ReplicaCollection; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig; +import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.schema.TableMetadataRef; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.service.pager.PagingState; @@ -152,10 +157,12 @@ import org.apache.cassandra.utils.FilterFactory; import org.apache.cassandra.utils.OutputHandler; import org.awaitility.Awaitility; +import org.awaitility.core.ThrowingRunnable; import org.hamcrest.Matcher; import org.mockito.Mockito; import org.mockito.internal.stubbing.defaultanswers.ForwardsInvocations; +import static com.google.common.base.Preconditions.checkState; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.equalTo; import static org.junit.Assert.assertEquals; @@ -177,6 +184,11 @@ public static DecoratedKey dk(String key) return testPartitioner().decorateKey(ByteBufferUtil.bytes(key)); } + public static DecoratedKey dk(int key) + { + return dk(String.valueOf(key), Int32Type.instance); + } + public static DecoratedKey dk(String key, AbstractType type) { return testPartitioner().decorateKey(type.fromString(key)); @@ -230,7 +242,7 @@ public static Iterable once(final Iterator source) private AtomicBoolean exhausted = new AtomicBoolean(); public Iterator iterator() { - Preconditions.checkState(!exhausted.getAndSet(true)); + checkState(!exhausted.getAndSet(true)); return source; } }; @@ -373,7 +385,8 @@ public static void expectException(Callable callable, Class exception) } catch (Throwable e) { - assert e.getClass().equals(exception) : e.getClass().getName() + " is not " + exception.getName(); + // Use name because in-jvm dtests will have different instances of the class + Invariants.require(e.getClass().getName().equals(exception.getName()), e.getClass().getName() + " is not " + exception.getName()); thrown = true; } @@ -699,22 +712,34 @@ public static void assumeLegacySecondaryIndex() public static class PartitionerSwitcher implements AutoCloseable { - final IPartitioner oldP; final IPartitioner newP; + boolean closed; + public PartitionerSwitcher(IPartitioner partitioner) { newP = partitioner; - oldP = StorageService.instance.setPartitionerUnsafe(partitioner); + StorageService.instance.setPartitionerUnsafe(partitioner); } public void close() { - IPartitioner p = StorageService.instance.setPartitionerUnsafe(oldP); - assert p == newP; + checkState(!closed, "Already reset"); + closed = true; + StorageService.instance.resetPartitionerUnsafe(); } } + public static void spinAssertEquals(int expected, Supplier actualSupplier) + { + spinAssertEquals((long)expected, () -> ((Number)actualSupplier.get()).longValue()); + } + + public static void spinAssertEquals(Object expected, Supplier actualSupplier) + { + spinAssertEquals(null, expected, actualSupplier, 10, TimeUnit.SECONDS); + } + public static void spinAssertEquals(Object expected, Supplier actualSupplier, int timeoutInSeconds) { spinAssertEquals(null, expected, actualSupplier, timeoutInSeconds, TimeUnit.SECONDS); @@ -734,6 +759,55 @@ public static void spinAssert(String message, Matcher matcher, Supplier assertThat(message, actualSupplier.get(), matcher)); } + public static void spinAssertEquals(Object expected, int timeoutInSeconds, Callable call) + { + spinAssertEquals(null, expected, timeoutInSeconds, TimeUnit.SECONDS, call); + } + + public static void spinAssertEquals(String message, T expected, long timeout, TimeUnit timeUnit, Callable call) + { + Awaitility.await() + .pollInterval(Duration.ofMillis(100)) + .pollDelay(0, TimeUnit.MILLISECONDS) + .atMost(timeout, timeUnit) + .untilAsserted(() -> assertThat(message, call.call(), equalTo(expected))); + } + + public static void spinUntilTrue(Callable test) + { + spinUntilTrue(test, 10, TimeUnit.SECONDS); + } + + public static void spinUntilTrue(Callable test, long timeoutInSeconds) + { + spinUntilTrue(test, timeoutInSeconds, TimeUnit.SECONDS); + } + + public static void spinUntilTrue(Callable test, long timeout, TimeUnit unit) + { + Awaitility.await() + .pollInterval(Duration.ofMillis(100)) + .pollDelay(0, TimeUnit.MILLISECONDS) + .atMost(timeout, unit) + .ignoreExceptions() + .untilAsserted(() -> assertThat(test.call(), equalTo(true))); + } + + public static void spinUntilSuccess(ThrowingRunnable runnable) + { + spinUntilSuccess(runnable, 10); + } + + public static void spinUntilSuccess(ThrowingRunnable runnable, int timeoutInSeconds) + { + Awaitility.await() + .pollInterval(Duration.ofMillis(100)) + .pollDelay(0, TimeUnit.MILLISECONDS) + .atMost(timeoutInSeconds, TimeUnit.SECONDS) + .ignoreExceptions() + .untilAsserted(runnable); + } + public static void joinThread(Thread thread) throws InterruptedException { thread.join(10000); @@ -832,6 +906,13 @@ public static UnfilteredPartitionIterator executeLocally(PartitionRangeReadComma return command.queryStorage(cfs, controller); } + public static UnfilteredPartitionIterator executeLocally(SinglePartitionReadCommand command, + ColumnFamilyStore cfs, + ReadExecutionController controller) + { + return command.queryStorage(cfs, controller); + } + public static Closeable markDirectoriesUnwriteable(ColumnFamilyStore cfs) { try @@ -1281,4 +1362,13 @@ public static Map listSnapshots(ColumnFamilyStore cfs) return tagSnapshotsMap; } + // Replaces the global auto-repair config with a new config where auto-repair schedulling is enabled/disabled + public static void setAutoRepairEnabled(boolean enabled) throws Exception + { + Config config = DatabaseDescriptor.getRawConfig(); + config.auto_repair = new AutoRepairConfig(enabled); + Field configField = DatabaseDescriptor.class.getDeclaredField("conf"); + configField.setAccessible(true); + configField.set(null, config); + } } diff --git a/test/unit/org/apache/cassandra/audit/AuditLoggerTest.java b/test/unit/org/apache/cassandra/audit/AuditLoggerTest.java index f97e0bf5cef2..ee0ad12bcff0 100644 --- a/test/unit/org/apache/cassandra/audit/AuditLoggerTest.java +++ b/test/unit/org/apache/cassandra/audit/AuditLoggerTest.java @@ -34,6 +34,7 @@ import javax.management.remote.JMXConnectorFactory; import javax.management.remote.JMXServiceURL; +import org.assertj.core.api.Assertions; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -64,7 +65,6 @@ import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.JMXServerUtils; -import org.assertj.core.api.Assertions; import static org.apache.cassandra.config.CassandraRelevantProperties.CASSANDRA_JMX_AUTHORIZER; import static org.apache.cassandra.config.CassandraRelevantProperties.CASSANDRA_JMX_LOCAL_PORT; @@ -439,6 +439,26 @@ public void testCqlBatch_MultipleTablesAuditing() assertEquals(0, size); } + @Test + public void testTransactionAuditing() + { + createTable("CREATE TABLE %s (key int PRIMARY KEY, val int) WITH transactional_mode='full'"); + + Session session = sessionNet(); + String fqTableName = KEYSPACE + "." + currentTable(); + String query = "BEGIN TRANSACTION\n" + + " LET a = (SELECT * FROM " + fqTableName + " WHERE key = 0);\n" + + " SELECT a.val;\n" + + " IF a IS NULL THEN\n" + + " INSERT INTO " + fqTableName + " (key, val) VALUES (0, 0);\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + + session.execute(query); + AuditLogEntry logEntry = ((InMemoryAuditLogger) AuditLogManager.instance.getLogger()).inMemQueue.poll(); + assertLogEntry(query, AuditLogEntryType.TRANSACTION, logEntry, true, null); + } + @Test public void testCqlKeyspaceAuditing() throws Throwable { diff --git a/test/unit/org/apache/cassandra/auth/AllowAllCIDRAuthorizerTest.java b/test/unit/org/apache/cassandra/auth/AllowAllCIDRAuthorizerTest.java index fb7a57a3521b..4f7637d9c371 100644 --- a/test/unit/org/apache/cassandra/auth/AllowAllCIDRAuthorizerTest.java +++ b/test/unit/org/apache/cassandra/auth/AllowAllCIDRAuthorizerTest.java @@ -47,16 +47,6 @@ */ public class AllowAllCIDRAuthorizerTest extends CQLTester { - private static void setupSuperUser() - { - QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (role, is_superuser, can_login, salted_hash) " - + "VALUES ('%s', true, true, '%s')", - AUTH_KEYSPACE_NAME, - AuthKeyspace.ROLES, - CassandraRoleManager.DEFAULT_SUPERUSER_NAME, - "xxx")); - } - @BeforeClass public static void defineSchema() throws ConfigurationException { @@ -65,7 +55,7 @@ public static void defineSchema() throws ConfigurationException new AuthTestUtils.LocalCassandraAuthorizer(), new AuthTestUtils.LocalCassandraNetworkAuthorizer(), new AuthTestUtils.LocalAllowAllCIDRAuthorizer()); - setupSuperUser(); + AuthTestUtils.setupSuperUser(); } @Before diff --git a/test/unit/org/apache/cassandra/auth/AuthConfigTest.java b/test/unit/org/apache/cassandra/auth/AuthConfigTest.java index b9bde913be05..580c48eb9f8f 100644 --- a/test/unit/org/apache/cassandra/auth/AuthConfigTest.java +++ b/test/unit/org/apache/cassandra/auth/AuthConfigTest.java @@ -30,6 +30,7 @@ import org.apache.cassandra.config.Config; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.config.EncryptionOptions.ServerEncryptionOptions.Builder; import org.apache.cassandra.config.ParameterizedClass; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.utils.MBeanWrapper; @@ -52,8 +53,10 @@ public void testNewInstanceForMutualTlsInternodeAuthenticator() throws IOExcepti Config config = load("cassandra-mtls.yaml"); config.internode_authenticator.class_name = "org.apache.cassandra.auth.MutualTlsInternodeAuthenticator"; config.internode_authenticator.parameters = Collections.singletonMap("validator_class_name", "org.apache.cassandra.auth.SpiffeCertificateValidator"); - config.server_encryption_options = config.server_encryption_options.withOutboundKeystore("test/conf/cassandra_ssl_test_outbound.keystore") - .withOutboundKeystorePassword("cassandra"); + config.server_encryption_options = new Builder(config.server_encryption_options) + .withOutboundKeystore("test/conf/cassandra_ssl_test_outbound.keystore") + .withOutboundKeystorePassword("cassandra") + .build(); DatabaseDescriptor.setConfig(config); MutualTlsInternodeAuthenticator authenticator = ParameterizedClass.newInstance(config.internode_authenticator, Arrays.asList("", "org.apache.cassandra.auth.")); diff --git a/test/unit/org/apache/cassandra/auth/AuthTestUtils.java b/test/unit/org/apache/cassandra/auth/AuthTestUtils.java index ce5a8284f8f5..228c93b3969a 100644 --- a/test/unit/org/apache/cassandra/auth/AuthTestUtils.java +++ b/test/unit/org/apache/cassandra/auth/AuthTestUtils.java @@ -35,6 +35,7 @@ import com.google.common.base.Charsets; import com.google.common.collect.ImmutableMap; +import org.apache.commons.lang3.RandomStringUtils; import org.apache.cassandra.auth.jmx.AuthorizationProxy; import org.apache.cassandra.config.DatabaseDescriptor; @@ -45,6 +46,7 @@ import org.apache.cassandra.cql3.UntypedResultSet; import org.apache.cassandra.cql3.statements.AlterRoleStatement; import org.apache.cassandra.cql3.statements.AuthenticationStatement; +import org.apache.cassandra.cql3.statements.AuthorizationStatement; import org.apache.cassandra.cql3.statements.BatchStatement; import org.apache.cassandra.cql3.statements.CreateRoleStatement; import org.apache.cassandra.cql3.statements.DropRoleStatement; @@ -444,4 +446,26 @@ public static void waitForExistingRoles() .atMost(10, SECONDS) .until(CassandraRoleManager::hasExistingRoles); } + + static void authorize(String query, Object... args) + { + CQLStatement statement = QueryProcessor.parseStatement(String.format(query, args)).prepare(ClientState.forInternalCalls()); + assert statement instanceof AuthorizationStatement; + AuthorizationStatement authStmt = (AuthorizationStatement) statement; + + // invalidate roles cache so that any changes to the underlying roles are picked up + AuthenticatedUser.permissionsCache.invalidate(); + authStmt.execute(getClientState()); + } + + static String createName() + { + return RandomStringUtils.randomAlphabetic(8).toLowerCase(); + } + + public static void setupSuperUser() + { + QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (role, is_superuser, can_login, salted_hash) VALUES ('%s', true, true, '%s')", + AUTH_KEYSPACE_NAME, AuthKeyspace.ROLES, CassandraRoleManager.DEFAULT_SUPERUSER_NAME, "xxx")); + } } diff --git a/test/unit/org/apache/cassandra/auth/CIDRGroupsMappingManagerTest.java b/test/unit/org/apache/cassandra/auth/CIDRGroupsMappingManagerTest.java index a3a899d65dcf..44d5ef435a15 100644 --- a/test/unit/org/apache/cassandra/auth/CIDRGroupsMappingManagerTest.java +++ b/test/unit/org/apache/cassandra/auth/CIDRGroupsMappingManagerTest.java @@ -32,10 +32,8 @@ import org.apache.cassandra.SchemaLoader; import org.apache.cassandra.cql3.CIDR; -import org.apache.cassandra.cql3.QueryProcessor; import org.apache.cassandra.exceptions.ConfigurationException; -import static org.apache.cassandra.schema.SchemaConstants.AUTH_KEYSPACE_NAME; import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.junit.Assert.assertEquals; @@ -43,16 +41,6 @@ public class CIDRGroupsMappingManagerTest { CIDRGroupsMappingManager cidrGroupsMappingManager; - private static void setupSuperUser() - { - QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (role, is_superuser, can_login, salted_hash) " - + "VALUES ('%s', true, true, '%s')", - AUTH_KEYSPACE_NAME, - AuthKeyspace.ROLES, - CassandraRoleManager.DEFAULT_SUPERUSER_NAME, - "xxx")); - } - @BeforeClass public static void defineSchema() throws ConfigurationException { @@ -64,7 +52,7 @@ public static void defineSchema() throws ConfigurationException new AuthTestUtils.LocalCassandraNetworkAuthorizer(), new AuthTestUtils.LocalCassandraCIDRAuthorizer()); AuthCacheService.initializeAndRegisterCaches(); - setupSuperUser(); + AuthTestUtils.setupSuperUser(); } @Before diff --git a/test/unit/org/apache/cassandra/auth/CassandraAuthorizerTest.java b/test/unit/org/apache/cassandra/auth/CassandraAuthorizerTest.java index 7ead4e4118a6..97e8fb4b08cc 100644 --- a/test/unit/org/apache/cassandra/auth/CassandraAuthorizerTest.java +++ b/test/unit/org/apache/cassandra/auth/CassandraAuthorizerTest.java @@ -36,6 +36,7 @@ public class CassandraAuthorizerTest extends CQLTester @BeforeClass public static void setupAuth() { + // This runs after the base class sets up Cassandra and might not even work CassandraRelevantProperties.ORG_APACHE_CASSANDRA_DISABLE_MBEAN_REGISTRATION.setBoolean(true); requireAuthentication(); requireNetwork(); diff --git a/test/unit/org/apache/cassandra/auth/CassandraCIDRAuthorizerEnforceModeTest.java b/test/unit/org/apache/cassandra/auth/CassandraCIDRAuthorizerEnforceModeTest.java index 0ef3917b91e8..25079e4913ed 100644 --- a/test/unit/org/apache/cassandra/auth/CassandraCIDRAuthorizerEnforceModeTest.java +++ b/test/unit/org/apache/cassandra/auth/CassandraCIDRAuthorizerEnforceModeTest.java @@ -59,16 +59,6 @@ public class CassandraCIDRAuthorizerEnforceModeTest extends CQLTester { private static final AuthTestUtils.LocalCassandraCIDRAuthorizer cidrAuthorizer = new AuthTestUtils.LocalCassandraCIDRAuthorizer(); - private static void setupSuperUser() - { - QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (role, is_superuser, can_login, salted_hash) " - + "VALUES ('%s', true, true, '%s')", - AUTH_KEYSPACE_NAME, - AuthKeyspace.ROLES, - CassandraRoleManager.DEFAULT_SUPERUSER_NAME, - "xxx")); - } - @BeforeClass public static void defineSchema() throws ConfigurationException { @@ -78,7 +68,7 @@ public static void defineSchema() throws ConfigurationException new AuthTestUtils.LocalCassandraNetworkAuthorizer(), cidrAuthorizer); AuthCacheService.initializeAndRegisterCaches(); - setupSuperUser(); + AuthTestUtils.setupSuperUser(); } @Before diff --git a/test/unit/org/apache/cassandra/auth/CassandraCIDRAuthorizerMonitorModeTest.java b/test/unit/org/apache/cassandra/auth/CassandraCIDRAuthorizerMonitorModeTest.java index 54fe18bc2d89..b9225b519861 100644 --- a/test/unit/org/apache/cassandra/auth/CassandraCIDRAuthorizerMonitorModeTest.java +++ b/test/unit/org/apache/cassandra/auth/CassandraCIDRAuthorizerMonitorModeTest.java @@ -33,7 +33,6 @@ import org.apache.cassandra.config.ParameterizedClass; import org.apache.cassandra.cql3.CIDR; import org.apache.cassandra.cql3.CQLTester; -import org.apache.cassandra.cql3.QueryProcessor; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.service.ClientState; @@ -45,16 +44,7 @@ public class CassandraCIDRAuthorizerMonitorModeTest extends CQLTester { private static final AuthTestUtils.LocalCassandraCIDRAuthorizer cidrAuthorizer = - new AuthTestUtils.LocalCassandraCIDRAuthorizer(ICIDRAuthorizer.CIDRAuthorizerMode.MONITOR); - private static void setupSuperUser() - { - QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (role, is_superuser, can_login, salted_hash) " - + "VALUES ('%s', true, true, '%s')", - AUTH_KEYSPACE_NAME, - AuthKeyspace.ROLES, - CassandraRoleManager.DEFAULT_SUPERUSER_NAME, - "xxx")); - } + new AuthTestUtils.LocalCassandraCIDRAuthorizer(ICIDRAuthorizer.CIDRAuthorizerMode.MONITOR); @BeforeClass public static void defineSchema() throws ConfigurationException @@ -65,7 +55,7 @@ public static void defineSchema() throws ConfigurationException new AuthTestUtils.LocalCassandraNetworkAuthorizer(), cidrAuthorizer); AuthCacheService.initializeAndRegisterCaches(); - setupSuperUser(); + AuthTestUtils.setupSuperUser(); } @Before diff --git a/test/unit/org/apache/cassandra/auth/CassandraNetworkAuthorizerTest.java b/test/unit/org/apache/cassandra/auth/CassandraNetworkAuthorizerTest.java index 2e233ba8512a..aa22f5be4f5b 100644 --- a/test/unit/org/apache/cassandra/auth/CassandraNetworkAuthorizerTest.java +++ b/test/unit/org/apache/cassandra/auth/CassandraNetworkAuthorizerTest.java @@ -50,16 +50,6 @@ public class CassandraNetworkAuthorizerTest extends CQLTester { - private static void setupSuperUser() - { - QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (role, is_superuser, can_login, salted_hash) " - + "VALUES ('%s', true, true, '%s')", - AUTH_KEYSPACE_NAME, - AuthKeyspace.ROLES, - CassandraRoleManager.DEFAULT_SUPERUSER_NAME, - "xxx")); - } - @BeforeClass public static void defineSchema() throws ConfigurationException { @@ -69,7 +59,7 @@ public static void defineSchema() throws ConfigurationException new AuthTestUtils.LocalCassandraNetworkAuthorizer(), new AuthTestUtils.LocalCassandraCIDRAuthorizer()); AuthCacheService.initializeAndRegisterCaches(); - setupSuperUser(); + AuthTestUtils.setupSuperUser(); } @Before diff --git a/test/unit/org/apache/cassandra/auth/GrantAndRevokeTest.java b/test/unit/org/apache/cassandra/auth/GrantAndRevokeTest.java index 49c21736973d..7b8a4cfb7777 100644 --- a/test/unit/org/apache/cassandra/auth/GrantAndRevokeTest.java +++ b/test/unit/org/apache/cassandra/auth/GrantAndRevokeTest.java @@ -36,6 +36,7 @@ import org.apache.cassandra.schema.Schema; import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.CassandraDaemon; import org.apache.cassandra.transport.ProtocolVersion; import static java.lang.String.format; @@ -59,6 +60,7 @@ public static void setUpAuth() DatabaseDescriptor.setRolesValidity(0); requireAuthentication(); requireNetwork(); + CassandraDaemon.getInstanceForTesting().setupVirtualKeyspaces(); } @After @@ -563,6 +565,18 @@ public void testGrantOnAllKeyspaces() throws Throwable executeNet(ProtocolVersion.CURRENT, "INSERT INTO system.peers_v2(peer, peer_port, data_center) VALUES ('127.0.100.100', 7012, 'invalid_dc')"); } + @Test + public void testGrantOnVirtualKeyspaces() throws Throwable + { + useSuperUser(); + executeNet(String.format("CREATE ROLE %s WITH LOGIN = TRUE AND password='%s'", user, pass)); + + executeNet(ProtocolVersion.CURRENT, format("GRANT SELECT PERMISSION ON KEYSPACE system_virtual_schema TO %s", user)); + executeNet(ProtocolVersion.CURRENT, format("GRANT SELECT PERMISSION ON KEYSPACE system_views TO %s", user)); + executeNet(ProtocolVersion.CURRENT, format("REVOKE SELECT PERMISSION ON KEYSPACE system_virtual_schema FROM %s", user)); + executeNet(ProtocolVersion.CURRENT, format("REVOKE SELECT PERMISSION ON KEYSPACE system_views FROM %s", user)); + } + private void maybeReadSystemTables(boolean superuser) throws Throwable { if (superuser) diff --git a/test/unit/org/apache/cassandra/auth/MutualTlsAuthenticatorTest.java b/test/unit/org/apache/cassandra/auth/MutualTlsAuthenticatorTest.java index 14b31a3c87bc..88e44cf0ab5b 100644 --- a/test/unit/org/apache/cassandra/auth/MutualTlsAuthenticatorTest.java +++ b/test/unit/org/apache/cassandra/auth/MutualTlsAuthenticatorTest.java @@ -48,7 +48,7 @@ import static org.apache.cassandra.auth.AuthTestUtils.getMockInetAddress; import static org.apache.cassandra.auth.AuthTestUtils.initializeIdentityRolesTable; import static org.apache.cassandra.auth.AuthTestUtils.loadCertificateChain; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; @@ -79,8 +79,10 @@ public static void setup() StorageService.instance.initServer(); ((CassandraRoleManager)DatabaseDescriptor.getRoleManager()).loadIdentityStatement(); final Config config = DatabaseDescriptor.getRawConfig(); - config.client_encryption_options = config.client_encryption_options.withEnabled(true) - .withRequireClientAuth(REQUIRED); + config.client_encryption_options = new EncryptionOptions.ClientEncryptionOptions.Builder(config.client_encryption_options) + .withEnabled(true) + .withRequireClientAuth(REQUIRED) + .build(); } @After @@ -183,8 +185,10 @@ public void testValidateConfiguration() " & client_encryption_options.require_client_auth to be true"; MutualTlsAuthenticator mutualTlsAuthenticator = createAndInitializeMtlsAuthenticator(); - config.client_encryption_options = config.client_encryption_options.withEnabled(true) - .withRequireClientAuth(EncryptionOptions.ClientAuth.NOT_REQUIRED); + config.client_encryption_options = new EncryptionOptions.ClientEncryptionOptions.Builder(config.client_encryption_options) + .withEnabled(true) + .withRequireClientAuth(EncryptionOptions.ClientEncryptionOptions.ClientAuth.NOT_REQUIRED) + .build(); expectedException.expect(ConfigurationException.class); expectedException.expectMessage(msg); mutualTlsAuthenticator.validateConfiguration(); diff --git a/test/unit/org/apache/cassandra/auth/MutualTlsInternodeAuthenticatorTest.java b/test/unit/org/apache/cassandra/auth/MutualTlsInternodeAuthenticatorTest.java index 8fdd23a44411..94fe66133bef 100644 --- a/test/unit/org/apache/cassandra/auth/MutualTlsInternodeAuthenticatorTest.java +++ b/test/unit/org/apache/cassandra/auth/MutualTlsInternodeAuthenticatorTest.java @@ -39,6 +39,7 @@ import org.apache.cassandra.SchemaLoader; import org.apache.cassandra.config.Config; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.config.EncryptionOptions.ServerEncryptionOptions.Builder; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.transport.TlsTestUtils; @@ -81,8 +82,10 @@ public static void initialize() public void before() { Config config = DatabaseDescriptor.getRawConfig(); - config.server_encryption_options = config.server_encryption_options.withOutboundKeystore("test/conf/cassandra_ssl_test_outbound.keystore") - .withOutboundKeystorePassword("cassandra"); + config.server_encryption_options = new Builder(config.server_encryption_options) + .withOutboundKeystore("test/conf/cassandra_ssl_test_outbound.keystore") + .withOutboundKeystorePassword("cassandra") + .build(); } String getValidatorClass() @@ -164,8 +167,10 @@ public void testNoValidatorClassNameInConfig() public void testNoIdentitiesInKeystore() { Config config = DatabaseDescriptor.getRawConfig(); - config.server_encryption_options = config.server_encryption_options.withOutboundKeystore(TlsTestUtils.SERVER_KEYSTORE_PATH) - .withOutboundKeystorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD); + config.server_encryption_options = new Builder(config.server_encryption_options) + .withOutboundKeystore(TlsTestUtils.SERVER_KEYSTORE_PATH) + .withOutboundKeystorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) + .build(); expectedException.expect(ConfigurationException.class); expectedException.expectMessage(String.format("No identity was extracted from the outbound keystore '%s'", TlsTestUtils.SERVER_KEYSTORE_PATH)); new MutualTlsInternodeAuthenticator(getParams()); diff --git a/test/unit/org/apache/cassandra/auth/MutualTlsWithPasswordFallbackAuthenticatorTest.java b/test/unit/org/apache/cassandra/auth/MutualTlsWithPasswordFallbackAuthenticatorTest.java index 95f7c133eab8..f0c642846b3b 100644 --- a/test/unit/org/apache/cassandra/auth/MutualTlsWithPasswordFallbackAuthenticatorTest.java +++ b/test/unit/org/apache/cassandra/auth/MutualTlsWithPasswordFallbackAuthenticatorTest.java @@ -50,8 +50,10 @@ public static void initialize() DatabaseDescriptor.daemonInitialization(); SchemaLoader.loadSchema(); Config config = DatabaseDescriptor.getRawConfig(); - config.client_encryption_options = config.client_encryption_options.withEnabled(true) - .withRequireClientAuth(EncryptionOptions.ClientAuth.OPTIONAL); + config.client_encryption_options = new EncryptionOptions.ClientEncryptionOptions.Builder(config.client_encryption_options) + .withEnabled(true) + .withRequireClientAuth(EncryptionOptions.ClientEncryptionOptions.ClientAuth.OPTIONAL) + .build(); Map parameters = Collections.singletonMap("validator_class_name", "org.apache.cassandra.auth.SpiffeCertificateValidator"); fallbackAuthenticator = new MutualTlsWithPasswordFallbackAuthenticator(parameters); fallbackAuthenticator.setup(); diff --git a/test/unit/org/apache/cassandra/auth/PasswordAuthenticatorTest.java b/test/unit/org/apache/cassandra/auth/PasswordAuthenticatorTest.java index fadfa82c6c40..fb6adb827650 100644 --- a/test/unit/org/apache/cassandra/auth/PasswordAuthenticatorTest.java +++ b/test/unit/org/apache/cassandra/auth/PasswordAuthenticatorTest.java @@ -44,9 +44,9 @@ import static org.apache.cassandra.auth.AuthTestUtils.ALL_ROLES; import static org.apache.cassandra.auth.CassandraRoleManager.DEFAULT_SUPERUSER_PASSWORD; -import static org.apache.cassandra.auth.CassandraRoleManager.getGensaltLogRounds; import static org.apache.cassandra.auth.PasswordAuthenticator.SaslNegotiator; import static org.apache.cassandra.auth.PasswordAuthenticator.checkpw; +import static org.apache.cassandra.auth.PasswordSaltSupplier.getGensaltLogRounds; import static org.apache.cassandra.config.CassandraRelevantProperties.AUTH_BCRYPT_GENSALT_LOG2_ROUNDS; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertEquals; diff --git a/test/unit/org/apache/cassandra/auth/TxnAuthTest.java b/test/unit/org/apache/cassandra/auth/TxnAuthTest.java new file mode 100644 index 000000000000..e26e0259510e --- /dev/null +++ b/test/unit/org/apache/cassandra/auth/TxnAuthTest.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.auth; + +import java.net.InetSocketAddress; +import java.util.Collections; + +import org.apache.cassandra.transport.Dispatcher; +import org.assertj.core.api.Assertions; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.cql3.QueryOptions; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.cql3.statements.TransactionStatement; +import org.apache.cassandra.exceptions.UnauthorizedException; +import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.QueryState; +import org.apache.cassandra.transport.messages.ResultMessage; + +import static org.junit.Assert.assertEquals; + +import static org.apache.cassandra.auth.AuthTestUtils.auth; +import static org.apache.cassandra.db.ConsistencyLevel.NODE_LOCAL; + +public class TxnAuthTest extends CQLTester +{ + @BeforeClass + public static void setUpAuthAndAccord() + { + CassandraRelevantProperties.ENABLE_NODELOCAL_QUERIES.setBoolean(true); + + IRoleManager roleManager = new AuthTestUtils.LocalCassandraRoleManager(); + SchemaLoader.setupAuth(roleManager, + new AuthTestUtils.LocalPasswordAuthenticator(), + new AuthTestUtils.LocalCassandraAuthorizer(), + new AuthTestUtils.LocalCassandraNetworkAuthorizer(), + new AuthTestUtils.LocalCassandraCIDRAuthorizer()); + roleManager.setup(); + AuthCacheService.initializeAndRegisterCaches(); + AuthTestUtils.setupSuperUser(); + + requireNetwork(); + } + + @Before + public void setUpTest() + { + createTable("CREATE TABLE %s (k int, v int, PRIMARY KEY(k)) WITH transactional_mode='full'"); + } + + @Test + public void canSelectInTxnWithPermissions() + { + QueryProcessor.process(formatQuery("INSERT INTO %s (k, v) VALUES (0, 0)"), NODE_LOCAL); + + ClientState clientState = createUserAndLogin(); + String query = formatQuery("BEGIN TRANSACTION\n" + + " SELECT * FROM %s WHERE k = 0;\n" + + "COMMIT TRANSACTION"); + + assertUnauthorized(query, clientState); + + grantTo(clientState, Permission.SELECT); + ResultMessage.Rows message = (ResultMessage.Rows) execute(query, clientState); + assertEquals(1, message.result.size()); + } + + @Test + public void canSelectRefsInTxnWithPermissions() + { + QueryProcessor.process(formatQuery("INSERT INTO %s (k, v) VALUES (0, 0)"), NODE_LOCAL); + + ClientState clientState = createUserAndLogin(); + String query = formatQuery("BEGIN TRANSACTION\n" + + " LET row0 = (SELECT * FROM %s WHERE k = 0);\n" + + " SELECT row0.v;\n" + + "COMMIT TRANSACTION"); + + assertUnauthorized(query, clientState); + + grantTo(clientState, Permission.SELECT); + ResultMessage.Rows message = (ResultMessage.Rows) execute(query, clientState); + assertEquals(1, message.result.size()); + } + + @Test + public void canInsertOnlyInTxnWithPermissions() + { + ClientState clientState = createUserAndLogin(); + String insert = formatQuery("BEGIN TRANSACTION\n" + + " INSERT INTO %s (k, v) VALUES (0, 0);\n" + + "COMMIT TRANSACTION"); + + assertUnauthorized(insert, clientState); + + grantTo(clientState, Permission.MODIFY); + execute(insert, clientState); + } + + @Test + public void canExecuteTxnWithAutoGeneratedRead() + { + QueryProcessor.process(formatQuery("INSERT INTO %s (k, v) VALUES (0, 0)"), NODE_LOCAL); + + ClientState clientState = createUserAndLogin(); + String update = "BEGIN TRANSACTION\n" + + formatQuery("SELECT * FROM %s WHERE k = 0;\n") + + formatQuery("UPDATE %s SET v += 1 WHERE k = 0 ;\n") + + "COMMIT TRANSACTION"; + + assertUnauthorized(update, clientState); + + // We should still fail here, given we need permisions to SELECT for the generated reads. + grantTo(clientState, Permission.MODIFY); + assertUnauthorized(update, clientState); + + grantTo(clientState, Permission.SELECT); + execute(update, clientState); + } + + private void assertUnauthorized(String query, ClientState clientState) + { + Assertions.assertThatThrownBy(() -> execute(query, clientState)) + .isInstanceOf(UnauthorizedException.class) + .hasMessageContaining(clientState.getUser().getName()); + } + + private void grantTo(ClientState clientState, Permission permission) + { + AuthTestUtils.authorize(formatQuery("GRANT " + permission + " ON TABLE %s TO " + clientState.getUser().getName())); + } + + private ClientState createUserAndLogin() + { + String username = AuthTestUtils.createName(); + auth("CREATE ROLE %s WITH password = 'password' AND LOGIN = true", username); + ClientState clientState = ClientState.forExternalCalls(InetSocketAddress.createUnresolved("127.0.0.1", 123)); + clientState.login(new AuthenticatedUser(username)); + return clientState; + } + + private ResultMessage execute(String query, ClientState clientState) + { + TransactionStatement.Parsed parsed = (TransactionStatement.Parsed) QueryProcessor.parseStatement(query); + TransactionStatement statement = (TransactionStatement) parsed.prepare(clientState); + QueryOptions options = QueryOptions.forInternalCalls(NODE_LOCAL, Collections.emptyList()); + QueryState queryState = new QueryState(clientState); + return QueryProcessor.instance.process(statement, queryState, options, Dispatcher.RequestTime.forImmediateExecution()); + } +} diff --git a/test/unit/org/apache/cassandra/auth/jmx/JMXAuthJMXServerOptionsTest.java b/test/unit/org/apache/cassandra/auth/jmx/JMXAuthJMXServerOptionsTest.java index cb83a61eda17..98749ca314ab 100644 --- a/test/unit/org/apache/cassandra/auth/jmx/JMXAuthJMXServerOptionsTest.java +++ b/test/unit/org/apache/cassandra/auth/jmx/JMXAuthJMXServerOptionsTest.java @@ -42,7 +42,7 @@ private static JMXServerOptions getJMXServerOptions() throws Exception String config = Paths.get(ClassLoader.getSystemResource("auth/cassandra-test-jaas.conf").toURI()).toString(); return new JMXServerOptions(true, false, 9999, 0, true, - new EncryptionOptions(), "TestLogin", config, null, null, + new EncryptionOptions.ClientEncryptionOptions(), "TestLogin", config, null, null, NoSuperUserAuthorizationProxy.class.getName()); } } diff --git a/test/unit/org/apache/cassandra/batchlog/BatchlogManagerTest.java b/test/unit/org/apache/cassandra/batchlog/BatchlogManagerTest.java index d267a4b96aba..f8599f4c6ed7 100644 --- a/test/unit/org/apache/cassandra/batchlog/BatchlogManagerTest.java +++ b/test/unit/org/apache/cassandra/batchlog/BatchlogManagerTest.java @@ -23,15 +23,16 @@ import java.util.concurrent.ExecutionException; import com.google.common.collect.Lists; -import org.junit.AfterClass; import org.junit.Assert; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; +import org.apache.cassandra.CassandraTestBase; +import org.apache.cassandra.CassandraTestBase.SchemaLoaderPrepareServer; +import org.apache.cassandra.CassandraTestBase.UseByteOrderedPartitioner; import org.apache.cassandra.SchemaLoader; import org.apache.cassandra.Util; -import org.apache.cassandra.Util.PartitionerSwitcher; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.UntypedResultSet; import org.apache.cassandra.db.ColumnFamilyStore; @@ -45,7 +46,6 @@ import org.apache.cassandra.db.partitions.ImmutableBTreePartition; import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.db.rows.Row; -import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.schema.KeyspaceParams; @@ -66,7 +66,9 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; -public class BatchlogManagerTest +@UseByteOrderedPartitioner +@SchemaLoaderPrepareServer +public class BatchlogManagerTest extends CassandraTestBase { private static final String KEYSPACE1 = "BatchlogManagerTest1"; private static final String CF_STANDARD1 = "Standard1"; @@ -75,14 +77,9 @@ public class BatchlogManagerTest private static final String CF_STANDARD4 = "Standard4"; private static final String CF_STANDARD5 = "Standard5"; - static PartitionerSwitcher sw; - @BeforeClass public static void defineSchema() throws ConfigurationException { - DatabaseDescriptor.daemonInitialization(); - sw = Util.switchPartitioner(Murmur3Partitioner.instance); - SchemaLoader.prepareServer(); SchemaLoader.createKeyspace(KEYSPACE1, KeyspaceParams.simple(1), SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1, 1, BytesType.instance), @@ -92,12 +89,6 @@ public static void defineSchema() throws ConfigurationException SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD5, 1, BytesType.instance)); } - @AfterClass - public static void cleanup() - { - sw.close(); - } - @Before public void setUp() throws Exception { diff --git a/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java b/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java index 50145e10b363..aebe5e194b6c 100644 --- a/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java +++ b/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java @@ -79,7 +79,7 @@ private static void doTestSerializeAndLoadKeyCache() throws Exception cfs.truncateBlocking(); for (int i = 0; i < 2; i++) { - ColumnMetadata colDef = ColumnMetadata.regularColumn(cfs.metadata(), ByteBufferUtil.bytes("col1"), AsciiType.instance); + ColumnMetadata colDef = ColumnMetadata.regularColumn(cfs.metadata(), ByteBufferUtil.bytes("col1"), AsciiType.instance, ColumnMetadata.NO_UNIQUE_ID); RowUpdateBuilder rowBuilder = new RowUpdateBuilder(cfs.metadata(), currentTimeMillis(), "key1"); rowBuilder.add(colDef, "val1"); rowBuilder.build().apply(); diff --git a/test/unit/org/apache/cassandra/concurrent/AdaptingScheduledExecutorPlus.java b/test/unit/org/apache/cassandra/concurrent/AdaptingScheduledExecutorPlus.java new file mode 100644 index 000000000000..9a84b2600df5 --- /dev/null +++ b/test/unit/org/apache/cassandra/concurrent/AdaptingScheduledExecutorPlus.java @@ -0,0 +1,257 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.concurrent; + +import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.Executors; // checkstyle: permit this import +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.TimeUnit; + +import accord.utils.async.AsyncChain; +import org.apache.cassandra.utils.WithResources; +import org.apache.cassandra.utils.concurrent.AsyncPromise; +import org.apache.cassandra.utils.concurrent.Future; + +import static com.google.common.primitives.Longs.max; +import static java.util.concurrent.TimeUnit.NANOSECONDS; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; + +public class AdaptingScheduledExecutorPlus implements ScheduledExecutorPlus +{ + private final ScheduledExecutorService delegate; + + public AdaptingScheduledExecutorPlus(ScheduledExecutorService delegate) + { + this.delegate = delegate; + } + + protected ScheduledExecutorService delegate() + { + return delegate; + } + + @Override + public ScheduledFuture scheduleSelfRecurring(Runnable run, long delay, TimeUnit units) + { + return schedule(run, delay, units); + } + + @Override + public ScheduledFuture scheduleAt(Runnable run, long deadline) + { + return schedule(run, max(0, deadline - nanoTime()), NANOSECONDS); + } + + @Override + public ScheduledFuture scheduleTimeoutAt(Runnable run, long deadline) + { + return scheduleTimeoutWithDelay(run, max(0, deadline - nanoTime()), NANOSECONDS); + } + + @Override + public ScheduledFuture scheduleTimeoutWithDelay(Runnable run, long delay, TimeUnit units) + { + return schedule(run, delay, units); + } + + @Override + public ScheduledFuture schedule(Runnable command, long delay, TimeUnit unit) + { + return delegate().schedule(command, delay, unit); + } + + @Override + public ScheduledFuture schedule(Callable callable, long delay, TimeUnit unit) + { + return delegate().schedule(callable, delay, unit); + } + + @Override + public ScheduledFuture scheduleAtFixedRate(Runnable command, long initialDelay, long period, TimeUnit unit) + { + return delegate().scheduleAtFixedRate(command, initialDelay, period, unit); + } + + @Override + public ScheduledFuture scheduleWithFixedDelay(Runnable command, long initialDelay, long delay, TimeUnit unit) + { + return delegate().scheduleWithFixedDelay(command, initialDelay, delay, unit); + } + + @Override + public void shutdown() + { + delegate().shutdown(); + } + + @Override + public List shutdownNow() + { + return delegate().shutdownNow(); + } + + @Override + public boolean isShutdown() + { + return delegate().isShutdown(); + } + + @Override + public boolean isTerminated() + { + return delegate().isTerminated(); + } + + @Override + public boolean awaitTermination(long timeout, TimeUnit unit) throws InterruptedException + { + return delegate().awaitTermination(timeout, unit); + } + + @Override + public Future submit(Callable task) + { + return wrap(delegate().submit(task)); + } + + @Override + public Future submit(Runnable task, T result) + { + return wrap(delegate().submit(task, result)); + } + + @Override + public Future submit(Runnable task) + { + return wrap(delegate().submit(task)); + } + + @Override + public void execute(WithResources withResources, Runnable task) + { + execute(TaskFactory.standard().toExecute(withResources, task)); + } + + @Override + public Future submit(WithResources withResources, Callable task) + { + class Catch { T value;} + Catch c = new Catch(); + Runnable exec = TaskFactory.standard().toExecute(withResources, () -> { + try + { + c.value = task.call(); + } + catch (Exception e) + { + throw new RuntimeException(e); + } + }); + return submit(() -> { + exec.run(); + return c.value; + }); + } + + @Override + public Future submit(WithResources withResources, Runnable task) + { + return submit(TaskFactory.standard().toExecute(withResources, task)); + } + + @Override + public Future submit(WithResources withResources, Runnable task, T result) + { + return submit(Executors.callable(TaskFactory.standard().toSubmit(withResources, task), result)); + } + + @Override + public boolean inExecutor() + { + return false; + } + + @Override + public void execute(Runnable command) + { + delegate().execute(command); + } + + @Override + public int getCorePoolSize() + { + return 0; + } + + @Override + public void setCorePoolSize(int newCorePoolSize) + { + + } + + @Override + public int getMaximumPoolSize() + { + return 0; + } + + @Override + public void setMaximumPoolSize(int newMaximumPoolSize) + { + + } + + @Override + public int getActiveTaskCount() + { + return 0; + } + + @Override + public long getCompletedTaskCount() + { + return 0; + } + + @Override + public int getPendingTaskCount() + { + return 0; + } + + private static org.apache.cassandra.utils.concurrent.Future wrap(java.util.concurrent.Future future) + { + if (future instanceof org.apache.cassandra.utils.concurrent.Future) + return (Future) future; + if (future instanceof AsyncChain) + { + AsyncChain chain = (AsyncChain) future; + AsyncPromise promise = new AsyncPromise<>(); + chain.begin((s, f) -> { + if (f != null) promise.setFailure(f); + else promise.setSuccess(s); + }); + + return promise; + } + throw new IllegalStateException("Unexpected future type: " + future.getClass()); + } +} diff --git a/test/unit/org/apache/cassandra/concurrent/ForwardingExecutorFactory.java b/test/unit/org/apache/cassandra/concurrent/ForwardingExecutorFactory.java new file mode 100644 index 000000000000..37b77f3dcf11 --- /dev/null +++ b/test/unit/org/apache/cassandra/concurrent/ForwardingExecutorFactory.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.concurrent; + +public class ForwardingExecutorFactory implements ExecutorFactory +{ + private final ExecutorFactory delegate; + + public ForwardingExecutorFactory(ExecutorFactory delegate) + { + this.delegate = delegate; + } + + protected ExecutorFactory delegate() + { + return delegate; + } + + @Override + public ExecutorPlus pooled(String name, int threads) + { + return delegate().pooled(name, threads); + } + + @Override + public ExecutorBuilderFactory withJmxInternal() + { + return delegate().withJmxInternal(); + } + + @Override + public ScheduledExecutorPlus scheduled(String name) + { + return delegate().scheduled(name); + } + + @Override + public ScheduledExecutorPlus scheduled(String name, SimulatorSemantics simulatorSemantics) + { + return delegate().scheduled(name, simulatorSemantics); + } + + @Override + public ScheduledExecutorPlus scheduled(boolean executeOnShutdown, String name) + { + return delegate().scheduled(executeOnShutdown, name); + } + + @Override + public ScheduledExecutorPlus scheduled(boolean executeOnShutdown, String name, int priority) + { + return delegate().scheduled(executeOnShutdown, name, priority); + } + + @Override + public Thread startThread(String name, Runnable runnable) + { + return delegate().startThread(name, runnable); + } + + @Override + public Interruptible infiniteLoop(String name, Interruptible.SimpleTask task, InfiniteLoopExecutor.SimulatorSafe simulatorSafe) + { + return delegate().infiniteLoop(name, task, simulatorSafe); + } + + @Override + public ExecutorBuilder configureSequential(String name) + { + return delegate().configureSequential(name); + } + + @Override + public SequentialExecutorPlus sequential(String name) + { + return delegate().sequential(name); + } + + @Override + public ExecutorBuilder configurePooled(String name, int threads) + { + return delegate().configurePooled(name, threads); + } + + @Override + public ExecutorBuilderFactory withJmx(String jmxPath) + { + return delegate().withJmx(jmxPath); + } + + @Override + public LocalAwareSubFactory localAware() + { + return delegate().localAware(); + } + + @Override + public ScheduledExecutorPlus scheduled(boolean executeOnShutdown, String name, int priority, SimulatorSemantics simulatorSemantics) + { + return delegate().scheduled(executeOnShutdown, name, priority, simulatorSemantics); + } + + @Override + public Thread startThread(String name, Runnable runnable, SystemThreadTag systemTag, SimulatorThreadTag simulatorTag) + { + return delegate().startThread(name, runnable, systemTag, simulatorTag); + } + + @Override + public Interruptible infiniteLoop(String name, Interruptible.Task task, InfiniteLoopExecutor.SimulatorSafe simulatorSafe, SystemThreadTag systemTag, InfiniteLoopExecutor.Interrupts interrupts) + { + return delegate().infiniteLoop(name, task, simulatorSafe, systemTag, interrupts); + } + + @Override + public ThreadGroup newThreadGroup(String name) + { + return delegate().newThreadGroup(name); + } +} diff --git a/test/unit/org/apache/cassandra/concurrent/ForwardingExecutorPlus.java b/test/unit/org/apache/cassandra/concurrent/ForwardingExecutorPlus.java index 71c88a947e48..cfa8d6f4496d 100644 --- a/test/unit/org/apache/cassandra/concurrent/ForwardingExecutorPlus.java +++ b/test/unit/org/apache/cassandra/concurrent/ForwardingExecutorPlus.java @@ -33,7 +33,7 @@ import org.apache.cassandra.utils.concurrent.AsyncPromise; import org.apache.cassandra.utils.concurrent.Future; -public class ForwardingExecutorPlus implements ExecutorPlus +public class ForwardingExecutorPlus implements ExecutorPlus, SequentialExecutorPlus, LocalAwareExecutorPlus { private final ExecutorService delegate; @@ -216,4 +216,10 @@ public void onFailure(Throwable t) } throw new IllegalStateException("Unexpected future type: " + submit.getClass()); } + + @Override + public AtLeastOnceTrigger atLeastOnceTrigger(Runnable runnable) + { + return new SingleThreadExecutorPlus.AtLeastOnce(this, runnable); + } } diff --git a/test/unit/org/apache/cassandra/concurrent/InfiniteLoopExecutorTest.java b/test/unit/org/apache/cassandra/concurrent/InfiniteLoopExecutorTest.java index 9ec702dd5010..29dc7ec8ac8d 100644 --- a/test/unit/org/apache/cassandra/concurrent/InfiniteLoopExecutorTest.java +++ b/test/unit/org/apache/cassandra/concurrent/InfiniteLoopExecutorTest.java @@ -30,7 +30,7 @@ import org.junit.Assert; import org.junit.Test; -import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.Daemon.DAEMON; +import static org.apache.cassandra.concurrent.ExecutorFactory.SystemThreadTag.DAEMON; public class InfiniteLoopExecutorTest { diff --git a/test/unit/org/apache/cassandra/concurrent/ManualExecutor.java b/test/unit/org/apache/cassandra/concurrent/ManualExecutor.java new file mode 100644 index 000000000000..2b98d13d6abb --- /dev/null +++ b/test/unit/org/apache/cassandra/concurrent/ManualExecutor.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.concurrent; + +import accord.utils.Invariants; +import org.apache.cassandra.utils.Closeable; +import org.apache.cassandra.utils.WithResources; +import org.apache.cassandra.utils.concurrent.Future; + +import java.util.ArrayDeque; +import java.util.Collections; +import java.util.List; +import java.util.Queue; +import java.util.concurrent.Callable; +import java.util.concurrent.TimeUnit; + +public class ManualExecutor implements ExecutorPlus +{ + private final Queue tasks = new ArrayDeque<>(); + private int completedCount = 0; + + public void runOne() + { + Task task = tasks.remove(); + task.run(); + completedCount++; + } + + public void runAll() + { + while (!tasks.isEmpty()) + runOne(); + } + + @Override + public Future submit(Callable callable) + { + return submit((WithResources) null, callable); + } + + @Override + public Future submit(WithResources withResources, Callable callable) + { + FutureImpl future = new FutureImpl<>(); + tasks.add(new Task(null, callable, withResources, null, future)); + return future; + } + + @Override + public Future submit(Runnable runnable) + { + return submit(null, runnable, null); + } + + @Override + public Future submit(WithResources withResources, Runnable runnable) + { + return submit(withResources, runnable, null); + } + + @Override + public Future submit(Runnable runnable, T result) + { + return submit(null, runnable, result); + } + + @Override + public Future submit(WithResources withResources, Runnable runnable, T result) + { + FutureImpl future = new FutureImpl<>(); + tasks.add(new Task(runnable, null, withResources, result, future)); + return future; + } + + @Override + public void execute(Runnable runnable) + { + execute(null, runnable); + } + + @Override + public void execute(WithResources withResources, Runnable runnable) + { + tasks.add(new Task(runnable, null, withResources, null, null)); + } + + private static class Task + { + private final Runnable runnable; + private final Callable callable; + private final WithResources withResources; + private final FutureImpl future; + + private Object result; + + Task(Runnable runnable, Callable callable, WithResources withResources, Object result, FutureImpl future) + { + Invariants.requireArgument(runnable != null ^ callable != null); + + this.runnable = runnable; + this.callable = callable; + this.withResources = withResources; + this.result = result; + this.future = future; + } + + void run() + { + try (Closeable ignored = withResources == null ? null : withResources.get()) + { + if (null != runnable) + runnable.run(); + else + result = callable.call(); + + if (null != future) + future.succeed(result); + } + catch (Throwable t) + { + ExecutionFailure.handle(t); + if (null != future) + future.fail(t); + } + } + } + + private static class FutureImpl extends org.apache.cassandra.utils.concurrent.AsyncFuture + { + @SuppressWarnings("unchecked") + void succeed(Object v) + { + trySuccess((V) v); + } + + void fail(Throwable throwable) + { + tryFailure(throwable); + } + } + + @Override + public boolean inExecutor() + { + return true; + } + + @Override public int getActiveTaskCount() { return 0; } + @Override public long getCompletedTaskCount() { return completedCount; } + @Override public int getPendingTaskCount() { return tasks.size(); } + + @Override public int getCorePoolSize() { return 0; } + @Override public int getMaximumPoolSize() { return 0; } + @Override public void setCorePoolSize(int newCorePoolSize) { throw new IllegalArgumentException("Cannot resize ManualExecutor"); } + @Override public void setMaximumPoolSize(int newMaximumPoolSize) { throw new IllegalArgumentException("Cannot resize ManualExecutor"); } + + @Override public void shutdown() {} + @Override public List shutdownNow() { return Collections.emptyList(); } + @Override public boolean isShutdown() { return false; } + @Override public boolean isTerminated() { return false; } + @Override public boolean awaitTermination(long timeout, TimeUnit unit) { return true; } +} diff --git a/test/unit/org/apache/cassandra/net/ManyToOneConcurrentLinkedQueueTest.java b/test/unit/org/apache/cassandra/concurrent/ManyToOneConcurrentLinkedQueueTest.java similarity index 99% rename from test/unit/org/apache/cassandra/net/ManyToOneConcurrentLinkedQueueTest.java rename to test/unit/org/apache/cassandra/concurrent/ManyToOneConcurrentLinkedQueueTest.java index 4f60d01f240d..d1baae7e10cf 100644 --- a/test/unit/org/apache/cassandra/net/ManyToOneConcurrentLinkedQueueTest.java +++ b/test/unit/org/apache/cassandra/concurrent/ManyToOneConcurrentLinkedQueueTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.cassandra.net; +package org.apache.cassandra.concurrent; import java.util.BitSet; import java.util.NoSuchElementException; diff --git a/test/unit/org/apache/cassandra/concurrent/SimulatedExecutorFactory.java b/test/unit/org/apache/cassandra/concurrent/SimulatedExecutorFactory.java index 01540499f4ca..e4ae9282e73d 100644 --- a/test/unit/org/apache/cassandra/concurrent/SimulatedExecutorFactory.java +++ b/test/unit/org/apache/cassandra/concurrent/SimulatedExecutorFactory.java @@ -101,7 +101,7 @@ public String toString() public SimulatedExecutorFactory(RandomSource rs, Consumer onError) { - this(rs, toGen(Generators.TIMESTAMP_GEN.map(Timestamp::getTime)).mapToLong(TimeUnit.MILLISECONDS::toNanos).next(rs), onError); + this(rs, toGen(Generators.TIMESTAMP_GEN.map(Timestamp::getTime)).mapToLong(TimeUnit.MILLISECONDS::toNanos).nextLong(rs), onError); } public SimulatedExecutorFactory(RandomSource rs) @@ -204,7 +204,7 @@ public ScheduledExecutorPlus scheduled(boolean executeOnShutdown, String name, i } @Override - public Thread startThread(String name, Runnable runnable, InfiniteLoopExecutor.Daemon daemon) + public Thread startThread(String name, Runnable runnable, SystemThreadTag systemTag, SimulatorThreadTag simulatorTag) { throw new UnsupportedOperationException("Thread can't be simualted"); } @@ -213,7 +213,7 @@ public Thread startThread(String name, Runnable runnable, InfiniteLoopExecutor.D public Interruptible infiniteLoop(String name, Interruptible.Task task, InfiniteLoopExecutor.SimulatorSafe simulatorSafe, - InfiniteLoopExecutor.Daemon daemon, + SystemThreadTag systemTag, InfiniteLoopExecutor.Interrupts interrupts) { var delegate = new UnorderedScheduledExecutorService(); diff --git a/test/unit/org/apache/cassandra/config/ConfigCompatibilityTest.java b/test/unit/org/apache/cassandra/config/ConfigCompatibilityTest.java index f5d10a6b526d..2d38db65308a 100644 --- a/test/unit/org/apache/cassandra/config/ConfigCompatibilityTest.java +++ b/test/unit/org/apache/cassandra/config/ConfigCompatibilityTest.java @@ -119,6 +119,7 @@ public class ConfigCompatibilityTest .add("Property role_manager used to be a value-type, but now is nested type class org.apache.cassandra.config.ParameterizedClass") .add("Property network_authorizer used to be a value-type, but now is nested type class org.apache.cassandra.config.ParameterizedClass") .add("require_client_auth types do not match; java.lang.String != java.lang.Boolean") + .add("available_processors types do not match; org.apache.cassandra.config.OptionaldPositiveInt != java.lang.Integer") .build(); /** diff --git a/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java b/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java index 34656eac55ae..d44dea211548 100644 --- a/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java +++ b/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java @@ -72,11 +72,18 @@ public class DatabaseDescriptorRefTest "org.apache.cassandra.auth.Cacheable", "org.apache.cassandra.auth.IAuthenticator", "org.apache.cassandra.auth.IAuthorizer", - "org.apache.cassandra.auth.IInternodeAuthenticator", "org.apache.cassandra.auth.ICIDRAuthorizer", "org.apache.cassandra.auth.ICIDRAuthorizer$CIDRAuthorizerMode", + "org.apache.cassandra.auth.IInternodeAuthenticator", "org.apache.cassandra.auth.INetworkAuthorizer", "org.apache.cassandra.auth.IRoleManager", + "org.apache.cassandra.config.AccordSpec", + "org.apache.cassandra.config.AccordSpec$JournalSpec", + "org.apache.cassandra.config.AccordSpec$MinEpochRetrySpec", + "org.apache.cassandra.config.AccordSpec$FetchRetrySpec", + "org.apache.cassandra.config.AccordSpec$TransactionalRangeMigration", + "org.apache.cassandra.config.AccordSpec$QueueShardModel", + "org.apache.cassandra.config.AccordSpec$QueueSubmissionModel", "org.apache.cassandra.config.CassandraRelevantProperties", "org.apache.cassandra.config.CassandraRelevantProperties$PropertyConverter", "org.apache.cassandra.config.Config", @@ -90,12 +97,15 @@ public class DatabaseDescriptorRefTest "org.apache.cassandra.config.Config$DiskOptimizationStrategy", "org.apache.cassandra.config.Config$FlushCompression", "org.apache.cassandra.config.Config$InternodeCompression", + "org.apache.cassandra.config.Config$LWTStrategy", + "org.apache.cassandra.config.Config$NonSerialWriteStrategy", "org.apache.cassandra.config.Config$MemtableAllocationType", "org.apache.cassandra.config.Config$PaxosOnLinearizabilityViolation", "org.apache.cassandra.config.Config$PaxosStatePurging", "org.apache.cassandra.config.Config$PaxosVariant", "org.apache.cassandra.config.Config$RepairCommandPoolFullStrategy", "org.apache.cassandra.config.Config$SSTableConfig", + "org.apache.cassandra.config.Config$TransactionalRangeMigration", "org.apache.cassandra.config.Config$TriggersPolicy", "org.apache.cassandra.config.Config$UserFunctionTimeoutPolicy", "org.apache.cassandra.config.ConfigBeanInfo", @@ -104,6 +114,16 @@ public class DatabaseDescriptorRefTest "org.apache.cassandra.config.Config$CorruptedTombstoneStrategy", "org.apache.cassandra.config.Config$BatchlogEndpointStrategy", "org.apache.cassandra.config.Config$TombstonesMetricGranularity", + "org.apache.cassandra.repair.autorepair.AutoRepairConfig", + "org.apache.cassandra.repair.autorepair.AutoRepairConfig$Options", + "org.apache.cassandra.repair.autorepair.AutoRepairConfig$RepairType", + "org.apache.cassandra.repair.autorepair.AutoRepairState", + "org.apache.cassandra.repair.autorepair.FixedSplitTokenRangeSplitter", + "org.apache.cassandra.repair.autorepair.FullRepairState", + "org.apache.cassandra.repair.autorepair.IAutoRepairTokenRangeSplitter", + "org.apache.cassandra.repair.autorepair.IncrementalRepairState", + "org.apache.cassandra.repair.autorepair.PreviewRepairedState", + "org.apache.cassandra.repair.autorepair.RepairTokenRangeSplitter", "org.apache.cassandra.config.DatabaseDescriptor$ByteUnit", "org.apache.cassandra.config.DataRateSpec", "org.apache.cassandra.config.DataRateSpec$DataRateUnit", @@ -128,8 +148,8 @@ public class DatabaseDescriptorRefTest "org.apache.cassandra.config.DurationSpec$IntMillisecondsBound", "org.apache.cassandra.config.DurationSpec$IntMinutesBound", "org.apache.cassandra.config.DurationSpec$IntSecondsBound", - "org.apache.cassandra.config.DurationSpec$LongMillisecondsBound", "org.apache.cassandra.config.DurationSpec$LongMicrosecondsBound", + "org.apache.cassandra.config.DurationSpec$LongMillisecondsBound", "org.apache.cassandra.config.DurationSpec$LongNanosecondsBound", "org.apache.cassandra.config.DurationSpec$LongSecondsBound", "org.apache.cassandra.config.EncryptionOptions", @@ -146,27 +166,29 @@ public class DatabaseDescriptorRefTest "org.apache.cassandra.config.GuardrailsOptions$ConsistencyLevels", "org.apache.cassandra.config.GuardrailsOptions$TableProperties", "org.apache.cassandra.config.JMXServerOptions", + "org.apache.cassandra.config.OptionaldPositiveInt", "org.apache.cassandra.config.ParameterizedClass", "org.apache.cassandra.config.RepairConfig", "org.apache.cassandra.config.RepairRetrySpec", + "org.apache.cassandra.config.ReplicaFilteringProtectionOptions", "org.apache.cassandra.config.RetrySpec", "org.apache.cassandra.config.RetrySpec$MaxAttempt", "org.apache.cassandra.config.RetrySpec$Type", - "org.apache.cassandra.config.ReplicaFilteringProtectionOptions", "org.apache.cassandra.config.StartupChecksOptions", + "org.apache.cassandra.config.StartupChecksOptions", + "org.apache.cassandra.config.StorageAttachedIndexOptions", + "org.apache.cassandra.config.StringRetryStrategy", + "org.apache.cassandra.config.SubnetGroups", "org.apache.cassandra.config.SubnetGroups", "org.apache.cassandra.config.TrackWarnings", + "org.apache.cassandra.config.TrackWarnings", + "org.apache.cassandra.config.TransparentDataEncryptionOptions", "org.apache.cassandra.config.TransparentDataEncryptionOptions", "org.apache.cassandra.config.YamlConfigurationLoader", "org.apache.cassandra.config.YamlConfigurationLoader$CustomConstructor", + "org.apache.cassandra.config.YamlConfigurationLoader$CustomConstructor", "org.apache.cassandra.config.YamlConfigurationLoader$PropertiesChecker", "org.apache.cassandra.config.YamlConfigurationLoader$PropertiesChecker$1", - "org.apache.cassandra.config.YamlConfigurationLoader$CustomConstructor", - "org.apache.cassandra.config.TransparentDataEncryptionOptions", - "org.apache.cassandra.config.StartupChecksOptions", - "org.apache.cassandra.config.SubnetGroups", - "org.apache.cassandra.config.TrackWarnings", - "org.apache.cassandra.config.StorageAttachedIndexOptions", "org.apache.cassandra.db.ConsistencyLevel", "org.apache.cassandra.db.commitlog.AbstractCommitLogSegmentManager", "org.apache.cassandra.db.commitlog.CommitLog", @@ -183,6 +205,7 @@ public class DatabaseDescriptorRefTest "org.apache.cassandra.db.guardrails.Values$Config", "org.apache.cassandra.db.rows.UnfilteredSource", "org.apache.cassandra.dht.IPartitioner", + "org.apache.cassandra.dht.RingPosition", "org.apache.cassandra.distributed.api.IInstance", "org.apache.cassandra.distributed.api.IInvokableInstance", "org.apache.cassandra.distributed.api.IIsolatedExecutor", @@ -211,6 +234,7 @@ public class DatabaseDescriptorRefTest "org.apache.cassandra.gms.IFailureDetector", "org.apache.cassandra.io.FSError", "org.apache.cassandra.io.FSWriteError", + "org.apache.cassandra.io.MessageVersionProvider", "org.apache.cassandra.io.compress.ICompressor", "org.apache.cassandra.io.compress.ICompressor$Uses", "org.apache.cassandra.io.compress.LZ4Compressor", @@ -234,9 +258,9 @@ public class DatabaseDescriptorRefTest "org.apache.cassandra.io.sstable.format.SSTableReaderLoadingBuilder", "org.apache.cassandra.io.sstable.format.SSTableReaderWithFilter", "org.apache.cassandra.io.sstable.format.SSTableReaderWithFilter$Builder", - "org.apache.cassandra.io.sstable.format.SortedTableReaderLoadingBuilder", "org.apache.cassandra.io.sstable.format.SSTableWriter", "org.apache.cassandra.io.sstable.format.SSTableWriter$Builder", + "org.apache.cassandra.io.sstable.format.SortedTableReaderLoadingBuilder", "org.apache.cassandra.io.sstable.format.SortedTableWriter", "org.apache.cassandra.io.sstable.format.SortedTableWriter$Builder", "org.apache.cassandra.io.sstable.format.Version", @@ -269,6 +293,10 @@ public class DatabaseDescriptorRefTest "org.apache.cassandra.io.util.PathUtils$IOToLongFunction", "org.apache.cassandra.io.util.RebufferingInputStream", "org.apache.cassandra.io.util.SpinningDiskOptimizationStrategy", + "org.apache.cassandra.journal.Params", + "org.apache.cassandra.journal.Params$FailurePolicy", + "org.apache.cassandra.journal.Params$FlushMode", + "org.apache.cassandra.locator.Endpoint", "org.apache.cassandra.locator.IEndpointSnitch", "org.apache.cassandra.locator.InetAddressAndPort", "org.apache.cassandra.locator.Locator", @@ -280,12 +308,33 @@ public class DatabaseDescriptorRefTest "org.apache.cassandra.locator.SeedProvider", "org.apache.cassandra.locator.SimpleSeedProvider", "org.apache.cassandra.locator.SnitchAdapter", + "org.apache.cassandra.net.MessagingService$Version", + "org.apache.cassandra.security.AbstractCryptoProvider", "org.apache.cassandra.security.EncryptionContext", "org.apache.cassandra.security.ISslContextFactory", "org.apache.cassandra.security.SSLFactory", + "org.apache.cassandra.service.accord.serializers.Version", "org.apache.cassandra.service.CacheService$CacheType", "org.apache.cassandra.security.AbstractCryptoProvider", "org.apache.cassandra.tcm.RegistrationStateCallbacks", + "org.apache.cassandra.service.consensus.TransactionalMode", + "org.apache.cassandra.service.TimeoutStrategy$LatencySourceFactory", + "org.apache.cassandra.service.TimeoutStrategy$LatencySource", + "org.apache.cassandra.service.RetryStrategy", + "org.apache.cassandra.service.RetryStrategy$WaitRandomizerFactory", + "org.apache.cassandra.service.RetryStrategy$1", + "org.apache.cassandra.service.RetryStrategy$WaitRandomizer", + "org.apache.cassandra.service.TimeoutStrategy", + "org.apache.cassandra.service.TimeoutStrategy$LatencyModifierFactory", + "org.apache.cassandra.service.TimeoutStrategy$Wait", + "org.apache.cassandra.service.TimeoutStrategy$LatencySupplier", + "org.apache.cassandra.service.TimeoutStrategy$1", + "org.apache.cassandra.service.TimeoutStrategy$LatencySupplier$Constant", + "org.apache.cassandra.service.TimeoutStrategy$LatencyModifier", + "org.apache.cassandra.service.TimeoutStrategy$Wait$Modifying", + "org.apache.cassandra.service.TimeoutStrategy$Wait$Constant", + "org.apache.cassandra.service.RetryStrategy$WaitRandomizerFactory$Uniform", + "org.apache.cassandra.service.WaitStrategy", "org.apache.cassandra.transport.ProtocolException", "org.apache.cassandra.utils.Closeable", "org.apache.cassandra.utils.CloseableIterator", @@ -294,12 +343,13 @@ public class DatabaseDescriptorRefTest "org.apache.cassandra.utils.LocalizeString", "org.apache.cassandra.utils.SystemInfo", "org.apache.cassandra.utils.Pair", + "org.apache.cassandra.utils.StorageCompatibilityMode", "org.apache.cassandra.utils.binlog.BinLogOptions", "org.apache.cassandra.utils.concurrent.RefCounted", "org.apache.cassandra.utils.concurrent.SelfRefCounted", "org.apache.cassandra.utils.concurrent.Transactional", + "org.apache.cassandra.utils.progress.ProgressListener", "org.apache.cassandra.utils.concurrent.UncheckedInterruptedException", - "org.apache.cassandra.utils.StorageCompatibilityMode" }; static final Set checkedClasses = new HashSet<>(Arrays.asList(validClasses)); diff --git a/test/unit/org/apache/cassandra/config/EncryptionOptionsEqualityTest.java b/test/unit/org/apache/cassandra/config/EncryptionOptionsEqualityTest.java index 5e6d26b8173e..b2770c23618c 100644 --- a/test/unit/org/apache/cassandra/config/EncryptionOptionsEqualityTest.java +++ b/test/unit/org/apache/cassandra/config/EncryptionOptionsEqualityTest.java @@ -27,8 +27,8 @@ import org.apache.cassandra.security.DummySslContextFactoryImpl; import org.apache.cassandra.transport.TlsTestUtils; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.NOT_REQUIRED; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.NOT_REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; @@ -40,23 +40,25 @@ public class EncryptionOptionsEqualityTest { private EncryptionOptions.ServerEncryptionOptions createServerEncryptionOptions() { - return new EncryptionOptions.ServerEncryptionOptions() + EncryptionOptions.ServerEncryptionOptions.Builder serverEncryptionOptionsBuilder = new EncryptionOptions.ServerEncryptionOptions.Builder(); + return serverEncryptionOptionsBuilder + .withOutboundKeystore(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PATH) + .withOutboundKeystorePassword(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PASSWORD) .withStoreType("JKS") .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH) .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) .withTrustStore(TlsTestUtils.SERVER_TRUSTSTORE_PATH) .withTrustStorePassword(TlsTestUtils.SERVER_TRUSTSTORE_PASSWORD) - .withOutboundKeystore(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PATH) - .withOutboundKeystorePassword(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PASSWORD) .withProtocol("TLSv1.1") .withRequireClientAuth(REQUIRED) - .withRequireEndpointVerification(false); + .withRequireEndpointVerification(false) + .build(); } @Test public void testKeystoreOptions() { - EncryptionOptions encryptionOptions1 = - new EncryptionOptions() + EncryptionOptions.ServerEncryptionOptions encryptionOptions1 = + new EncryptionOptions.ServerEncryptionOptions.Builder() .withStoreType("JKS") .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH) .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) @@ -64,10 +66,11 @@ public void testKeystoreOptions() { .withTrustStorePassword(TlsTestUtils.SERVER_TRUSTSTORE_PASSWORD) .withProtocol("TLSv1.1") .withRequireClientAuth(REQUIRED) - .withRequireEndpointVerification(false); + .withRequireEndpointVerification(false) + .build(); - EncryptionOptions encryptionOptions2 = - new EncryptionOptions() + EncryptionOptions.ServerEncryptionOptions encryptionOptions2 = + new EncryptionOptions.ServerEncryptionOptions.Builder() .withStoreType("JKS") .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH) .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) @@ -75,7 +78,8 @@ public void testKeystoreOptions() { .withTrustStorePassword(TlsTestUtils.SERVER_TRUSTSTORE_PASSWORD) .withProtocol("TLSv1.1") .withRequireClientAuth(REQUIRED) - .withRequireEndpointVerification(false); + .withRequireEndpointVerification(false) + .build(); assertEquals(encryptionOptions1, encryptionOptions2); assertEquals(encryptionOptions1.hashCode(), encryptionOptions2.hashCode()); @@ -83,8 +87,8 @@ public void testKeystoreOptions() { @Test public void testKeystoreOptionsWithPasswordFile() { - EncryptionOptions encryptionOptions1 = - new EncryptionOptions() + EncryptionOptions.ServerEncryptionOptions encryptionOptions1 = + new EncryptionOptions.ServerEncryptionOptions.Builder() .withStoreType("JKS") .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH) .withKeyStorePasswordFile(TlsTestUtils.SERVER_KEYSTORE_PASSWORD_FILE) @@ -92,10 +96,11 @@ public void testKeystoreOptionsWithPasswordFile() { .withTrustStorePasswordFile(TlsTestUtils.SERVER_TRUSTSTORE_PASSWORD_FILE) .withProtocol("TLSv1.1") .withRequireClientAuth(REQUIRED) - .withRequireEndpointVerification(false); + .withRequireEndpointVerification(false) + .build(); - EncryptionOptions encryptionOptions2 = - new EncryptionOptions() + EncryptionOptions.ServerEncryptionOptions encryptionOptions2 = + new EncryptionOptions.ServerEncryptionOptions.Builder() .withStoreType("JKS") .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH) .withKeyStorePasswordFile(TlsTestUtils.SERVER_KEYSTORE_PASSWORD_FILE) @@ -103,7 +108,8 @@ public void testKeystoreOptionsWithPasswordFile() { .withTrustStorePasswordFile(TlsTestUtils.SERVER_TRUSTSTORE_PASSWORD_FILE) .withProtocol("TLSv1.1") .withRequireClientAuth(REQUIRED) - .withRequireEndpointVerification(false); + .withRequireEndpointVerification(false) + .build(); assertEquals(encryptionOptions1, encryptionOptions2); assertEquals(encryptionOptions1.hashCode(), encryptionOptions2.hashCode()); @@ -115,15 +121,17 @@ public void testMismatchForKeystoreOptionsWithPasswordFile() EncryptionOptions.ServerEncryptionOptions encryptionOptions1 = createServerEncryptionOptions(); EncryptionOptions.ServerEncryptionOptions encryptionOptions2 = createServerEncryptionOptions(); - encryptionOptions1 = encryptionOptions1 + encryptionOptions1 = new EncryptionOptions.ServerEncryptionOptions.Builder(encryptionOptions1) .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH) .withKeyStorePassword(null) - .withKeyStorePasswordFile(TlsTestUtils.SERVER_KEYSTORE_PASSWORD_FILE); + .withKeyStorePasswordFile(TlsTestUtils.SERVER_KEYSTORE_PASSWORD_FILE) + .build(); - encryptionOptions2 = encryptionOptions2 + encryptionOptions2 = new EncryptionOptions.ServerEncryptionOptions.Builder(encryptionOptions2) .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH) .withKeyStorePassword(null) - .withKeyStorePasswordFile(TlsTestUtils.SERVER_TRUSTSTORE_PASSWORD_FILE); + .withKeyStorePasswordFile(TlsTestUtils.SERVER_TRUSTSTORE_PASSWORD_FILE) + .build(); assertNotEquals(encryptionOptions1, encryptionOptions2); assertNotEquals(encryptionOptions1.hashCode(), encryptionOptions2.hashCode()); @@ -135,22 +143,24 @@ public void testSameCustomSslContextFactoryImplementation() { Map parameters1 = new HashMap<>(); parameters1.put("key1", "value1"); parameters1.put("key2", "value2"); - EncryptionOptions encryptionOptions1 = - new EncryptionOptions() + EncryptionOptions.ClientEncryptionOptions encryptionOptions1 = + new EncryptionOptions.ClientEncryptionOptions.Builder() .withSslContextFactory(new ParameterizedClass(DummySslContextFactoryImpl.class.getName(), parameters1)) .withProtocol("TLSv1.1") .withRequireClientAuth(REQUIRED) - .withRequireEndpointVerification(false); + .withRequireEndpointVerification(false) + .build(); Map parameters2 = new HashMap<>(); parameters2.put("key1", "value1"); parameters2.put("key2", "value2"); - EncryptionOptions encryptionOptions2 = - new EncryptionOptions() + EncryptionOptions.ClientEncryptionOptions encryptionOptions2 = + new EncryptionOptions.ClientEncryptionOptions.Builder() .withSslContextFactory(new ParameterizedClass(DummySslContextFactoryImpl.class.getName(), parameters2)) .withProtocol("TLSv1.1") .withRequireClientAuth(REQUIRED) - .withRequireEndpointVerification(false); + .withRequireEndpointVerification(false) + .build(); assertEquals(encryptionOptions1, encryptionOptions2); assertEquals(encryptionOptions1.hashCode(), encryptionOptions2.hashCode()); @@ -162,22 +172,24 @@ public void testDifferentCustomSslContextFactoryImplementations() { Map parameters1 = new HashMap<>(); parameters1.put("key1", "value1"); parameters1.put("key2", "value2"); - EncryptionOptions encryptionOptions1 = - new EncryptionOptions() + EncryptionOptions.ClientEncryptionOptions encryptionOptions1 = + new EncryptionOptions.ClientEncryptionOptions.Builder() .withSslContextFactory(new ParameterizedClass(DummySslContextFactoryImpl.class.getName(), parameters1)) .withProtocol("TLSv1.1") .withRequireClientAuth(NOT_REQUIRED) - .withRequireEndpointVerification(true); + .withRequireEndpointVerification(true) + .build(); Map parameters2 = new HashMap<>(); parameters2.put("key1", "value1"); parameters2.put("key2", "value2"); - EncryptionOptions encryptionOptions2 = - new EncryptionOptions() + EncryptionOptions.ClientEncryptionOptions encryptionOptions2 = + new EncryptionOptions.ClientEncryptionOptions.Builder() .withSslContextFactory(new ParameterizedClass(DefaultSslContextFactory.class.getName(), parameters2)) .withProtocol("TLSv1.1") .withRequireClientAuth(NOT_REQUIRED) - .withRequireEndpointVerification(true); + .withRequireEndpointVerification(true) + .build(); assertNotEquals(encryptionOptions1, encryptionOptions2); assertNotEquals(encryptionOptions1.hashCode(), encryptionOptions2.hashCode()); @@ -189,18 +201,20 @@ public void testDifferentCustomSslContextFactoryParameters() { Map parameters1 = new HashMap<>(); parameters1.put("key1", "value11"); parameters1.put("key2", "value12"); - EncryptionOptions encryptionOptions1 = - new EncryptionOptions() + EncryptionOptions.ClientEncryptionOptions encryptionOptions1 = + new EncryptionOptions.ClientEncryptionOptions.Builder() .withSslContextFactory(new ParameterizedClass(DummySslContextFactoryImpl.class.getName(), parameters1)) - .withProtocol("TLSv1.1"); + .withProtocol("TLSv1.1") + .build(); Map parameters2 = new HashMap<>(); parameters2.put("key1", "value21"); parameters2.put("key2", "value22"); - EncryptionOptions encryptionOptions2 = - new EncryptionOptions() + EncryptionOptions.ClientEncryptionOptions encryptionOptions2 = + new EncryptionOptions.ClientEncryptionOptions.Builder() .withSslContextFactory(new ParameterizedClass(DummySslContextFactoryImpl.class.getName(), parameters2)) - .withProtocol("TLSv1.1"); + .withProtocol("TLSv1.1") + .build(); assertNotEquals(encryptionOptions1, encryptionOptions2); assertNotEquals(encryptionOptions1.hashCode(), encryptionOptions2.hashCode()); @@ -222,13 +236,15 @@ public void testServerEncryptionOptionsMismatchForOutboundKeystore() EncryptionOptions.ServerEncryptionOptions encryptionOptions1 = createServerEncryptionOptions(); EncryptionOptions.ServerEncryptionOptions encryptionOptions2 = createServerEncryptionOptions(); - encryptionOptions1 = encryptionOptions1 + encryptionOptions1 = new EncryptionOptions.ServerEncryptionOptions.Builder(encryptionOptions1) .withOutboundKeystore("test/conf/cassandra_outbound1.keystore") - .withOutboundKeystorePassword("cassandra1"); + .withOutboundKeystorePassword("cassandra1") + .build(); - encryptionOptions2 = encryptionOptions2 + encryptionOptions2 = new EncryptionOptions.ServerEncryptionOptions.Builder(encryptionOptions2) .withOutboundKeystore("test/conf/cassandra_outbound2.keystore") - .withOutboundKeystorePassword("cassandra2"); + .withOutboundKeystorePassword("cassandra2") + .build(); assertNotEquals(encryptionOptions1, encryptionOptions2); assertNotEquals(encryptionOptions1.hashCode(), encryptionOptions2.hashCode()); @@ -240,13 +256,15 @@ public void testServerEncryptionOptionsMismatchForInboundKeystore() EncryptionOptions.ServerEncryptionOptions encryptionOptions1 = createServerEncryptionOptions(); EncryptionOptions.ServerEncryptionOptions encryptionOptions2 = createServerEncryptionOptions(); - encryptionOptions1 = encryptionOptions1 + encryptionOptions1 = new EncryptionOptions.ServerEncryptionOptions.Builder(encryptionOptions1) .withKeyStore("test/conf/cassandra1.keystore") - .withKeyStorePassword("cassandra1"); + .withKeyStorePassword("cassandra1") + .build(); - encryptionOptions2 = encryptionOptions2 + encryptionOptions2 = new EncryptionOptions.ServerEncryptionOptions.Builder(encryptionOptions2) .withKeyStore("test/conf/cassandra2.keystore") - .withKeyStorePassword("cassandra2"); + .withKeyStorePassword("cassandra2") + .build(); assertNotEquals(encryptionOptions1, encryptionOptions2); assertNotEquals(encryptionOptions1.hashCode(), encryptionOptions2.hashCode()); diff --git a/test/unit/org/apache/cassandra/config/EncryptionOptionsTest.java b/test/unit/org/apache/cassandra/config/EncryptionOptionsTest.java index 5ef08eb060a0..cec982d0c52b 100644 --- a/test/unit/org/apache/cassandra/config/EncryptionOptionsTest.java +++ b/test/unit/org/apache/cassandra/config/EncryptionOptionsTest.java @@ -23,11 +23,11 @@ import java.util.Map; import com.google.common.collect.ImmutableMap; -import org.apache.cassandra.io.util.File; import org.junit.Assert; import org.junit.Test; import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.io.util.File; import org.assertj.core.api.Assertions; import org.yaml.snakeyaml.constructor.ConstructorException; @@ -46,11 +46,11 @@ public class EncryptionOptionsTest { static class EncryptionOptionsTestCase { - final EncryptionOptions encryptionOptions; + final EncryptionOptions.ClientEncryptionOptions encryptionOptions; final EncryptionOptions.TlsEncryptionPolicy expected; final String description; - public EncryptionOptionsTestCase(EncryptionOptions encryptionOptions, EncryptionOptions.TlsEncryptionPolicy expected, String description) + public EncryptionOptionsTestCase(EncryptionOptions.ClientEncryptionOptions encryptionOptions, EncryptionOptions.TlsEncryptionPolicy expected, String description) { this.encryptionOptions = encryptionOptions; this.expected = expected; @@ -59,25 +59,25 @@ public EncryptionOptionsTestCase(EncryptionOptions encryptionOptions, Encryption public static EncryptionOptionsTestCase of(Boolean optional, String keystorePath, Boolean enabled, EncryptionOptions.TlsEncryptionPolicy expected) { - return new EncryptionOptionsTestCase(new EncryptionOptions(new ParameterizedClass("org.apache.cassandra.security.DefaultSslContextFactory", - new HashMap<>()), - keystorePath, "dummypass", null, - "dummytruststore", "dummypass", null, - Collections.emptyList(), null, null, null, "JKS", "false", false, enabled, optional, null, null) + return new EncryptionOptionsTestCase(new EncryptionOptions.ClientEncryptionOptions(new ParameterizedClass("org.apache.cassandra.security.DefaultSslContextFactory", + new HashMap<>()), + keystorePath, "dummypass", null, + "dummytruststore", "dummypass", null, + Collections.emptyList(), null, null, null, "JKS", "false", false, enabled, optional, null, null) .applyConfig(), expected, String.format("optional=%s keystore=%s enabled=%s", optional, keystorePath, enabled)); } public static EncryptionOptionsTestCase of(Boolean optional, String keystorePath, Boolean enabled, - Map customSslContextFactoryParams, + Map customSslContextFactoryParams, EncryptionOptions.TlsEncryptionPolicy expected) { - return new EncryptionOptionsTestCase(new EncryptionOptions(new ParameterizedClass("org.apache.cassandra.security.DefaultSslContextFactory", - customSslContextFactoryParams), - keystorePath, "dummypass", null, - "dummytruststore", "dummypass", null, - Collections.emptyList(), null, null, null, "JKS", "false", false, enabled, optional, null, null) + return new EncryptionOptionsTestCase(new EncryptionOptions.ClientEncryptionOptions(new ParameterizedClass("org.apache.cassandra.security.DefaultSslContextFactory", + customSslContextFactoryParams), + keystorePath, "dummypass", null, + "dummytruststore", "dummypass", null, + Collections.emptyList(), null, null, null, "JKS", "false", false, enabled, optional, null, null) .applyConfig(), expected, String.format("optional=%s keystore=%s enabled=%s", optional, keystorePath, enabled)); @@ -87,15 +87,15 @@ public static EncryptionOptionsTestCase of(Boolean optional, String keystorePath static final String absentKeystore = "test/conf/missing-keystore-is-not-here"; static final String presentKeystore = "test/conf/keystore.jks"; final EncryptionOptionsTestCase[] encryptionOptionTestCases = { - // Optional Keystore Enabled Expected - EncryptionOptionsTestCase.of(null, absentKeystore, false, UNENCRYPTED), - EncryptionOptionsTestCase.of(null, absentKeystore, true, ENCRYPTED), - EncryptionOptionsTestCase.of(null, presentKeystore, false, OPTIONAL), - EncryptionOptionsTestCase.of(null, presentKeystore, true, ENCRYPTED), - EncryptionOptionsTestCase.of(false, absentKeystore, false, UNENCRYPTED), - EncryptionOptionsTestCase.of(false, absentKeystore, true, ENCRYPTED), - EncryptionOptionsTestCase.of(true, presentKeystore, false, OPTIONAL), - EncryptionOptionsTestCase.of(true, presentKeystore, true, OPTIONAL) + // Optional Keystore Enabled Expected + EncryptionOptionsTestCase.of(null, absentKeystore, false, UNENCRYPTED), + EncryptionOptionsTestCase.of(null, absentKeystore, true, ENCRYPTED), + EncryptionOptionsTestCase.of(null, presentKeystore, false, OPTIONAL), + EncryptionOptionsTestCase.of(null, presentKeystore, true, ENCRYPTED), + EncryptionOptionsTestCase.of(false, absentKeystore, false, UNENCRYPTED), + EncryptionOptionsTestCase.of(false, absentKeystore, true, ENCRYPTED), + EncryptionOptionsTestCase.of(true, presentKeystore, false, OPTIONAL), + EncryptionOptionsTestCase.of(true, presentKeystore, true, OPTIONAL) }; @Test @@ -111,11 +111,11 @@ public void testEncryptionOptionPolicy() static class ServerEncryptionOptionsTestCase { - final EncryptionOptions encryptionOptions; + final EncryptionOptions.ServerEncryptionOptions encryptionOptions; final EncryptionOptions.TlsEncryptionPolicy expected; final String description; - public ServerEncryptionOptionsTestCase(EncryptionOptions encryptionOptions, EncryptionOptions.TlsEncryptionPolicy expected, String description) + public ServerEncryptionOptionsTestCase(EncryptionOptions.ServerEncryptionOptions encryptionOptions, EncryptionOptions.TlsEncryptionPolicy expected, String description) { this.encryptionOptions = encryptionOptions; this.expected = expected; @@ -131,10 +131,10 @@ public static ServerEncryptionOptionsTestCase of(Boolean optional, String keysto keystorePath, "dummypass", null, keystorePath, "dummypass", null, "dummytruststore", "dummypass", null, - Collections.emptyList(), null, null, null, "JKS", "false", false, optional, internodeEncryption, false, null, null) + Collections.emptyList(), null, null, null, "JKS", "false", false, optional, internodeEncryption, false, null, null) .applyConfig(), - expected, - String.format("optional=%s keystore=%s internode=%s", optional, keystorePath, internodeEncryption)); + expected, + String.format("optional=%s keystore=%s internode=%s", optional, keystorePath, internodeEncryption)); } } @@ -143,8 +143,8 @@ public void isEnabledServer() { Map yaml = ImmutableMap.of( "server_encryption_options", ImmutableMap.of( - "isEnabled", false - ) + "isEnabled", false + ) ); Assertions.assertThatThrownBy(() -> YamlConfigurationLoader.fromMap(yaml, Config.class)) @@ -157,8 +157,8 @@ public void isOptionalServer() { Map yaml = ImmutableMap.of( "server_encryption_options", ImmutableMap.of( - "isOptional", false - ) + "isOptional", false + ) ); Assertions.assertThatThrownBy(() -> YamlConfigurationLoader.fromMap(yaml, Config.class)) @@ -171,11 +171,11 @@ public void testMaxCertificateValidityPeriod() { Map yaml = ImmutableMap.of( "server_encryption_options", ImmutableMap.of( - "max_certificate_validity_period", "2d" - ), + "max_certificate_validity_period", "2d" + ), "client_encryption_options", ImmutableMap.of( - "max_certificate_validity_period", "10d" - ) + "max_certificate_validity_period", "10d" + ) ); Config config = YamlConfigurationLoader.fromMap(yaml, Config.class); @@ -188,8 +188,8 @@ public void testFailsToParseInvalidMaxCertificateValidityPeriodValue() { Map yaml = ImmutableMap.of( "server_encryption_options", ImmutableMap.of( - "max_certificate_validity_period", "not-a-valid-input" - ) + "max_certificate_validity_period", "not-a-valid-input" + ) ); Assertions.assertThatThrownBy(() -> YamlConfigurationLoader.fromMap(yaml, Config.class)) @@ -202,8 +202,8 @@ public void testFailsToParseNegativeMaxCertificateValidityPeriod() { Map yaml = ImmutableMap.of( "server_encryption_options", ImmutableMap.of( - "max_certificate_validity_period", "-2d" - ) + "max_certificate_validity_period", "-2d" + ) ); Assertions.assertThatThrownBy(() -> YamlConfigurationLoader.fromMap(yaml, Config.class)) @@ -213,26 +213,26 @@ public void testFailsToParseNegativeMaxCertificateValidityPeriod() final ServerEncryptionOptionsTestCase[] serverEncryptionOptionTestCases = { - // Optional Keystore Internode Expected - ServerEncryptionOptionsTestCase.of(null, absentKeystore, none, UNENCRYPTED), - ServerEncryptionOptionsTestCase.of(null, absentKeystore, rack, OPTIONAL), - ServerEncryptionOptionsTestCase.of(null, absentKeystore, dc, OPTIONAL), - ServerEncryptionOptionsTestCase.of(null, absentKeystore, all, ENCRYPTED), - - ServerEncryptionOptionsTestCase.of(null, presentKeystore, none, OPTIONAL), - ServerEncryptionOptionsTestCase.of(null, presentKeystore, rack, OPTIONAL), - ServerEncryptionOptionsTestCase.of(null, absentKeystore, dc, OPTIONAL), - ServerEncryptionOptionsTestCase.of(null, absentKeystore, all, ENCRYPTED), - - ServerEncryptionOptionsTestCase.of(false, absentKeystore, none, UNENCRYPTED), - ServerEncryptionOptionsTestCase.of(false, absentKeystore, rack, OPTIONAL), - ServerEncryptionOptionsTestCase.of(false, absentKeystore, dc, OPTIONAL), - ServerEncryptionOptionsTestCase.of(false, absentKeystore, all, ENCRYPTED), - - ServerEncryptionOptionsTestCase.of(true, presentKeystore, none, OPTIONAL), - ServerEncryptionOptionsTestCase.of(true, presentKeystore, rack, OPTIONAL), - ServerEncryptionOptionsTestCase.of(true, absentKeystore, dc, OPTIONAL), - ServerEncryptionOptionsTestCase.of(true, absentKeystore, all, OPTIONAL), + // Optional Keystore Internode Expected + ServerEncryptionOptionsTestCase.of(null, absentKeystore, none, UNENCRYPTED), + ServerEncryptionOptionsTestCase.of(null, absentKeystore, rack, OPTIONAL), + ServerEncryptionOptionsTestCase.of(null, absentKeystore, dc, OPTIONAL), + ServerEncryptionOptionsTestCase.of(null, absentKeystore, all, ENCRYPTED), + + ServerEncryptionOptionsTestCase.of(null, presentKeystore, none, OPTIONAL), + ServerEncryptionOptionsTestCase.of(null, presentKeystore, rack, OPTIONAL), + ServerEncryptionOptionsTestCase.of(null, absentKeystore, dc, OPTIONAL), + ServerEncryptionOptionsTestCase.of(null, absentKeystore, all, ENCRYPTED), + + ServerEncryptionOptionsTestCase.of(false, absentKeystore, none, UNENCRYPTED), + ServerEncryptionOptionsTestCase.of(false, absentKeystore, rack, OPTIONAL), + ServerEncryptionOptionsTestCase.of(false, absentKeystore, dc, OPTIONAL), + ServerEncryptionOptionsTestCase.of(false, absentKeystore, all, ENCRYPTED), + + ServerEncryptionOptionsTestCase.of(true, presentKeystore, none, OPTIONAL), + ServerEncryptionOptionsTestCase.of(true, presentKeystore, rack, OPTIONAL), + ServerEncryptionOptionsTestCase.of(true, absentKeystore, dc, OPTIONAL), + ServerEncryptionOptionsTestCase.of(true, absentKeystore, all, OPTIONAL), }; @Test @@ -246,12 +246,12 @@ public void testServerEncryptionOptionPolicy() } } - @Test(expected = IllegalArgumentException.class) + @Test(expected = IllegalArgumentException.class) public void testMisplacedConfigKey() { Map customSslContextFactoryParams = new HashMap<>(); - for(EncryptionOptions.ConfigKey configKey: EncryptionOptions.ConfigKey.values()) + for (EncryptionOptions.ConfigKey configKey : EncryptionOptions.ConfigKey.values()) { customSslContextFactoryParams.put(configKey.toString(), "my-custom-value"); } diff --git a/test/unit/org/apache/cassandra/config/YamlConfigurationLoaderTest.java b/test/unit/org/apache/cassandra/config/YamlConfigurationLoaderTest.java index 11833783a3ad..68d5302047be 100644 --- a/test/unit/org/apache/cassandra/config/YamlConfigurationLoaderTest.java +++ b/test/unit/org/apache/cassandra/config/YamlConfigurationLoaderTest.java @@ -27,14 +27,19 @@ import java.util.Collections; import java.util.HashMap; import java.util.Map; +import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.function.Predicate; import com.google.common.collect.ImmutableMap; import org.junit.Test; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import org.apache.cassandra.distributed.shared.WithProperties; import org.apache.cassandra.io.util.File; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig; import org.yaml.snakeyaml.error.YAMLException; import static org.apache.cassandra.config.CassandraRelevantProperties.CONFIG_ALLOW_SYSTEM_PROPERTIES; @@ -42,6 +47,7 @@ import static org.apache.cassandra.config.YamlConfigurationLoader.SYSTEM_PROPERTY_PREFIX; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; + import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -104,8 +110,8 @@ public void validateTypes() assertEquals("You have wrongly defined a config parameter of abstract type DurationSpec, DataStorageSpec or DataRateSpec." + "Please check the config docs, otherwise Cassandra won't be able to start with this parameter being set in cassandra.yaml.", Arrays.stream(Config.class.getFields()) - .filter(f -> !Modifier.isStatic(f.getModifiers())) - .filter(isDurationSpec.or(isDataRateSpec).or(isDataStorageSpec)).count(), 0); + .filter(f -> !Modifier.isStatic(f.getModifiers())) + .filter(isDurationSpec.or(isDataRateSpec).or(isDataStorageSpec)).count(), 0); } @Test @@ -113,12 +119,12 @@ public void updateInPlace() { Config config = new Config(); Map map = ImmutableMap.builder().put("storage_port", 123) - .put("commitlog_sync", Config.CommitLogSync.batch) - .put("seed_provider.class_name", "org.apache.cassandra.locator.SimpleSeedProvider") - .put("client_encryption_options.cipher_suites", Collections.singletonList("FakeCipher")) - .put("client_encryption_options.optional", false) - .put("client_encryption_options.enabled", true) - .build(); + .put("commitlog_sync", Config.CommitLogSync.batch) + .put("seed_provider.class_name", "org.apache.cassandra.locator.SimpleSeedProvider") + .put("client_encryption_options.cipher_suites", Collections.singletonList("FakeCipher")) + .put("client_encryption_options.optional", false) + .put("client_encryption_options.enabled", true) + .build(); Config updated = YamlConfigurationLoader.updateFromMap(map, true, config); assert updated == config : "Config pointers do not match"; assertThat(config.storage_port).isEqualTo(123); @@ -271,6 +277,12 @@ public void fromMapTest() Map encryptionOptions = ImmutableMap.of("cipher_suites", Collections.singletonList("FakeCipher"), "optional", false, "enabled", true); + Map autoRepairConfig = ImmutableMap.of("enabled", true, + "global_settings", + ImmutableMap.of("number_of_repair_threads", 1), + "repair_type_overrides", + ImmutableMap.of("full", + ImmutableMap.of("number_of_repair_threads", 2))); Map map = new ImmutableMap.Builder() .put("storage_port", storagePort) .put("commitlog_sync", commitLogSync) @@ -279,6 +291,7 @@ public void fromMapTest() .put("internode_socket_send_buffer_size", "5B") .put("internode_socket_receive_buffer_size", "5B") .put("commitlog_sync_group_window_in_ms", "42") + .put("auto_repair", autoRepairConfig) .build(); Config config = YamlConfigurationLoader.fromMap(map, Config.class); @@ -289,6 +302,9 @@ public void fromMapTest() assertEquals(true, config.client_encryption_options.enabled); // Check a nested object assertEquals(new DataStorageSpec.IntBytesBound("5B"), config.internode_socket_send_buffer_size); // Check names backward compatibility (CASSANDRA-17141 and CASSANDRA-15234) assertEquals(new DataStorageSpec.IntBytesBound("5B"), config.internode_socket_receive_buffer_size); // Check names backward compatibility (CASSANDRA-17141 and CASSANDRA-15234) + assertTrue(config.auto_repair.enabled); + assertEquals(new DurationSpec.IntSecondsBound("6h"), config.auto_repair.getAutoRepairTableMaxRepairTime(AutoRepairConfig.RepairType.INCREMENTAL)); + config.auto_repair.setMaterializedViewRepairEnabled(AutoRepairConfig.RepairType.INCREMENTAL, false); } @Test @@ -426,6 +442,38 @@ public void converters() assertThat(from("compaction_tombstone_warning_threshold", "0").partition_tombstones_warn_threshold).isEqualTo(0); } + @Test + public void process() + { + for (Type type : Type.values()) + { + Config c = fromType(type, "available_processors", 4); + assertThat(c.available_processors).isEqualTo(new OptionaldPositiveInt(4)); + assertThat(c.accord.command_store_shard_count).isEqualTo(OptionaldPositiveInt.UNDEFINED); + assertThat(c.accord.queue_shard_count).isEqualTo(OptionaldPositiveInt.UNDEFINED); + + c = fromType(type, "available_processors", 3, "accord.queue_shard_count", 1, "accord.command_store_shard_count", 1); + assertThat(c.available_processors).isEqualTo(new OptionaldPositiveInt(3)); + assertThat(c.accord.command_store_shard_count).isEqualTo(new OptionaldPositiveInt(1)); + assertThat(c.accord.queue_shard_count).isEqualTo(new OptionaldPositiveInt(1)); + } + } + + private enum Type { MAP, YAML } + + private static Config fromType(Type type, Object... values) + { + switch (type) + { + case MAP: + return from(values); + case YAML: + return fromYaml(values); + default: + throw new AssertionError("Unexpected type: " + type); + } + } + private static Config from(Object... values) { assert values.length % 2 == 0 : "Map can only be created with an even number of inputs: given " + values.length; @@ -469,6 +517,38 @@ public void testBackwardCompatibilityOfAuthenticatorPropertyAsString() throws IO assertTrue(config.authenticator.parameters.isEmpty()); } + @Test + public void testAccordConfig() + { + Map accordSpec = ImmutableMap.of("fast_path_update_delay", "60s", + "durability_txnid_lag", "60s", + "shard_durability_cycle", "60s", + "global_durability_cycle", "60s"); + AccordSpec spec = from("accord", accordSpec).accord; + assertThat(spec.fast_path_update_delay.to(TimeUnit.NANOSECONDS)).isEqualTo(60000000000L); + assertThat(spec.durability_txnid_lag.to(TimeUnit.NANOSECONDS)).isEqualTo(60000000000L); + assertThat(spec.shard_durability_cycle.to(TimeUnit.NANOSECONDS)).isEqualTo(60000000000L); + assertThat(spec.global_durability_cycle.to(TimeUnit.NANOSECONDS)).isEqualTo(60000000000L); + } + + private static Config fromYaml(Object... values) + { + assert values.length % 2 == 0 : "Map can only be created with an even number of inputs: given " + values.length; + ImmutableMap.Builder builder = ImmutableMap.builder(); + for (int i = 0; i < values.length; i += 2) + builder.put((String) values[i], values[i + 1]); + ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); // checkstyle: permit this instantiation + try + { + byte[] bytes = mapper.writeValueAsBytes(builder.build()); + return YamlConfigurationLoader.loadConfig(bytes); + } + catch (JsonProcessingException e) + { + throw new AssertionError("Unable to convert map to YAML", e); + } + } + public static Config load(String path) { URL url = YamlConfigurationLoaderTest.class.getClassLoader().getResource(path); diff --git a/test/unit/org/apache/cassandra/constraints/AlterTableWithTableConstraintValidationTest.java b/test/unit/org/apache/cassandra/constraints/AlterTableWithTableConstraintValidationTest.java index d5f3f4a32c50..9ecae738dd45 100644 --- a/test/unit/org/apache/cassandra/constraints/AlterTableWithTableConstraintValidationTest.java +++ b/test/unit/org/apache/cassandra/constraints/AlterTableWithTableConstraintValidationTest.java @@ -22,10 +22,8 @@ import org.apache.cassandra.exceptions.InvalidRequestException; - public class AlterTableWithTableConstraintValidationTest extends CqlConstraintValidationTester { - @Test public void testCreateTableWithColumnNamedConstraintDescribeTableNonFunction() throws Throwable { @@ -118,12 +116,12 @@ public void testCreateTableAddMultipleMixedConstraints() throws Throwable table, tableCreateStatement)); - execute("ALTER TABLE %s ALTER ck2 CHECK LENGTH(ck2) = 4"); + execute("ALTER TABLE %s ALTER ck2 CHECK LENGTH() = 4"); tableCreateStatement = "CREATE TABLE " + KEYSPACE + "." + table + " (\n" + " pk int,\n" + " ck1 int CHECK ck1 < 100,\n" + - " ck2 text CHECK LENGTH(ck2) = 4,\n" + + " ck2 text CHECK LENGTH() = 4,\n" + " v int,\n" + " PRIMARY KEY (pk, ck1, ck2)\n" + ") WITH CLUSTERING ORDER BY (ck1 ASC, ck2 ASC)\n" + @@ -135,13 +133,13 @@ public void testCreateTableAddMultipleMixedConstraints() throws Throwable table, tableCreateStatement)); - execute("ALTER TABLE %s ALTER v CHECK NOT_NULL(v)"); + execute("ALTER TABLE %s ALTER v CHECK NOT NULL"); tableCreateStatement = "CREATE TABLE " + KEYSPACE + "." + table + " (\n" + " pk int,\n" + " ck1 int CHECK ck1 < 100,\n" + - " ck2 text CHECK LENGTH(ck2) = 4,\n" + - " v int CHECK NOT_NULL(v),\n" + + " ck2 text CHECK LENGTH() = 4,\n" + + " v int CHECK NOT NULL,\n" + " PRIMARY KEY (pk, ck1, ck2)\n" + ") WITH CLUSTERING ORDER BY (ck1 ASC, ck2 ASC)\n" + " AND " + tableParametersCql(); @@ -204,7 +202,7 @@ public void testAlterWithConstraintsAndCdcEnabled() throws Throwable @Test public void testAlterWithCdcAndPKConstraintsEnabled() throws Throwable { - createTable("CREATE TABLE %s (pk text CHECK length(pk) = 100, ck1 int, ck2 int, PRIMARY KEY ((pk), ck1, ck2));"); + createTable("CREATE TABLE %s (pk text CHECK length() = 100, ck1 int, ck2 int, PRIMARY KEY ((pk), ck1, ck2));"); // It works execute("ALTER TABLE %s WITH cdc = true"); } @@ -239,4 +237,44 @@ public void testCreateTableAddConstraintWithNonExistingColumn() throws Throwable String expectedErrorMessage = "Column 'foo' doesn't exist"; assertInvalidThrowMessage(expectedErrorMessage, InvalidRequestException.class, "ALTER TABLE %s ALTER foo CHECK foo < 100"); } + + @Test + public void testAlterTableAlterExistingColumnWithCheckOnNonExistingColumn() throws Throwable + { + createTable("CREATE TABLE %s (pk int, ck1 text, ck2 text, v int, PRIMARY KEY ((pk),ck1, ck2));"); + assertInvalidThrowMessage("Constraint ck3 < 100 was not specified on a column it operates on: ck1 but on: ck3", + InvalidRequestException.class, + "ALTER TABLE %s ALTER ck1 CHECK ck3 < 100"); + } + + @Test + public void testAlterTableAddNewColumnWithCheckOnNonExistingColumn() throws Throwable + { + createTable("CREATE TABLE %s (pk int, ck1 text, ck2 text, v int, PRIMARY KEY ((pk),ck1, ck2));"); + + assertInvalidThrowMessage("Constraint v3 < 100 was not specified on a column it operates on: v2 but on: v3", + InvalidRequestException.class, + "ALTER TABLE %s ADD v2 int CHECK v3 < 100"); + } + + @Test + public void testAlterTableAddColumnWithCheck() + { + createTable("CREATE TABLE %s (pk text, col1 int, primary key (pk));"); + execute("ALTER TABLE %s ADD col2 int CHECK col2 > 0"); + } + + @Test + public void testNotNullSyntax() throws Throwable + { + createTable("CREATE TABLE %s (pk text, col1 int NOT NULL, primary key (pk));"); + createTable("CREATE TABLE %s (pk text, col1 int CHECK NOT NULL, primary key (pk));"); + createTable("CREATE TABLE %s (pk text, col1 int NOT NULL CHECK col1 > 0, primary key (pk));"); + execute("ALTER TABLE %s ALTER col1 CHECK col1 > 100"); + execute("ALTER TABLE %s ALTER col1 CHECK NOT NULL AND col1 > 100"); + + assertInvalidThrowMessage("Duplicate definition of NOT NULL constraint", + InvalidRequestException.class, + "CREATE TABLE %s (pk text, col1 int NOT NULL CHECK NOT NULL, primary key (pk));"); + } } diff --git a/test/unit/org/apache/cassandra/constraints/ConstraintArgumentsTest.java b/test/unit/org/apache/cassandra/constraints/ConstraintArgumentsTest.java new file mode 100644 index 000000000000..289c8ceb64d6 --- /dev/null +++ b/test/unit/org/apache/cassandra/constraints/ConstraintArgumentsTest.java @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.constraints; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; + +import org.junit.Test; + +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.cql3.Operator; +import org.apache.cassandra.cql3.constraints.ColumnConstraint; +import org.apache.cassandra.cql3.constraints.ColumnConstraints; +import org.apache.cassandra.cql3.constraints.ConstraintFunction; +import org.apache.cassandra.cql3.constraints.ConstraintViolationException; +import org.apache.cassandra.cql3.constraints.InvalidConstraintDefinitionException; +import org.apache.cassandra.cql3.constraints.UnaryConstraintFunction; +import org.apache.cassandra.cql3.constraints.UnaryFunctionColumnConstraint; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.AsciiType; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.tcm.serialization.MetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; + +import static java.lang.String.format; +import static org.apache.cassandra.schema.ColumnMetadata.Kind.REGULAR; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.junit.Assert.assertEquals; + +public class ConstraintArgumentsTest +{ + private static final ColumnIdentifier columnIdentifier = new ColumnIdentifier("a_column", false); + private static final ColumnMetadata columnMetadata = new ColumnMetadata("a", "b", columnIdentifier, UTF8Type.instance, ColumnMetadata.NO_UNIQUE_ID, -1, REGULAR, null); + + @Test + public void testDeserOfContraintsWithArguments() throws Throwable + { + List> checkConstraints = new ArrayList<>(); + checkConstraints.add(new UnaryFunctionColumnConstraint(new Enumeration(List.of("a", "b", "c")))); + ColumnConstraints constraints = new ColumnConstraints(checkConstraints); + constraints.setColumnName(columnIdentifier); + + MetadataSerializer serializer = new TestingSerializer(); + + DataOutputBuffer dataOutputBuffer = new DataOutputBuffer(); + serializer.serialize(constraints, dataOutputBuffer, Version.V7); + + DataInputBuffer dataInputBuffer = new DataInputBuffer(dataOutputBuffer.getData()); + ColumnConstraints deserialize = serializer.deserialize(dataInputBuffer, Version.V7); + + List> deserializeConstraints = deserialize.getConstraints(); + assertEquals(1, deserializeConstraints.size()); + ColumnConstraint constraint = deserializeConstraints.get(0); + + assertEquals(Enumeration.FUNCTION_NAME, constraint.name()); + assertEquals(ColumnConstraint.ConstraintType.UNARY_FUNCTION, constraint.getConstraintType()); + + constraint.validate(columnMetadata); + + UnaryFunctionColumnConstraint c = ((UnaryFunctionColumnConstraint) constraint); + List arguments = c.function().arguments(); + assertEquals(List.of("a", "b", "c"), arguments); + } + + @Test + public void testDeserOfContraintsWithoutArguments() throws Throwable + { + List> checkConstraints = new ArrayList<>(); + checkConstraints.add(new UnaryFunctionColumnConstraint(new ParamerterlessContraint(List.of("a", "b", "c")))); + ColumnConstraints constraints = new ColumnConstraints(checkConstraints); + constraints.setColumnName(columnIdentifier); + + MetadataSerializer serializer = new TestingSerializer(); + + DataOutputBuffer dataOutputBuffer = new DataOutputBuffer(); + serializer.serialize(constraints, dataOutputBuffer, Version.V7); + + DataInputBuffer dataInputBuffer = new DataInputBuffer(dataOutputBuffer.getData()); + ColumnConstraints deserialize = serializer.deserialize(dataInputBuffer, Version.V7); + + List> deserializeConstraints = deserialize.getConstraints(); + assertEquals(1, deserializeConstraints.size()); + ColumnConstraint constraint = deserializeConstraints.get(0); + + assertEquals(ParamerterlessContraint.FUNCTION_NAME, constraint.name()); + assertEquals(ColumnConstraint.ConstraintType.UNARY_FUNCTION, constraint.getConstraintType()); + + assertThatThrownBy(() -> constraint.validate(columnMetadata)) + .isInstanceOf(InvalidConstraintDefinitionException.class) + .hasMessage("Constraint PARAMERTERLESS does not accept any arguments."); + } + + private static class TestingUnaryFunctionSerializer extends UnaryFunctionColumnConstraint.Serializer + { + @Override + public ConstraintFunction getConstraintFunction(String functionName, List args) + { + if (functionName.equals(Enumeration.FUNCTION_NAME)) + return new Enumeration(args); + + if (functionName.equals(ParamerterlessContraint.FUNCTION_NAME)) + return new ParamerterlessContraint(args); + + throw new IllegalStateException("not supported"); + } + } + + private static class TestingSerializer extends ColumnConstraints.Serializer + { + private static final TestingUnaryFunctionSerializer constraintSerializer = new TestingUnaryFunctionSerializer(); + + @Override + public ColumnConstraint deserializeConstraint(DataInputPlus in, int serializerPosition, Version version) throws IOException + { + return constraintSerializer.deserialize(in, version); + } + } + + private static class ParamerterlessContraint extends UnaryConstraintFunction + { + public static final String FUNCTION_NAME = "PARAMERTERLESS"; + + public ParamerterlessContraint(List args) + { + super(FUNCTION_NAME, args); + } + + @Override + protected void internalEvaluate(AbstractType valueType, Operator relationType, String term, ByteBuffer columnValue) + { + + } + + @Override + public List> getSupportedTypes() + { + return null; + } + } + + private static class Enumeration extends UnaryConstraintFunction + { + private static final List> SUPPORTED_TYPES = List.of(UTF8Type.instance, AsciiType.instance); + + public static final String FUNCTION_NAME = "ENUM"; + + public Enumeration(List args) + { + super(FUNCTION_NAME, args); + } + + @Override + public void internalEvaluate(AbstractType valueType, Operator relationType, String term, ByteBuffer columnValue) + { + if (!args.contains(valueType.getString(columnValue))) + { + throw new ConstraintViolationException(format("Value for column '%s' violated %s constraint as its value is not one of %s.", + columnName.toCQLString(), + name, + args)); + } + } + + @Override + public List> getSupportedTypes() + { + return SUPPORTED_TYPES; + } + + @Override + public boolean isParameterless() + { + return false; + } + + @Override + public boolean equals(Object o) + { + if (this == o) + return true; + + if (!(o instanceof Enumeration)) + return false; + + Enumeration other = (Enumeration) o; + + return columnName.equals(other.columnName) && name.equals(other.name); + } + } +} diff --git a/test/unit/org/apache/cassandra/constraints/ConstraintsSatisfiabilityTest.java b/test/unit/org/apache/cassandra/constraints/ConstraintsSatisfiabilityTest.java index 6f087c5850b5..c22bb7ea533d 100644 --- a/test/unit/org/apache/cassandra/constraints/ConstraintsSatisfiabilityTest.java +++ b/test/unit/org/apache/cassandra/constraints/ConstraintsSatisfiabilityTest.java @@ -18,6 +18,8 @@ package org.apache.cassandra.constraints; +import java.util.List; + import org.junit.Test; import org.apache.cassandra.cql3.ColumnIdentifier; @@ -45,8 +47,8 @@ public class ConstraintsSatisfiabilityTest { private static final ColumnIdentifier columnIdentifier = new ColumnIdentifier("a_column", false); private static final ColumnIdentifier lengthFunctionIdentifier = new ColumnIdentifier("LENGTH", false); - private static final ColumnMetadata regularIntColumn = new ColumnMetadata("a", "b", columnIdentifier, IntegerType.instance, -1, ColumnMetadata.Kind.REGULAR, null); - private static final ColumnMetadata regularStringColumn = new ColumnMetadata("a", "b", columnIdentifier, UTF8Type.instance, -1, ColumnMetadata.Kind.REGULAR, null); + private static final ColumnMetadata regularIntColumn = new ColumnMetadata("a", "b", columnIdentifier, IntegerType.instance, ColumnMetadata.NO_UNIQUE_ID, -1, ColumnMetadata.Kind.REGULAR, null); + private static final ColumnMetadata regularStringColumn = new ColumnMetadata("a", "b", columnIdentifier, UTF8Type.instance, ColumnMetadata.NO_UNIQUE_ID, -1, ColumnMetadata.Kind.REGULAR, null); @Test public void testScalarSatisfiability() throws Throwable @@ -71,13 +73,13 @@ private void run(QuadFunction quadFunction, ColumnMetadata columnMetadata if (op1 == NEQ) { // a_column != 0 and a_column != 10 -> valid - check(op1, 0, op2, 100, quadFunction, null, columnMetadata); + check(op1, 50, op2, 100, quadFunction, null, columnMetadata); // does not make sense to check twice // check a_column != 0 and a_column != 0 check(op1, 0, op2, 0, quadFunction, "There are duplicate constraint definitions on column", columnMetadata); } else - check(op1, 0, op2, 100, quadFunction, "There are duplicate constraint definitions on column", columnMetadata); + check(op1, 50, op2, 100, quadFunction, "There are duplicate constraint definitions on column", columnMetadata); } else if ((op1 == GT && op2 == GTE) || (op1 == GTE && op2 == GT) || @@ -85,19 +87,33 @@ else if ((op1 == GT && op2 == GTE) || (op1 == LTE && op2 == LT) || (op1 == EQ || op2 == EQ)) { - check(op1, 0, op2, 100, quadFunction, "not supported", columnMetadata); + check(op1, 50, op2, 100, quadFunction, "not supported", columnMetadata); } else if ((op1 == LTE && op2 == GT) || (op1 == LT && op2 == GT) || (op1 == LTE && op2 == GTE) || (op1 == LT && op2 == GTE)) { - check(op1, 0, op2, 100, quadFunction, "are not satisfiable", columnMetadata); + check(op1, 50, op2, 100, quadFunction, "are not satisfiable", columnMetadata); } - else if (!(op1 == NEQ || op2 == NEQ)) + else if ((op1 == GT && op2 == LTE) || + (op1 == GT && op2 == LT) || + (op1 == GTE && op2 == LTE) || + (op1 == GTE && op2 == LT)) + { + check(op1, 50, op2, 100, quadFunction, null, columnMetadata); + } + else if ((op1 == GT && op2 == LTE) || + (op1 == GT && op2 == LT) || + (op1 == GTE && op2 == LTE) || + (op1 == GTE && op2 == LT)) { check(op1, 0, op2, 100, quadFunction, null, columnMetadata); } + else if (!(op1 == NEQ || op2 == NEQ)) + { + check(op1, 50, op2, 100, quadFunction, null, columnMetadata); + } else { // this is valid @@ -167,7 +183,7 @@ private ScalarColumnConstraint scalar(Operator operator, Integer term) private FunctionColumnConstraint length(Operator operator, Integer term) { return new FunctionColumnConstraint.Raw(lengthFunctionIdentifier, - columnIdentifier, + List.of(), operator, term.toString()).prepare(); } diff --git a/test/unit/org/apache/cassandra/constraints/CqlConstraintValidationTester.java b/test/unit/org/apache/cassandra/constraints/CqlConstraintValidationTester.java index 8d2e52ff0e4f..cb853dab1b30 100644 --- a/test/unit/org/apache/cassandra/constraints/CqlConstraintValidationTester.java +++ b/test/unit/org/apache/cassandra/constraints/CqlConstraintValidationTester.java @@ -63,6 +63,7 @@ static String tableParametersCql() " AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}\n" + " AND memtable = 'default'\n" + " AND crc_check_chance = 1.0\n" + + " AND fast_path = 'keyspace'\n" + " AND default_time_to_live = 0\n" + " AND extensions = {}\n" + " AND gc_grace_seconds = 864000\n" + @@ -71,6 +72,8 @@ static String tableParametersCql() " AND memtable_flush_period_in_ms = 0\n" + " AND min_index_interval = 128\n" + " AND read_repair = 'BLOCKING'\n" + + " AND transactional_mode = 'off'\n" + + " AND transactional_migration_from = 'none'\n" + " AND speculative_retry = '99p';"; } diff --git a/test/unit/org/apache/cassandra/constraints/CreateTableWithColumnCqlConstraintValidationTest.java b/test/unit/org/apache/cassandra/constraints/CreateTableWithColumnCqlConstraintValidationTest.java index 6dc160468642..f68ee1ab6920 100644 --- a/test/unit/org/apache/cassandra/constraints/CreateTableWithColumnCqlConstraintValidationTest.java +++ b/test/unit/org/apache/cassandra/constraints/CreateTableWithColumnCqlConstraintValidationTest.java @@ -26,9 +26,11 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; +import org.apache.cassandra.cql3.constraints.InvalidConstraintDefinitionException; import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.utils.Generators; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import static accord.utils.Property.qt; @@ -96,11 +98,11 @@ public void testCreateTableWithColumnMultipleConstraintsDescribeTableNonFunction @Test public void testCreateTableWithColumnNotNamedConstraintDescribeTableFunction() throws Throwable { - String table = createTable(KEYSPACE_PER_TEST, "CREATE TABLE %s (pk int, ck1 text CHECK LENGTH(ck1) = 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); + String table = createTable(KEYSPACE_PER_TEST, "CREATE TABLE %s (pk int, ck1 text CHECK LENGTH() = 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); String tableCreateStatement = "CREATE TABLE " + KEYSPACE_PER_TEST + "." + table + " (\n" + " pk int,\n" + - " ck1 text CHECK LENGTH(ck1) = 4,\n" + + " ck1 text CHECK LENGTH() = 4,\n" + " ck2 int,\n" + " v int,\n" + " PRIMARY KEY (pk, ck1, ck2)\n" + @@ -117,13 +119,13 @@ public void testCreateTableWithColumnNotNamedConstraintDescribeTableFunction() t @Test public void testCreateTableWithColumnNotNullConstraintDescribe() throws Throwable { - String table = createTable(KEYSPACE_PER_TEST, "CREATE TABLE %s (pk int, ck1 int, ck2 int, v int CHECK NOT_NULL(v), PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); + String table = createTable(KEYSPACE_PER_TEST, "CREATE TABLE %s (pk int, ck1 int, ck2 int, v int CHECK NOT NULL, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); String tableCreateStatement = "CREATE TABLE " + KEYSPACE_PER_TEST + "." + table + " (\n" + " pk int,\n" + " ck1 int,\n" + " ck2 int,\n" + - " v int CHECK NOT_NULL(v),\n" + + " v int CHECK NOT NULL,\n" + " PRIMARY KEY (pk, ck1, ck2)\n" + ") WITH CLUSTERING ORDER BY (ck1 ASC, ck2 ASC)\n" + " AND " + tableParametersCql(); @@ -930,7 +932,7 @@ public void testCreateTableWithColumnWithNotNullCheckScalarFloatConstraints() th @Test public void testCreateTableWithColumnWithClusteringColumnLengthEqualToConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 text CHECK LENGTH(ck1) = 4, ck2 int, v int, PRIMARY KEY ((pk), ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); + createTable("CREATE TABLE %s (pk int, ck1 text CHECK LENGTH() = 4, ck2 int, v int, PRIMARY KEY ((pk), ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 'fooo', 2, 3)"); @@ -944,7 +946,7 @@ public void testCreateTableWithColumnWithClusteringColumnLengthEqualToConstraint @Test public void testCreateTableWithColumnWithClusteringColumnLengthDifferentThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 text CHECK LENGTH(ck1) != 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); + createTable("CREATE TABLE %s (pk int, ck1 text CHECK LENGTH() != 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); final String expectedErrorMessage = "Column value does not satisfy value constraint for column 'ck1'. It has a length of"; // Valid @@ -958,7 +960,7 @@ public void testCreateTableWithColumnWithClusteringColumnLengthDifferentThanCons @Test public void testCreateTableWithColumnWithClusteringColumnLengthBiggerThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 text CHECK LENGTH(ck1) > 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); + createTable("CREATE TABLE %s (pk int, ck1 text CHECK LENGTH() > 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 'foooo', 2, 3)"); @@ -972,7 +974,7 @@ public void testCreateTableWithColumnWithClusteringColumnLengthBiggerThanConstra @Test public void testCreateTableWithColumnWithClusteringColumnLengthBiggerOrEqualThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 text CHECK LENGTH(ck1) >= 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); + createTable("CREATE TABLE %s (pk int, ck1 text CHECK LENGTH() >= 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 'foooo', 2, 3)"); @@ -986,7 +988,7 @@ public void testCreateTableWithColumnWithClusteringColumnLengthBiggerOrEqualThan @Test public void testCreateTableWithColumnWithClusteringColumnLengthSmallerThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 text CHECK LENGTH(ck1) < 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); + createTable("CREATE TABLE %s (pk int, ck1 text CHECK LENGTH() < 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 'foo', 2, 3)"); @@ -1000,7 +1002,7 @@ public void testCreateTableWithColumnWithClusteringColumnLengthSmallerThanConstr @Test public void testCreateTableWithColumnWithClusteringColumnLengthSmallerOrEqualThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 text CHECK LENGTH(ck1) <= 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); + createTable("CREATE TABLE %s (pk int, ck1 text CHECK LENGTH() <= 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 'foo', 2, 3)"); @@ -1014,7 +1016,7 @@ public void testCreateTableWithColumnWithClusteringColumnLengthSmallerOrEqualTha @Test public void testCreateTableWithColumnWithClusteringBlobColumnLengthEqualToConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 blob CHECK LENGTH(ck1) = 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); + createTable("CREATE TABLE %s (pk int, ck1 blob CHECK LENGTH() = 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, textAsBlob('fooo'), 2, 3)"); @@ -1028,7 +1030,7 @@ public void testCreateTableWithColumnWithClusteringBlobColumnLengthEqualToConstr @Test public void testCreateTableWithColumnWithClusteringBlobColumnLengthDifferentThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 blob CHECK LENGTH(ck1) != 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); + createTable("CREATE TABLE %s (pk int, ck1 blob CHECK LENGTH() != 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, textAsBlob('foo'), 2, 3)"); @@ -1042,7 +1044,7 @@ public void testCreateTableWithColumnWithClusteringBlobColumnLengthDifferentThan @Test public void testCreateTableWithColumnWithClusteringBlobColumnLengthBiggerThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 blob CHECK LENGTH(ck1) > 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); + createTable("CREATE TABLE %s (pk int, ck1 blob CHECK LENGTH() > 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, textAsBlob('foooo'), 2, 3)"); @@ -1056,7 +1058,7 @@ public void testCreateTableWithColumnWithClusteringBlobColumnLengthBiggerThanCon @Test public void testCreateTableWithColumnWithClusteringBlobColumnLengthBiggerOrEqualThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 blob CHECK LENGTH(ck1) >= 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); + createTable("CREATE TABLE %s (pk int, ck1 blob CHECK LENGTH() >= 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, textAsBlob('foooo'), 2, 3)"); @@ -1070,7 +1072,7 @@ public void testCreateTableWithColumnWithClusteringBlobColumnLengthBiggerOrEqual @Test public void testCreateTableWithColumnWithClusteringBlobColumnLengthSmallerThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 blob CHECK LENGTH(ck1) < 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); + createTable("CREATE TABLE %s (pk int, ck1 blob CHECK LENGTH() < 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, textAsBlob('foo'), 2, 3)"); @@ -1084,7 +1086,7 @@ public void testCreateTableWithColumnWithClusteringBlobColumnLengthSmallerThanCo @Test public void testCreateTableWithColumnWithClusteringBlobColumnLengthSmallerOrEqualThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 blob CHECK LENGTH(ck1) <= 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); + createTable("CREATE TABLE %s (pk int, ck1 blob CHECK LENGTH() <= 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 ASC);"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, textAsBlob('foo'), 2, 3)"); @@ -1099,7 +1101,7 @@ public void testCreateTableWithColumnWithClusteringBlobColumnLengthSmallerOrEqua @Test public void testCreateTableWithColumnWithPkColumnLengthEqualToConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk text CHECK LENGTH(pk) = 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text CHECK LENGTH() = 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES ('fooo', 1, 2, 3)"); @@ -1113,7 +1115,7 @@ public void testCreateTableWithColumnWithPkColumnLengthEqualToConstraint() throw @Test public void testCreateTableWithColumnWithPkColumnLengthDifferentThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk text CHECK LENGTH(pk) != 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text CHECK LENGTH() != 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES ('foo', 1, 2, 3)"); @@ -1127,7 +1129,7 @@ public void testCreateTableWithColumnWithPkColumnLengthDifferentThanConstraint() @Test public void testCreateTableWithColumnWithPkColumnLengthBiggerThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk text CHECK LENGTH(pk) > 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text CHECK LENGTH() > 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES ('foooo', 1, 2, 3)"); @@ -1141,7 +1143,7 @@ public void testCreateTableWithColumnWithPkColumnLengthBiggerThanConstraint() th @Test public void testCreateTableWithColumnWithPkColumnLengthBiggerOrEqualThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk text CHECK LENGTH(pk) >= 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text CHECK LENGTH() >= 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES ('foooo', 1, 2, 3)"); @@ -1155,7 +1157,7 @@ public void testCreateTableWithColumnWithPkColumnLengthBiggerOrEqualThanConstrai @Test public void testCreateTableWithColumnWithPkColumnLengthSmallerThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk text CHECK LENGTH(pk) < 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text CHECK LENGTH() < 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES ('foo', 1, 2, 3)"); @@ -1169,7 +1171,7 @@ public void testCreateTableWithColumnWithPkColumnLengthSmallerThanConstraint() t @Test public void testCreateTableWithColumnWithPkColumnLengthSmallerOrEqualThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk text CHECK LENGTH(pk) <= 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text CHECK LENGTH() <= 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES ('foo', 1, 2, 3)"); @@ -1184,7 +1186,7 @@ public void testCreateTableWithColumnWithPkColumnLengthSmallerOrEqualThanConstra @Test public void testCreateTableWithColumnWithRegularColumnLengthEqualToConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK LENGTH(v) = 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK LENGTH() = 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, 'fooo')"); @@ -1198,7 +1200,7 @@ public void testCreateTableWithColumnWithRegularColumnLengthEqualToConstraint() @Test public void testCreateTableWithColumnWithRegularColumnLengthDifferentThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK LENGTH(v) != 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK LENGTH() != 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, 'foo')"); @@ -1212,7 +1214,7 @@ public void testCreateTableWithColumnWithRegularColumnLengthDifferentThanConstra @Test public void testCreateTableWithColumnWithRegularColumnLengthBiggerThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK LENGTH(v) > 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK LENGTH() > 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, 'foooo')"); @@ -1226,7 +1228,7 @@ public void testCreateTableWithColumnWithRegularColumnLengthBiggerThanConstraint @Test public void testCreateTableWithColumnWithRegularColumnLengthBiggerOrEqualThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK LENGTH(v) >= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK LENGTH() >= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, 'foooo')"); @@ -1240,7 +1242,7 @@ public void testCreateTableWithColumnWithRegularColumnLengthBiggerOrEqualThanCon @Test public void testCreateTableWithColumnWithRegularColumnLengthSmallerThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK LENGTH(v) < 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK LENGTH() < 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, 'foo')"); @@ -1254,7 +1256,7 @@ public void testCreateTableWithColumnWithRegularColumnLengthSmallerThanConstrain @Test public void testCreateTableWithColumnWithRegularColumnLengthSmallerOrEqualThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK LENGTH(v) <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK LENGTH() <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, 'foo')"); @@ -1268,7 +1270,7 @@ public void testCreateTableWithColumnWithRegularColumnLengthSmallerOrEqualThanCo @Test public void testCreateTableWithColumnWithRegularColumnLengthCheckNullTextConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK LENGTH(v) <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK LENGTH() <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); final String expectedErrorMessage = "Column value does not satisfy value constraint for column 'v' as it is null."; assertInvalidThrowMessage(expectedErrorMessage, InvalidRequestException.class, "INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, null)"); } @@ -1276,7 +1278,7 @@ public void testCreateTableWithColumnWithRegularColumnLengthCheckNullTextConstra @Test public void testCreateTableWithColumnWithRegularColumnLengthCheckNullVarcharConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v varchar CHECK LENGTH(v) <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v varchar CHECK LENGTH() <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); final String expectedErrorMessage = "Column value does not satisfy value constraint for column 'v' as it is null."; assertInvalidThrowMessage(expectedErrorMessage, InvalidRequestException.class, "INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, null)"); } @@ -1284,7 +1286,7 @@ public void testCreateTableWithColumnWithRegularColumnLengthCheckNullVarcharCons @Test public void testCreateTableWithColumnWithRegularColumnLengthCheckNullAsciiConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v ascii CHECK LENGTH(v) <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v ascii CHECK LENGTH() <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); final String expectedErrorMessage = "Column value does not satisfy value constraint for column 'v' as it is null."; assertInvalidThrowMessage(expectedErrorMessage, InvalidRequestException.class, "INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, null)"); } @@ -1292,7 +1294,7 @@ public void testCreateTableWithColumnWithRegularColumnLengthCheckNullAsciiConstr @Test public void testCreateTableWithColumnWithRegularColumnLengthCheckNullBlobConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v blob CHECK LENGTH(v) <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v blob CHECK LENGTH() <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); final String expectedErrorMessage = "Column value does not satisfy value constraint for column 'v' as it is null."; assertInvalidThrowMessage(expectedErrorMessage, InvalidRequestException.class, "INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, null)"); } @@ -1300,7 +1302,7 @@ public void testCreateTableWithColumnWithRegularColumnLengthCheckNullBlobConstra @Test public void testCreateTableWithColumnMixedColumnsLengthConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk text CHECK LENGTH(pk) = 4, ck1 int, ck2 int, v text CHECK LENGTH(v) = 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text CHECK LENGTH() = 4, ck1 int, ck2 int, v text CHECK LENGTH() = 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES ('fooo', 2, 3, 'fooo')"); @@ -1321,7 +1323,7 @@ public void testCreateTableWithWrongColumnConstraint() throws Throwable { try { - createTable("CREATE TABLE %s (pk text, ck1 int CHECK LENGTH(pk) = 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text, ck1 int CHECK LENGTH() = 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); fail(); } catch (InvalidRequestException e) @@ -1336,7 +1338,7 @@ public void testCreateTableWithWrongColumnMultipleConstraint() throws Throwable { try { - createTable("CREATE TABLE %s (pk text, ck1 int CHECK LENGTH(pk) = 4 AND ck1 < 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text, ck1 int CHECK LENGTH() = 4 AND ck1 < 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); fail(); } catch (InvalidRequestException e) @@ -1351,7 +1353,7 @@ public void testCreateTableWithColumnWithClusteringColumnInvalidTypeConstraint() { try { - createTable("CREATE TABLE %s (pk int, ck1 int CHECK LENGTH(ck1) = 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int CHECK LENGTH() = 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); fail(); } catch (InvalidRequestException e) @@ -1382,7 +1384,7 @@ public void testCreateTableInvalidFunction() throws Throwable { try { - createTable("CREATE TABLE %s (pk text CHECK not_a_function(pk) = 4, ck1 int, ck2 int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text CHECK not_a_function() = 4, ck1 int, ck2 int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); fail(); } catch (InvalidRequestException e) @@ -1396,7 +1398,7 @@ public void testCreateTableInvalidFunction() throws Throwable public void testCreateTableWithPKConstraintsAndCDCEnabled() throws Throwable { // It works - createTable("CREATE TABLE %s (pk text CHECK length(pk) = 4, ck1 int, ck2 int, PRIMARY KEY ((pk), ck1, ck2)) WITH cdc = true;"); + createTable("CREATE TABLE %s (pk text CHECK length() = 4, ck1 int, ck2 int, PRIMARY KEY ((pk), ck1, ck2)) WITH cdc = true;"); } @Test @@ -1438,4 +1440,12 @@ public void testCreateTableWithColumnWithClusteringColumnLessThanScalarConstrain } }); } + + @Test + public void testCreateTableAddConstraintWithCheckOnNonExistingColumn() throws Throwable + { + assertThatThrownBy(() -> createTable("CREATE TABLE %s (pk int, ck1 int CHECK ck3 > 5, ck2 text, v int, PRIMARY KEY ((pk),ck1, ck2));")) + .hasRootCauseMessage("Constraint ck3 > 5 was not specified on a column it operates on: ck1 but on: ck3") + .rootCause().isInstanceOf(InvalidConstraintDefinitionException.class); + } } diff --git a/test/unit/org/apache/cassandra/constraints/CreateTableWithColumnNotNullConstraintInvalidTest.java b/test/unit/org/apache/cassandra/constraints/CreateTableWithColumnNotNullConstraintInvalidTest.java index dcb172163807..0fab10870ae5 100644 --- a/test/unit/org/apache/cassandra/constraints/CreateTableWithColumnNotNullConstraintInvalidTest.java +++ b/test/unit/org/apache/cassandra/constraints/CreateTableWithColumnNotNullConstraintInvalidTest.java @@ -59,7 +59,7 @@ public static Collection data() @Test public void testCreateTableWithColumnNotNullCheckNonExisting() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 " + typeString + " CHECK NOT_NULL(ck1), ck2 int, v int, PRIMARY KEY (pk));"); + createTable("CREATE TABLE %s (pk int, ck1 " + typeString + " CHECK NOT NULL, ck2 int, v int, PRIMARY KEY (pk));"); // Invalid assertInvalidThrowMessage("Column 'ck1' has to be specified as part of this query.", InvalidRequestException.class, "INSERT INTO %s (pk, ck2, v) VALUES (1, 2, 3)"); @@ -71,12 +71,12 @@ public void testCreateTableWithColumnNotNullCheckNonExisting() throws Throwable @Test public void testInvalidSpecificationOfNotNullConstraintOnPrimaryKeys() throws Throwable { - assertThatThrownBy(() -> createTable("CREATE TABLE %s (pk " + typeString + " CHECK NOT_NULL(pk) PRIMARY KEY)")) + assertThatThrownBy(() -> createTable("CREATE TABLE %s (pk " + typeString + " CHECK NOT NULL PRIMARY KEY)")) .isInstanceOf(InvalidRequestException.class) .hasRootCauseInstanceOf(InvalidConstraintDefinitionException.class) .hasRootCauseMessage("NOT_NULL constraint can not be specified on a partition key column 'pk'"); - assertThatThrownBy(() -> createTable("CREATE TABLE %s (pk int, cl " + typeString + " CHECK NOT_NULL(cl), PRIMARY KEY (pk, cl))")) + assertThatThrownBy(() -> createTable("CREATE TABLE %s (pk int, cl " + typeString + " CHECK NOT NULL, PRIMARY KEY (pk, cl))")) .isInstanceOf(InvalidRequestException.class) .hasRootCauseInstanceOf(InvalidConstraintDefinitionException.class) .hasRootCauseMessage("NOT_NULL constraint can not be specified on a clustering key column 'cl'"); diff --git a/test/unit/org/apache/cassandra/constraints/CreateTableWithColumnNotNullConstraintValidTest.java b/test/unit/org/apache/cassandra/constraints/CreateTableWithColumnNotNullConstraintValidTest.java index b1ad79a43f1a..bf91fdbd6bdb 100644 --- a/test/unit/org/apache/cassandra/constraints/CreateTableWithColumnNotNullConstraintValidTest.java +++ b/test/unit/org/apache/cassandra/constraints/CreateTableWithColumnNotNullConstraintValidTest.java @@ -55,7 +55,7 @@ public static Collection data() @Test public void testCreateTableWithColumnNotNullCheckValid() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 " + typeString + " CHECK NOT_NULL(ck1), ck2 int, v int, PRIMARY KEY (pk));"); + createTable("CREATE TABLE %s (pk int, ck1 " + typeString + " CHECK NOT NULL, ck2 int, v int, PRIMARY KEY (pk));"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, " + value + ", 2, 3)"); diff --git a/test/unit/org/apache/cassandra/constraints/CreateTableWithColumnOctetLengthConstraintValidationTest.java b/test/unit/org/apache/cassandra/constraints/CreateTableWithColumnOctetLengthConstraintValidationTest.java index 6f9260f022ec..092f21f77063 100644 --- a/test/unit/org/apache/cassandra/constraints/CreateTableWithColumnOctetLengthConstraintValidationTest.java +++ b/test/unit/org/apache/cassandra/constraints/CreateTableWithColumnOctetLengthConstraintValidationTest.java @@ -53,7 +53,7 @@ public static Collection generateData() @Test public void testCreateTableWithColumnWithClusteringColumnSerializedSizeEqualToConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 text CHECK OCTET_LENGTH(ck1) = 4, ck2 int, v int, PRIMARY KEY ((pk), ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 text CHECK OCTET_LENGTH() = 4, ck2 int, v int, PRIMARY KEY ((pk), ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 'fooo', 2, 3)"); @@ -69,7 +69,7 @@ public void testCreateTableWithColumnWithClusteringColumnSerializedSizeEqualToCo @Test public void testCreateTableWithColumnWithClusteringColumnSerializedSizeDifferentThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 text CHECK OCTET_LENGTH(ck1) != 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 text CHECK OCTET_LENGTH() != 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); final String expectedErrorMessage = "Column value does not satisfy value constraint for column 'ck1'. It has a length of"; // Valid @@ -83,7 +83,7 @@ public void testCreateTableWithColumnWithClusteringColumnSerializedSizeDifferent @Test public void testCreateTableWithColumnWithClusteringColumnSerializedSizeBiggerThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 text CHECK OCTET_LENGTH(ck1) > 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 text CHECK OCTET_LENGTH() > 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 'fñoo', 2, 3)"); @@ -97,7 +97,7 @@ public void testCreateTableWithColumnWithClusteringColumnSerializedSizeBiggerTha @Test public void testCreateTableWithColumnWithClusteringColumnSerializedSizeBiggerOrEqualThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 text CHECK OCTET_LENGTH(ck1) >= 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 text CHECK OCTET_LENGTH() >= 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 'fñoo', 2, 3)"); @@ -111,7 +111,7 @@ public void testCreateTableWithColumnWithClusteringColumnSerializedSizeBiggerOrE @Test public void testCreateTableWithColumnWithClusteringColumnSerializedSizeSmallerThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 text CHECK OCTET_LENGTH(ck1) < 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 text CHECK OCTET_LENGTH() < 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 'fñ', 2, 3)"); @@ -125,7 +125,7 @@ public void testCreateTableWithColumnWithClusteringColumnSerializedSizeSmallerTh @Test public void testCreateTableWithColumnWithClusteringColumnSerializedSizeSmallerOrEqualThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 text CHECK OCTET_LENGTH(ck1) <= 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 text CHECK OCTET_LENGTH() <= 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 'fñ', 2, 3)"); @@ -139,7 +139,7 @@ public void testCreateTableWithColumnWithClusteringColumnSerializedSizeSmallerOr @Test public void testCreateTableWithColumnWithClusteringBlobColumnSerializedSizeEqualToConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 blob CHECK OCTET_LENGTH(ck1) = 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 blob CHECK OCTET_LENGTH() = 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, textAsBlob('fño'), 2, 3)"); @@ -153,7 +153,7 @@ public void testCreateTableWithColumnWithClusteringBlobColumnSerializedSizeEqual @Test public void testCreateTableWithColumnWithClusteringBlobColumnSerializedSizeDifferentThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 blob CHECK OCTET_LENGTH(ck1) != 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 blob CHECK OCTET_LENGTH() != 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, textAsBlob('fñ'), 2, 3)"); @@ -167,7 +167,7 @@ public void testCreateTableWithColumnWithClusteringBlobColumnSerializedSizeDiffe @Test public void testCreateTableWithColumnWithClusteringBlobColumnSerializedSizeBiggerThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 blob CHECK OCTET_LENGTH(ck1) > 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 blob CHECK OCTET_LENGTH() > 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, textAsBlob('fñoo'), 2, 3)"); @@ -181,7 +181,7 @@ public void testCreateTableWithColumnWithClusteringBlobColumnSerializedSizeBigge @Test public void testCreateTableWithColumnWithClusteringBlobColumnSerializedSizeBiggerOrEqualThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 blob CHECK OCTET_LENGTH(ck1) >= 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 blob CHECK OCTET_LENGTH() >= 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, textAsBlob('fñoo'), 2, 3)"); @@ -195,7 +195,7 @@ public void testCreateTableWithColumnWithClusteringBlobColumnSerializedSizeBigge @Test public void testCreateTableWithColumnWithClusteringBlobColumnSerializedSizeSmallerThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 blob CHECK OCTET_LENGTH(ck1) < 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 blob CHECK OCTET_LENGTH() < 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, textAsBlob('fñ'), 2, 3)"); @@ -209,7 +209,7 @@ public void testCreateTableWithColumnWithClusteringBlobColumnSerializedSizeSmall @Test public void testCreateTableWithColumnWithClusteringBlobColumnSerializedSizeSmallerOrEqualThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 blob CHECK OCTET_LENGTH(ck1) <= 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 blob CHECK OCTET_LENGTH() <= 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, textAsBlob('fñ'), 2, 3)"); @@ -224,7 +224,7 @@ public void testCreateTableWithColumnWithClusteringBlobColumnSerializedSizeSmall @Test public void testCreateTableWithColumnWithPkColumnSerializedSizeEqualToConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk text CHECK OCTET_LENGTH(pk) = 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text CHECK OCTET_LENGTH() = 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES ('fño', 1, 2, 3)"); @@ -238,7 +238,7 @@ public void testCreateTableWithColumnWithPkColumnSerializedSizeEqualToConstraint @Test public void testCreateTableWithColumnWithPkColumnSerializedSizeDifferentThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk text CHECK OCTET_LENGTH(pk) != 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text CHECK OCTET_LENGTH() != 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES ('fñ', 1, 2, 3)"); @@ -252,7 +252,7 @@ public void testCreateTableWithColumnWithPkColumnSerializedSizeDifferentThanCons @Test public void testCreateTableWithColumnWithPkColumnSerializedSizeBiggerThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk text CHECK OCTET_LENGTH(pk) > 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text CHECK OCTET_LENGTH() > 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES ('fñoo', 1, 2, 3)"); @@ -266,7 +266,7 @@ public void testCreateTableWithColumnWithPkColumnSerializedSizeBiggerThanConstra @Test public void testCreateTableWithColumnWithPkColumnSerializedSizeBiggerOrEqualThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk text CHECK OCTET_LENGTH(pk) >= 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text CHECK OCTET_LENGTH() >= 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES ('fñoo', 1, 2, 3)"); @@ -280,7 +280,7 @@ public void testCreateTableWithColumnWithPkColumnSerializedSizeBiggerOrEqualThan @Test public void testCreateTableWithColumnWithPkColumnSerializedSizeSmallerThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk text CHECK OCTET_LENGTH(pk) < 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text CHECK OCTET_LENGTH() < 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES ('fñ', 1, 2, 3)"); @@ -294,7 +294,7 @@ public void testCreateTableWithColumnWithPkColumnSerializedSizeSmallerThanConstr @Test public void testCreateTableWithColumnWithPkColumnSerializedSizeSmallerOrEqualThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk text CHECK OCTET_LENGTH(pk) <= 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text CHECK OCTET_LENGTH() <= 4, ck1 int, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES ('fñ', 1, 2, 3)"); @@ -309,7 +309,7 @@ public void testCreateTableWithColumnWithPkColumnSerializedSizeSmallerOrEqualTha @Test public void testCreateTableWithColumnWithRegularColumnSerializedSizeEqualToConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK OCTET_LENGTH(v) = 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK OCTET_LENGTH() = 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, 'fño')"); @@ -323,7 +323,7 @@ public void testCreateTableWithColumnWithRegularColumnSerializedSizeEqualToConst @Test public void testCreateTableWithColumnWithRegularColumnSerializedSizeDifferentThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK OCTET_LENGTH(v) != 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK OCTET_LENGTH() != 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, 'fñ')"); @@ -337,7 +337,7 @@ public void testCreateTableWithColumnWithRegularColumnSerializedSizeDifferentTha @Test public void testCreateTableWithColumnWithRegularColumnSerializedSizeBiggerThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK OCTET_LENGTH(v) > 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK OCTET_LENGTH() > 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, 'fñoo')"); @@ -351,7 +351,7 @@ public void testCreateTableWithColumnWithRegularColumnSerializedSizeBiggerThanCo @Test public void testCreateTableWithColumnWithRegularColumnSerializedSizeBiggerOrEqualThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK OCTET_LENGTH(v) >= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK OCTET_LENGTH() >= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, 'fñoo')"); @@ -365,7 +365,7 @@ public void testCreateTableWithColumnWithRegularColumnSerializedSizeBiggerOrEqua @Test public void testCreateTableWithColumnWithRegularColumnSerializedSizeSmallerThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK OCTET_LENGTH(v) < 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK OCTET_LENGTH() < 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, 'fñ')"); @@ -379,7 +379,7 @@ public void testCreateTableWithColumnWithRegularColumnSerializedSizeSmallerThanC @Test public void testCreateTableWithColumnWithRegularColumnSerializedSizeSmallerOrEqualThanConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK OCTET_LENGTH(v) <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK OCTET_LENGTH() <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, 'fñ')"); @@ -393,7 +393,7 @@ public void testCreateTableWithColumnWithRegularColumnSerializedSizeSmallerOrEqu @Test public void testCreateTableWithColumnWithRegularColumnSerializedSizeCheckNullTextConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK OCTET_LENGTH(v) <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v text CHECK OCTET_LENGTH() <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); final String expectedErrorMessage = "Column value does not satisfy value constraint for column 'v' as it is null."; assertInvalidThrowMessage(expectedErrorMessage, InvalidRequestException.class, "INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, null)"); } @@ -401,7 +401,7 @@ public void testCreateTableWithColumnWithRegularColumnSerializedSizeCheckNullTex @Test public void testCreateTableWithColumnWithRegularColumnSerializedSizeCheckNullVarcharConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v varchar CHECK OCTET_LENGTH(v) <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v varchar CHECK OCTET_LENGTH() <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); final String expectedErrorMessage = "Column value does not satisfy value constraint for column 'v' as it is null."; assertInvalidThrowMessage(expectedErrorMessage, InvalidRequestException.class, "INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, null)"); } @@ -409,7 +409,7 @@ public void testCreateTableWithColumnWithRegularColumnSerializedSizeCheckNullVar @Test public void testCreateTableWithColumnWithRegularColumnSerializedSizeCheckNullAsciiConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v ascii CHECK OCTET_LENGTH(v) <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v ascii CHECK OCTET_LENGTH() <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); final String expectedErrorMessage = "Column value does not satisfy value constraint for column 'v' as it is null."; assertInvalidThrowMessage(expectedErrorMessage, InvalidRequestException.class, "INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, null)"); } @@ -417,7 +417,7 @@ public void testCreateTableWithColumnWithRegularColumnSerializedSizeCheckNullAsc @Test public void testCreateTableWithColumnWithRegularColumnSerializedSizeCheckNullBlobConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v blob CHECK OCTET_LENGTH(v) <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int, ck2 int, v blob CHECK OCTET_LENGTH() <= 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); final String expectedErrorMessage = "Column value does not satisfy value constraint for column 'v' as it is null."; assertInvalidThrowMessage(expectedErrorMessage, InvalidRequestException.class, "INSERT INTO %s (pk, ck1, ck2, v) VALUES (1, 2, 3, null)"); } @@ -425,7 +425,7 @@ public void testCreateTableWithColumnWithRegularColumnSerializedSizeCheckNullBlo @Test public void testCreateTableWithColumnMixedColumnsSerializedSizeConstraint() throws Throwable { - createTable("CREATE TABLE %s (pk text CHECK OCTET_LENGTH(pk) = 4, ck1 int, ck2 int, v text CHECK OCTET_LENGTH(v) = 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text CHECK OCTET_LENGTH() = 4, ck1 int, ck2 int, v text CHECK OCTET_LENGTH() = 4, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); // Valid execute("INSERT INTO %s (pk, ck1, ck2, v) VALUES ('fño', 2, 3, 'fño')"); @@ -446,7 +446,7 @@ public void testCreateTableWithWrongColumnConstraint() throws Throwable { try { - createTable("CREATE TABLE %s (pk text, ck1 int CHECK OCTET_LENGTH(pk) = 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text, ck1 int CHECK OCTET_LENGTH() = 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); fail(); } catch (InvalidRequestException e) @@ -461,7 +461,7 @@ public void testCreateTableWithWrongColumnMultipleConstraint() throws Throwable { try { - createTable("CREATE TABLE %s (pk text, ck1 int CHECK OCTET_LENGTH(pk) = 4 AND ck1 < 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text, ck1 int CHECK OCTET_LENGTH() = 4 AND ck1 < 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); fail(); } catch (InvalidRequestException e) @@ -476,7 +476,7 @@ public void testCreateTableWithColumnWithClusteringColumnInvalidTypeConstraint() { try { - createTable("CREATE TABLE %s (pk int, ck1 int CHECK OCTET_LENGTH(ck1) = 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk int, ck1 int CHECK OCTET_LENGTH() = 4, ck2 int, v int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); fail(); } catch (InvalidRequestException e) @@ -507,7 +507,7 @@ public void testCreateTableInvalidFunction() throws Throwable { try { - createTable("CREATE TABLE %s (pk text CHECK not_a_function(pk) = 4, ck1 int, ck2 int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); + createTable("CREATE TABLE %s (pk text CHECK not_a_function() = 4, ck1 int, ck2 int, PRIMARY KEY ((pk),ck1, ck2)) WITH CLUSTERING ORDER BY (ck1 " + order + ");"); fail(); } catch (InvalidRequestException e) @@ -521,7 +521,7 @@ public void testCreateTableInvalidFunction() throws Throwable public void testCreateTableWithPKConstraintsAndCDCEnabled() throws Throwable { // It works - createTable("CREATE TABLE %s (pk text CHECK length(pk) = 4, ck1 int, ck2 int, PRIMARY KEY ((pk), ck1, ck2)) WITH cdc = true;"); + createTable("CREATE TABLE %s (pk text CHECK length() = 4, ck1 int, ck2 int, PRIMARY KEY ((pk), ck1, ck2)) WITH cdc = true;"); } @Test diff --git a/test/unit/org/apache/cassandra/constraints/JsonConstraintTest.java b/test/unit/org/apache/cassandra/constraints/JsonConstraintTest.java index 95db5b7604de..9dc964dd60a1 100644 --- a/test/unit/org/apache/cassandra/constraints/JsonConstraintTest.java +++ b/test/unit/org/apache/cassandra/constraints/JsonConstraintTest.java @@ -42,17 +42,19 @@ public class JsonConstraintTest private static final ColumnMetadata regularStringColumn = getColumnOfType(UTF8Type.instance); private static final ColumnMetadata regularAsciiColumn = getColumnOfType(AsciiType.instance); - private static final ColumnConstraints json = new ColumnConstraints(of(new Raw(jsonFunctionIdentifier, columnIdentifier).prepare())); + private final ColumnConstraints json = new ColumnConstraints(of(new Raw(jsonFunctionIdentifier).prepare())); @Test public void testJsonConstraint() throws Throwable { + json.setColumnName(columnIdentifier); run("{}"); run("{\"a\": 5, \"b\": \"1\", \"c\": [1,2,3]}"); run("nonsense", "Value for column 'a_column' violated JSON constraint as it is not a valid JSON."); - run("", "Column value does not satisfy value constraint for column 'a_column' as it is null."); + run("", "Value for column 'a_column' violated JSON constraint as it is not a valid JSON."); } + @Test public void testInvalidTypes() { @@ -86,6 +88,6 @@ private void run(String jsonToCheck, String exceptionMessage) throws Throwable private static ColumnMetadata getColumnOfType(AbstractType type) { - return new ColumnMetadata("a", "b", columnIdentifier, type, -1, REGULAR, null); + return new ColumnMetadata("a", "b", columnIdentifier, type, ColumnMetadata.NO_UNIQUE_ID, -1, REGULAR, null); } } diff --git a/test/unit/org/apache/cassandra/constraints/NotNullConstraintTest.java b/test/unit/org/apache/cassandra/constraints/NotNullConstraintTest.java new file mode 100644 index 000000000000..e033c62c5909 --- /dev/null +++ b/test/unit/org/apache/cassandra/constraints/NotNullConstraintTest.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.constraints; + +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.junit.Test; + +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.cql3.constraints.ColumnConstraints; +import org.apache.cassandra.cql3.constraints.ConstraintViolationException; +import org.apache.cassandra.cql3.constraints.FunctionColumnConstraint; +import org.apache.cassandra.cql3.constraints.InvalidConstraintDefinitionException; +import org.apache.cassandra.cql3.constraints.NotNullConstraint; +import org.apache.cassandra.cql3.constraints.ScalarColumnConstraint; +import org.apache.cassandra.cql3.constraints.UnaryFunctionColumnConstraint; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.ListType; +import org.apache.cassandra.db.marshal.MapType; +import org.apache.cassandra.db.marshal.SetType; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.db.marshal.UUIDType; +import org.apache.cassandra.schema.ColumnMetadata; + +import static java.util.List.of; +import static org.apache.cassandra.cql3.Operator.GT; +import static org.apache.cassandra.schema.ColumnMetadata.Kind.REGULAR; +import static org.apache.cassandra.utils.ByteBufferUtil.EMPTY_BYTE_BUFFER; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/** + * TODO - UDTs are not supported yet in constraints as such + */ +public class NotNullConstraintTest +{ + private static final ColumnIdentifier columnIdentifier = new ColumnIdentifier("a_column", false); + private static final ColumnConstraints unaryConstraint = new ColumnConstraints(of(new UnaryFunctionColumnConstraint.Raw(new ColumnIdentifier(NotNullConstraint.FUNCTION_NAME, false), List.of()).prepare())); + private static final ColumnConstraints scalarConstraint = new ColumnConstraints(of(new ScalarColumnConstraint.Raw(columnIdentifier, GT, "5").prepare())); + private static final ColumnConstraints functionConstraint = new ColumnConstraints(of(new FunctionColumnConstraint.Raw(new ColumnIdentifier("LENGTH", false), List.of(), GT, "5").prepare())); + + @Test + public void testNotNullConstraintValidation() + { + unaryConstraint.setColumnName(columnIdentifier); + scalarConstraint.setColumnName(columnIdentifier); + functionConstraint.setColumnName(columnIdentifier); + // unary + unaryConstraint.validate(getColumnOfType(UTF8Type.instance)); + assertThatThrownBy(() -> unaryConstraint.evaluate(UTF8Type.instance, EMPTY_BYTE_BUFFER)) + .hasMessage("Column value does not satisfy value constraint for column 'a_column' as it is null.") + .isInstanceOf(ConstraintViolationException.class); + + // not null / empty + unaryConstraint.evaluate(UTF8Type.instance, UTF8Type.instance.fromString("a value")); + + // scalar + scalarConstraint.validate(getColumnOfType(Int32Type.instance)); + assertThatThrownBy(() -> scalarConstraint.evaluate(Int32Type.instance, EMPTY_BYTE_BUFFER)) + .hasMessage("Column value does not satisfy value constraint for column 'a_column' as it is null.") + .isInstanceOf(ConstraintViolationException.class); + + // function, e.g. length + functionConstraint.validate(getColumnOfType(UTF8Type.instance)); + assertThatThrownBy(() -> functionConstraint.evaluate(UTF8Type.instance, EMPTY_BYTE_BUFFER)) + .hasMessage("Column value does not satisfy value constraint for column 'a_column' as it is null.") + .isInstanceOf(ConstraintViolationException.class); + + // empty string is not _null_ string so this passes + unaryConstraint.evaluate(UTF8Type.instance, UTF8Type.instance.fromString("")); + + // test a type for which empty value is meaningless + + assertThatThrownBy(() -> unaryConstraint.evaluate(UUIDType.instance, ByteBuffer.allocate(0))) + .hasMessage("Column value does not satisfy value constraint for column 'a_column' as it is empty.") + .isInstanceOf(ConstraintViolationException.class); + } + + @Test + public void testCollections() + { + unaryConstraint.setColumnName(columnIdentifier); + scalarConstraint.setColumnName(columnIdentifier); + functionConstraint.setColumnName(columnIdentifier); + checkList(false); + checkSet(false); + checkMap(false); + + checkList(true); + checkSet(true); + checkMap(true); + } + + private static ColumnMetadata getColumnOfType(AbstractType type) + { + return new ColumnMetadata("a", "b", columnIdentifier, type, -1, -1, REGULAR, null); + } + + private void checkList(boolean frozen) + { + if (frozen) + { + ListType listType = ListType.getInstance(Int32Type.instance, false); + ByteBuffer payload = listType.getSerializer().serialize(List.of(1, 2, 3)); + checkFrozenCollection(listType, payload); + } + else + checkUnfrozenCollection(ListType.getInstance(Int32Type.instance, true)); + } + + private void checkMap(boolean frozen) + { + if (frozen) + { + MapType mapType = MapType.getInstance(Int32Type.instance, Int32Type.instance, false); + ByteBuffer payload = mapType.getSerializer().serialize(Map.of(1, 1, 2, 2, 3, 3)); + checkFrozenCollection(mapType, payload); + } + else + checkUnfrozenCollection(MapType.getInstance(Int32Type.instance, Int32Type.instance, true)); + } + + private void checkSet(boolean frozen) + { + if (frozen) + { + SetType setType = SetType.getInstance(Int32Type.instance, false); + ByteBuffer payload = setType.getSerializer().serialize(Set.of(1, 2, 3)); + checkFrozenCollection(setType, payload); + } + else + checkUnfrozenCollection(SetType.getInstance(Int32Type.instance, true)); + } + + private void checkFrozenCollection(AbstractType type, ByteBuffer payload) + { + unaryConstraint.validate(getColumnOfType(type)); + unaryConstraint.evaluate(type, payload); + + assertThatThrownBy(() -> unaryConstraint.evaluate(type, EMPTY_BYTE_BUFFER)) + .hasMessage("Column value does not satisfy value constraint for column 'a_column' as it is null.") + .isInstanceOf(ConstraintViolationException.class); + } + + private void checkUnfrozenCollection(AbstractType type) + { + assertThatThrownBy(() -> unaryConstraint.validate(getColumnOfType(type))) + .hasMessageContaining("Constraint cannot be defined on the column") + .hasMessageContaining("When using collections, constraints can be used only of frozen collections") + .isInstanceOf(InvalidConstraintDefinitionException.class); + } +} diff --git a/test/unit/org/apache/cassandra/constraints/RegexpConstraintTest.java b/test/unit/org/apache/cassandra/constraints/RegexpConstraintTest.java index 3c865509416c..ddbff0c45d1f 100644 --- a/test/unit/org/apache/cassandra/constraints/RegexpConstraintTest.java +++ b/test/unit/org/apache/cassandra/constraints/RegexpConstraintTest.java @@ -18,6 +18,8 @@ package org.apache.cassandra.constraints; +import java.util.List; + import org.junit.Test; import org.apache.cassandra.cql3.ColumnIdentifier; @@ -44,8 +46,8 @@ public class RegexpConstraintTest private static final ColumnMetadata regularStringColumn = getColumnOfType(UTF8Type.instance); private static final ColumnMetadata regularAsciiColumn = getColumnOfType(AsciiType.instance); - private static final ColumnConstraints regexp = new ColumnConstraints(of(new Raw(regexpFunctionIdentifier, columnIdentifier, Operator.EQ, "'a..b'").prepare())); - private static final ColumnConstraints negatedRegexp = new ColumnConstraints(of(new Raw(regexpFunctionIdentifier, columnIdentifier, Operator.NEQ, "'a..b'").prepare())); + private static final ColumnConstraints regexp = new ColumnConstraints(of(new Raw(regexpFunctionIdentifier, List.of(), Operator.EQ, "'a..b'").prepare())); + private static final ColumnConstraints negatedRegexp = new ColumnConstraints(of(new Raw(regexpFunctionIdentifier, List.of(), Operator.NEQ, "'a..b'").prepare())); @Test public void testRegexpConstraint() throws Throwable @@ -59,7 +61,7 @@ public void testRegexpConstraint() throws Throwable @Test public void testInvalidPattern() { - ColumnConstraints invalid = new ColumnConstraints(of(new Raw(regexpFunctionIdentifier, columnIdentifier, Operator.EQ, "'*abc'").prepare())); + ColumnConstraints invalid = new ColumnConstraints(of(new Raw(regexpFunctionIdentifier, List.of(), Operator.EQ, "'*abc'").prepare())); assertThatThrownBy(() -> invalid.validate(regularStringColumn)) .hasMessage("String '*abc' is not a valid regular expression") .isInstanceOf(InvalidConstraintDefinitionException.class); @@ -98,6 +100,6 @@ private void run(ColumnConstraints regexp, String input, String exceptionMessage private static ColumnMetadata getColumnOfType(AbstractType type) { - return new ColumnMetadata("a", "b", columnIdentifier, type, -1, REGULAR, null); + return new ColumnMetadata("a", "b", columnIdentifier, type, ColumnMetadata.NO_UNIQUE_ID, -1, REGULAR, null); } } diff --git a/test/unit/org/apache/cassandra/constraints/TimeConstraintsTest.java b/test/unit/org/apache/cassandra/constraints/TimeConstraintsTest.java index 830abb444a8c..7fcf89476dee 100644 --- a/test/unit/org/apache/cassandra/constraints/TimeConstraintsTest.java +++ b/test/unit/org/apache/cassandra/constraints/TimeConstraintsTest.java @@ -182,6 +182,6 @@ private String cantParse(String value) private static ColumnMetadata getColumnOfType(AbstractType type) { - return new ColumnMetadata("a", "b", columnIdentifier, type, -1, REGULAR, null); + return new ColumnMetadata("a", "b", columnIdentifier, type, ColumnMetadata.NO_UNIQUE_ID, -1, REGULAR, null); } } diff --git a/test/unit/org/apache/cassandra/cql3/CDCStatementTest.java b/test/unit/org/apache/cassandra/cql3/CDCStatementTest.java index 44ada8e9176b..86bc1141516e 100644 --- a/test/unit/org/apache/cassandra/cql3/CDCStatementTest.java +++ b/test/unit/org/apache/cassandra/cql3/CDCStatementTest.java @@ -22,7 +22,6 @@ import org.junit.BeforeClass; import org.junit.Test; -import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.config.DatabaseDescriptor; public class CDCStatementTest extends CQLTester @@ -30,19 +29,18 @@ public class CDCStatementTest extends CQLTester @BeforeClass public static void enableCDC() { - ServerTestUtils.daemonInitialization(); DatabaseDescriptor.setCDCEnabled(true); } @Test - public void testEnableOnCreate() throws Throwable + public void testEnableOnCreate() { createTable("CREATE TABLE %s (key text, val int, primary key(key)) WITH cdc = true;"); Assert.assertTrue(currentTableMetadata().params.cdc); } @Test - public void testEnableOnAlter() throws Throwable + public void testEnableOnAlter() { createTable("CREATE TABLE %s (key text, val int, primary key(key));"); Assert.assertFalse(currentTableMetadata().params.cdc); @@ -51,7 +49,7 @@ public void testEnableOnAlter() throws Throwable } @Test - public void testDisableOnAlter() throws Throwable + public void testDisableOnAlter() { createTable("CREATE TABLE %s (key text, val int, primary key(key)) WITH cdc = true;"); Assert.assertTrue(currentTableMetadata().params.cdc); diff --git a/test/unit/org/apache/cassandra/cql3/CQLTester.java b/test/unit/org/apache/cassandra/cql3/CQLTester.java index 22ab64c12fca..88ab4fa27010 100644 --- a/test/unit/org/apache/cassandra/cql3/CQLTester.java +++ b/test/unit/org/apache/cassandra/cql3/CQLTester.java @@ -66,6 +66,9 @@ import com.google.common.base.Strings; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; + +import org.assertj.core.api.Assertions; +import org.awaitility.Awaitility; import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; import org.junit.After; @@ -126,6 +129,7 @@ import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.BooleanType; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; import org.apache.cassandra.db.marshal.ByteType; import org.apache.cassandra.db.marshal.BytesType; import org.apache.cassandra.db.marshal.CollectionType; @@ -147,6 +151,7 @@ import org.apache.cassandra.db.marshal.UTF8Type; import org.apache.cassandra.db.marshal.UUIDType; import org.apache.cassandra.db.marshal.VectorType; +import org.apache.cassandra.db.virtual.SystemViewsKeyspace; import org.apache.cassandra.db.virtual.VirtualKeyspace; import org.apache.cassandra.db.virtual.VirtualKeyspaceRegistry; import org.apache.cassandra.db.virtual.VirtualSchemaKeyspace; @@ -157,10 +162,10 @@ import org.apache.cassandra.index.Index; import org.apache.cassandra.index.SecondaryIndexManager; import org.apache.cassandra.io.filesystem.ListenableFileSystem; +import org.apache.cassandra.io.sstable.Descriptor; import org.apache.cassandra.io.util.File; import org.apache.cassandra.io.util.FileSystems; import org.apache.cassandra.io.util.FileUtils; -import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.metrics.CassandraMetricsRegistry; import org.apache.cassandra.metrics.ClientMetrics; import org.apache.cassandra.net.MessagingService; @@ -175,6 +180,7 @@ import org.apache.cassandra.service.ClientState; import org.apache.cassandra.service.QueryState; import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.AccordCache; import org.apache.cassandra.service.snapshot.SnapshotManager; import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.transport.Event; @@ -185,14 +191,22 @@ import org.apache.cassandra.transport.TlsTestUtils; import org.apache.cassandra.transport.messages.ResultMessage; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.CassandraGenerators; import org.apache.cassandra.utils.ConfigGenBuilder; import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.Generators; import org.apache.cassandra.utils.JMXServerUtils; +import org.apache.cassandra.utils.LazyToString; import org.apache.cassandra.utils.Pair; import org.apache.cassandra.utils.TimeUUID; -import org.assertj.core.api.Assertions; -import org.awaitility.Awaitility; +import static org.apache.cassandra.utils.CassandraGenerators.regularKeyspace; +import static org.apache.cassandra.utils.CassandraGenerators.regularTable; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.apache.cassandra.config.CassandraRelevantProperties.SYSTEM_DISTRIBUTED_DEFAULT_RF; import static org.apache.cassandra.config.CassandraRelevantProperties.TEST_DRIVER_CONNECTION_TIMEOUT_MS; import static org.apache.cassandra.config.CassandraRelevantProperties.TEST_DRIVER_READ_TIMEOUT_MS; import static org.apache.cassandra.config.CassandraRelevantProperties.TEST_RANDOM_SEED; @@ -207,10 +221,6 @@ import static org.apache.cassandra.metrics.CassandraMetricsRegistry.createMetricsKeyspaceTables; import static org.apache.cassandra.schema.SchemaConstants.VIRTUAL_METRICS; import static org.apache.cassandra.utils.LocalizeString.toLowerCaseLocalized; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; /** * Base class for CQL tests. @@ -250,7 +260,6 @@ public abstract class CQLTester protected static int nativePort; protected static final InetAddress nativeAddr; - protected static final Set remoteAddrs = new HashSet<>(); private static final Map clusters = new HashMap<>(); private static final Map sessions = new HashMap<>(); @@ -298,6 +307,7 @@ public static final ProtocolVersion getDefaultVersion() private List keyspaces = new ArrayList<>(); private List tables = new ArrayList<>(); + private List indexes = new ArrayList<>(); private List views = new ArrayList<>(); private List types = new ArrayList<>(); private List functions = new ArrayList<>(); @@ -381,6 +391,7 @@ private static void checkProtocolVersion() public static void prepareServer() { ServerTestUtils.prepareServer(); + AccordCache.validateLoadOnEvict(true); } public static void cleanup() @@ -443,12 +454,19 @@ public static void setUpClass() protected static void prePrepareServer() { CassandraRelevantProperties.SUPERUSER_SETUP_DELAY_MS.setLong(0); - ServerTestUtils.daemonInitialization(); + daemonInitialization(); if (ROW_CACHE_SIZE_IN_MIB > 0) DatabaseDescriptor.setRowCacheSizeInMiB(ROW_CACHE_SIZE_IN_MIB); StorageService.instance.registerMBeans(); StorageService.instance.setPartitionerUnsafe(Murmur3Partitioner.instance); SnapshotManager.instance.registerMBean(); + SYSTEM_DISTRIBUTED_DEFAULT_RF.setInt(1); + } + + // So derived classes can get enough intialization to start setting DatabaseDescriptor options + public static void daemonInitialization() + { + ServerTestUtils.daemonInitialization(); } @AfterClass @@ -497,6 +515,7 @@ public void afterTest() throws Throwable keyspaces = null; tables = null; + indexes = null; views = null; types = null; functions = null; @@ -509,6 +528,32 @@ protected static void addMetricsKeyspace() VirtualKeyspaceRegistry.instance.register(new VirtualKeyspace(VIRTUAL_METRICS, createMetricsKeyspaceTables())); } + protected static void addVirtualKeyspace() + { + VirtualKeyspaceRegistry.instance.register(SystemViewsKeyspace.instance); + } + + protected void clearSchema() + { + ServerTestUtils.resetCMS(); + keyspaces.clear(); + tables.clear(); + indexes.clear(); + views.clear(); + types.clear(); + functions.clear(); + aggregates.clear(); + } + + protected void clearState() + { + clearSchema(); + usePrepared = USE_PREPARED_VALUES; + reusePrepared = REUSE_PREPARED; + + seqNumber.set(0); + } + protected void resetSchema() throws Throwable { for (TableMetadata table : SchemaKeyspace.metadata().tables) @@ -620,13 +665,15 @@ public void shouldUseClientCertificate(boolean useClientCert) public static void requireNativeProtocolClientEncryption() { DatabaseDescriptor.updateNativeProtocolEncryptionOptions((encryptionOptions) -> - encryptionOptions.withEnabled(true) - .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH) - .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) - .withTrustStore(TlsTestUtils.SERVER_TRUSTSTORE_PATH) - .withTrustStorePassword(TlsTestUtils.SERVER_TRUSTSTORE_PASSWORD) - .withRequireEndpointVerification(false) - .withRequireClientAuth(EncryptionOptions.ClientAuth.OPTIONAL)); + new EncryptionOptions.ClientEncryptionOptions.Builder(encryptionOptions) + .withEnabled(true) + .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH) + .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) + .withTrustStore(TlsTestUtils.SERVER_TRUSTSTORE_PATH) + .withTrustStorePassword(TlsTestUtils.SERVER_TRUSTSTORE_PASSWORD) + .withRequireEndpointVerification(false) + .withRequireClientAuth(EncryptionOptions.ClientEncryptionOptions.ClientAuth.OPTIONAL) + .build()); } /** @@ -1648,7 +1695,7 @@ protected Cluster getCluster(ProtocolVersion protocolVersion) protected SimpleClient newSimpleClient(ProtocolVersion version) throws IOException { - return new SimpleClient(nativeAddr.getHostAddress(), nativePort, version, version.isBeta(), new EncryptionOptions().applyConfig()) + return new SimpleClient(nativeAddr.getHostAddress(), nativePort, version, version.isBeta(), new EncryptionOptions.ClientEncryptionOptions()) .connect(false, false); } @@ -1776,9 +1823,10 @@ protected void assertRowsNet(ProtocolVersion protocolVersion, ResultSet result, Object[] expected = rows[i]; Row actual = iter.next(); - Assert.assertEquals(String.format("Invalid number of (expected) values provided for row %d (using protocol version %s)", - i, protocolVersion), - meta.size(), expected.length); + Assertions.assertThat(meta.size()) + .describedAs("Invalid number of (expected) values provided for row %d (using protocol version %s); expected=%s, actual=%s", + i, protocolVersion, LazyToString.lazy(() -> Arrays.toString(expected)), LazyToString.lazy(() -> Arrays.toString(toObjectArray(actual)))) + .isEqualTo(expected.length); for (int j = 0; j < meta.size(); j++) { @@ -2036,6 +2084,14 @@ private boolean equalsWithoutKsTb(ColumnMetadata left, ColumnMetadata right) && left.type.equals(right.type); } + private static Object[] toObjectArray(Row actual) + { + Object[] row = new Object[actual.getColumnDefinitions().size()]; + for (int i = 0; i < row.length; i++) + row[i] = actual.getObject(i); + return row; + } + protected void assertRowCountNet(ResultSet r1, int expectedCount) { Assert.assertFalse("Received a null resultset when expected count was > 0", expectedCount > 0 && r1 == null); @@ -2043,6 +2099,133 @@ protected void assertRowCountNet(ResultSet r1, int expectedCount) Assert.assertEquals(String.format("expected %d rows but received %d", expectedCount, actualRowCount), expectedCount, actualRowCount); } + public abstract static class CellValidator + { + public abstract ByteBuffer expected(); + public abstract boolean equals(ByteBuffer bb); + + @Override + public boolean equals(Object obj) + { + if (obj instanceof ByteBuffer) + return equals((ByteBuffer) obj); + return false; + } + + public abstract String describe(); + } + + protected static CellValidator any() + { + return new CellValidator() + { + @Override + public ByteBuffer expected() + { + return ByteBufferUtil.EMPTY_BYTE_BUFFER; + } + + @Override + public boolean equals(ByteBuffer bb) + { + return true; + } + + @Override + public String describe() + { + return "any"; + } + }; + } + + protected static CellValidator anyNonNull() + { + return new CellValidator() + { + @Override + public ByteBuffer expected() + { + return ByteBufferUtil.EMPTY_BYTE_BUFFER; + } + + @Override + public boolean equals(ByteBuffer bb) + { + return !(bb == null || !bb.hasRemaining()); + } + + @Override + public String describe() + { + return "any non-null"; + } + }; + } + + protected static CellValidator anyInt() + { + return new CellValidator() + { + @Override + public ByteBuffer expected() + { + return ByteBufferUtil.bytes(0); + } + + @Override + public boolean equals(ByteBuffer bb) + { + if (bb == null) return false; + Int32Type.instance.validate(bb); + return bb.hasRemaining(); + } + + @Override + public String describe() + { + return "any non-null int"; + } + }; + } + + protected static CellValidator anyOf(String... values) + { + return anyOf(UTF8Type.instance, values); + } + + protected static CellValidator anyOf(AbstractType type, T... values) + { + assert values.length > 0; + ByteBuffer[] bbs = new ByteBuffer[values.length]; + for (int i = 0; i < values.length; i++) + bbs[i] = type.decompose(values[i]); + return new CellValidator() + { + @Override + public ByteBuffer expected() + { + return bbs[0]; + } + + @Override + public boolean equals(ByteBuffer bb) + { + for (int i = 0; i < bbs.length; i++) + { + if (Objects.equal(bbs[i], bb)) return true; + } + return false; + } + + @Override + public String describe() + { + return formatValue(bbs[0], type); + } + }; + } + public static void assertRows(UntypedResultSet result, Object[]... rows) { if (result == null) @@ -2066,24 +2249,25 @@ public static void assertRows(UntypedResultSet result, Object[]... rows) for (int j = 0; j < meta.size(); j++) { ColumnSpecification column = meta.get(j); - ByteBuffer expectedByteValue = makeByteBuffer(expected == null ? null : expected[j], column.type); + CellValidator cellValidator = makeCellValidator(expected == null ? null : expected[j], column.type); ByteBuffer actualValue = actual.getBytes(column.name.toString()); - if (expectedByteValue != null) - expectedByteValue = expectedByteValue.duplicate(); - if (!Objects.equal(expectedByteValue, actualValue)) + if (!((cellValidator == null && actualValue == null) || (cellValidator != null && cellValidator.equals(actualValue)))) { Object actualValueDecoded = actualValue == null ? null : column.type.getSerializer().deserialize(actualValue); - if (!Objects.equal(expected != null ? expected[j] : null, actualValueDecoded)) + Object expectedValueDecoded = expected != null ? expected[j] : null; + if (expectedValueDecoded instanceof ByteBuffer && !(actualValueDecoded instanceof ByteBuffer)) + expectedValueDecoded = column.type.getSerializer().deserialize(((ByteBuffer) expectedValueDecoded).duplicate()); + if (!Objects.equal(expectedValueDecoded, actualValueDecoded)) { - if (isEmptyContainerNull(column.type, expectedByteValue, actualValue)) + if (isEmptyContainerNull(column.type, cellValidator != null ? cellValidator.expected() : null, actualValue)) continue; error.append(String.format("Invalid value for row %d column %d (%s of type %s), expected <%s> but got <%s>", i, j, column.name, column.type.asCQL3Type(), - formatValue(expectedByteValue != null ? expectedByteValue.duplicate() : null, column.type), + cellValidator != null ? cellValidator.describe() : "null", formatValue(actualValue, column.type))).append("\n"); } } @@ -2107,14 +2291,30 @@ public static void assertRows(UntypedResultSet result, Object[]... rows) ByteBuffer actualValue = actual.getBytes(column.name.toString()); str.append(String.format("%s=%s ", column.name, formatValue(actualValue, column.type))); } - logger.info("Extra row num {}: {}", i, str.toString()); + logger.info("Extra row num {}: {}", i, str); } - Assert.fail(String.format("Got more rows than expected. Expected %d but got %d.", rows.length, i)); + Assert.fail(String.format("Got more rows than expected. Expected %d but got %d.\nExpected: %s\nActual: %s", rows.length, i, toString(rows), result.toStringUnsafe())); } Assert.assertTrue(String.format("Got %s rows than expected. Expected %d but got %d", rows.length>i ? "less" : "more", rows.length, i), i == rows.length); } + private static String toString(Object o) + { + if (o == null) + return "null"; + if (o instanceof CellValidator) + return ((CellValidator) o).describe(); + if (o instanceof Object[]) + return toString((Object[]) o); + return o.toString(); + } + + private static String toString(Object[] array) + { + return Stream.of(array).map(CQLTester::toString).collect(Collectors.joining(", ", "[", "]")); + } + /** * Like assertRows(), but ignores the ordering of rows. */ @@ -2488,6 +2688,86 @@ protected void assertUnauthorizedQuery(String errorMessage, String query, Object values); } + protected CassandraGenerators.KeyspaceMetadataBuilder createKeyspaceMetadataBuilder() + { + return regularKeyspace() + .withName(createKeyspaceName()) + .withReplication(new CassandraGenerators.AbstractReplicationStrategyBuilder() + .withUserAllowed() + .withDatacenters("datacenter1") + .withRf(1)); + } + + protected KeyspaceMetadata createKeyspace(RandomSource rs) + { + KeyspaceMetadata metadata = Generators.toGen(createKeyspaceMetadataBuilder().build()).next(rs); + String fullQuery = metadata.toCqlString(false, false, false); + logger.info(fullQuery); + schemaChange(fullQuery); + return metadata; + } + + protected CassandraGenerators.TableMetadataBuilder createTableMetadataBuilder() + { + String ks = currentKeyspace(); + if (ks == null) + ks = KEYSPACE; + return createTableMetadataBuilder(ks); + } + + protected CassandraGenerators.TableMetadataBuilder createTableMetadataBuilder(String ks) + { + return regularTable() + .withKeyspaceName(ks) + .withSimpleColumnNames(); + } + + protected TableMetadata createTable(RandomSource rs) + { + TableMetadata metadata = Generators.toGen(createTableMetadataBuilder().build()).next(rs); + maybeCreateUDTs(metadata); + String fullQuery = metadata.toCqlString(false, false, false); + logger.info(fullQuery); + schemaChange(fullQuery); + return metadata; + } + + protected TableMetadata createTable(RandomSource rs, String keyspace) + { + TableMetadata metadata = Generators.toGen(createTableMetadataBuilder(keyspace).build()).next(rs); + maybeCreateUDTs(metadata); + String fullQuery = metadata.toCqlString(false, false, false); + logger.info(fullQuery); + schemaChange(fullQuery); + return Schema.instance.getTableMetadata(keyspace, metadata.name); + } + + protected void maybeCreateUDTs(TableMetadata metadata) + { + CassandraGenerators.visitUDTs(metadata, next -> { + String cql = next.toCqlString(false, false, true); + logger.warn("Creating UDT {}", cql); + schemaChange(cql); + }); + } + + protected String createIndexName() + { + String name = createSchemaElementName(SchemaElement.SchemaElementType.INDEX, null); + indexes.add(name); + return name; + } + + protected UntypedResultSet execute(org.apache.cassandra.cql3.ast.Statement stmt) + { + return executeFormattedQuery(stmt.toCQL(), (Object[]) stmt.bindsEncoded()); + } + + protected ResultSet executeNet(ProtocolVersion protocolVersion, org.apache.cassandra.cql3.ast.Statement stmt) + { + return sessionNet(protocolVersion).execute(stmt.toCQL(), (Object[]) stmt.bindsEncoded()); + } + @FunctionalInterface public interface CheckedFunction { @@ -2712,7 +2992,7 @@ else if (type instanceof BytesType) return s; } - protected static ByteBuffer makeByteBuffer(Object value, AbstractType type) + public static ByteBuffer makeByteBuffer(Object value, AbstractType type) { if (value == null) return null; @@ -2721,11 +3001,42 @@ protected static ByteBuffer makeByteBuffer(Object value, AbstractType type) return ((TupleValue)value).toByteBuffer(); if (value instanceof ByteBuffer) - return (ByteBuffer)value; + return ((ByteBuffer)value); return type.decomposeUntyped(serializeTuples(value)); } + public static CellValidator makeCellValidator(Object value, AbstractType type) + { + if (value == null) + return null; + if (value instanceof CellValidator) + return (CellValidator) value; + + ByteBuffer byteBuffer = makeByteBuffer(value, type); + return new CellValidator() + { + @Override + public ByteBuffer expected() + { + return byteBuffer; + } + + @Override + public boolean equals(ByteBuffer bb) + { + if (bb == null) return false; + return byteBuffer.equals(bb); + } + + @Override + public String describe() + { + return formatValue(byteBuffer, type); + } + }; + } + private static String formatValue(ByteBuffer bb, AbstractType type) { if (bb == null) @@ -2736,8 +3047,15 @@ private static String formatValue(ByteBuffer bb, AbstractType type) // CollectionType override getString() to use hexToBytes. We can't change that // without breaking SSTable2json, but the serializer for collection have the // right getString so using it directly instead. - TypeSerializer ser = type.getSerializer(); - return ser.toString(ser.deserialize(bb)); + try + { + TypeSerializer ser = type.getSerializer(); + return ser.toString(ser.deserialize(bb)); + } + catch (Throwable t) + { + return "TypeSerializer.toString failed for type " + type.asCQL3Type() + ": " + t.getMessage(); + } } try @@ -2750,12 +3068,12 @@ private static String formatValue(ByteBuffer bb, AbstractType type) } } - protected TupleValue tuple(Object...values) + public static TupleValue tuple(Object...values) { return new TupleValue(values); } - protected Object userType(Object... values) + public static UserTypeValue userType(Object... values) { if (values.length % 2 != 0) throw new IllegalArgumentException("userType() requires an even number of arguments"); @@ -2967,6 +3285,26 @@ private static AbstractType typeFor(Object value) throw new IllegalArgumentException("Unsupported value type (value is " + value + ")"); } + protected static String wrapInTxn(String... stmts) + { + return wrapInTxn(Arrays.asList(stmts)); + } + + protected static String wrapInTxn(List stmts) + { + StringBuilder sb = new StringBuilder(); + sb.append("BEGIN TRANSACTION\n"); + for (String stmt : stmts) + { + sb.append('\t').append(stmt); + if (!stmt.endsWith(";")) + sb.append(';'); + sb.append('\n'); + } + sb.append("COMMIT TRANSACTION"); + return sb.toString(); + } + private static class TupleValue { protected final Object[] values; @@ -2986,7 +3324,7 @@ public ByteBuffer toByteBuffer() types.add(type); bbs.add(makeByteBuffer(value, type)); } - return new TupleType(types).pack(bbs); + return new TupleType(types).pack(bbs, ByteBufferAccessor.instance); } public String toCQLString() @@ -3024,7 +3362,7 @@ public int hashCode() } } - private static class UserTypeValue extends TupleValue + public static class UserTypeValue extends TupleValue { private final String[] fieldNames; @@ -3153,7 +3491,7 @@ protected static long seed() return SEED; } - protected static void setupSeed() + public static void setupSeed() { if (RANDOM != null) return; SEED = TEST_RANDOM_SEED.getLong(new DefaultRandom().nextLong()); @@ -3166,7 +3504,7 @@ public void resetSeed() RANDOM.setSeed(SEED); } - protected static void updateConfigs() + public static void updateConfigs() { if (CONFIG_GEN == null) CONFIG_GEN = new ConfigGenBuilder().build(); @@ -3210,17 +3548,31 @@ public static abstract class InMemory extends CQLTester * Used by {@link #cleanupFileSystemListeners()} to know if file system listeners should be removed at the start * of a test; can disable for cases where listeners are needed cross mutliple tests. */ - protected boolean cleanupFileSystemListeners = true; + protected static boolean cleanupFileSystemListeners = true; @BeforeClass public static void setUpClass() + { + prePrepareServer(); + + // Once per-JVM is enough + prepareServer(); + } + + protected static void prePrepareServer() + { + setupFileSystem(); + + CQLTester.prePrepareServer(); + } + + protected static void setupFileSystem() { fs = FileSystems.newGlobalInMemoryFileSystem(); CassandraRelevantProperties.IGNORE_MISSING_NATIVE_FILE_HINTS.setBoolean(true); FileSystems.maybeCreateTmp(); - - CQLTester.setUpClass(); } + @Before public void cleanupFileSystemListeners() { @@ -3228,6 +3580,18 @@ public void cleanupFileSystemListeners() return; fs.clearListeners(); } + + protected ListenableFileSystem.PathFilter isCurrentTableIndexFile(String keyspace) + { + return path -> { + if (!path.getFileName().toString().endsWith("Index.db")) + return false; + Descriptor desc = Descriptor.fromFile(new File(path)); + if (!desc.ksname.equals(keyspace) && desc.cfname.equals(currentTable())) + return false; + return true; + }; + } } private static class ClusterSettings diff --git a/test/unit/org/apache/cassandra/cql3/ColumnSpecificationTest.java b/test/unit/org/apache/cassandra/cql3/ColumnSpecificationTest.java new file mode 100644 index 000000000000..3731ead11351 --- /dev/null +++ b/test/unit/org/apache/cassandra/cql3/ColumnSpecificationTest.java @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.cql3; + +import java.util.Map; + +import org.junit.Before; +import org.junit.Test; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.schema.CompactionParams; + +/** + * Test various "extensions" to a column spec when altering / creating a table + */ +public class ColumnSpecificationTest extends CQLTester +{ + @Before + public void before() + { + DatabaseDescriptor.setDynamicDataMaskingEnabled(true); + } + + @Test + public void testCreateTableWithColumnHavingMaskBeforeCheck() + { + createTable("CREATE TABLE %s (pk text primary key, name text MASKED WITH system.mask_default() CHECK NOT NULL AND LENGTH() > 1);"); + verifyColumnSpec("name text MASKED WITH system.mask_default() CHECK NOT NULL AND LENGTH() > 1"); + } + + @Test + public void testAlterTableAlterColumnWithMaskAndCheckStandalone() + { + createTable("CREATE TABLE %s (pk text, name text, primary key (pk));"); + execute("ALTER TABLE %s ALTER name MASKED WITH system.mask_default()"); + execute("ALTER TABLE %s ALTER name CHECK NOT NULL AND LENGTH() > 1;"); + verifyColumnSpec("name text MASKED WITH system.mask_default() CHECK NOT NULL AND LENGTH() > 1"); + } + + @Test + public void testAlterTableAlterColumnWithMask() + { + createTable("CREATE TABLE %s (pk text, name text, primary key (pk));"); + execute("ALTER TABLE %s ALTER name MASKED WITH system.mask_default()"); + verifyColumnSpec("name text MASKED WITH system.mask_default()"); + } + + @Test + public void testAlterTableAlterColumnWithCheck() + { + createTable("CREATE TABLE %s (pk text, name text, primary key (pk));"); + execute("ALTER TABLE %s ALTER name CHECK NOT NULL AND LENGTH() > 1;"); + verifyColumnSpec("name text CHECK NOT NULL AND LENGTH() > 1"); + } + + @Test + public void testAddingCheckToColumnWithMask() + { + createTable("CREATE TABLE %s (pk text primary key, name text MASKED WITH system.mask_default());"); + execute("ALTER TABLE %s ALTER name CHECK NOT NULL AND LENGTH() > 1"); + verifyColumnSpec("name text MASKED WITH system.mask_default() CHECK NOT NULL AND LENGTH() > 1"); + } + + @Test + public void testAddingMaskToColumnWithCheck() + { + createTable("CREATE TABLE %s (pk text primary key, name text CHECK NOT NULL AND LENGTH() > 1);"); + execute("ALTER TABLE %s ALTER name MASKED WITH system.mask_default()"); + verifyColumnSpec("name text MASKED WITH system.mask_default() CHECK NOT NULL AND LENGTH() > 1"); + } + + @Test + public void testDroppingCheckKeepsMask() + { + createTable("CREATE TABLE %s (pk text primary key, name text MASKED WITH system.mask_default() CHECK NOT NULL AND LENGTH() > 1);"); + execute("ALTER TABLE %s ALTER name DROP CHECK"); + verifyColumnSpec("name text MASKED WITH system.mask_default()"); + } + + @Test + public void droppingMaskKeepsCheck() + { + createTable("CREATE TABLE %s (pk text primary key, name text MASKED WITH system.mask_default() CHECK NOT NULL AND LENGTH() > 1);"); + execute("ALTER TABLE %s ALTER name DROP MASKED"); + verifyColumnSpec("name text CHECK NOT NULL AND LENGTH() > 1"); + } + + @Test + public void testAlterTableAddColumnWithCheck() + { + createTable("CREATE TABLE %s (pk text primary key);"); + execute("ALTER TABLE %s ADD name text CHECK NOT NULL AND LENGTH() > 1"); + verifyColumnSpec("name text CHECK NOT NULL AND LENGTH() > 1"); + } + + @Test + public void testAlterTableAddColumnWithMask() + { + createTable("CREATE TABLE %s (pk text primary key);"); + execute("ALTER TABLE %s ADD name text MASKED WITH system.mask_default()"); + verifyColumnSpec("name text MASKED WITH system.mask_default()"); + } + + @Test + public void testAlterTableAddColumnWithMaskAndCheck() + { + createTable("CREATE TABLE %s (pk text primary key);"); + execute("ALTER TABLE %s ADD name text MASKED WITH system.mask_default() CHECK NOT NULL"); + verifyColumnSpec("name text MASKED WITH system.mask_default() CHECK NOT NULL"); + } + + @Test + public void testAlterTableAddColumnWithMaskAndMultipleChecks() + { + createTable("CREATE TABLE %s (pk text primary key);"); + execute("ALTER TABLE %s ADD name text MASKED WITH system.mask_default() CHECK NOT NULL AND LENGTH() > 1"); + verifyColumnSpec("name text MASKED WITH system.mask_default() CHECK NOT NULL AND LENGTH() > 1"); + } + + /** + * TODO - investigate if it is possible to specify checks before mask when creating a table + */ + @Test(expected = RuntimeException.class) + public void testFailingCreateTableWithColumnHavingMaskAfterCheck() + { + createTable("CREATE TABLE %s (pk text primary key, name text CHECK NOT NULL AND LENGTH() > 1 MASKED WITH system.mask_default());"); + } + + /** + * TODO - investigate if it is possible to specify both check and mask, check being first + */ + @Test(expected = RuntimeException.class) + public void testFailingAlterTableAlterColumnWithCheckAndMask() + { + createTable("CREATE TABLE %s (pk text, name text, primary key (pk));"); + execute("ALTER TABLE %s ALTER name CHECK NOT NULL AND LENGTH() > 1 MASKED WITH system.mask_default();"); + verifyColumnSpec("name text MASKED WITH system.mask_default() CHECK NOT NULL AND LENGTH() > 1"); + } + + /** + * TODO - investigate if it is possible to specify both check and mask, mask being first + */ + @Test(expected = RuntimeException.class) + public void testFailingAlterTableAlterColumnWithMaskAndCheck() + { + createTable("CREATE TABLE %s (pk text, name text, primary key (pk));"); + execute("ALTER TABLE %s ALTER name MASKED WITH system.mask_default() CHECK NOT NULL AND LENGTH() > 1"); + verifyColumnSpec("name text MASKED WITH system.mask_default() CHECK NOT NULL AND LENGTH() > 1"); + } + + private void verifyColumnSpec(String modifiedColumn) + { + assertRowsContains(executeNetWithoutPaging("DESCRIBE TABLE " + KEYSPACE + '.' + currentTable()), + row(KEYSPACE, + "table", + currentTable(), + "CREATE TABLE " + KEYSPACE + '.' + currentTable() + " (\n" + + " pk text PRIMARY KEY,\n" + + " " + modifiedColumn + '\n' + + ") WITH " + tableParametersCql())); + } + + static String tableParametersCql() + { + return "additional_write_policy = '99p'\n" + + " AND allow_auto_snapshot = true\n" + + " AND bloom_filter_fp_chance = 0.01\n" + + " AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}\n" + + " AND cdc = false\n" + + " AND comment = ''\n" + + " AND compaction = " + cqlQuoted(CompactionParams.DEFAULT.asMap()) + '\n' + + " AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}\n" + + " AND memtable = 'default'\n" + + " AND crc_check_chance = 1.0\n" + + " AND fast_path = 'keyspace'\n" + + " AND default_time_to_live = 0\n" + + " AND extensions = {}\n" + + " AND gc_grace_seconds = 864000\n" + + " AND incremental_backups = true\n" + + " AND max_index_interval = 2048\n" + + " AND memtable_flush_period_in_ms = 0\n" + + " AND min_index_interval = 128\n" + + " AND read_repair = 'BLOCKING'\n" + + " AND transactional_mode = 'off'\n" + + " AND transactional_migration_from = 'none'\n" + + " AND speculative_retry = '99p';"; + } + + private static String cqlQuoted(Map map) + { + return new CqlBuilder().append(map).toString(); + } +} diff --git a/test/unit/org/apache/cassandra/cql3/KnownIssue.java b/test/unit/org/apache/cassandra/cql3/KnownIssue.java index be2dfe75248c..19378ce496f1 100644 --- a/test/unit/org/apache/cassandra/cql3/KnownIssue.java +++ b/test/unit/org/apache/cassandra/cql3/KnownIssue.java @@ -43,6 +43,12 @@ public enum KnownIssue "When doing an SAI query, if the where clause also contains a vector column bad results can be produced"), CAS_CONDITION_ON_UDT_W_EMPTY_BYTES("https://issues.apache.org/jira/browse/CASSANDRA-20479", "WHERE clause blocks operations on UDTs but CAS allows in IF clause. During this path empty can be confused with null which allows non-existing rows to match empty bytes"), + CAS_ON_STATIC_ROW("", + "When you do a CAS to the partition level the read is SELECT statics LIMIT 1, if the CAS doesn't apply the response includes the first row in the partition with its values redacted... this statement is partition level and not row level, would expect just the applied column like the other cases where the static row isn't present"), + STATIC_LIST_APPEND_WITH_CLUSTERING_IN("", + "When an 'UPDATE SET s += [0] WHERE pk = ? AND ck IN (?, ?)' happens the static operation happens twice, so the list append adds 2 elements!"), + ACCORD_JOURNAL_SUPPORT_DROP_TABLE("", + "When DROP TABLE is done, this is currently not plumbed through to Journals snapshot logic, which leads to unknown keyspace/table errors"), ; KnownIssue(String url, String description) diff --git a/test/unit/org/apache/cassandra/cql3/NodeLocalConsistencyTest.java b/test/unit/org/apache/cassandra/cql3/NodeLocalConsistencyTest.java index a9f3d267df63..be9fcb613674 100644 --- a/test/unit/org/apache/cassandra/cql3/NodeLocalConsistencyTest.java +++ b/test/unit/org/apache/cassandra/cql3/NodeLocalConsistencyTest.java @@ -22,12 +22,13 @@ import org.apache.cassandra.config.CassandraRelevantProperties; +import static org.junit.Assert.assertEquals; + import static org.apache.cassandra.db.ConsistencyLevel.NODE_LOCAL; import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.readMetrics; import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.readMetricsForLevel; import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.writeMetrics; import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.writeMetricsForLevel; -import static org.junit.Assert.assertEquals; public class NodeLocalConsistencyTest extends CQLTester { @@ -35,6 +36,7 @@ public class NodeLocalConsistencyTest extends CQLTester public static void setUp() throws Exception { CassandraRelevantProperties.ENABLE_NODELOCAL_QUERIES.setBoolean(true); + requireNetwork(); } @Test @@ -87,4 +89,18 @@ public void testSelect() assertEquals(1, afterLevel - beforeLevel); assertEquals(1, afterGlobal - beforeGlobal); } -} \ No newline at end of file + + @Test + public void testTransaction() + { + createTable("CREATE TABLE %s (key text, val int, PRIMARY KEY(key)) WITH transactional_mode='full'"); + QueryProcessor.process(formatQuery("INSERT INTO %s (key, val) VALUES ('foo', 0)"), NODE_LOCAL); + + String query = "BEGIN TRANSACTION\n" + + " SELECT * FROM %s WHERE key = 'foo';\n" + + "COMMIT TRANSACTION"; + + UntypedResultSet rows = QueryProcessor.process(formatQuery(query), NODE_LOCAL); + assertEquals(1, rows.size()); + } +} diff --git a/test/unit/org/apache/cassandra/cql3/PreparedStatementsTest.java b/test/unit/org/apache/cassandra/cql3/PreparedStatementsTest.java index 39c641daa278..47cae295d879 100644 --- a/test/unit/org/apache/cassandra/cql3/PreparedStatementsTest.java +++ b/test/unit/org/apache/cassandra/cql3/PreparedStatementsTest.java @@ -22,9 +22,13 @@ import java.util.Collections; import java.util.EnumSet; import java.util.List; +import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; +import com.google.common.util.concurrent.Uninterruptibles; +import org.junit.Assume; import org.junit.Before; +import org.junit.BeforeClass; import org.junit.Test; import com.datastax.driver.core.Cluster; @@ -32,6 +36,9 @@ import com.datastax.driver.core.ResultSet; import com.datastax.driver.core.Session; import com.datastax.driver.core.exceptions.SyntaxError; +import com.datastax.driver.core.exceptions.WriteTimeoutException; +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.exceptions.PreparedQueryNotFoundException; import org.apache.cassandra.index.StubIndex; @@ -39,10 +46,16 @@ import org.apache.cassandra.serializers.Int32Serializer; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.service.ClientWarn; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.transport.SimpleClient; import org.apache.cassandra.transport.messages.ResultMessage; +import org.assertj.core.api.Assertions; +import static org.apache.cassandra.service.consensus.TransactionalMode.test_unsafe; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; @@ -55,10 +68,39 @@ public class PreparedStatementsTest extends CQLTester " WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };"; private static final String dropKsStatement = "DROP KEYSPACE IF EXISTS " + KEYSPACE; + @BeforeClass + public static void setUpClass() + { + ServerTestUtils.daemonInitialization(); + DatabaseDescriptor.setAccordTransactionsEnabled(true); + CQLTester.setUpClass(); + } + @Before public void setup() { requireNetwork(); + for (int i=0; i<10; i++) + ClusterMetadataService.instance().log().waitForHighestConsecutive(); + } + + private static void runAndAwaitNextEpoch(Runnable runnable) + { + try + { + Epoch current = ClusterMetadata.current().epoch; + runnable.run(); + ClusterMetadataService.instance().awaitAtLeast(Epoch.create(current.getEpoch() + 1)); + } + catch (Throwable e) + { + throw new RuntimeException(e); + } + } + + private static void sessionSchemaUpdate(Session session, String update) + { + runAndAwaitNextEpoch(() -> session.execute(update)); } @Test @@ -156,33 +198,42 @@ else if (expectWarn) public void testInvalidatePreparedStatementsOnDrop() { Session session = sessionNet(ProtocolVersion.V5); - session.execute(dropKsStatement); - session.execute(createKsStatement); + sessionSchemaUpdate(session, dropKsStatement); + sessionSchemaUpdate(session, createKsStatement); - String createTableStatement = "CREATE TABLE IF NOT EXISTS " + KEYSPACE + ".qp_cleanup (id int PRIMARY KEY, cid int, val text);"; + String createTableStatement = "CREATE TABLE IF NOT EXISTS " + KEYSPACE + ".qp_cleanup (id int PRIMARY KEY, cid int, val text) WITH transactional_mode='" + test_unsafe + "';"; String dropTableStatement = "DROP TABLE IF EXISTS " + KEYSPACE + ".qp_cleanup;"; - session.execute(createTableStatement); + sessionSchemaUpdate(session, createTableStatement); + + String insert = "INSERT INTO " + KEYSPACE + ".qp_cleanup (id, cid, val) VALUES (?, ?, ?)"; + PreparedStatement prepared = session.prepare(insert); + PreparedStatement preparedBatch = session.prepare(batch(insert)); + PreparedStatement preparedTxn = session.prepare(txn(insert)); + preparedTxn.setConsistencyLevel(com.datastax.driver.core.ConsistencyLevel.QUORUM); + + sessionSchemaUpdate(session, dropTableStatement); + sessionSchemaUpdate(session, createTableStatement); + updateTxnState(); - PreparedStatement prepared = session.prepare("INSERT INTO " + KEYSPACE + ".qp_cleanup (id, cid, val) VALUES (?, ?, ?)"); - PreparedStatement preparedBatch = session.prepare("BEGIN BATCH " + - "INSERT INTO " + KEYSPACE + ".qp_cleanup (id, cid, val) VALUES (?, ?, ?);" + - "APPLY BATCH;"); - session.execute(dropTableStatement); - session.execute(createTableStatement); session.execute(prepared.bind(1, 1, "value")); session.execute(preparedBatch.bind(2, 2, "value2")); + session.execute(preparedTxn.bind(3, 3, "value3")); - session.execute(dropKsStatement); - session.execute(createKsStatement); - session.execute(createTableStatement); + sessionSchemaUpdate(session, dropTableStatement); // since this is an accord table, need to drop the table before the keyspace + sessionSchemaUpdate(session, dropKsStatement); + sessionSchemaUpdate(session, createKsStatement); + sessionSchemaUpdate(session, createTableStatement); + updateTxnState(); // The driver will get a response about the prepared statement being invalid, causing it to transparently // re-prepare the statement. We'll rely on the fact that we get no errors while executing this to show that // the statements have been invalidated. session.execute(prepared.bind(1, 1, "value")); session.execute(preparedBatch.bind(2, 2, "value2")); - session.execute(dropKsStatement); + session.execute(preparedTxn.bind(3, 3, "value3")); + sessionSchemaUpdate(session, dropTableStatement); // since this is an accord table, need to drop the table before the keyspace + sessionSchemaUpdate(session, dropKsStatement); } @Test @@ -200,14 +251,18 @@ public void testInvalidatePreparedStatementOnAlterV4() private void testInvalidatePreparedStatementOnAlter(ProtocolVersion version, boolean supportsMetadataChange) { Session session = sessionNet(version); - String createTableStatement = "CREATE TABLE IF NOT EXISTS " + KEYSPACE + ".qp_cleanup (a int PRIMARY KEY, b int, c int);"; + String createTableStatement = "CREATE TABLE IF NOT EXISTS " + KEYSPACE + ".qp_cleanup (a int PRIMARY KEY, b int, c int) WITH transactional_mode='" + test_unsafe + "';"; String alterTableStatement = "ALTER TABLE " + KEYSPACE + ".qp_cleanup ADD d int;"; + String dropTableStatement = "DROP TABLE IF EXISTS " + KEYSPACE + ".qp_cleanup;"; - session.execute(dropKsStatement); - session.execute(createKsStatement); - session.execute(createTableStatement); + sessionSchemaUpdate(session, dropKsStatement); + sessionSchemaUpdate(session, createKsStatement); + sessionSchemaUpdate(session, createTableStatement); + updateTxnState(); - PreparedStatement preparedSelect = session.prepare("SELECT * FROM " + KEYSPACE + ".qp_cleanup"); + String select = "SELECT * FROM " + KEYSPACE + ".qp_cleanup"; + PreparedStatement preparedSelect = session.prepare(select); + PreparedStatement preparedSelectTxn = session.prepare(txn(select + " WHERE a = ?")); session.execute("INSERT INTO " + KEYSPACE + ".qp_cleanup (a, b, c) VALUES (?, ?, ?);", 1, 2, 3); session.execute("INSERT INTO " + KEYSPACE + ".qp_cleanup (a, b, c) VALUES (?, ?, ?);", @@ -216,8 +271,14 @@ private void testInvalidatePreparedStatementOnAlter(ProtocolVersion version, boo assertRowsNet(session.execute(preparedSelect.bind()), row(1, 2, 3), row(2, 3, 4)); + assertRowsNet(session.execute(preparedSelectTxn.bind(1)), + row(1, 2, 3)); + assertRowsNet(session.execute(preparedSelectTxn.bind(2)), + row(2, 3, 4)); + + sessionSchemaUpdate(session, alterTableStatement); + updateTxnState(); - session.execute(alterTableStatement); session.execute("INSERT INTO " + KEYSPACE + ".qp_cleanup (a, b, c, d) VALUES (?, ?, ?, ?);", 3, 4, 5, 6); @@ -231,18 +292,36 @@ private void testInvalidatePreparedStatementOnAlter(ProtocolVersion version, boo row(2, 3, 4, null), row(3, 4, 5, 6)); assertEquals(rs.getColumnDefinitions().size(), 4); + + for (int i = 1; i <= 3; i++) + { + rs = session.execute(preparedSelectTxn.bind(i)); + assertRowsNet(version, + rs, + row(i, i + 1, i + 2, i == 3 ? 6 : null)); + assertEquals(rs.getColumnDefinitions().size(), 4); + } } else { rs = session.execute(preparedSelect.bind()); - assertRowsNet(rs, + assertRowsNet(version, + rs, row(1, 2, 3), row(2, 3, 4), row(3, 4, 5)); - assertEquals(rs.getColumnDefinitions().size(), 3); + assertEquals(3, rs.getColumnDefinitions().size()); + for (int i = 1; i <= 3; i++) + { + rs = session.execute(preparedSelectTxn.bind(i)); + Assertions.assertThat(columnNames(rs)) + .containsExactlyInAnyOrder("a", "b", "c"); + assertRowsNet(version, rs, row(i, i + 1, i + 2)); + } } - session.execute(dropKsStatement); + sessionSchemaUpdate(session, dropTableStatement); + sessionSchemaUpdate(session, dropKsStatement); } @Test @@ -260,60 +339,84 @@ public void testInvalidatePreparedStatementOnAlterUnchangedMetadataV5() private void testInvalidatePreparedStatementOnAlterUnchangedMetadata(ProtocolVersion version) { Session session = sessionNet(version); - String createTableStatement = "CREATE TABLE IF NOT EXISTS " + KEYSPACE + ".qp_cleanup (a int PRIMARY KEY, b int, c int);"; + String createTableStatement = "CREATE TABLE IF NOT EXISTS " + KEYSPACE + ".qp_cleanup (a int PRIMARY KEY, b int, c int) WITH transactional_mode='" + test_unsafe + "';"; String alterTableStatement = "ALTER TABLE " + KEYSPACE + ".qp_cleanup ADD d int;"; + String dropTableStatement = "DROP TABLE IF EXISTS " + KEYSPACE + ".qp_cleanup;"; - session.execute(dropKsStatement); - session.execute(createKsStatement); - session.execute(createTableStatement); + sessionSchemaUpdate(session, dropKsStatement); + sessionSchemaUpdate(session, createKsStatement); + sessionSchemaUpdate(session, createTableStatement); + updateTxnState(); - PreparedStatement preparedSelect = session.prepare("SELECT a, b, c FROM " + KEYSPACE + ".qp_cleanup"); + String select = "SELECT a, b, c FROM " + KEYSPACE + ".qp_cleanup"; + PreparedStatement preparedSelect = session.prepare(select); + PreparedStatement preparedSelectTxn = session.prepare(txn(select + " WHERE a = ?")); session.execute("INSERT INTO " + KEYSPACE + ".qp_cleanup (a, b, c) VALUES (?, ?, ?);", 1, 2, 3); session.execute("INSERT INTO " + KEYSPACE + ".qp_cleanup (a, b, c) VALUES (?, ?, ?);", 2, 3, 4); ResultSet rs = session.execute(preparedSelect.bind()); - assertRowsNet(rs, row(1, 2, 3), row(2, 3, 4)); assertEquals(rs.getColumnDefinitions().size(), 3); - session.execute(alterTableStatement); + for (int i = 1; i <= 2; i++) + { + rs = session.execute(preparedSelectTxn.bind(i)); + assertRowsNet(rs, row(i, i + 1, i + 2)); + Assertions.assertThat(columnNames(rs)).containsExactlyInAnyOrder("a", "b", "c"); + } + + sessionSchemaUpdate(session, alterTableStatement); + updateTxnState(); + session.execute("INSERT INTO " + KEYSPACE + ".qp_cleanup (a, b, c, d) VALUES (?, ?, ?, ?);", 3, 4, 5, 6); rs = session.execute(preparedSelect.bind()); - assertRowsNet(rs, + assertRowsNet(version, rs, row(1, 2, 3), row(2, 3, 4), row(3, 4, 5)); assertEquals(rs.getColumnDefinitions().size(), 3); - session.execute(dropKsStatement); + for (int i = 1; i <= 3; i++) + { + rs = session.execute(preparedSelectTxn.bind(i)); + assertRowsNet(rs, row(i, i + 1, i + 2)); + Assertions.assertThat(columnNames(rs)).containsExactlyInAnyOrder("a", "b", "c"); + } + + sessionSchemaUpdate(session, dropTableStatement); + sessionSchemaUpdate(session, dropKsStatement); } @Test - public void testStatementRePreparationOnReconnect() + public void testStatementRePreparationOnReconnect() throws Throwable { Session session = sessionNet(ProtocolVersion.V5); session.execute("USE " + keyspace()); - session.execute(dropKsStatement); - session.execute(createKsStatement); - - createTable("CREATE TABLE %s (id int PRIMARY KEY, cid int, val text);"); - + sessionSchemaUpdate(session, dropKsStatement); + sessionSchemaUpdate(session, createKsStatement); + runAndAwaitNextEpoch(() -> createTable("CREATE TABLE %s (id int PRIMARY KEY, cid int, val text) WITH transactional_mode='" + test_unsafe + "';")); + updateTxnState(); String insertCQL = "INSERT INTO " + currentTable() + " (id, cid, val) VALUES (?, ?, ?)"; String selectCQL = "Select * from " + currentTable() + " where id = ?"; PreparedStatement preparedInsert = session.prepare(insertCQL); PreparedStatement preparedSelect = session.prepare(selectCQL); + PreparedStatement preparedTxn = session.prepare(txn(selectCQL, insertCQL)); + preparedTxn.setConsistencyLevel(com.datastax.driver.core.ConsistencyLevel.QUORUM); session.execute(preparedInsert.bind(1, 1, "value")); assertEquals(1, session.execute(preparedSelect.bind(1)).all().size()); + // txn will return state before mutations are applied, so null result + assertRowsNet(ProtocolVersion.V5, + session.execute(preparedTxn.bind(2, 2, 2, "value2"))); try (Cluster newCluster = Cluster.builder() .addContactPoints(nativeAddr) @@ -328,46 +431,50 @@ public void testStatementRePreparationOnReconnect() newSession.execute("USE " + keyspace()); preparedInsert = newSession.prepare(insertCQL); preparedSelect = newSession.prepare(selectCQL); - newSession.execute(preparedInsert.bind(1, 1, "value")); + newSession.execute(preparedInsert.bind(1, 1, "value")); assertEquals(1, newSession.execute(preparedSelect.bind(1)).all().size()); + + assertRowsNet(ProtocolVersion.V5, + session.execute(preparedTxn.bind(2, 2, 2, "value2")), + row(2, 2, "value2")); } } } @Test - public void prepareAndExecuteWithCustomExpressions() throws Throwable + public void prepareAndExecuteWithCustomExpressions() { Session session = sessionNet(ProtocolVersion.V5); - session.execute(dropKsStatement); - session.execute(createKsStatement); + sessionSchemaUpdate(session, dropKsStatement); + sessionSchemaUpdate(session, createKsStatement); String table = "custom_expr_test"; String index = "custom_index"; - session.execute(String.format("CREATE TABLE IF NOT EXISTS %s.%s (id int PRIMARY KEY, cid int, val text);", + sessionSchemaUpdate(session, String.format("CREATE TABLE IF NOT EXISTS %s.%s (id int PRIMARY KEY, cid int, val text) WITH transactional_mode='" + test_unsafe + "';", KEYSPACE, table)); - session.execute(String.format("CREATE CUSTOM INDEX %s ON %s.%s(val) USING '%s'", + sessionSchemaUpdate(session, String.format("CREATE CUSTOM INDEX %s ON %s.%s(val) USING '%s'", index, KEYSPACE, table, StubIndex.class.getName())); - session.execute(String.format("INSERT INTO %s.%s(id, cid, val) VALUES (0, 0, 'test')", KEYSPACE, table)); - - PreparedStatement prepared1 = session.prepare(String.format("SELECT * FROM %s.%s WHERE expr(%s, 'foo')", - KEYSPACE, table, index)); - assertEquals(1, session.execute(prepared1.bind()).all().size()); + updateTxnState(); - PreparedStatement prepared2 = session.prepare(String.format("SELECT * FROM %s.%s WHERE expr(%s, ?)", - KEYSPACE, table, index)); - assertEquals(1, session.execute(prepared2.bind("foo bar baz")).all().size()); + session.execute(String.format("INSERT INTO %s.%s(id, cid, val) VALUES (0, 0, 'test')", KEYSPACE, table)); - try - { - session.prepare(String.format("SELECT * FROM %s.%s WHERE expr(?, 'foo bar baz')", KEYSPACE, table)); - fail("Expected syntax exception, but none was thrown"); - } - catch(SyntaxError e) - { - assertEquals("Bind variables cannot be used for index names", e.getMessage()); - } + String select = String.format("SELECT * FROM %s.%s WHERE expr(%s, 'foo')", KEYSPACE, table, index); + assertEquals(1, session.execute(session.prepare(select).bind()).all().size()); + assertEquals(1, session.execute(session.prepare(txn(select + " AND id = ?")).bind(0)).all().size()); + + String select2 = String.format("SELECT * FROM %s.%s WHERE expr(%s, ?)", KEYSPACE, table, index); + assertEquals(1, session.execute(session.prepare(select2).bind("foo bar baz")).all().size()); + assertEquals(1, session.execute(session.prepare(txn(select2 + " AND id = ?")).bind("foo bar baz", 0)).all().size()); + + String badSelect = String.format("SELECT * FROM %s.%s WHERE expr(?, 'foo bar baz')", KEYSPACE, table); + Assertions.assertThatThrownBy(() -> session.prepare(badSelect)) + .isInstanceOf(SyntaxError.class) + .hasMessage("Bind variables cannot be used for index names"); + Assertions.assertThatThrownBy(() -> session.prepare(txn(badSelect + " AND id = ?"))) + .isInstanceOf(SyntaxError.class) + .hasMessage("Bind variables cannot be used for index names"); } @Test @@ -386,7 +493,7 @@ public void testMetadataFlagsWithLWTs() throws Throwable // Note: this test does not cover all aspects of 10786 (yet) - it was intended to test the // changes for CASSANDRA-13992. - createTable("CREATE TABLE %s (pk int, v1 int, v2 int, PRIMARY KEY (pk))"); + runAndAwaitNextEpoch(() -> createTable("CREATE TABLE %s (pk int, v1 int, v2 int, PRIMARY KEY (pk))")); execute("INSERT INTO %s (pk, v1, v2) VALUES (1,1,1)"); try (SimpleClient simpleClient = newSimpleClient(ProtocolVersion.BETA.orElse(ProtocolVersion.CURRENT))) @@ -572,7 +679,7 @@ private void testPrepareWithLWT(ProtocolVersion version) throws Throwable { Session session = sessionNet(version); session.execute("USE " + keyspace()); - createTable("CREATE TABLE %s (pk int, v1 int, v2 int, PRIMARY KEY (pk))"); + runAndAwaitNextEpoch(() -> createTable("CREATE TABLE %s (pk int, v1 int, v2 int, PRIMARY KEY (pk))")); PreparedStatement prepared1 = session.prepare(String.format("UPDATE %s SET v1 = ?, v2 = ? WHERE pk = 1 IF v1 = ?", currentTable())); PreparedStatement prepared2 = session.prepare(String.format("INSERT INTO %s (pk, v1, v2) VALUES (?, 200, 300) IF NOT EXISTS", currentTable())); @@ -636,7 +743,7 @@ private void testPrepareWithBatchLWT(ProtocolVersion version) throws Throwable { Session session = sessionNet(version); session.execute("USE " + keyspace()); - createTable("CREATE TABLE %s (pk int, v1 int, v2 int, PRIMARY KEY (pk))"); + runAndAwaitNextEpoch(() -> createTable("CREATE TABLE %s (pk int, v1 int, v2 int, PRIMARY KEY (pk))")); PreparedStatement prepared1 = session.prepare("BEGIN BATCH " + "UPDATE " + currentTable() + " SET v1 = ? WHERE pk = 1 IF v1 = ?;" + @@ -671,11 +778,132 @@ private void testPrepareWithBatchLWT(ProtocolVersion version) throws Throwable row(false, 1, 10, 20)); assertEquals(rs.getColumnDefinitions().size(), 4); - alterTable("ALTER TABLE %s ADD v3 int;"); + runAndAwaitNextEpoch(() -> alterTable("ALTER TABLE %s ADD v3 int;")); rs = session.execute(prepared2.bind()); assertRowsNet(rs, row(false, 1, 10, 20, null)); assertEquals(rs.getColumnDefinitions().size(), 5); } + + @Test + public void testPrepareWithAccordV4() + { + testPrepareWithAccord(ProtocolVersion.V4); + } + + @Test + public void testPrepareWithAccordV5() + { + Assume.assumeTrue("Protocol v5 is CURRENT", ProtocolVersion.CURRENT != ProtocolVersion.V5); + testPrepareWithAccord(ProtocolVersion.V5); + } + + @Test + public void testPrepareWithAccordCurrent() + { + testPrepareWithAccord(ProtocolVersion.CURRENT); + } + + private void testPrepareWithAccord(ProtocolVersion version) + { + int maxAttempts = 3; + Session session = sessionNet(version); + session.execute("USE " + keyspace()); + runAndAwaitNextEpoch(() -> createTable("CREATE TABLE %s (pk int, v1 int, v2 int, PRIMARY KEY (pk)) WITH transactional_mode='full'")); + updateTxnState(); + + PreparedStatement writeOnly = session.prepare(txn( + "INSERT INTO " + currentTable() + " (pk, v1, v2) VALUES (?, ?, ?)" + )); + PreparedStatement returnSelect = session.prepare(txn( + "SELECT * FROM " + currentTable() + " WHERE pk=?", + "UPDATE " + currentTable() + " SET v1 += 1, v2 += 2 WHERE pk = ?" + )); + PreparedStatement returnRef = session.prepare(txn( + "LET a = (SELECT * FROM " + currentTable() + " WHERE pk=?)", + "SELECT a.pk, a.v1, a.v2", + "UPDATE " + currentTable() + " SET v1 += 1, v2 += 2 WHERE pk = ?" + )); + // populate every row + int numPartitions = 5; + int[][] model = new int[numPartitions][]; + for (int writePk = 0; writePk < numPartitions; writePk++) + { + model[writePk] = new int[] {0, 0}; + assertRowsNet(version, session.execute(writeOnly.bind(writePk, 0, 0))); + } + + for (int writePk = 0; writePk < numPartitions; writePk++) + { + for (int readPk = 0; readPk < numPartitions; readPk++) + { + int[] expected = model[readPk]; + int[] mutated = model[writePk]; + for (boolean select : Arrays.asList(true, false)) + { + for (int retries = 0; retries < maxAttempts; retries++) + { + try + { + ResultSet rs = session.execute(select ? returnSelect.bind(readPk, writePk) + : returnRef.bind(readPk, writePk)); + assertRowsNet(version, rs, row(readPk, expected[0], expected[1])); + break; + } + catch (WriteTimeoutException e) + { + logger.warn("Write timeout seen", e); + if (retries >= maxAttempts - 1) throw e; + Uninterruptibles.sleepUninterruptibly(500, TimeUnit.MILLISECONDS); + } + finally + { + // update to account for counter bumps + mutated[0]++; + mutated[1] = mutated[1] + 2; + } + } + } + } + } + } + + private static String txn(String... stmts) + { + StringBuilder sb = new StringBuilder(); + sb.append("BEGIN TRANSACTION\n"); + for (String stmt : stmts) + { + sb.append(" ").append(stmt); + if (!stmt.endsWith(";")) sb.append(';'); + sb.append('\n'); + } + sb.append("COMMIT TRANSACTION"); + return sb.toString(); + } + + private static String batch(String... stmts) + { + StringBuilder sb = new StringBuilder(); + sb.append("BEGIN BATCH\n"); + for (String stmt : stmts) + { + sb.append(" ").append(stmt); + if (!stmt.endsWith(";")) sb.append(';'); + sb.append('\n'); + } + sb.append("APPLY BATCH"); + return sb.toString(); + } + + private static List columnNames(ResultSet rs) + { + return rs.getColumnDefinitions().asList().stream().map(d -> d.getName()).collect(Collectors.toList()); + } + + private static void updateTxnState() + { + AccordService.instance().setCacheSize(0); + } } diff --git a/test/unit/org/apache/cassandra/cql3/PstmtPersistenceTest.java b/test/unit/org/apache/cassandra/cql3/PstmtPersistenceTest.java index 910651091d1c..59dfdadf752d 100644 --- a/test/unit/org/apache/cassandra/cql3/PstmtPersistenceTest.java +++ b/test/unit/org/apache/cassandra/cql3/PstmtPersistenceTest.java @@ -21,34 +21,65 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import org.junit.runner.RunWith; +import org.apache.cassandra.db.ReadQuery; import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.db.marshal.Int32Type; import org.apache.cassandra.db.marshal.UTF8Type; import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.schema.SchemaKeyspaceTables; +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.transport.Dispatcher; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.MD5Digest; +import org.jboss.byteman.contrib.bmunit.BMRule; +import org.jboss.byteman.contrib.bmunit.BMRules; +import org.jboss.byteman.contrib.bmunit.BMUnitRunner; import static java.util.Collections.emptyMap; import static org.apache.cassandra.service.QueryState.forInternalCalls; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +@RunWith(BMUnitRunner.class) public class PstmtPersistenceTest extends CQLTester { + private static final CompletableFuture[] futureArray = new CompletableFuture[0]; + + private static final ConcurrentMap preparedStatementLoadTimestamps = new ConcurrentHashMap<>(); + private static final ConcurrentMap preparedStatementRemoveTimestamps = new ConcurrentHashMap<>(); + + // page size passed to preloadPreparedStatements + private static final int PRELOAD_PAGE_SIZE = 100; + + // recorded page invocations in preloadPreparedStatements + private static final AtomicInteger pageInvocations = new AtomicInteger(); + @Before public void setUp() { + preparedStatementLoadTimestamps.clear(); + preparedStatementRemoveTimestamps.clear(); + QueryProcessor.clearPreparedStatements(false); } - + @Test public void testCachedPreparedStatements() throws Throwable { @@ -105,7 +136,7 @@ public void testCachedPreparedStatements() throws Throwable Assert.assertNotNull(prepared); } - // add anther prepared statement and sync it to table + // add another prepared statement and sync it to table prepareStatement(statement2, "foo", "bar", clientState); // statement1 will have two statements prepared because of `setKeyspace` usage @@ -143,12 +174,24 @@ public void testPstmtInvalidation() throws Throwable createTable("CREATE TABLE %s (key int primary key, val int)"); + long initialEvicted = numberOfEvictedStatements(); + for (int cnt = 1; cnt < 10000; cnt++) { prepareStatement("INSERT INTO %s (key, val) VALUES (?, ?) USING TIMESTAMP " + cnt, clientState); - if (numberOfEvictedStatements() > 0) + if (numberOfEvictedStatements() - initialEvicted > 0) { + assertEquals("Number of statements in table and in cache don't match", numberOfStatementsInMemory(), numberOfStatementsOnDisk()); + + // prepare more statements to trigger more evictions + for (int cnt2 = cnt + 1; cnt2 < cnt + 10; cnt2++) + prepareStatement("INSERT INTO %s (key, val) VALUES (?, ?) USING TIMESTAMP " + cnt2, clientState); + + // each new prepared statement should have caused an eviction + assertEquals("eviction count didn't increase by the expected number", 10, numberOfEvictedStatements() - initialEvicted); + assertEquals("Number of statements in memory (expected) and table (actual) don't match", numberOfStatementsInMemory(), numberOfStatementsOnDisk()); + return; } } @@ -156,6 +199,196 @@ public void testPstmtInvalidation() throws Throwable fail("Prepared statement eviction does not work"); } + @Test + @BMRules(rules= { + @BMRule(name = "CaptureWriteTimestamps", + targetClass = "SystemKeyspace", + targetMethod = "writePreparedStatement(String, MD5Digest, String, long)", + targetLocation = "AT INVOKE executeInternal", + action = "org.apache.cassandra.cql3.PstmtPersistenceTest.preparedStatementLoadTimestamps.put($key, $timestamp);" + ), + @BMRule(name = "CaptureEvictTimestamps", + targetClass = "QueryProcessor", + targetMethod = "evictPreparedStatement(MD5Digest, RemovalCause)", + action = "org.apache.cassandra.cql3.PstmtPersistenceTest.preparedStatementRemoveTimestamps.put($key, org.apache.cassandra.service.ClientState.getTimestamp());" + ) + }) + public void testAsyncPstmtInvalidation() throws Throwable + { + ClientState clientState = ClientState.forInternalCalls(); + createTable("CREATE TABLE %s (key int primary key, val int)"); + + // prepare statements concurrently in a thread pool to exercise bug encountered in CASSANDRA-19703 where + // delete from table occurs before the insert due to early eviction. + final ExecutorService executor = Executors.newFixedThreadPool(10); + + long initialEvicted = numberOfEvictedStatements(); + try + { + int initialMaxStatementsToPrepare = 10000; + int maxStatementsToPrepare = initialMaxStatementsToPrepare; + boolean hasEvicted = false; + int concurrency = 100; + List> prepareFutures = new ArrayList<>(concurrency); + + for (int cnt = 1; cnt <= maxStatementsToPrepare; cnt++) + { + final int localCnt = cnt; + prepareFutures.add(CompletableFuture.supplyAsync(() -> prepareStatement("INSERT INTO %s (key, val) VALUES (?, ?) USING TIMESTAMP " + localCnt, clientState), executor)); + + if (prepareFutures.size() == concurrency) + { + // Await completion of current inflight futures + CompletableFuture.allOf(prepareFutures.toArray(futureArray)).get(10, TimeUnit.SECONDS); + prepareFutures.clear(); + } + + // Once we've detected evictions, prepare as many statements as we've prepared so far to initialMaxStatementsToPrepare and then stop. + if (!hasEvicted && numberOfEvictedStatements() - initialEvicted > 0) + { + maxStatementsToPrepare = Math.min(cnt * 2, initialMaxStatementsToPrepare); + hasEvicted = true; + } + } + + long evictedStatements = numberOfEvictedStatements() - initialEvicted; + assertNotEquals("Should have evicted some prepared statements", 0, evictedStatements); + + // Recorded prepared statement removals should match metrics + assertEquals("Actual evicted statements does not match metrics", evictedStatements, preparedStatementRemoveTimestamps.size()); + + // For each prepared statement evicted, assert the time it was deleted is greater than the timestamp + // used for when it was loaded. + for (Map.Entry evictedStatementEntry : preparedStatementRemoveTimestamps.entrySet()) + { + MD5Digest key = evictedStatementEntry.getKey(); + long deletionTimestamp = evictedStatementEntry.getValue(); + long insertionTimestamp = preparedStatementLoadTimestamps.get(key); + + assertTrue(String.format("Expected deletion timestamp for prepared statement (%d) to be greater than insertion timestamp (%d)", + deletionTimestamp, insertionTimestamp), + deletionTimestamp > insertionTimestamp); + } + + // ensure the number of statements on disk match the number in memory, if number of statements on disk eclipses in memory, there was a leak. + assertEquals("Number of statements in memory (expected) and table (actual) don't match", numberOfStatementsInMemory(), numberOfStatementsOnDisk()); + } + finally + { + executor.shutdown(); + } + } + + /** + * Invoked whenever paging happens in testPreloadPreparedStatements, increments PAGE_INVOCATIONS when we detect + * paging happening in the path of QueryProcessor.preloadPreparedStatements with the expected page size. + */ + @SuppressWarnings("unused") + private static void nextPageReadQuery(ReadQuery query, int pageSize) + { + TableMetadata metadata = query.metadata(); + if (metadata.keyspace.equals(SchemaConstants.SYSTEM_KEYSPACE_NAME) && + metadata.name.equals(SystemKeyspace.PREPARED_STATEMENTS) && + pageSize == PRELOAD_PAGE_SIZE) + { + for (StackTraceElement stackTraceElement : Thread.currentThread().getStackTrace()) + { + if (stackTraceElement.getClassName().equals(QueryProcessor.class.getName()) && stackTraceElement.getMethodName().equals("preloadPreparedStatements")) + { + pageInvocations.incrementAndGet(); + return; + } + } + } + } + + @Test + @BMRule(name = "CapturePageInvocations", + targetClass = "PartitionRangeQueryPager", + targetMethod = "nextPageReadQuery(int)", + action = "org.apache.cassandra.cql3.PstmtPersistenceTest.nextPageReadQuery($this.query, $pageSize)") + public void testPreloadPreparedStatements() throws Throwable + { + ClientState clientState = ClientState.forInternalCalls(); + createTable("CREATE TABLE %s (key int primary key, val int)"); + + // Prepare more statements than the paging size to ensure paging works properly. + int statementsToPrepare = 750; + + for (int cnt = 1; cnt <= statementsToPrepare; cnt++) + { + prepareStatement("INSERT INTO %s (key, val) VALUES (?, ?) USING TIMESTAMP " + cnt, clientState); + } + + // Capture how many statements are in memory before clearing cache. + long statementsInMemory = numberOfStatementsInMemory(); + long statementsOnDisk = numberOfStatementsOnDisk(); + assertEquals(statementsOnDisk, statementsInMemory); + + // Drop prepared statements from cache only and ensure the cache empties out. + QueryProcessor.clearPreparedStatements(true); + assertEquals(0, numberOfStatementsInMemory()); + + // Load prepared statements and ensure the cache size matches max + QueryProcessor.instance.preloadPreparedStatements(PRELOAD_PAGE_SIZE); + + long statementsInMemoryAfterLoading = numberOfStatementsInMemory(); + // Ensure size of cache matches statements that were on disk before preload + assertEquals("Statements prepared - evicted (expected) does not match statements in memory (actual)", + statementsOnDisk, statementsInMemoryAfterLoading); + + // Number of statements on disk shold match memory + assertEquals(statementsInMemoryAfterLoading, numberOfStatementsOnDisk()); + + // Ensure only executed the expected amount of pages. + int expectedPageInvocations = (int) Math.ceil(statementsInMemoryAfterLoading / (double) PRELOAD_PAGE_SIZE); + assertEquals(expectedPageInvocations, pageInvocations.get()); + } + + @Test + public void testPreloadPreparedStatementsUntilCacheFull() + { + QueryHandler handler = ClientState.getCQLQueryHandler(); + ClientState clientState = ClientState.forInternalCalls(); + createTable("CREATE TABLE %s (key int primary key, val int)"); + + // Fill up and clear the prepared statement cache several times to load up the system.prepared_statements table. + // This simulates a 'leak' of prepared statements akin to CASSANDRA-19703 as the system.prepared_statements + // table is able to grow to a larger size than the in memory prepared statement cache. In such a case we + // should detect a possible leak and defer paging indefinitely by returning early in preloadPreparedStatements. + int statementsLoadedWhenFull = -1; + long accumulatedSize = 0; + // load enough prepared statements to fill the cache 5 times. + for (int cnt = 0; accumulatedSize < QueryProcessor.PREPARED_STATEMENT_CACHE_SIZE_BYTES * 5; cnt++) + { + MD5Digest id = prepareStatement("INSERT INTO %s (key, val) VALUES (?, ?) USING TIMESTAMP " + cnt, clientState); + QueryHandler.Prepared prepared = handler.getPrepared(id); + assertTrue(prepared.pstmntSize > -1); + accumulatedSize += prepared.pstmntSize; + if (statementsLoadedWhenFull == -1 && accumulatedSize > QueryProcessor.PREPARED_STATEMENT_CACHE_SIZE_BYTES) + { + statementsLoadedWhenFull = cnt; + } + // clear cache repeatedly to avoid eviction. + QueryProcessor.clearPreparedStatements(true); + } + + + int preloadedStatements = QueryProcessor.instance.preloadPreparedStatements(PRELOAD_PAGE_SIZE); + + // Should have loaded as many statements as we detected were loaded before cache would be full. + assertTrue(String.format("Preloaded %d statements, expected at least %d", + preloadedStatements, statementsLoadedWhenFull), + preloadedStatements > statementsLoadedWhenFull); + + // We should only expect to load how many statements we were able to load before filling the cache + // + a buffer of 110%, set to 1.5x just to deal with sensitivity of detecting cache filling up. + int atMostPreloadedExpected = (int) (statementsLoadedWhenFull * 1.5); + assertTrue(String.format("Preloaded %d statements, but only expected that we'd load at most %d", + preloadedStatements, atMostPreloadedExpected), + preloadedStatements <= atMostPreloadedExpected); + } + private long numberOfStatementsOnDisk() throws Throwable { UntypedResultSet.Row row = execute("SELECT COUNT(*) FROM " + SchemaConstants.SYSTEM_KEYSPACE_NAME + '.' + SystemKeyspace.PREPARED_STATEMENTS).one(); @@ -179,7 +412,6 @@ private MD5Digest prepareStatement(String stmt, ClientState clientState) private MD5Digest prepareStatement(String stmt, String keyspace, String table, ClientState clientState) { - System.out.println(stmt + String.format(stmt, keyspace + "." + table)); - return QueryProcessor.instance.prepare(String.format(stmt, keyspace + "." + table), clientState).statementId; + return QueryProcessor.instance.prepare(String.format(stmt, keyspace + '.' + table), clientState).statementId; } } diff --git a/test/unit/org/apache/cassandra/cql3/RandomSchemaTest.java b/test/unit/org/apache/cassandra/cql3/RandomSchemaTest.java index 00abdc9ca166..075fbf8e91df 100644 --- a/test/unit/org/apache/cassandra/cql3/RandomSchemaTest.java +++ b/test/unit/org/apache/cassandra/cql3/RandomSchemaTest.java @@ -20,27 +20,18 @@ import java.io.IOException; import java.nio.ByteBuffer; -import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; -import java.util.Deque; -import java.util.HashSet; import java.util.Iterator; import java.util.List; -import java.util.NavigableMap; -import java.util.Set; -import java.util.TreeMap; import java.util.stream.Collectors; import com.google.common.collect.ImmutableList; import org.junit.Assert; import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.db.marshal.UserType; import org.apache.cassandra.io.sstable.format.SSTableFormat; import org.apache.cassandra.io.util.DataInputBuffer; import org.apache.cassandra.io.util.DataOutputBuffer; @@ -65,8 +56,6 @@ public class RandomSchemaTest extends CQLTester.InMemory { - private static final Logger logger = LoggerFactory.getLogger(RandomSchemaTest.class); - static { // make sure blob is always the same @@ -80,20 +69,17 @@ public void test() { // in accord branch there is a much cleaner api for this pattern... Gen domainGen = SourceDSL.integers().between(1, 100).map(i -> i < 2 ? AbstractTypeGenerators.ValueDomain.NULL : i < 4 ? AbstractTypeGenerators.ValueDomain.EMPTY_BYTES : AbstractTypeGenerators.ValueDomain.NORMAL); - // make sure ordering is determanstic, else repeatability breaks - NavigableMap> formats = new TreeMap<>(DatabaseDescriptor.getSSTableFormats()); - Gen> ssTableFormatGen = SourceDSL.arbitrary().pick(new ArrayList<>(formats.values())); + + Gen> sstableFormatGen = CassandraGenerators.sstableFormat(); qt().checkAssert(random -> { resetSchema(); // TODO : when table level override of sstable format is allowed, migrate to that - SSTableFormat sstableFormat = ssTableFormatGen.generate(random); - DatabaseDescriptor.setSelectedSSTableFormat(sstableFormat); + DatabaseDescriptor.setSelectedSSTableFormat(sstableFormatGen.generate(random)); Gen udtName = Generators.unique(IDENTIFIER_GEN); TypeGenBuilder withoutUnsafeEquality = AbstractTypeGenerators.withoutUnsafeEquality() - .withUserTypeKeyspace(KEYSPACE) .withUDTNames(udtName); TableMetadata metadata = new TableMetadataBuilder() .withKeyspaceName(KEYSPACE) @@ -101,7 +87,6 @@ public void test() .withKnownMemtables() .withDefaultTypeGen(AbstractTypeGenerators.builder() .withoutEmpty() - .withUserTypeKeyspace(KEYSPACE) .withMaxDepth(2) .withDefaultSetKey(withoutUnsafeEquality) .withoutTypeKinds(AbstractTypeGenerators.TypeKind.COUNTER) @@ -187,36 +172,6 @@ private void serde(ClusterMetadata metadata, TableMetadata tableMetadata) throws } } - private void maybeCreateUDTs(TableMetadata metadata) - { - Set udts = CassandraGenerators.extractUDTs(metadata); - if (!udts.isEmpty()) - { - Deque pending = new ArrayDeque<>(udts); - Set created = new HashSet<>(); - while (!pending.isEmpty()) - { - UserType next = pending.poll(); - Set subTypes = AbstractTypeGenerators.extractUDTs(next); - subTypes.remove(next); // it includes self - if (subTypes.isEmpty() || subTypes.stream().allMatch(t -> created.contains(t.name))) - { - String cql = next.toCqlString(true, false, false); - logger.warn("Creating UDT {}", cql); - schemaChange(cql); - created.add(next.name); - } - else - { - logger.warn("Unable to create UDT {}; following sub-types still not created: {}", - next.getCqlTypeName(), - subTypes.stream().filter(t -> !created.contains(t.name)).collect(Collectors.toSet())); - pending.add(next); - } - } - } - } - private static int primaryColumnCount(TableMetadata metadata) { return metadata.partitionKeyColumns().size() + metadata.clusteringColumns().size(); diff --git a/test/unit/org/apache/cassandra/cql3/ReservedKeywordsTest.java b/test/unit/org/apache/cassandra/cql3/ReservedKeywordsTest.java index eb860ebda02f..fc891bfba4a3 100644 --- a/test/unit/org/apache/cassandra/cql3/ReservedKeywordsTest.java +++ b/test/unit/org/apache/cassandra/cql3/ReservedKeywordsTest.java @@ -18,10 +18,14 @@ package org.apache.cassandra.cql3; +import java.lang.reflect.Modifier; + import org.junit.Test; import org.junit.Assert; + import org.apache.cassandra.exceptions.SyntaxException; +import org.assertj.core.api.SoftAssertions; public class ReservedKeywordsTest { @@ -30,14 +34,51 @@ public void testReservedWordsForColumns() { for (String reservedWord : ReservedKeywords.reservedKeywords) { - try - { - QueryProcessor.parseStatement(String.format("ALTER TABLE ks.t ADD %s TEXT", reservedWord)); + if (isAllowed(reservedWord)) Assert.fail(String.format("Reserved keyword %s should not have parsed", reservedWord)); - } - catch (SyntaxException ignore) - { - } + } + } + + @Test + public void parserAndTextFileMatch() + { + // If this test starts to fail that means that the lexer added a new keyword, and this keyword was not updated + // to be unreserved. + // + // To mark a keyword as unreserved, open "Parser.g" and search for + // basic_unreserved_keyword returns [String str] + // or + // unreserved_keyword returns [String str] + // Add your keyword there and rebuild the jar (to generate the parser). + // + // If it is desired to make this keyword reserved, then you must first go to the mailing list and request a vote + // on this change, if that vote passes then you can update "reserved_keywords.txt" (and pylib/cqlshlib/cqlhandling.py::cql_keywords_reserved). + // Never update "reserved_keywords.txt" without a vote on the mailing list! + SoftAssertions asserts = new SoftAssertions(); + for (var f : Cql_Lexer.class.getDeclaredFields()) + { + if (!Modifier.isStatic(f.getModifiers())) continue; + if (!f.getName().startsWith("K_")) continue; + String name = f.getName(); + String keyword = name.replaceFirst("K_", ""); + + asserts.assertThat(ReservedKeywords.isReserved(keyword)) + .describedAs(keyword) + .isEqualTo(!isAllowed(keyword)); + } + asserts.assertAll(); + } + + private static boolean isAllowed(String keyword) + { + try + { + QueryProcessor.parseStatement(String.format("ALTER TABLE ks.t ADD %s TEXT", keyword)); + return true; + } + catch (SyntaxException ignore) + { + return false; } } } diff --git a/test/unit/org/apache/cassandra/cql3/SimpleQueryTest.java b/test/unit/org/apache/cassandra/cql3/SimpleQueryTest.java index 0c89f9b599cb..fe160f0af6ad 100644 --- a/test/unit/org/apache/cassandra/cql3/SimpleQueryTest.java +++ b/test/unit/org/apache/cassandra/cql3/SimpleQueryTest.java @@ -559,5 +559,17 @@ public void testSStableTimestampOrdering() throws Throwable execute("DELETE FROM %s USING TIMESTAMP 6 WHERE k1 = 1"); assertRows(execute("SELECT * FROM %s WHERE k1=1"), row(1, 1, 2)); - } + } + + @Test + public void testTokenRestriction() + { + createTable("CREATE TABLE %s (id int primary key)"); + for (int i = 0; i < 10; i++) + execute("INSERT INTO %s (id) values (?)", i); + + assertRows(execute("SELECT * FROM %s where token(id) > 0 AND token(id) < " + Long.MIN_VALUE), row(7), row(6), row(9), row(3)); + assertRows(execute("SELECT * FROM %s where token(id) > 0 AND token(id) <= " + Long.MIN_VALUE), row(7), row(6), row(9), row(3)); + assertRows(execute("SELECT * FROM %s where token(id) BETWEEN 0 AND " + Long.MIN_VALUE), row(7), row(6), row(9), row(3)); + } } diff --git a/test/unit/org/apache/cassandra/cql3/StatementSourceTest.java b/test/unit/org/apache/cassandra/cql3/StatementSourceTest.java new file mode 100644 index 000000000000..b6362747d575 --- /dev/null +++ b/test/unit/org/apache/cassandra/cql3/StatementSourceTest.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.cql3; + +import org.junit.Test; + +import org.antlr.runtime.Token; +import org.mockito.Mockito; + +import static org.apache.cassandra.cql3.StatementSource.create; +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.when; + +public class StatementSourceTest +{ + private static Token token(int line, int pos) + { + Token token = Mockito.mock(Token.class); + when(token.getLine()).thenReturn(line); + when(token.getCharPositionInLine()).thenReturn(pos); + when(token.getType()).thenReturn(1); + return token; + } + + private static Token eof() + { + Token token = Mockito.mock(Token.class); + when(token.getLine()).thenThrow(UnsupportedOperationException.class); + when(token.getCharPositionInLine()).thenThrow(UnsupportedOperationException.class); + when(token.getType()).thenReturn(Token.EOF); + return token; + } + + @Test + public void test() + { + assertThat(create(token(1, 4))).hasToString("at [1:5]"); + assertThat(create(token(3, 8))).hasToString("at [3:9]"); + assertThat(create(token(6, 8))).hasToString("at [6:9]"); + assertThat(create(token(1, 0))).hasToString("at [1:1]"); + assertThat(create(eof()).toString()).isEmpty(); + + assertThat(StatementSource.INTERNAL).hasToString("<<>>"); + } +} diff --git a/test/unit/org/apache/cassandra/cql3/UntypedResultSetTest.java b/test/unit/org/apache/cassandra/cql3/UntypedResultSetTest.java new file mode 100644 index 000000000000..4973cae87de2 --- /dev/null +++ b/test/unit/org/apache/cassandra/cql3/UntypedResultSetTest.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.cql3; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +import org.junit.Test; + +import accord.utils.Gen; +import accord.utils.Gens; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.DoubleType; +import org.apache.cassandra.utils.AbstractTypeGenerators; +import org.apache.cassandra.utils.Generators; + +import static accord.utils.Property.qt; +import static org.apache.cassandra.utils.AccordGenerators.fromQT; +import static org.assertj.core.api.Assertions.assertThat; + +public class UntypedResultSetTest +{ + @Test + public void rowToString() + { + qt().forAll(row()).check(row -> { + String str = row.toString(); + assertThat(str.split(" \\| ")).hasSize(row.getColumns().size()); + assertThat(str).doesNotContain("null"); + }); + } + + @Test + public void resultSetToString() + { + qt().forAll(resultSet().map(UntypedResultSet::create)).check(rs -> { + String str = rs.toStringUnsafe(); + assertThat(str.split("\n")).describedAs("toStringUnsafe expected to return different size of rows", str).hasSize(rs.size() + 2); // header + footer + }); + } + + private static Gen> columns() + { + Gen identifierGen = fromQT(Generators.IDENTIFIER_GEN); + // this is testing toString so don't really need a complex type... + return rs -> { + int numColumns = rs.nextInt(1, 10); + String ks = identifierGen.next(rs); + String tableName = identifierGen.next(rs); + List names = Gens.lists(identifierGen).unique().ofSize(numColumns).next(rs); + // rather than generate the type, use a simple type like double as it doesn't matter... the type is not epxected to be parsable, so conflicts in output format doen't matter + List> types = names.stream().map(ignore -> DoubleType.instance).collect(Collectors.toList()); + List columns = new ArrayList<>(numColumns); + for (int i = 0; i < numColumns; i++) + columns.add(new ColumnSpecification(ks, tableName, new ColumnIdentifier(names.get(i), true), types.get(i))); + return columns; + }; + } + + private static Gen row() + { + return columns().flatMap(columns -> rs -> { + List data = new ArrayList<>(columns.size()); + for (int i = 0; i < columns.size(); i++) + { + AbstractTypeGenerators.TypeSupport support = AbstractTypeGenerators.getTypeSupport(columns.get(i).type); + data.add(fromQT(support.bytesGen()).next(rs)); + } + return new UntypedResultSet.Row(columns, data); + }); + } + + private static Gen resultSet() + { + Gen> columnsGen = columns(); + return rs -> { + ResultSet result = new ResultSet(new ResultSet.ResultMetadata(columnsGen.next(rs))); + List> dataGens = result.metadata.names.stream().map(c -> fromQT(AbstractTypeGenerators.getTypeSupport(c.type).bytesGen())).collect(Collectors.toList()); + int numRows = rs.nextInt(0, 10); + for (int i = 0; i < numRows; i++) + { + List row = dataGens.stream().map(g -> g.next(rs)).collect(Collectors.toList()); + result.addRow(row); + } + return result; + }; + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/cql3/ast/AssignmentOperator.java b/test/unit/org/apache/cassandra/cql3/ast/AssignmentOperator.java index e3918f70da7c..499b36551f48 100644 --- a/test/unit/org/apache/cassandra/cql3/ast/AssignmentOperator.java +++ b/test/unit/org/apache/cassandra/cql3/ast/AssignmentOperator.java @@ -123,4 +123,10 @@ public Expression visit(Visitor v) if (r == right) return this; return new AssignmentOperator(kind, r); } + + @Override + public String toString() + { + return debugCQL(); + } } diff --git a/test/unit/org/apache/cassandra/cql3/ast/CasCondition.java b/test/unit/org/apache/cassandra/cql3/ast/CasCondition.java index d0d4d0e35b09..c60e27a85df2 100644 --- a/test/unit/org/apache/cassandra/cql3/ast/CasCondition.java +++ b/test/unit/org/apache/cassandra/cql3/ast/CasCondition.java @@ -24,6 +24,11 @@ public interface CasCondition extends Element { CasCondition visit(Visitor v); + default String debugCQL() + { + return visit(StandardVisitors.DEBUG).toCQL(); + } + enum Simple implements CasCondition { NotExists("IF NOT EXISTS"), @@ -80,5 +85,11 @@ public CasCondition visit(Visitor v) if (c == conditional) return this; return new IfCondition(c); } + + @Override + public String toString() + { + return toCQL(); + } } } diff --git a/test/unit/org/apache/cassandra/cql3/ast/Conditional.java b/test/unit/org/apache/cassandra/cql3/ast/Conditional.java index 66012c060cba..4fc8a2085085 100644 --- a/test/unit/org/apache/cassandra/cql3/ast/Conditional.java +++ b/test/unit/org/apache/cassandra/cql3/ast/Conditional.java @@ -50,6 +50,11 @@ default Conditional visit(Visitor v) return v.visit(this); } + default String debugCQL() + { + return visit(StandardVisitors.DEBUG).toCQL(); + } + default List simplify() { return Collections.singletonList(this); diff --git a/test/unit/org/apache/cassandra/cql3/ast/CreateIndexDDL.java b/test/unit/org/apache/cassandra/cql3/ast/CreateIndexDDL.java index ce86d6bbb500..3cdf2eba0cc4 100644 --- a/test/unit/org/apache/cassandra/cql3/ast/CreateIndexDDL.java +++ b/test/unit/org/apache/cassandra/cql3/ast/CreateIndexDDL.java @@ -23,12 +23,18 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.Set; import java.util.stream.Stream; +import com.google.common.collect.ImmutableSet; + import org.apache.cassandra.cql3.ast.Symbol.UnquotedSymbol; import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.AsciiType; +import org.apache.cassandra.db.marshal.BooleanType; import org.apache.cassandra.db.marshal.CompositeType; import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.db.marshal.UUIDType; import org.apache.cassandra.index.sai.StorageAttachedIndex; import org.apache.cassandra.index.sai.utils.IndexTermType; import org.apache.cassandra.schema.ColumnMetadata; @@ -101,6 +107,10 @@ public boolean supported(TableMetadata table, ColumnMetadata column) } }; + private static final Set> SAI_EQ_ONLY = ImmutableSet.of(UTF8Type.instance, AsciiType.instance, + BooleanType.instance, + UUIDType.instance); + public static final Indexer SAI = new Indexer() { @Override diff --git a/test/unit/org/apache/cassandra/cql3/ast/Elements.java b/test/unit/org/apache/cassandra/cql3/ast/Elements.java index f750c4efc10d..4713c30c15be 100644 --- a/test/unit/org/apache/cassandra/cql3/ast/Elements.java +++ b/test/unit/org/apache/cassandra/cql3/ast/Elements.java @@ -18,6 +18,8 @@ package org.apache.cassandra.cql3.ast; +import java.util.stream.Stream; + public final class Elements { private Elements() @@ -29,4 +31,11 @@ public static void newLine(StringBuilder sb, int indent) for (int i = 0; i < indent; i++) sb.append(' '); } + + public static Stream symbols(Element element) + { + return element.streamRecursive(true) + .filter(e -> e instanceof Symbol) + .map(e -> (Symbol) e); + } } diff --git a/test/unit/org/apache/cassandra/cql3/ast/Expression.java b/test/unit/org/apache/cassandra/cql3/ast/Expression.java index 96bc41a78c9e..9a0e554968df 100644 --- a/test/unit/org/apache/cassandra/cql3/ast/Expression.java +++ b/test/unit/org/apache/cassandra/cql3/ast/Expression.java @@ -32,4 +32,9 @@ default Expression visit(Visitor v) { return v.visit(this); } + + default String debugCQL() + { + return visit(StandardVisitors.DEBUG).toCQL(); + } } diff --git a/test/unit/org/apache/cassandra/cql3/ast/ExpressionTest.java b/test/unit/org/apache/cassandra/cql3/ast/ExpressionTest.java index 333f3dec8e1d..a2805e85690c 100644 --- a/test/unit/org/apache/cassandra/cql3/ast/ExpressionTest.java +++ b/test/unit/org/apache/cassandra/cql3/ast/ExpressionTest.java @@ -122,4 +122,4 @@ private static Gen expressions() return rs.nextBoolean() ? Literal.of(value) : Bind.of(value); }; } -} \ No newline at end of file +} diff --git a/test/unit/org/apache/cassandra/cql3/ast/Literal.java b/test/unit/org/apache/cassandra/cql3/ast/Literal.java index 4bd2f9b6319c..eb6d83df41e0 100644 --- a/test/unit/org/apache/cassandra/cql3/ast/Literal.java +++ b/test/unit/org/apache/cassandra/cql3/ast/Literal.java @@ -23,6 +23,7 @@ import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.BytesType; import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.LongType; import org.apache.cassandra.db.marshal.StringType; public class Literal implements Value @@ -41,6 +42,11 @@ public static Literal of(int value) return new Literal(value, Int32Type.instance); } + public static Literal of(long value) + { + return new Literal(value, LongType.instance); + } + @Override public AbstractType type() { diff --git a/test/unit/org/apache/cassandra/cql3/ast/Mutation.java b/test/unit/org/apache/cassandra/cql3/ast/Mutation.java index 95987dc657ac..7c764853e5b5 100644 --- a/test/unit/org/apache/cassandra/cql3/ast/Mutation.java +++ b/test/unit/org/apache/cassandra/cql3/ast/Mutation.java @@ -27,10 +27,10 @@ import java.util.List; import java.util.Optional; import java.util.Set; +import java.util.function.Function; import java.util.stream.Stream; import javax.annotation.Nullable; -import org.apache.cassandra.cql3.ColumnIdentifier; import org.apache.cassandra.db.marshal.Int32Type; import org.apache.cassandra.db.marshal.LongType; import org.apache.cassandra.schema.ColumnMetadata; @@ -50,6 +50,23 @@ protected Mutation(Kind kind, TableReference table) this.table = table; } + public Insert asInsert() + { + return (Insert) this; + } + + public Update asUpdate() + { + return (Update) this; + } + + public Delete asDelete() + { + return (Delete) this; + } + + public abstract long timestampOrDefault(long defaultValue); + public abstract boolean isCas(); public abstract Mutation withoutTimestamp(); @@ -61,6 +78,9 @@ public Mutation withTimestamp(long timestamp) public abstract Mutation withTimestamp(Timestamp timestamp); + + public abstract Optional casCondition(); + public final Kind mutationKind() { return kind; @@ -161,6 +181,13 @@ public Stream stream() { return Stream.of(value); } + + public long get() + { + if (value.value() instanceof Long) + return (long) value.value(); + return LongType.instance.compose(value.valueEncoded()); + } } public static class Using implements Element @@ -168,15 +195,24 @@ public static class Using implements Element public final Optional ttl; public final Optional timestamp; - public Using(Optional ttl, Optional timestamp) + private Using(Optional ttl, Optional timestamp) { this.ttl = ttl; this.timestamp = timestamp; + if (ttl.isEmpty() && timestamp.isEmpty()) + throw new IllegalStateException("Empty USING isnt allowed"); + } + + public static Optional create(Optional ttl, Optional timestamp) + { + if (ttl.isEmpty() && timestamp.isEmpty()) return Optional.empty(); + return Optional.of(new Using(ttl, timestamp)); } - public Using withoutTimestamp() + public Optional withoutTimestamp() { - return new Using(ttl, Optional.empty()); + if (ttl.isEmpty()) return Optional.empty(); + return Optional.of(new Using(ttl, Optional.empty())); } public Using withTimestamp(Timestamp timestamp) @@ -187,8 +223,6 @@ public Using withTimestamp(Timestamp timestamp) @Override public void toCQL(StringBuilder sb, CQLFormatter formatter) { - if (ttl.isEmpty() && timestamp.isEmpty()) - return; sb.append("USING "); if (ttl.isPresent()) ttl.get().toCQL(sb, formatter); @@ -227,6 +261,16 @@ public Insert(TableReference table, LinkedHashMap values, bo this.using = using; } + @Override + public long timestampOrDefault(long defaultValue) + { + if (using.isEmpty()) return defaultValue; + var opt = using.get().timestamp; + if (opt.isEmpty()) return defaultValue; + var timestamp = opt.get(); + return timestamp.get(); + } + @Override public void toCQL(StringBuilder sb, CQLFormatter formatter) { @@ -311,7 +355,7 @@ public Mutation withoutTimestamp() { return new Insert(table, values, ifNotExists, using.isEmpty() ? using - : using.map(u -> u.withoutTimestamp())); + : using.flatMap(u -> u.withoutTimestamp())); } @Override @@ -321,6 +365,12 @@ public Insert withTimestamp(Timestamp timestamp) ? Optional.of(new Using(Optional.empty(), Optional.of(timestamp))) : using.map(u -> u.withTimestamp(timestamp))); } + + @Override + public Optional casCondition() + { + return ifNotExists ? Optional.of(CasCondition.Simple.NotExists) : Optional.empty(); + } } public static class Update extends Mutation @@ -339,6 +389,16 @@ public Update(TableReference table, Optional using, LinkedHashMap u.withoutTimestamp()), set, where, casCondition); + return new Update(table, using.isEmpty() ? using : using.flatMap(u -> u.withoutTimestamp()), set, where, casCondition); } @Override @@ -455,6 +515,12 @@ public Update withTimestamp(Timestamp timestamp) : using.map(u -> u.withTimestamp(timestamp)); return new Update(table, updated, set, where, casCondition); } + + @Override + public Optional casCondition() + { + return casCondition; + } } public static class Delete extends Mutation @@ -477,6 +543,15 @@ public Delete(List columns, this.casCondition = casCondition; } + @Override + public long timestampOrDefault(long defaultValue) + { + var opt = timestamp; + if (opt.isEmpty()) return defaultValue; + var timestamp = opt.get(); + return timestamp.get(); + } + /* DELETE [column_name (term)][, ...] FROM [keyspace_name.] table_name @@ -585,6 +660,12 @@ public Delete withTimestamp(Timestamp timestamp) { return new Delete(columns, table, Optional.of(timestamp), where, casCondition); } + + @Override + public Optional casCondition() + { + return casCondition; + } } public static abstract class BaseBuilder> implements Conditional.EqBuilderPlus @@ -612,6 +693,11 @@ protected BaseBuilder(Kind kind, TableMetadata table) neededPks.addAll(partitionColumns); } + protected Symbol find(String name) + { + return allColumns.stream().filter(s -> s.symbol.equals(name)).findAny().get(); + } + public abstract T build(); @Override @@ -678,6 +764,11 @@ public InsertBuilder ifNotExists() return this; } + public InsertBuilder timestamp(long value) + { + return timestamp(Literal.of(value)); + } + public InsertBuilder timestamp(Value value) { this.timestamp = new Timestamp(value); @@ -727,6 +818,11 @@ protected UpdateBuilder(TableMetadata table) super(Kind.UPDATE, table); } + public UpdateBuilder timestamp(long value) + { + return timestamp(Literal.of(value)); + } + public UpdateBuilder timestamp(Value value) { this.timestamp = new Timestamp(value); @@ -766,18 +862,32 @@ public UpdateBuilder set(Symbol column, Expression value) public UpdateBuilder set(String column, int value) { - return set(new Symbol(column, Int32Type.instance), Bind.of(value)); + Symbol symbol = find(column); + if (!symbol.type().equals(Int32Type.instance)) + throw new AssertionError("Expected int type but given " + symbol.type().asCQL3Type()); + return set(symbol, Bind.of(value)); + } + + public UpdateBuilder set(String column, Object value) + { + Symbol symbol = find(column); + return set(symbol, new Bind(value, symbol.type())); } public UpdateBuilder set(String column, Expression expression) { - Symbol symbol = new Symbol(metadata.getColumn(new ColumnIdentifier(column, true))); - return set(symbol, expression); + return set(find(column), expression); + } + + public UpdateBuilder set(String column, Function fn) + { + Symbol symbol = find(column); + return set(symbol, fn.apply(symbol)); } public UpdateBuilder set(String column, String value) { - Symbol symbol = new Symbol(metadata.getColumn(new ColumnIdentifier(column, true))); + Symbol symbol = find(column); return set(symbol, new Bind(symbol.type().asCQL3Type().fromCQLLiteral(value), symbol.type())); } @@ -857,9 +967,15 @@ public List columns() return Collections.unmodifiableList(columns); } + public DeleteBuilder columns(String... names) + { + Stream.of(names).map(this::find).forEach(this::column); + return this; + } + public DeleteBuilder column(String columnName) { - return column(Symbol.from(metadata.getColumn(new ColumnIdentifier(columnName, true)))); + return column(find(columnName)); } public DeleteBuilder column(Symbol symbol) @@ -881,6 +997,11 @@ public DeleteBuilder column(List symbols) return this; } + public DeleteBuilder timestamp(long value) + { + return timestamp(Literal.of(value)); + } + public DeleteBuilder timestamp(Value value) { this.timestamp = new Timestamp(value); diff --git a/test/unit/org/apache/cassandra/cql3/ast/Operator.java b/test/unit/org/apache/cassandra/cql3/ast/Operator.java index d0baa10ec01c..35d745142d63 100644 --- a/test/unit/org/apache/cassandra/cql3/ast/Operator.java +++ b/test/unit/org/apache/cassandra/cql3/ast/Operator.java @@ -103,4 +103,10 @@ public Expression visit(Visitor v) if (left == this.left && right == this.right) return this; return new Operator(kind, left, right); } + + @Override + public String toString() + { + return visit(StandardVisitors.DEBUG).toCQL(); + } } diff --git a/test/unit/org/apache/cassandra/cql3/ast/Select.java b/test/unit/org/apache/cassandra/cql3/ast/Select.java index 28134dde8e95..10d98dee6305 100644 --- a/test/unit/org/apache/cassandra/cql3/ast/Select.java +++ b/test/unit/org/apache/cassandra/cql3/ast/Select.java @@ -29,6 +29,7 @@ import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.ImmutableUniqueList; public class Select implements Statement { @@ -479,11 +480,15 @@ public Builder table(TableMetadata table) public static class TableBasedBuilder extends BaseBuilder implements Conditional.ConditionalBuilderPlus { private final TableMetadata metadata; + private final ImmutableUniqueList columns; public TableBasedBuilder(TableMetadata metadata) { this.metadata = metadata; source = Optional.of(TableReference.from(metadata)); + var builder = ImmutableUniqueList.builder(); + metadata.allColumnsInSelectOrder().forEachRemaining(c -> builder.add(Symbol.from(c))); + columns = builder.buildAndClear(); } @Override @@ -491,5 +496,15 @@ public TableMetadata metadata() { return metadata; } + + private Symbol find(String name) + { + return columns.stream().filter(s -> s.symbol.equals(name)).findAny().get(); + } + + public TableBasedBuilder columnSelection(String name) + { + return selection(find(name)); + } } } diff --git a/test/unit/org/apache/cassandra/cql3/ast/StandardVisitors.java b/test/unit/org/apache/cassandra/cql3/ast/StandardVisitors.java index 4cbf3d989f03..85c8a8bc7040 100644 --- a/test/unit/org/apache/cassandra/cql3/ast/StandardVisitors.java +++ b/test/unit/org/apache/cassandra/cql3/ast/StandardVisitors.java @@ -30,6 +30,16 @@ public Value visit(Value v) return new Literal(b.value(), b.type()); } }; + public static final Visitor LITERAL_TO_BIND = new Visitor() + { + @Override + public Value visit(Value v) + { + if (!(v instanceof Literal)) return v; + Literal b = (Literal) v; + return new Bind(b.value(), b.type()); + } + }; public static final Visitor UNWRAP_TYPE_HINT = new Visitor() { diff --git a/test/unit/org/apache/cassandra/cql3/ast/Statement.java b/test/unit/org/apache/cassandra/cql3/ast/Statement.java index ffcba03465b8..fd9a9ab2a964 100644 --- a/test/unit/org/apache/cassandra/cql3/ast/Statement.java +++ b/test/unit/org/apache/cassandra/cql3/ast/Statement.java @@ -51,12 +51,17 @@ default String detailedToString() { Object[] binds = binds(); return "CQL:\n" + toCQL() + "\nBinds:\n" + IntStream.range(0, binds.length) - .mapToObj(i -> i + " -> " + binds[i].getClass().getCanonicalName() + "(" + normalize(binds[i]) + ")") + .mapToObj(i -> i + " -> " + binds[i] == null ? "null" : binds[i].getClass().getCanonicalName() + "(" + normalize(binds[i]) + ")") .collect(Collectors.joining("\n")); } Statement visit(Visitor v); + default String debugCQL() + { + return visit(StandardVisitors.DEBUG).toCQL(); + } + static boolean hasByteBuffer(Object value) { if (value == null) diff --git a/test/unit/org/apache/cassandra/cql3/ast/Value.java b/test/unit/org/apache/cassandra/cql3/ast/Value.java index 92ef0101831d..ab467c3b8378 100644 --- a/test/unit/org/apache/cassandra/cql3/ast/Value.java +++ b/test/unit/org/apache/cassandra/cql3/ast/Value.java @@ -37,4 +37,9 @@ default Value visit(Visitor v) { return v.visit(this); } + + default String debugCQL() + { + return visit(StandardVisitors.DEBUG).toCQL(); + } } diff --git a/test/unit/org/apache/cassandra/cql3/ast/Visitor.java b/test/unit/org/apache/cassandra/cql3/ast/Visitor.java index c87415e98148..36fb3f33d63c 100644 --- a/test/unit/org/apache/cassandra/cql3/ast/Visitor.java +++ b/test/unit/org/apache/cassandra/cql3/ast/Visitor.java @@ -92,7 +92,7 @@ public static CompositeVisitor of(Visitor... visitors) public static CompositeVisitor of(List visitors) { - Invariants.checkArgument(!visitors.isEmpty(), "Visitors may not be empty"); + Invariants.requireArgument(!visitors.isEmpty(), "Visitors may not be empty"); if (Stream.of(visitors).noneMatch(v -> v instanceof CompositeVisitor)) return new CompositeVisitor(visitors); diff --git a/test/unit/org/apache/cassandra/cql3/conditions/ColumnConditionTest.java b/test/unit/org/apache/cassandra/cql3/conditions/ColumnConditionTest.java index 771c8c357064..ea19dd381d87 100644 --- a/test/unit/org/apache/cassandra/cql3/conditions/ColumnConditionTest.java +++ b/test/unit/org/apache/cassandra/cql3/conditions/ColumnConditionTest.java @@ -18,46 +18,80 @@ package org.apache.cassandra.cql3.conditions; import java.nio.ByteBuffer; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; -import org.apache.cassandra.cql3.terms.*; import org.junit.Assert; import org.junit.Test; -import org.apache.cassandra.cql3.*; +import accord.utils.Gen; +import accord.utils.Gens; +import accord.utils.RandomSource; +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.cql3.ColumnsExpression; +import org.apache.cassandra.cql3.FieldIdentifier; +import org.apache.cassandra.cql3.Operator; +import org.apache.cassandra.cql3.QueryOptions; import org.apache.cassandra.cql3.terms.Constants; +import org.apache.cassandra.cql3.terms.InMarker; +import org.apache.cassandra.cql3.terms.Marker; import org.apache.cassandra.cql3.terms.MultiElements; +import org.apache.cassandra.cql3.terms.Sets; import org.apache.cassandra.cql3.terms.Term; import org.apache.cassandra.cql3.terms.Terms; import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.Int32Type; import org.apache.cassandra.db.marshal.ListType; import org.apache.cassandra.db.marshal.MapType; import org.apache.cassandra.db.marshal.SetType; -import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.db.marshal.UserType; +import org.apache.cassandra.db.rows.BTreeRow; +import org.apache.cassandra.db.rows.BufferCell; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.CellPath; +import org.apache.cassandra.db.rows.Row; import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.io.util.DataOutputBuffer; import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.utils.AbstractTypeGenerators; +import org.apache.cassandra.utils.AbstractTypeGenerators.TypeKind; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.CassandraGenerators; +import org.apache.cassandra.utils.Generators; +import org.apache.cassandra.utils.Pair; import org.apache.cassandra.utils.TimeUUID; +import org.assertj.core.api.Assertions; +import org.quicktheories.generators.SourceDSL; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.fail; +import static accord.utils.Property.qt; import static java.util.Arrays.asList; - +import static org.apache.cassandra.cql3.Operator.CONTAINS; +import static org.apache.cassandra.cql3.Operator.CONTAINS_KEY; import static org.apache.cassandra.cql3.Operator.EQ; -import static org.apache.cassandra.cql3.Operator.NEQ; -import static org.apache.cassandra.cql3.Operator.LT; -import static org.apache.cassandra.cql3.Operator.LTE; import static org.apache.cassandra.cql3.Operator.GT; import static org.apache.cassandra.cql3.Operator.GTE; -import static org.apache.cassandra.cql3.Operator.CONTAINS; -import static org.apache.cassandra.cql3.Operator.CONTAINS_KEY; -import static org.apache.cassandra.cql3.conditions.ColumnCondition.Raw.simpleCondition; +import static org.apache.cassandra.cql3.Operator.LT; +import static org.apache.cassandra.cql3.Operator.LTE; +import static org.apache.cassandra.cql3.Operator.NEQ; import static org.apache.cassandra.cql3.conditions.ColumnCondition.Raw.collectionElementCondition; +import static org.apache.cassandra.cql3.conditions.ColumnCondition.Raw.simpleCondition; import static org.apache.cassandra.cql3.conditions.ColumnCondition.Raw.udtFieldCondition; import static org.apache.cassandra.utils.ByteBufferUtil.EMPTY_BYTE_BUFFER; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; public class ColumnConditionTest @@ -65,6 +99,17 @@ public class ColumnConditionTest public static final ByteBuffer ZERO = Int32Type.instance.fromString("0"); public static final ByteBuffer ONE = Int32Type.instance.fromString("1"); public static final ByteBuffer TWO = Int32Type.instance.fromString("2"); + public static final String KEYSPACE = "ks"; + public static final FieldIdentifier UDT_FIELD_A = FieldIdentifier.forUnquoted("a"); + public static final FieldIdentifier UDT_FIELD_B = FieldIdentifier.forUnquoted("b"); + public static final UserType UDT_FROZEN = new UserType(KEYSPACE, ByteBufferUtil.bytes("simple"), + Arrays.asList(UDT_FIELD_A, UDT_FIELD_B), + Arrays.asList(Int32Type.instance, Int32Type.instance), + false); + public static final UserType UDT_MULTI_CELL = new UserType(KEYSPACE, ByteBufferUtil.bytes("simple"), + Arrays.asList(UDT_FIELD_A, UDT_FIELD_B), + Arrays.asList(Int32Type.instance, Int32Type.instance), + true); private static Row newRow(ColumnMetadata definition, ByteBuffer value) { @@ -138,8 +183,8 @@ private static Row newRow(ColumnMetadata definition, Map private static boolean appliesSimpleCondition(ByteBuffer rowValue, Operator op, ByteBuffer conditionValue) { - ColumnMetadata definition = ColumnMetadata.regularColumn("ks", "cf", "c", Int32Type.instance); - ColumnsExpression column = ColumnsExpression.singleColumn(definition); + ColumnMetadata definition = ColumnMetadata.regularColumn("ks", "cf", "c", Int32Type.instance, ColumnMetadata.NO_UNIQUE_ID); + ColumnsExpression column = ColumnsExpression.singleColumn(definition, null); Terms terms = Terms.of(new Constants.Value(conditionValue)); ColumnCondition condition = new ColumnCondition(column, op, terms); ColumnCondition.Bound bound = condition.bind(QueryOptions.DEFAULT); @@ -149,8 +194,8 @@ private static boolean appliesSimpleCondition(ByteBuffer rowValue, Operator op, private static boolean appliesListCondition(List rowValue, Operator op, List conditionValue) { ListType type = ListType.getInstance(Int32Type.instance, true); - ColumnMetadata definition = ColumnMetadata.regularColumn("ks", "cf", "c", type); - ColumnsExpression column = ColumnsExpression.singleColumn(definition); + ColumnMetadata definition = ColumnMetadata.regularColumn("ks", "cf", "c", type, ColumnMetadata.NO_UNIQUE_ID); + ColumnsExpression column = ColumnsExpression.singleColumn(definition, null); Term term = conditionValue == null ? Constants.NULL_VALUE : new MultiElements.Value(type, conditionValue); ColumnCondition condition = new ColumnCondition(column, op, Terms.of(term)); ColumnCondition.Bound bound = condition.bind(QueryOptions.DEFAULT); @@ -159,8 +204,8 @@ private static boolean appliesListCondition(List rowValue, Operator private static boolean conditionContainsApplies(List rowValue, Operator op, ByteBuffer conditionValue) { - ColumnMetadata definition = ColumnMetadata.regularColumn("ks", "cf", "c", ListType.getInstance(Int32Type.instance, true)); - ColumnsExpression column = ColumnsExpression.singleColumn(definition); + ColumnMetadata definition = ColumnMetadata.regularColumn("ks", "cf", "c", ListType.getInstance(Int32Type.instance, true), ColumnMetadata.NO_UNIQUE_ID); + ColumnsExpression column = ColumnsExpression.singleColumn(definition, null); Terms terms = Terms.of(new Constants.Value(conditionValue)); ColumnCondition condition = new ColumnCondition(column, op, terms); ColumnCondition.Bound bound = condition.bind(QueryOptions.DEFAULT); @@ -169,8 +214,8 @@ private static boolean conditionContainsApplies(List rowValue, Opera private static boolean conditionContainsApplies(Map rowValue, Operator op, ByteBuffer conditionValue) { - ColumnMetadata definition = ColumnMetadata.regularColumn("ks", "cf", "c", MapType.getInstance(Int32Type.instance, Int32Type.instance, true)); - ColumnsExpression column = ColumnsExpression.singleColumn(definition); + ColumnMetadata definition = ColumnMetadata.regularColumn("ks", "cf", "c", MapType.getInstance(Int32Type.instance, Int32Type.instance, true), ColumnMetadata.NO_UNIQUE_ID); + ColumnsExpression column = ColumnsExpression.singleColumn(definition, null); Terms terms = Terms.of(new Constants.Value(conditionValue)); ColumnCondition condition = new ColumnCondition(column, op, terms); ColumnCondition.Bound bound = condition.bind(QueryOptions.DEFAULT); @@ -180,8 +225,8 @@ private static boolean conditionContainsApplies(Map rowV private static boolean appliesSetCondition(SortedSet rowValue, Operator op, SortedSet conditionValue) { SetType type = SetType.getInstance(Int32Type.instance, true); - ColumnMetadata definition = ColumnMetadata.regularColumn("ks", "cf", "c", type); - ColumnsExpression column = ColumnsExpression.singleColumn(definition); + ColumnMetadata definition = ColumnMetadata.regularColumn("ks", "cf", "c", type, ColumnMetadata.NO_UNIQUE_ID); + ColumnsExpression column = ColumnsExpression.singleColumn(definition, null); Term term = conditionValue == null ? Constants.NULL_VALUE : new MultiElements.Value(type, new ArrayList<>(conditionValue)); ColumnCondition condition = new ColumnCondition(column, op, Terms.of(term)); ColumnCondition.Bound bound = condition.bind(QueryOptions.DEFAULT); @@ -190,8 +235,8 @@ private static boolean appliesSetCondition(SortedSet rowValue, Opera private static boolean conditionContainsApplies(SortedSet rowValue, Operator op, ByteBuffer conditionValue) { - ColumnMetadata definition = ColumnMetadata.regularColumn("ks", "cf", "c", SetType.getInstance(Int32Type.instance, true)); - ColumnsExpression column = ColumnsExpression.singleColumn(definition); + ColumnMetadata definition = ColumnMetadata.regularColumn("ks", "cf", "c", SetType.getInstance(Int32Type.instance, true), ColumnMetadata.NO_UNIQUE_ID); + ColumnsExpression column = ColumnsExpression.singleColumn(definition, null); Terms terms = Terms.of(new Constants.Value(conditionValue)); ColumnCondition condition = new ColumnCondition(column, op, terms); @@ -199,10 +244,51 @@ private static boolean conditionContainsApplies(SortedSet rowValue, return bound.appliesTo(newRow(definition, rowValue)); } + private boolean conditionUDTApplies(ByteBuffer rowValue, Operator op, ByteBuffer conditionValue) + { + boolean frozen = conditionUDTApplies(UDT_FROZEN, rowValue, op, conditionValue); + boolean multi = conditionUDTApplies(UDT_MULTI_CELL, rowValue, op, conditionValue); + Assertions.assertThat(frozen).isEqualTo(multi); + return frozen; + } + + private boolean conditionUDTApplies(UserType ut, ByteBuffer rowValue, Operator op, ByteBuffer conditionValue) + { + ColumnMetadata column = ColumnMetadata.regularColumn(KEYSPACE, "tbl", "c", ut, ColumnMetadata.NO_UNIQUE_ID); + ColumnCondition.ElementOrFieldAccessBound bounds = new ColumnCondition.ElementOrFieldAccessBound(column, null, UDT_FIELD_A.bytes, op, conditionValue); + Row row; + if (ut.isMultiCell()) + { + Row.Builder builder = BTreeRow.sortedBuilder(); + builder.newRow(Clustering.EMPTY); + if (rowValue != null) + { + builder.addCell(new BufferCell(column, + 0L, + Cell.NO_TTL, + Cell.NO_DELETION_TIME, + rowValue, + ut.cellPathForField(UDT_FIELD_A))); + builder.addCell(new BufferCell(column, + 0L, + Cell.NO_TTL, + Cell.NO_DELETION_TIME, + EMPTY_BYTE_BUFFER, + ut.cellPathForField(UDT_FIELD_B))); + } + row = builder.build(); + } + else + { + row = newRow(column, ut.pack(rowValue, EMPTY_BYTE_BUFFER)); + } + return bounds.appliesTo(row); + } + private static boolean appliesMapCondition(Map rowValue, Operator op, SortedMap conditionValue) { MapType type = MapType.getInstance(Int32Type.instance, Int32Type.instance, true); - ColumnMetadata definition = ColumnMetadata.regularColumn("ks", "cf", "c", type); + ColumnMetadata definition = ColumnMetadata.regularColumn("ks", "cf", "c", type, ColumnMetadata.NO_UNIQUE_ID); Term term; if (conditionValue == null) { @@ -218,7 +304,7 @@ private static boolean appliesMapCondition(Map rowValue, } term = new MultiElements.Value(type, value); } - ColumnsExpression column = ColumnsExpression.singleColumn(definition); + ColumnsExpression column = ColumnsExpression.singleColumn(definition, null); ColumnCondition condition = new ColumnCondition(column, op, Terms.of(term)); ColumnCondition.Bound bound = condition.bind(QueryOptions.DEFAULT); return bound.appliesTo(newRow(definition, rowValue)); @@ -738,4 +824,160 @@ public void toCQLStringTest() assertEquals("col.f1 = ?", udtFieldCondition(col, f, EQ, Terms.Raw.of(marker)).toCQLString()); assertEquals("col.f1 = 1", udtFieldCondition(col, f, EQ, Terms.Raw.of(one)).toCQLString()); } + + @Test + public void testUDTBound() throws InvalidRequestException + { + // EQ + assertTrue(conditionUDTApplies(ONE, EQ, ONE)); + assertFalse(conditionUDTApplies(ONE, EQ, ZERO)); + assertFalse(conditionUDTApplies(ZERO, EQ, ONE)); + assertFalse(conditionUDTApplies(ONE, EQ, null)); + + assertFalse(conditionUDTApplies(ONE, EQ, null)); + assertFalse(conditionUDTApplies(null, EQ, ONE)); + assertTrue(conditionUDTApplies(null, EQ, null)); + + assertFalse(conditionUDTApplies(ONE, EQ, ByteBufferUtil.EMPTY_BYTE_BUFFER)); + assertFalse(conditionUDTApplies(ByteBufferUtil.EMPTY_BYTE_BUFFER, EQ, ONE)); + assertTrue(conditionUDTApplies(ByteBufferUtil.EMPTY_BYTE_BUFFER, EQ, ByteBufferUtil.EMPTY_BYTE_BUFFER)); + + // NEQ + assertFalse(conditionUDTApplies(ONE, NEQ, ONE)); + assertTrue(conditionUDTApplies(ONE, NEQ, ZERO)); + assertTrue(conditionUDTApplies(ZERO, NEQ, ONE)); + assertTrue(conditionUDTApplies(ONE, NEQ, null)); + assertTrue(conditionUDTApplies(null, NEQ, ONE)); + + assertTrue(conditionUDTApplies(ONE, NEQ, null)); + assertTrue(conditionUDTApplies(null, NEQ, ONE)); + assertFalse(conditionUDTApplies(null, NEQ, null)); + + assertTrue(conditionUDTApplies(ONE, NEQ, ByteBufferUtil.EMPTY_BYTE_BUFFER)); + assertTrue(conditionUDTApplies(ByteBufferUtil.EMPTY_BYTE_BUFFER, NEQ, ONE)); + assertFalse(conditionUDTApplies(ByteBufferUtil.EMPTY_BYTE_BUFFER, NEQ, ByteBufferUtil.EMPTY_BYTE_BUFFER)); + + // LT + assertFalse(conditionUDTApplies(ONE, LT, ONE)); + assertThatThrownBy(() -> conditionUDTApplies(null, LT, null)).isInstanceOf(InvalidRequestException.class); + assertFalse(conditionUDTApplies(ONE, LT, ZERO)); + assertTrue(conditionUDTApplies(ZERO, LT, ONE)); + assertThatThrownBy(() -> conditionUDTApplies(ONE, LT, null)).isInstanceOf(InvalidRequestException.class); + + assertFalse(conditionUDTApplies(ONE, LT, ByteBufferUtil.EMPTY_BYTE_BUFFER)); + assertTrue(conditionUDTApplies(ByteBufferUtil.EMPTY_BYTE_BUFFER, LT, ONE)); + assertFalse(conditionUDTApplies(ByteBufferUtil.EMPTY_BYTE_BUFFER, LT, ByteBufferUtil.EMPTY_BYTE_BUFFER)); + + // LTE + assertTrue(conditionUDTApplies(ONE, LTE, ONE)); + assertFalse(conditionUDTApplies(ONE, LTE, ZERO)); + assertTrue(conditionUDTApplies(ZERO, LTE, ONE)); + assertThatThrownBy(() -> conditionUDTApplies(ONE, LTE, null)).isInstanceOf(InvalidRequestException.class); + + assertFalse(conditionUDTApplies(ONE, LTE, ByteBufferUtil.EMPTY_BYTE_BUFFER)); + assertTrue(conditionUDTApplies(ByteBufferUtil.EMPTY_BYTE_BUFFER, LTE, ONE)); + assertTrue(conditionUDTApplies(ByteBufferUtil.EMPTY_BYTE_BUFFER, LTE, ByteBufferUtil.EMPTY_BYTE_BUFFER)); + + // GT + assertFalse(conditionUDTApplies(ONE, GT, ONE)); + assertTrue(conditionUDTApplies(ONE, GT, ZERO)); + assertFalse(conditionUDTApplies(ZERO, GT, ONE)); + assertThatThrownBy(() -> conditionUDTApplies(ONE, GT, null)).isInstanceOf(InvalidRequestException.class); + + assertTrue(conditionUDTApplies(ONE, GT, ByteBufferUtil.EMPTY_BYTE_BUFFER)); + assertFalse(conditionUDTApplies(ByteBufferUtil.EMPTY_BYTE_BUFFER, GT, ONE)); + assertFalse(conditionUDTApplies(ByteBufferUtil.EMPTY_BYTE_BUFFER, GT, ByteBufferUtil.EMPTY_BYTE_BUFFER)); + + // GTE + assertTrue(conditionUDTApplies(ONE, GTE, ONE)); + assertTrue(conditionUDTApplies(ONE, GTE, ZERO)); + assertFalse(conditionUDTApplies(ZERO, GTE, ONE)); + assertTrue(conditionUDTApplies(ONE, GTE, ONE)); + assertThatThrownBy(() -> conditionUDTApplies(ONE, GTE, null)).isInstanceOf(InvalidRequestException.class); + + assertTrue(conditionUDTApplies(ONE, GTE, ByteBufferUtil.EMPTY_BYTE_BUFFER)); + assertFalse(conditionUDTApplies(ByteBufferUtil.EMPTY_BYTE_BUFFER, GTE, ONE)); + assertTrue(conditionUDTApplies(ByteBufferUtil.EMPTY_BYTE_BUFFER, GTE, ByteBufferUtil.EMPTY_BYTE_BUFFER)); + } + + @Test + public void serde() + { + DataOutputBuffer out = new DataOutputBuffer(); + qt().forAll(boundGen()).check(bounds -> { + TableMetadatas tables = TableMetadatas.of(bounds.table); + Serializers.testSerde(out, ColumnCondition.Bound.serializer, bounds, tables); + }); + } + + private static Gen columnMetadataGen(ColumnCondition.BoundKind kind) + { + var typeGen = selectTypes(kind); + var columnKindGen = selectColumnKinds(kind); + return Generators.toGen(CassandraGenerators.columnMetadataGen(columnKindGen, typeGen)); + } + + private static org.quicktheories.core.Gen selectColumnKinds(ColumnCondition.BoundKind kind) + { + if (kind == ColumnCondition.BoundKind.MultiCell || kind == ColumnCondition.BoundKind.ElementOrFieldAccess) + return SourceDSL.arbitrary().pick(ColumnMetadata.Kind.STATIC, ColumnMetadata.Kind.REGULAR); + return SourceDSL.arbitrary().enumValues(ColumnMetadata.Kind.class); + } + + private static Pair createColumnMetadata(RandomSource rs, ColumnCondition.BoundKind kind) + { + ColumnMetadata cm = columnMetadataGen(kind).next(rs); + TableMetadata.Builder tmb = TableMetadata.builder(cm.ksName, cm.cfName).addColumn(cm); + tmb.addPartitionKeyColumn("", Int32Type.instance); + TableMetadata tm = tmb.build(); + cm = tm.getColumn(cm.name); + return Pair.create(cm, tm); + } + + private static org.quicktheories.core.Gen> selectTypes(ColumnCondition.BoundKind kind) + { + switch (kind) + { + // A condition on a single non-collection column. + case Simple: + return new AbstractTypeGenerators.TypeGenBuilder().build(); + // A condition on a multicell column. + // assert column.type.isMultiCell(); + case MultiCell: + return new AbstractTypeGenerators.TypeGenBuilder().withTypeKinds(TypeKind.UDT, TypeKind.LIST, TypeKind.MAP, TypeKind.SET).withMultiCell(true).build(); + // The map key, list index or UDT fieldname. + case ElementOrFieldAccess: + return new AbstractTypeGenerators.TypeGenBuilder().withTypeKinds(TypeKind.UDT, TypeKind.LIST, TypeKind.MAP).withMultiCell(true).build(); + default: throw new UnsupportedOperationException(kind.name()); + } + } + + public static Gen boundGen() + { + Gen kindGen = Gens.enums().all(ColumnCondition.BoundKind.class); + Gen operatorGen = Gens.enums().all(Operator.class); + Gen nonNullValuesGen = Generators.toGen(Generators.directAndHeapBytes(1, 100)); + Gen valueGen = rs -> { + if (rs.decide(.2)) return null; + return nonNullValuesGen.next(rs); + }; + + return rs -> { + ColumnCondition.BoundKind kind = kindGen.next(rs); + Pair column = createColumnMetadata(rs, kind); + Operator operator = operatorGen.next(rs); + ByteBuffer value = valueGen.next(rs); + switch (kind) + { + // A condition on a single non-collection column. + case Simple: return new ColumnCondition.SimpleBound(column.left, column.right, operator, value); + // A condition on a multicell column. + // assert column.type.isMultiCell(); + case MultiCell: return new ColumnCondition.MultiCellBound(column.left, column.right, operator, value); + // The map key, list index or UDT fieldname. + case ElementOrFieldAccess: return new ColumnCondition.ElementOrFieldAccessBound(column.left, column.right, Generators.toGen(AbstractTypeGenerators.elementAccess(column.left.type).bytesGen()).next(rs), operator, value); + default: throw new UnsupportedOperationException(kind.name()); + } + }; + } } diff --git a/test/unit/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictionsTest.java b/test/unit/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictionsTest.java index a8fda344bc30..3229e8ecd785 100644 --- a/test/unit/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictionsTest.java +++ b/test/unit/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictionsTest.java @@ -1644,7 +1644,7 @@ private static TableMetadata newTableMetadata(Sort... sorts) private static Restriction newSingleRestriction(TableMetadata tableMetadata, int index, Operator operator, ByteBuffer... values) { ColumnMetadata column = getClusteringColumnDefinition(tableMetadata, index); - return new SimpleRestriction(ColumnsExpression.singleColumn(column), operator, toTerms(values)); + return new SimpleRestriction(ColumnsExpression.singleColumn(column, tableMetadata), operator, toTerms(values)); } /** @@ -1666,7 +1666,7 @@ private static Restriction newMultiEq(TableMetadata tableMetadata, int firstInde types.add(column.type); } TupleType tupleType = new TupleType(types); - return new SimpleRestriction(ColumnsExpression.multiColumns(columns), + return new SimpleRestriction(ColumnsExpression.multiColumns(columns, tableMetadata), Operator.EQ, Terms.of(new MultiElements.Value(tupleType, asList(values)))); } @@ -1699,7 +1699,7 @@ private static Restriction newMultiIN(TableMetadata tableMetadata, int firstInde { terms.add(new MultiElements.Value(tupleType, values[i])); } - return new SimpleRestriction(ColumnsExpression.multiColumns(columns), Operator.IN, Terms.of(terms)); + return new SimpleRestriction(ColumnsExpression.multiColumns(columns, tableMetadata), Operator.IN, Terms.of(terms)); } /** @@ -1734,7 +1734,7 @@ private static Restriction newMultiSlice(TableMetadata tableMetadata, int firstI types.add(column.type); } TupleType type = new TupleType(types); - return new SimpleRestriction(ColumnsExpression.multiColumns(columns), + return new SimpleRestriction(ColumnsExpression.multiColumns(columns, tableMetadata), operator, Terms.of(new MultiElements.Value(type, asList(values)))); } diff --git a/test/unit/org/apache/cassandra/cql3/restrictions/ClusteringElementsTest.java b/test/unit/org/apache/cassandra/cql3/restrictions/ClusteringElementsTest.java index 60e2508be20b..8ad94d4638c9 100644 --- a/test/unit/org/apache/cassandra/cql3/restrictions/ClusteringElementsTest.java +++ b/test/unit/org/apache/cassandra/cql3/restrictions/ClusteringElementsTest.java @@ -31,6 +31,7 @@ import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.Int32Type; import org.apache.cassandra.db.marshal.ReversedType; +import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.harry.util.ByteUtils; import org.apache.cassandra.schema.ColumnMetadata; @@ -187,7 +188,7 @@ public void testAtMostWithOneColumn() ClusteringElements four = elements(type, 4); ClusteringElements six = elements(type, 6); - RangeSet rangeSet = ClusteringElements.atMost(four); + RangeSet rangeSet = ClusteringElements.atMost(four, Murmur3Partitioner.instance); assertTrue(rangeSet.contains(one)); assertTrue(rangeSet.contains(four)); assertFalse(rangeSet.contains(six)); @@ -208,7 +209,7 @@ public void testAtMostWithTwoColumns() ClusteringElements oneFive = elements(columns, 1, 5); ClusteringElements twoFive = elements(columns, 2, 5); - RangeSet rangeSet = ClusteringElements.atMost(oneThree); + RangeSet rangeSet = ClusteringElements.atMost(oneThree, Murmur3Partitioner.instance); assertTrue(rangeSet.contains(zeroZero)); assertTrue(rangeSet.contains(oneZero)); @@ -258,7 +259,7 @@ public void testAtMostWithThreeColumns() ClusteringElements oneFiveOne = elements(columns, 1, 5, 1); ClusteringElements twoFiveFive = elements(columns, 2, 5, 5); - RangeSet rangeSet = ClusteringElements.atMost(oneThreeOne); + RangeSet rangeSet = ClusteringElements.atMost(oneThreeOne, Murmur3Partitioner.instance); assertTrue(rangeSet.contains(zeroZeroZero)); assertTrue(rangeSet.contains(oneZeroOne)); @@ -279,7 +280,7 @@ public void testLessThanWithOneColumn() ClusteringElements four = elements(column, 4); ClusteringElements six = elements(column, 6); - RangeSet rangeSet = ClusteringElements.lessThan(four); + RangeSet rangeSet = ClusteringElements.lessThan(four, Murmur3Partitioner.instance); assertTrue(rangeSet.contains(one)); assertFalse(rangeSet.contains(four)); assertFalse(rangeSet.contains(six)); @@ -300,7 +301,7 @@ public void testLessThanWithTwoColumns() ClusteringElements oneFive = elements(columns, 1, 5); ClusteringElements twoFive = elements(columns, 2, 5); - RangeSet rangeSet = ClusteringElements.lessThan(oneThree); + RangeSet rangeSet = ClusteringElements.lessThan(oneThree, Murmur3Partitioner.instance); assertTrue(rangeSet.contains(zeroZero)); assertTrue(rangeSet.contains(oneZero)); @@ -351,7 +352,7 @@ public void testLessThanWithThreeColumns() ClusteringElements oneFiveOne = elements(columns, 1, 5, 1); ClusteringElements twoFiveFive = elements(columns, 2, 5, 5); - RangeSet rangeSet = ClusteringElements.lessThan(oneThreeOne); + RangeSet rangeSet = ClusteringElements.lessThan(oneThreeOne, Murmur3Partitioner.instance); assertTrue(rangeSet.contains(zeroZeroZero)); assertTrue(rangeSet.contains(oneZeroOne)); @@ -372,7 +373,7 @@ public void testAtLeastWithOneColumn() ClusteringElements four = elements(column, 4); ClusteringElements six = elements(column, 6); - RangeSet rangeSet = ClusteringElements.atLeast(four); + RangeSet rangeSet = ClusteringElements.atLeast(four, Murmur3Partitioner.instance); assertFalse(rangeSet.contains(one)); assertTrue(rangeSet.contains(four)); assertTrue(rangeSet.contains(six)); @@ -393,7 +394,7 @@ public void testAtLeastWithTwoColumns() ClusteringElements oneFive = elements(columns, 1, 5); ClusteringElements twoFive = elements(columns, 2, 5); - RangeSet rangeSet = ClusteringElements.atLeast(oneThree); + RangeSet rangeSet = ClusteringElements.atLeast(oneThree, Murmur3Partitioner.instance); assertFalse(rangeSet.contains(zeroZero)); assertFalse(rangeSet.contains(oneZero)); @@ -444,7 +445,7 @@ public void testAtLeastWithThreeColumns() ClusteringElements oneFiveOne = elements(columns, 1, 5, 1); ClusteringElements twoFiveFive = elements(columns, 2, 5, 5); - RangeSet rangeSet = ClusteringElements.atLeast(oneThreeOne); + RangeSet rangeSet = ClusteringElements.atLeast(oneThreeOne, Murmur3Partitioner.instance); assertFalse(rangeSet.contains(zeroZeroZero)); assertFalse(rangeSet.contains(oneZeroOne)); @@ -465,7 +466,7 @@ public void testGreaterThanWithOneColumn() ClusteringElements four = elements(column, 4); ClusteringElements six = elements(column, 6); - RangeSet rangeSet = ClusteringElements.greaterThan(four); + RangeSet rangeSet = ClusteringElements.greaterThan(four, Murmur3Partitioner.instance); assertFalse(rangeSet.contains(one)); assertFalse(rangeSet.contains(four)); assertTrue(rangeSet.contains(six)); @@ -486,7 +487,7 @@ public void testGreaterThanWithTwoColumns() ClusteringElements oneFive = elements(columns, 1, 5); ClusteringElements twoFive = elements(columns, 2, 5); - RangeSet rangeSet = ClusteringElements.greaterThan(oneThree); + RangeSet rangeSet = ClusteringElements.greaterThan(oneThree, Murmur3Partitioner.instance); assertFalse(rangeSet.contains(zeroZero)); assertFalse(rangeSet.contains(oneZero)); @@ -537,7 +538,7 @@ public void testGreaterThanWithThreeColumns() ClusteringElements oneFiveOne = elements(columns, 1, 5, 1); ClusteringElements twoFiveFive = elements(columns, 2, 5, 5); - RangeSet rangeSet = ClusteringElements.greaterThan(oneThreeOne); + RangeSet rangeSet = ClusteringElements.greaterThan(oneThreeOne, Murmur3Partitioner.instance); assertFalse(rangeSet.contains(zeroZeroZero)); assertFalse(rangeSet.contains(oneZeroOne)); @@ -662,7 +663,7 @@ private void assertUnsupported(String expectedMsg, Runnable r) private static ClusteringElements elements(ColumnMetadata column, int value) { - return ClusteringElements.of(column, bytes(value)); + return ClusteringElements.of(column, bytes(value), false); } private static ClusteringElements elements(List columns, int... values) diff --git a/test/unit/org/apache/cassandra/cql3/selection/SelectionColumnMappingTest.java b/test/unit/org/apache/cassandra/cql3/selection/SelectionColumnMappingTest.java index 9f20aea4c0fa..58f4a3364e44 100644 --- a/test/unit/org/apache/cassandra/cql3/selection/SelectionColumnMappingTest.java +++ b/test/unit/org/apache/cassandra/cql3/selection/SelectionColumnMappingTest.java @@ -40,7 +40,6 @@ import org.apache.cassandra.utils.ByteBufferUtil; import static java.util.Arrays.asList; -import static org.apache.cassandra.ServerTestUtils.daemonInitialization; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; diff --git a/test/unit/org/apache/cassandra/cql3/statements/DescribeStatementTest.java b/test/unit/org/apache/cassandra/cql3/statements/DescribeStatementTest.java index 506a69e352ff..5d376dcf0721 100644 --- a/test/unit/org/apache/cassandra/cql3/statements/DescribeStatementTest.java +++ b/test/unit/org/apache/cassandra/cql3/statements/DescribeStatementTest.java @@ -45,6 +45,12 @@ import org.apache.cassandra.transport.ProtocolVersion; import static java.lang.String.format; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.apache.cassandra.schema.SchemaConstants.ACCORD_KEYSPACE_NAME; import static org.apache.cassandra.schema.SchemaConstants.AUTH_KEYSPACE_NAME; import static org.apache.cassandra.schema.SchemaConstants.DISTRIBUTED_KEYSPACE_NAME; import static org.apache.cassandra.schema.SchemaConstants.METADATA_KEYSPACE_NAME; @@ -52,11 +58,6 @@ import static org.apache.cassandra.schema.SchemaConstants.SYSTEM_KEYSPACE_NAME; import static org.apache.cassandra.schema.SchemaConstants.TRACE_KEYSPACE_NAME; import static org.apache.cassandra.schema.SchemaConstants.VIRTUAL_SCHEMA; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; public class DescribeStatementTest extends CQLTester { @@ -269,6 +270,19 @@ public void testDescribeVirtualTables() throws Throwable @Test public void testDescribe() throws Throwable + { + DatabaseDescriptor.getAutoRepairConfig().setAutoRepairSchedulingEnabled(false); + helperTestDescribe(); + } + + @Test + public void testDescribeWithAutoRepair() throws Throwable + { + DatabaseDescriptor.getAutoRepairConfig().setAutoRepairSchedulingEnabled(true); + helperTestDescribe(); + } + + public void helperTestDescribe() throws Throwable { try { @@ -290,11 +304,11 @@ public void testDescribe() throws Throwable row(KEYSPACE, "keyspace", KEYSPACE, "CREATE KEYSPACE " + KEYSPACE + " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}" + - " AND durable_writes = true;"), + " AND durable_writes = true AND fast_path = 'simple';"), row(KEYSPACE_PER_TEST, "keyspace", KEYSPACE_PER_TEST, "CREATE KEYSPACE " + KEYSPACE_PER_TEST + " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}" + - " AND durable_writes = true;"), + " AND durable_writes = true AND fast_path = 'simple';"), row("test", "keyspace", "test", keyspaceOutput()), row("test", "table", "has_all_types", allTypesTable()), row("test", "table", "\"Test\"", testTableOutput()), @@ -313,6 +327,7 @@ public void testDescribe() throws Throwable Object[][] testKeyspacesOutput = rows(row(KEYSPACE, "keyspace", KEYSPACE), row(KEYSPACE_PER_TEST, "keyspace", KEYSPACE_PER_TEST), row(SYSTEM_KEYSPACE_NAME, "keyspace", SYSTEM_KEYSPACE_NAME), + row(ACCORD_KEYSPACE_NAME, "keyspace", ACCORD_KEYSPACE_NAME), row(AUTH_KEYSPACE_NAME, "keyspace", AUTH_KEYSPACE_NAME), row(METADATA_KEYSPACE_NAME, "keyspace", METADATA_KEYSPACE_NAME), row(DISTRIBUTED_KEYSPACE_NAME, "keyspace", DISTRIBUTED_KEYSPACE_NAME), @@ -695,7 +710,8 @@ public void testDescribeTypes() throws Throwable assertRowsNet(executeDescribeNet(KEYSPACE_PER_TEST, "DESCRIBE KEYSPACE " + KEYSPACE_PER_TEST), row(KEYSPACE_PER_TEST, "keyspace", KEYSPACE_PER_TEST, "CREATE KEYSPACE " + KEYSPACE_PER_TEST + " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}" + - " AND durable_writes = true;"), + " AND durable_writes = true" + + " AND fast_path = 'simple';"), row(KEYSPACE_PER_TEST, "type", type2, "CREATE TYPE " + KEYSPACE_PER_TEST + "." + type2 + " (\n" + " x text,\n" + " y text\n" + @@ -800,7 +816,8 @@ public void testDescribeWithCustomIndex() throws Throwable String expectedKeyspaceStmt = "CREATE KEYSPACE " + KEYSPACE_PER_TEST + " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}" + - " AND durable_writes = true;"; + " AND durable_writes = true" + + " AND fast_path = 'simple';"; String expectedTableStmt = "CREATE TABLE " + KEYSPACE_PER_TEST + "." + table + " (\n" + " id int PRIMARY KEY,\n" + @@ -829,7 +846,7 @@ public void testDescribeCreateLikeTable() throws Throwable requireNetwork(); DatabaseDescriptor.setDynamicDataMaskingEnabled(true); String souceTable = createTable(KEYSPACE_PER_TEST, - "CREATE TABLE %s (" + + "CREATE TABLE %s (" + " pk1 text, " + " pk2 int MASKED WITH DEFAULT, " + " ck1 int, " + @@ -1118,25 +1135,57 @@ private static String testTableOutput() private static String tableParametersCql() { - return "additional_write_policy = '99p'\n" + - " AND allow_auto_snapshot = true\n" + - " AND bloom_filter_fp_chance = 0.01\n" + - " AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}\n" + - " AND cdc = false\n" + - " AND comment = ''\n" + - " AND compaction = " + cqlQuoted(CompactionParams.DEFAULT.asMap()) + "\n" + - " AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}\n" + - " AND memtable = 'default'\n" + - " AND crc_check_chance = 1.0\n" + - " AND default_time_to_live = 0\n" + - " AND extensions = {}\n" + - " AND gc_grace_seconds = 864000\n" + - " AND incremental_backups = true\n" + - " AND max_index_interval = 2048\n" + - " AND memtable_flush_period_in_ms = 0\n" + - " AND min_index_interval = 128\n" + - " AND read_repair = 'BLOCKING'\n" + - " AND speculative_retry = '99p';"; + if (!DatabaseDescriptor.getAutoRepairConfig().isAutoRepairSchedulingEnabled()) + { + return "additional_write_policy = '99p'\n" + + " AND allow_auto_snapshot = true\n" + + " AND bloom_filter_fp_chance = 0.01\n" + + " AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}\n" + + " AND cdc = false\n" + + " AND comment = ''\n" + + " AND compaction = " + cqlQuoted(CompactionParams.DEFAULT.asMap()) + "\n" + + " AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}\n" + + " AND memtable = 'default'\n" + + " AND crc_check_chance = 1.0\n" + + " AND fast_path = 'keyspace'\n" + + " AND default_time_to_live = 0\n" + + " AND extensions = {}\n" + + " AND gc_grace_seconds = 864000\n" + + " AND incremental_backups = true\n" + + " AND max_index_interval = 2048\n" + + " AND memtable_flush_period_in_ms = 0\n" + + " AND min_index_interval = 128\n" + + " AND read_repair = 'BLOCKING'\n" + + " AND transactional_mode = 'off'\n" + + " AND transactional_migration_from = 'none'\n" + + " AND speculative_retry = '99p';"; + } + else + { + return "additional_write_policy = '99p'\n" + + " AND allow_auto_snapshot = true\n" + + " AND bloom_filter_fp_chance = 0.01\n" + + " AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}\n" + + " AND cdc = false\n" + + " AND comment = ''\n" + + " AND compaction = " + cqlQuoted(CompactionParams.DEFAULT.asMap()) + "\n" + + " AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}\n" + + " AND memtable = 'default'\n" + + " AND crc_check_chance = 1.0\n" + + " AND fast_path = 'keyspace'\n" + + " AND default_time_to_live = 0\n" + + " AND extensions = {}\n" + + " AND gc_grace_seconds = 864000\n" + + " AND incremental_backups = true\n" + + " AND max_index_interval = 2048\n" + + " AND memtable_flush_period_in_ms = 0\n" + + " AND min_index_interval = 128\n" + + " AND read_repair = 'BLOCKING'\n" + + " AND transactional_mode = 'off'\n" + + " AND transactional_migration_from = 'none'\n" + + " AND speculative_retry = '99p'\n" + + " AND auto_repair = {'full_enabled': 'true', 'incremental_enabled': 'true', 'preview_repaired_enabled': 'true', 'priority': '0'};"; + } } private static String cqlQuoted(Map map) @@ -1146,29 +1195,54 @@ private static String cqlQuoted(Map map) private static String mvParametersCql() { - return "additional_write_policy = '99p'\n" + - " AND allow_auto_snapshot = true\n" + - " AND bloom_filter_fp_chance = 0.01\n" + - " AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}\n" + - " AND cdc = false\n" + - " AND comment = ''\n" + - " AND compaction = " + cqlQuoted(CompactionParams.DEFAULT.asMap()) + "\n" + - " AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}\n" + - " AND memtable = 'default'\n" + - " AND crc_check_chance = 1.0\n" + - " AND extensions = {}\n" + - " AND gc_grace_seconds = 864000\n" + - " AND incremental_backups = true\n" + - " AND max_index_interval = 2048\n" + - " AND memtable_flush_period_in_ms = 0\n" + - " AND min_index_interval = 128\n" + - " AND read_repair = 'BLOCKING'\n" + - " AND speculative_retry = '99p';"; + if (!DatabaseDescriptor.getAutoRepairConfig().isAutoRepairSchedulingEnabled()) + { + return "additional_write_policy = '99p'\n" + + " AND allow_auto_snapshot = true\n" + + " AND bloom_filter_fp_chance = 0.01\n" + + " AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}\n" + + " AND cdc = false\n" + + " AND comment = ''\n" + + " AND compaction = " + cqlQuoted(CompactionParams.DEFAULT.asMap()) + "\n" + + " AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}\n" + + " AND memtable = 'default'\n" + + " AND crc_check_chance = 1.0\n" + + " AND extensions = {}\n" + + " AND gc_grace_seconds = 864000\n" + + " AND incremental_backups = true\n" + + " AND max_index_interval = 2048\n" + + " AND memtable_flush_period_in_ms = 0\n" + + " AND min_index_interval = 128\n" + + " AND read_repair = 'BLOCKING'\n" + + " AND speculative_retry = '99p';"; + } + else + { + return "additional_write_policy = '99p'\n" + + " AND allow_auto_snapshot = true\n" + + " AND bloom_filter_fp_chance = 0.01\n" + + " AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}\n" + + " AND cdc = false\n" + + " AND comment = ''\n" + + " AND compaction = " + cqlQuoted(CompactionParams.DEFAULT.asMap()) + "\n" + + " AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}\n" + + " AND memtable = 'default'\n" + + " AND crc_check_chance = 1.0\n" + + " AND extensions = {}\n" + + " AND gc_grace_seconds = 864000\n" + + " AND incremental_backups = true\n" + + " AND max_index_interval = 2048\n" + + " AND memtable_flush_period_in_ms = 0\n" + + " AND min_index_interval = 128\n" + + " AND read_repair = 'BLOCKING'\n" + + " AND speculative_retry = '99p'\n" + + " AND auto_repair = {'full_enabled': 'true', 'incremental_enabled': 'true', 'preview_repaired_enabled': 'true', 'priority': '0'};"; + } } private static String keyspaceOutput() { - return "CREATE KEYSPACE test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'} AND durable_writes = true;"; + return "CREATE KEYSPACE test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'} AND durable_writes = true AND fast_path = 'simple';"; } private void describeError(String cql, String msg) throws Throwable diff --git a/test/unit/org/apache/cassandra/cql3/statements/TransactionStatementTest.java b/test/unit/org/apache/cassandra/cql3/statements/TransactionStatementTest.java new file mode 100644 index 000000000000..dd4c816e84a9 --- /dev/null +++ b/test/unit/org/apache/cassandra/cql3/statements/TransactionStatementTest.java @@ -0,0 +1,575 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.cql3.statements; + +import java.util.Arrays; + +import org.assertj.core.api.Assertions; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.cql3.CQLStatement; +import org.apache.cassandra.cql3.QueryOptions; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.cql3.ast.Conditional.Is; +import org.apache.cassandra.cql3.ast.FunctionCall; +import org.apache.cassandra.cql3.ast.Literal; +import org.apache.cassandra.cql3.ast.Mutation; +import org.apache.cassandra.cql3.ast.Select; +import org.apache.cassandra.cql3.ast.Txn; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.exceptions.SyntaxException; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.QueryState; +import org.apache.cassandra.transport.Dispatcher; +import org.apache.cassandra.transport.messages.ResultMessage; + +import static org.apache.cassandra.cql3.statements.TransactionStatement.DUPLICATE_TUPLE_NAME_MESSAGE; +import static org.apache.cassandra.cql3.statements.TransactionStatement.EMPTY_TRANSACTION_MESSAGE; +import static org.apache.cassandra.cql3.statements.TransactionStatement.ILLEGAL_RANGE_QUERY_MESSAGE; +import static org.apache.cassandra.cql3.statements.TransactionStatement.INCOMPLETE_PARTITION_KEY_SELECT_MESSAGE; +import static org.apache.cassandra.cql3.statements.TransactionStatement.INCOMPLETE_PRIMARY_KEY_SELECT_MESSAGE; +import static org.apache.cassandra.cql3.statements.TransactionStatement.NO_AGGREGATION_IN_TXNS_MESSAGE; +import static org.apache.cassandra.cql3.statements.TransactionStatement.NO_CONDITIONS_IN_UPDATES_MESSAGE; +import static org.apache.cassandra.cql3.statements.TransactionStatement.NO_GROUP_BY_IN_TXNS_MESSAGE; +import static org.apache.cassandra.cql3.statements.TransactionStatement.NO_ORDER_BY_IN_TXNS_MESSAGE; +import static org.apache.cassandra.cql3.statements.TransactionStatement.NO_PARTITION_IN_CLAUSE_WITH_LIMIT; +import static org.apache.cassandra.cql3.statements.TransactionStatement.NO_TIMESTAMPS_IN_UPDATES_MESSAGE; +import static org.apache.cassandra.cql3.statements.TransactionStatement.NO_TTLS_IN_UPDATES_MESSAGE; +import static org.apache.cassandra.cql3.statements.TransactionStatement.SELECT_REFS_NEED_COLUMN_MESSAGE; +import static org.apache.cassandra.cql3.statements.TransactionStatement.TRANSACTIONS_DISABLED_ON_TABLE_MESSAGE; +import static org.apache.cassandra.cql3.statements.UpdateStatement.CANNOT_SET_KEY_WITH_REFERENCE_MESSAGE; +import static org.apache.cassandra.cql3.statements.UpdateStatement.UPDATING_PRIMARY_KEY_MESSAGE; +import static org.apache.cassandra.cql3.statements.schema.CreateTableStatement.parse; +import static org.apache.cassandra.cql3.transactions.RowDataReference.CANNOT_FIND_TUPLE_MESSAGE; +import static org.apache.cassandra.cql3.transactions.RowDataReference.COLUMN_NOT_IN_TUPLE_MESSAGE; +import static org.apache.cassandra.schema.TableMetadata.UNDEFINED_COLUMN_NAME_MESSAGE; + +public class TransactionStatementTest +{ + private static final TableId TABLE1_ID = TableId.fromString("00000000-0000-0000-0000-000000000001"); + private static final TableId TABLE2_ID = TableId.fromString("00000000-0000-0000-0000-000000000002"); + private static final TableId TABLE3_ID = TableId.fromString("00000000-0000-0000-0000-000000000003"); + private static final TableId TABLE4_ID = TableId.fromString("00000000-0000-0000-0000-000000000004"); + private static final TableId TABLE5_ID = TableId.fromString("00000000-0000-0000-0000-000000000005"); + private static final TableId TABLE6_ID = TableId.fromString("00000000-0000-0000-0000-000000000006"); + private static final TableId TABLE7_ID = TableId.fromString("00000000-0000-0000-0000-000000000007"); + + @BeforeClass + public static void beforeClass() throws Exception + { + SchemaLoader.prepareServer(); + SchemaLoader.createKeyspace("ks", KeyspaceParams.simple(1), + parse("CREATE TABLE tbl1 (k int, c int, v int, PRIMARY KEY (k, c)) WITH transactional_mode = 'full'", "ks").id(TABLE1_ID), + parse("CREATE TABLE tbl2 (k int, c int, v int, primary key (k, c)) WITH transactional_mode = 'full'", "ks").id(TABLE2_ID), + parse("CREATE TABLE tbl3 (k int PRIMARY KEY, \"with spaces\" int, \"with\"\"quote\" int, \"MiXeD_CaSe\" int) WITH transactional_mode = 'full'", "ks").id(TABLE3_ID), + parse("CREATE TABLE tbl4 (k int PRIMARY KEY, int_list list) WITH transactional_mode = 'full'", "ks").id(TABLE4_ID), + parse("CREATE TABLE tbl5 (k int PRIMARY KEY, v int) WITH transactional_mode = 'full'", "ks").id(TABLE5_ID), + parse("CREATE TABLE tbl6 (k int PRIMARY KEY, v int) WITH transactional_mode = 'off'", "ks").id(TABLE6_ID), + parse("CREATE TABLE tbl7 (k int PRIMARY KEY, v vector) WITH transactional_mode = 'full'", "ks").id(TABLE7_ID)); + } + + private static TableMetadata tbl(int num) + { + return Keyspace.open("ks").getColumnFamilyStore("tbl" + num).metadata(); + } + + private static TableMetadata tbl5() + { + return tbl(5); + } + + @Test + public void shouldRejectReferenceSelectOutsideTxn() + { + String query = "SELECT row1.v, row2.v;"; + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(SyntaxException.class) + .hasMessageContaining("expecting K_FROM"); + } + + @Test + public void shouldRejectReferenceUpdateOutsideTxn() + { + String query = "UPDATE ks.tbl1 SET v = row2.v WHERE k=1 AND c=2;"; + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(SyntaxException.class) + .hasMessageContaining("failed predicate"); + } + + @Test + public void shouldRejectConditionalWithNoEndIf() + { + String query = "BEGIN TRANSACTION\n" + + " IF row1 IS NOT NULL AND row1.v = 3 AND row2.v=4 THEN\n" + + " UPDATE ks.tbl1 SET v=1 WHERE k=1 AND c=2;\n" + + "COMMIT TRANSACTION"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(SyntaxException.class) + .hasMessageContaining("failed predicate"); + } + + @Test + public void shouldRejectConditionalWithEndIfButNoIf() + { + String query = "BEGIN TRANSACTION\n" + + " UPDATE ks.tbl1 SET v=1 WHERE k=1 AND c=2;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(SyntaxException.class) + .hasMessageContaining("failed predicate"); + } + + @Test + public void shouldRejectLetOnlyStatement() + { + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM ks.tbl1 WHERE k=1 AND c=2);\n" + + "COMMIT TRANSACTION"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessage(EMPTY_TRANSACTION_MESSAGE); + } + + @Test + public void shouldRejectEntireTupleSelect() + { + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM ks.tbl1 WHERE k=1 AND c=2);\n" + + " SELECT row1;\n" + + "COMMIT TRANSACTION"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessage(SELECT_REFS_NEED_COLUMN_MESSAGE); + } + + @Test + public void shouldRejectDuplicateTupleName() + { + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM ks.tbl1 WHERE k=1 AND c=2);\n" + + " LET row1 = (SELECT * FROM ks.tbl2 WHERE k=2 AND c=2);\n" + + " SELECT row1.v;\n" + + "COMMIT TRANSACTION"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(DUPLICATE_TUPLE_NAME_MESSAGE, "row1")); + } + + @Test + public void shouldRejectIllegalLimitInLet() + { + String letSelect = "SELECT * FROM ks.tbl1 WHERE k = 1 LIMIT 2"; + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (" + letSelect + ");\n" + + " SELECT row1.v;\n" + + "COMMIT TRANSACTION"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(INCOMPLETE_PRIMARY_KEY_SELECT_MESSAGE, "LET assignment row1", "at [2:15]")); + } + + @Test + public void shouldRejectIllegalBindLimitInLet() + { + String letSelect = "SELECT * FROM ks.tbl1 WHERE k = 1 LIMIT ?"; + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (" + letSelect + ");\n" + + " SELECT row1.v;\n" + + "COMMIT TRANSACTION"; + + Assertions.assertThatThrownBy(() -> execute(query, 2)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(INCOMPLETE_PRIMARY_KEY_SELECT_MESSAGE, "LET assignment", "at [2:15]")); + } + + @Test + public void shouldRejectIncompletePrimaryKeyInLet() + { + String letSelect = "SELECT * FROM ks.tbl1 WHERE k = 1"; + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (" + letSelect + ");\n" + + " SELECT row1.v;\n" + + "COMMIT TRANSACTION"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(INCOMPLETE_PRIMARY_KEY_SELECT_MESSAGE, "LET assignment row1", "at [2:15]")); + } + + @Test + public void shouldRejectUpdateWithCondition() + { + String query = "BEGIN TRANSACTION\n" + + " INSERT INTO ks.tbl1 (k, c, v) VALUES (0, 0, 1) IF NOT EXISTS;\n" + + "COMMIT TRANSACTION"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(NO_CONDITIONS_IN_UPDATES_MESSAGE, "INSERT", "at [2:3]"); + } + + @Test + public void shouldRejectUpdateWithCustomTimestamp() + { + String query = "BEGIN TRANSACTION\n" + + " INSERT INTO ks.tbl1 (k, c, v) VALUES (0, 0, 1) USING TIMESTAMP 1;\n" + + "COMMIT TRANSACTION"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(NO_TIMESTAMPS_IN_UPDATES_MESSAGE, "INSERT", "at [2:3]"); + } + + @Test + public void shouldRejectBothFullSelectAndSelectWithReferences() + { + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM ks.tbl1 WHERE k=1 AND c=2);\n" + + " SELECT v FROM ks.tbl1 WHERE k=2 AND c=2;\n" + + " SELECT row1.v;\n" + + " IF row1 IS NOT NULL AND row1.v = 3 AND row2.v=4 THEN\n" + + " UPDATE ks.tbl1 SET v=1 WHERE k=1 AND c=2;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(SyntaxException.class) + .hasMessageContaining("no viable alternative"); + } + + @Test + public void shouldRejectPrimaryKeyValueReference() + { + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM ks.tbl1 WHERE k=1 AND c=1);\n" + + " IF row1 IS NULL THEN\n" + + " UPDATE ks.tbl1 SET c = row1.c WHERE k=1 AND c=2;\n" + + " END IF\n" + + "COMMIT TRANSACTION"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(UPDATING_PRIMARY_KEY_MESSAGE, "c")); + } + + @Test + public void shouldRejectShorthandAssignmentToUnknownColumn() + { + String query = "BEGIN TRANSACTION\n" + + " UPDATE ks.tbl1 SET q += 1 WHERE k=1 AND c=2;\n" + + "COMMIT TRANSACTION"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(UNDEFINED_COLUMN_NAME_MESSAGE, "q", "ks.tbl1")); + } + + @Test + public void shouldRejectAdditionToUnknownColumn() + { + String query = "BEGIN TRANSACTION\n" + + " UPDATE ks.tbl1 SET v = q + 1 WHERE k=1 AND c=2;\n" + + "COMMIT TRANSACTION"; + + Assertions.assertThatThrownBy(() -> QueryProcessor.parseStatement(query)) + .isInstanceOf(SyntaxException.class) + .hasMessageContaining("Only expressions of the form X = X + are supported."); + } + + @Test + public void shouldRejectUnknownSubstitutionTuple() + { + String query = "BEGIN TRANSACTION\n" + + " UPDATE ks.tbl1 SET v = row1.v WHERE k=1 AND c=2;\n" + + "COMMIT TRANSACTION"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(CANNOT_FIND_TUPLE_MESSAGE, "row1")); + } + + @Test + public void shouldRejectUnknownSubstitutionColumn() + { + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM ks.tbl1 WHERE k=1 AND c=2);\n" + + " UPDATE ks.tbl1 SET v = row1.q WHERE k=1 AND c=2;\n" + + "COMMIT TRANSACTION"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(COLUMN_NOT_IN_TUPLE_MESSAGE, "q", "row1")); + } + + @Test + public void shouldRejectInsertPartiitonKeyReference() + { + String query = "BEGIN TRANSACTION\n" + + " LET row0 = (SELECT * FROM ks.tbl1 WHERE k = 0 AND c = 0);\n" + + " INSERT INTO ks.tbl1 (k, c, v) VALUES (row0.k, 1, 1);\n" + + "COMMIT TRANSACTION"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(CANNOT_SET_KEY_WITH_REFERENCE_MESSAGE, "row0.k", "k")); + } + + @Test + public void shouldRejectNormalSelectWithIncompletePartitionKey() + { + String select = "SELECT k, v FROM ks.tbl5 LIMIT 1"; + String query = "BEGIN TRANSACTION\n" + + select + ";\n" + + "COMMIT TRANSACTION;\n"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(INCOMPLETE_PARTITION_KEY_SELECT_MESSAGE, "returning select", "at [2:1]")); + } + + @Test + public void shouldRejectLetSelectWithIncompletePartitionKey() + { + String select = "SELECT k, v FROM ks.tbl5 WHERE token(k) > token(123) LIMIT 1"; + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (" + select + "); \n" + + " SELECT row1.k, row1.v;\n" + + "COMMIT TRANSACTION;\n"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(ILLEGAL_RANGE_QUERY_MESSAGE, "LET assignment row1", "at [2:15]")); + } + + @Test + public void shouldRejectTTL() + { + for (Mutation.Kind kind : Mutation.Kind.values()) + { + if (kind == Mutation.Kind.DELETE) continue; // deletes don't support TTL + Mutation mutation; + switch (kind) + { + case INSERT: + mutation = Mutation.insert(tbl5()) + .value("k", 1) + .value("v", 2) + .ttl(42) + .build(); + break; + case UPDATE: + mutation = Mutation.update(tbl5()) + .value("k", 1) + .set("v", 2) + .ttl(42) + .build(); + break; + default: + throw new UnsupportedOperationException(kind.name()); + } + String query = Txn.wrap(mutation).toCQL(); + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(NO_TTLS_IN_UPDATES_MESSAGE, kind.name(), "at")); + + var txn = Txn.builder() + .addLet("a", Select.builder() + .table(tbl5()) + .value("k", 1) + .build()) + .addIf(new Is("a", Is.Kind.Null), mutation) + .build(); + Assertions.assertThatThrownBy(() -> prepare(txn.toCQL())) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(NO_TTLS_IN_UPDATES_MESSAGE, kind.name(), "at")); + } + } + + @Test + public void shouldRejectAggFunctions() + { + var select = Select.builder() + .selection(FunctionCall.count("v")) + .table(tbl5()) + .value("k",0) + .build(); + + Assertions.assertThatThrownBy(() -> prepare(Txn.wrap(select).toCQL())) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(NO_AGGREGATION_IN_TXNS_MESSAGE, "SELECT", "at")); + + var txn = Txn.builder() + .addLet("a", select) + .addReturnReferences("a.count") + .build(); + + Assertions.assertThatThrownBy(() -> prepare(txn.toCQL())) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(NO_AGGREGATION_IN_TXNS_MESSAGE, "SELECT", "at")); + } + + @Test + public void shouldRejectOrderBy() + { + String query = "BEGIN TRANSACTION\n" + + " SELECT * FROM ks.tbl7 WHERE k=0 ORDER BY v ANN OF [42] LIMIT 1;" + + "COMMIT TRANSACTION;"; + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(NO_ORDER_BY_IN_TXNS_MESSAGE, "SELECT", "at")); + + // The below code is left commented out as a reminder to think about this case... As of this writing ORDER BY does not parse in a LET clause... if that is ever fixed we should block it right away! +// String query2 = "BEGIN TRANSACTION\n" + +// " LET a = (SELECT * FROM ks.tbl7 WHERE k=0 ORDER BY v ANN OF [42] LIMIT 1;)" + +// " SELECT a.v" + +// "COMMIT TRANSACTION;"; +// Assertions.assertThatThrownBy(() -> prepare(query2)) +// .isInstanceOf(InvalidRequestException.class) +// .hasMessageContaining(String.format(NO_ORDER_BY_IN_TXNS_MESSAGE, "SELECT", "at")); + } + + @Test + public void shouldRejectGroupBy() + { + String query = "BEGIN TRANSACTION\n" + + " SELECT * FROM ks.tbl1 WHERE k=0 GROUP BY c LIMIT 1;" + + "COMMIT TRANSACTION;"; + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(NO_GROUP_BY_IN_TXNS_MESSAGE, "SELECT", "at")); + + // The below code is left commented out as a reminder to think about this case... As of this writing GROUP BY does not parse in a LET clause... if that is ever fixed we should block it right away! +// String query2 = "BEGIN TRANSACTION\n" + +// " LET a = (SELECT * FROM ks.tbl1 WHERE k=0 GROUP BY c LIMIT 1;)" + +// " SELECT a.v" + +// "COMMIT TRANSACTION;"; +// Assertions.assertThatThrownBy(() -> prepare(query2)) +// .isInstanceOf(InvalidRequestException.class) +// .hasMessageContaining(String.format(NO_GROUP_BY_IN_TXNS_MESSAGE, "SELECT", "at")); + } + + @Test + public void shouldRejectInClauseInLet() + { + // this is blocked not because this isn't safe, but that the logic to handle this is currently in the read coordinator, which Accord doesn't call. + // So rather than return bad results to users, IN w/ LIMIT is blocked... until we can fix + Select select = Select.builder() + .table(tbl(1)) + .in("k", 0, 1) + .limit(Literal.of(1)) + .build(); + + Assertions.assertThatThrownBy(() -> prepare(Txn.wrap(select).toCQL())) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(NO_PARTITION_IN_CLAUSE_WITH_LIMIT, "SELECT", "at")); + + Assertions.assertThatThrownBy(() -> prepare(Txn.builder() + .addLet("a", select) + .addReturnReferences("a.k") + .build().toCQL())) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(NO_PARTITION_IN_CLAUSE_WITH_LIMIT, "SELECT", "at")); + } + + @Test + public void shouldRejectInClauseInLetWithBind() + { + Select select = Select.builder() + .table(tbl(1)) + .in("k", 0, 1) + .limit(1) + .build(); + + TransactionStatement stmt = (TransactionStatement) prepare(Txn.wrap(select).toCQL()); + QueryState state = QueryState.forInternalCalls(); + Dispatcher.RequestTime now = Dispatcher.RequestTime.forImmediateExecution(); + Assertions.assertThatThrownBy(() -> stmt.execute(state, QueryOptions.forInternalCalls(Arrays.asList(select.bindsEncoded())), now)).isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(NO_PARTITION_IN_CLAUSE_WITH_LIMIT, "SELECT", "at")); + + Txn txn = Txn.builder() + .addLet("a", select) + .addReturnReferences("a.v") + .build(); + TransactionStatement stmt2 = (TransactionStatement) prepare(txn.toCQL()); + Assertions.assertThatThrownBy(() -> stmt2.execute(state, QueryOptions.forInternalCalls(Arrays.asList(txn.bindsEncoded())), now)).isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(NO_PARTITION_IN_CLAUSE_WITH_LIMIT, "SELECT", "at")); + } + + @Test + public void shouldRejectLetSelectOnNonTransactionalTable() + { + String query = "BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM ks.tbl6 WHERE k = 0);\n" + + " INSERT INTO ks.tbl5 (k, v) VALUES (1, 2);\n" + + "COMMIT TRANSACTION;"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(TRANSACTIONS_DISABLED_ON_TABLE_MESSAGE, "SELECT", "at [2:15]")); + } + + @Test + public void shouldRejectSelectOnNonTransactionalTable() + { + String query = "BEGIN TRANSACTION\n" + + " SELECT * FROM ks.tbl6 WHERE k = 0;\n" + + "COMMIT TRANSACTION;"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(TRANSACTIONS_DISABLED_ON_TABLE_MESSAGE, "SELECT", "at [2:3]")); + } + + @Test + public void shouldRejectUpdateOnNonTransactionalTable() + { + String query = "BEGIN TRANSACTION\n" + + " INSERT INTO ks.tbl6 (k, v) VALUES (1, 2);\n" + + "COMMIT TRANSACTION;"; + + Assertions.assertThatThrownBy(() -> prepare(query)) + .isInstanceOf(InvalidRequestException.class) + .hasMessageContaining(String.format(TRANSACTIONS_DISABLED_ON_TABLE_MESSAGE, "INSERT", "at [2:3]")); + } + + private static CQLStatement prepare(String query) + { + TransactionStatement.Parsed parsed = (TransactionStatement.Parsed) QueryProcessor.parseStatement(query); + return parsed.prepare(ClientState.forInternalCalls()); + } + + private static ResultMessage execute(String query, Object... binds) + { + CQLStatement stmt = prepare(query); + return stmt.execute(QueryState.forInternalCalls(), QueryProcessor.makeInternalOptions(stmt, binds), Dispatcher.RequestTime.forImmediateExecution()); + } +} diff --git a/test/unit/org/apache/cassandra/cql3/statements/TxnDataNameTest.java b/test/unit/org/apache/cassandra/cql3/statements/TxnDataNameTest.java new file mode 100644 index 000000000000..22703609f595 --- /dev/null +++ b/test/unit/org/apache/cassandra/cql3/statements/TxnDataNameTest.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.cql3.statements; + +import org.junit.Test; + +import org.apache.cassandra.service.accord.txn.TxnData.TxnDataNameKind; +import org.quicktheories.core.Gen; +import org.quicktheories.generators.SourceDSL; +import org.quicktheories.impl.Constraint; + +import static org.apache.cassandra.service.accord.txn.TxnData.TXN_DATA_NAME_INDEX_MAX; +import static org.apache.cassandra.service.accord.txn.TxnData.txnDataName; +import static org.apache.cassandra.service.accord.txn.TxnData.txnDataNameIndex; +import static org.apache.cassandra.service.accord.txn.TxnData.txnDataNameKind; +import static org.apache.cassandra.utils.FailingConsumer.orFail; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; +import static org.quicktheories.QuickTheory.qt; + +public class TxnDataNameTest +{ + @Test + public void buildAndAccess() + { + qt().forAll(gen()).checkAssert(orFail(test -> { + if (test.index < 0 || test.index > TXN_DATA_NAME_INDEX_MAX) + { + try + { + txnDataName(test.kind, test.index); + fail("Expect IllegalArgumentException"); + } + catch (IllegalArgumentException e) + { + // expected + } + return; + } + + int txnDataName = txnDataName(test.kind, test.index); + assertEquals(test.kind, txnDataNameKind(txnDataName)); + assertEquals(test.index, txnDataNameIndex(txnDataName)); + })); + } + + @Test + public void testIndex() + { + TxnDataNameKind kind = TxnDataNameKind.values()[TxnDataNameKind.values().length - 1]; + int txnDataName = txnDataName(kind, 0); + assertEquals(0, txnDataNameIndex(txnDataName)); + txnDataName = txnDataName(kind, TXN_DATA_NAME_INDEX_MAX); + assertEquals(TXN_DATA_NAME_INDEX_MAX, txnDataNameIndex(txnDataName)); + } + + static class TestData + { + final TxnDataNameKind kind; + final int index; + + public TestData(TxnDataNameKind kind, int index) + { + this.kind = kind; + this.index = index; + } + } + + public static Gen gen() + { + Gen kindGen = SourceDSL.arbitrary().enumValues(TxnDataNameKind.class); + return rnd -> { + TxnDataNameKind kind = kindGen.generate(rnd); + int index = (int)(rnd.next(Constraint.zeroToOne()) == 0 ? rnd.next(Constraint.between(0, TXN_DATA_NAME_INDEX_MAX)) : rnd.next(Constraint.between(Integer.MIN_VALUE, Integer.MAX_VALUE))); + return new TestData(kind, index); + }; + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/cql3/terms/ListsTest.java b/test/unit/org/apache/cassandra/cql3/terms/ListsTest.java index 30921a8870b1..a45c2930cba9 100644 --- a/test/unit/org/apache/cassandra/cql3/terms/ListsTest.java +++ b/test/unit/org/apache/cassandra/cql3/terms/ListsTest.java @@ -147,7 +147,6 @@ private void testPrepender_execute(List terms) ByteBuffer keyBuf = ByteBufferUtil.bytes("key"); DecoratedKey key = Murmur3Partitioner.instance.decorateKey(keyBuf); UpdateParameters parameters = new UpdateParameters(metaData, - null, ClientState.forInternalCalls(), QueryOptions.DEFAULT, System.currentTimeMillis(), diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/FrozenCollectionsTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/FrozenCollectionsTest.java index 9ea3e7b05890..2d935eb978ad 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/entities/FrozenCollectionsTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/entities/FrozenCollectionsTest.java @@ -29,7 +29,12 @@ import org.apache.cassandra.Util; import org.apache.cassandra.cql3.CQLTester; import org.apache.cassandra.cql3.restrictions.StatementRestrictions; -import org.apache.cassandra.db.marshal.*; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.ListType; +import org.apache.cassandra.db.marshal.MapType; +import org.apache.cassandra.db.marshal.SetType; +import org.apache.cassandra.db.marshal.TupleType; import org.apache.cassandra.dht.ByteOrderedPartitioner; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.exceptions.InvalidRequestException; @@ -37,7 +42,6 @@ import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.FBUtilities; -import static org.apache.cassandra.ServerTestUtils.daemonInitialization; import static org.junit.Assert.assertEquals; public class FrozenCollectionsTest extends CQLTester diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/JsonTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/JsonTest.java index d05391afccd0..2976c014bf67 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/entities/JsonTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/entities/JsonTest.java @@ -39,7 +39,6 @@ import java.util.*; import java.util.concurrent.*; -import static org.apache.cassandra.ServerTestUtils.daemonInitialization; import static org.apache.cassandra.utils.Clock.Global.nanoTime; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java index c1365e4cc36d..6888ff3a9314 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java @@ -25,12 +25,11 @@ import java.util.concurrent.CountDownLatch; import com.google.common.collect.ImmutableSet; + import org.apache.commons.lang3.StringUtils; import org.junit.BeforeClass; import org.junit.Test; -import org.apache.cassandra.index.internal.CassandraIndex; -import org.apache.cassandra.index.sai.StorageAttachedIndex; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.config.DatabaseDescriptor; @@ -46,10 +45,12 @@ import org.apache.cassandra.db.rows.Row; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.exceptions.SyntaxException; -import org.apache.cassandra.index.IndexNotAvailableException; +import org.apache.cassandra.index.IndexBuildInProgressException; import org.apache.cassandra.index.SecondaryIndexManager; import org.apache.cassandra.index.StubIndex; +import org.apache.cassandra.index.internal.CassandraIndex; import org.apache.cassandra.index.internal.CustomCassandraIndex; +import org.apache.cassandra.index.sai.StorageAttachedIndex; import org.apache.cassandra.index.sasi.SASIIndex; import org.apache.cassandra.schema.IndexMetadata; import org.apache.cassandra.service.ClientState; @@ -1090,7 +1091,7 @@ public void testIndexQueriesWithIndexNotReady() throws Throwable execute("SELECT value FROM %s WHERE value = 2"); fail(); } - catch (IndexNotAvailableException e) + catch (IndexBuildInProgressException e) { assertTrue(true); } @@ -1124,7 +1125,7 @@ public void testReadOnlyIndex() throws Throwable indexName = createIndexAsync("CREATE CUSTOM INDEX ON %s (value) USING '" + ReadOnlyOnFailureIndex.class.getName() + "'"); index = (ReadOnlyOnFailureIndex) getCurrentColumnFamilyStore().indexManager.getIndexByName(indexName); waitForIndexBuilds(indexName); - assertInvalidThrow(IndexNotAvailableException.class, "SELECT value FROM %s WHERE value = 1"); + assertInvalidThrow(IndexBuildInProgressException.class, "SELECT value FROM %s WHERE value = 1"); execute("INSERT INTO %s (pk, ck, value) VALUES (?, ?, ?)", 1, 1, 1); assertEquals(0, index.rowsInserted.size()); @@ -1164,7 +1165,7 @@ public void testWriteOnlyIndex() throws Throwable waitForIndexBuilds(indexName); execute("INSERT INTO %s (pk, ck, value) VALUES (?, ?, ?)", 1, 1, 1); assertEquals(1, index.rowsInserted.size()); - assertInvalidThrow(IndexNotAvailableException.class, "SELECT value FROM %s WHERE value = 1"); + assertInvalidThrow(IndexBuildInProgressException.class, "SELECT value FROM %s WHERE value = 1"); // Upon recovery, we can query data again index.reset(); diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/TupleTypeTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/TupleTypeTest.java index 90c9778df41d..2a7bb8e74a13 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/entities/TupleTypeTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/entities/TupleTypeTest.java @@ -174,7 +174,7 @@ public void testTupleWithUnsetValues() throws Throwable createIndex("CREATE INDEX tuple_index ON %s (t)"); // select using unset - assertInvalidMessage("Invalid unset value for tuple field number 0", "SELECT * FROM %s WHERE k = ? and t = (?,?,?)", unset(), unset(), unset(), unset()); + assertInvalidMessage("Invalid unset value for tuple field number 0", "SELECT * FROM %s WHERE k = ? and t = (?,?,?)", 42, unset(), unset(), unset()); } /** diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/UserTypesTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/UserTypesTest.java index 530d55ba11cd..2d30f1946ca6 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/entities/UserTypesTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/entities/UserTypesTest.java @@ -27,8 +27,6 @@ import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.service.StorageService; -import static org.apache.cassandra.ServerTestUtils.daemonInitialization; - public class UserTypesTest extends CQLTester { @BeforeClass diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/VectorsTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/VectorsTest.java index 7154d0295ec7..2d77428dae59 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/entities/VectorsTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/entities/VectorsTest.java @@ -25,8 +25,6 @@ import org.apache.cassandra.cql3.CQLTester; -import static org.apache.cassandra.ServerTestUtils.daemonInitialization; - public class VectorsTest extends CQLTester { @BeforeClass diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/VirtualTableTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/VirtualTableTest.java index 3b9ce8ac80b8..38a6df7c45c4 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/entities/VirtualTableTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/entities/VirtualTableTest.java @@ -39,7 +39,6 @@ import org.junit.Test; import com.datastax.driver.core.exceptions.InvalidQueryException; -import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.cql3.CQLTester; import org.apache.cassandra.db.Mutation; import org.apache.cassandra.db.marshal.Int32Type; @@ -57,7 +56,6 @@ import org.apache.cassandra.service.StorageServiceMBean; import org.apache.cassandra.triggers.ITrigger; - import static java.lang.String.format; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -209,8 +207,6 @@ private static Pair updateColumn(Pair row, @BeforeClass public static void setUpVirtualTables() { - ServerTestUtils.daemonInitialization(); - TableMetadata vt1Metadata = TableMetadata.builder(KS_NAME, VT1_NAME) .kind(TableMetadata.Kind.VIRTUAL) .addPartitionKeyColumn("pk", UTF8Type.instance) @@ -1051,7 +1047,7 @@ public void testMBeansMethods() throws Throwable } @Test - public void testDisallowedFilteringOnRegularColumn() throws Throwable + public void testDisallowedFilteringOnRegularColumn() { try { @@ -1065,7 +1061,7 @@ public void testDisallowedFilteringOnRegularColumn() throws Throwable } @Test - public void testDisallowedFilteringOnClusteringColumn() throws Throwable + public void testDisallowedFilteringOnClusteringColumn() { try { @@ -1079,13 +1075,13 @@ public void testDisallowedFilteringOnClusteringColumn() throws Throwable } @Test - public void testAllowedFilteringOnRegularColumn() throws Throwable + public void testAllowedFilteringOnRegularColumn() { executeNet(format("SELECT * FROM %s.%s WHERE v2 = 5", KS_NAME, VT1_NAME)); } @Test - public void testAllowedFilteringOnClusteringColumn() throws Throwable + public void testAllowedFilteringOnClusteringColumn() { executeNet(format("SELECT * FROM %s.%s WHERE c = 'abc'", KS_NAME, VT1_NAME)); } diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/AggregationTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/AggregationTest.java index 24afbb7e802c..72bbc53b76b8 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/operations/AggregationTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/operations/AggregationTest.java @@ -31,18 +31,8 @@ import java.util.concurrent.ThreadLocalRandom; import org.apache.commons.lang3.time.DateUtils; - import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import ch.qos.logback.classic.LoggerContext; -import ch.qos.logback.classic.joran.ReconfigureOnChangeTask; -import ch.qos.logback.classic.spi.TurboFilterList; -import ch.qos.logback.classic.turbo.ReconfigureOnChangeFilter; -import ch.qos.logback.classic.turbo.TurboFilter; -import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.cql3.CQLTester; import org.apache.cassandra.cql3.QueryProcessor; import org.apache.cassandra.cql3.UntypedResultSet; @@ -51,13 +41,13 @@ import org.apache.cassandra.db.marshal.TypeParser; import org.apache.cassandra.exceptions.FunctionExecutionException; import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.transport.Event.SchemaChange.Change; import org.apache.cassandra.transport.Event.SchemaChange.Target; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.transport.messages.ResultMessage; -import static ch.qos.logback.core.CoreConstants.RECONFIGURE_ON_CHANGE_TASK; import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; @@ -1893,91 +1883,48 @@ public void testLogbackReload() throws Throwable { // see https://issues.apache.org/jira/browse/CASSANDRA-11033 - // make logback's scan interval 1ms - boilerplate, but necessary for this test - configureLogbackScanPeriod(1L); - try - { - - createTable("CREATE TABLE %s (" + - " year int PRIMARY KEY," + - " country text," + - " title text)"); - - String[] countries = Locale.getISOCountries(); - ThreadLocalRandom rand = ThreadLocalRandom.current(); - for (int i = 0; i < 10000; i++) - { - execute("INSERT INTO %s (year, country, title) VALUES (1980,?,?)", - countries[rand.nextInt(countries.length)], - "title-" + i); - } - - String albumCountByCountry = createFunction(KEYSPACE, - "map,text,text", - "CREATE FUNCTION IF NOT EXISTS %s(state map,country text, album_title text)\n" + - " RETURNS NULL ON NULL INPUT\n" + - " RETURNS map\n" + - " LANGUAGE java\n" + - " AS $$\n" + - " if(state.containsKey(country)) {\n" + - " Long newCount = (Long)state.get(country) + 1;\n" + - " state.put(country, newCount);\n" + - " } else {\n" + - " state.put(country, 1L);\n" + - " }\n" + - " return state;\n" + - " $$;"); - - String releasesByCountry = createAggregate(KEYSPACE, - "text, text", - " CREATE AGGREGATE IF NOT EXISTS %s(text, text)\n" + - " SFUNC " + shortFunctionName(albumCountByCountry) + '\n' + - " STYPE map\n" + - " INITCOND { };"); - - long tEnd = System.currentTimeMillis() + 150; - while (System.currentTimeMillis() < tEnd) - { - execute("SELECT " + releasesByCountry + "(country,title) FROM %s WHERE year=1980"); - } - } - finally - { - configureLogbackScanPeriod(60000L); - } - } + createTable("CREATE TABLE %s (" + + " year int PRIMARY KEY," + + " country text," + + " title text)"); - private static void configureLogbackScanPeriod(long millis) - { - Logger l = LoggerFactory.getLogger(AggregationTest.class); - ch.qos.logback.classic.Logger logbackLogger = (ch.qos.logback.classic.Logger) l; - LoggerContext ctx = logbackLogger.getLoggerContext(); - TurboFilterList turboFilterList = ctx.getTurboFilterList(); - boolean done = false; - for (TurboFilter turboFilter : turboFilterList) + String[] countries = Locale.getISOCountries(); + ThreadLocalRandom rand = ThreadLocalRandom.current(); + for (int i = 0; i < 10000; i++) { - if (turboFilter instanceof ReconfigureOnChangeFilter) - { - ReconfigureOnChangeFilter reconfigureFilter = (ReconfigureOnChangeFilter) turboFilter; - reconfigureFilter.setContext(ctx); - reconfigureFilter.setRefreshPeriod(millis); - reconfigureFilter.stop(); - reconfigureFilter.start(); // start() sets the next check timestammp - done = true; - break; - } + execute("INSERT INTO %s (year, country, title) VALUES (1980,?,?)", + countries[rand.nextInt(countries.length)], + "title-" + i); } - ReconfigureOnChangeTask roct = (ReconfigureOnChangeTask) ctx.getObject(RECONFIGURE_ON_CHANGE_TASK); - if (roct != null) + String albumCountByCountry = createFunction(KEYSPACE, + "map,text,text", + "CREATE FUNCTION IF NOT EXISTS %s(state map,country text, album_title text)\n" + + " RETURNS NULL ON NULL INPUT\n" + + " RETURNS map\n" + + " LANGUAGE java\n" + + " AS $$\n" + + " if(state.containsKey(country)) {\n" + + " Long newCount = (Long)state.get(country) + 1;\n" + + " state.put(country, newCount);\n" + + " } else {\n" + + " state.put(country, 1L);\n" + + " }\n" + + " return state;\n" + + " $$;"); + + String releasesByCountry = createAggregate(KEYSPACE, + "text, text", + " CREATE AGGREGATE IF NOT EXISTS %s(text, text)\n" + + " SFUNC " + shortFunctionName(albumCountByCountry) + '\n' + + " STYPE map\n" + + " INITCOND { };"); + + long tEnd = System.currentTimeMillis() + 150; + while (System.currentTimeMillis() < tEnd) { - // New functionality in logback - they replaced ReconfigureOnChangeFilter (which runs in the logging code) - // with an async ReconfigureOnChangeTask - i.e. in a thread that does not become sandboxed. - // Let the test run anyway, just we cannot reconfigure it (and it is pointless to reconfigure). - return; + execute("SELECT " + releasesByCountry + "(country,title) FROM %s WHERE year=1980"); } - - assertTrue("ReconfigureOnChangeFilter not in logback's turbo-filter list - do that by adding scan=\"true\" to logback-test.xml's configuration element", done); } @Test diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/CQLVectorTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/CQLVectorTest.java index 2c6ce589dae4..6b034af523c2 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/operations/CQLVectorTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/operations/CQLVectorTest.java @@ -203,7 +203,7 @@ public void sandwichBetweenUDTs() execute("INSERT INTO %s (pk, value) VALUES (0, {z: [{y:1}, {y:2}]})"); assertRows(execute("SELECT * FROM %s"), - row(0, userType("z", vector(userType("y", 1), userType("y", 2))))); + row(0, userType("z", vector((Object)userType("y", 1), (Object)userType("y", 2))))); } @Test diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/InsertTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/InsertTest.java index 70b8d3e9da77..f382675955c9 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/operations/InsertTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/operations/InsertTest.java @@ -66,7 +66,7 @@ public void testInsertZeroDuration() throws Throwable row(12, expectedDuration), row(13, expectedDuration), row(14, expectedDuration)); - assertInvalidMessage("no viable alternative at input ')' (... b) VALUES (15, [P]))","INSERT INTO %s (a, b) VALUES (15, P)"); + assertInvalid("no viable alternative at input ')' (... b) VALUES (15, [P]))","INSERT INTO %s (a, b) VALUES (15, P)"); } @Test diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectLimitTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectLimitTest.java index 3cbc9d79c083..31ef09ec86e4 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectLimitTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectLimitTest.java @@ -28,8 +28,6 @@ import org.apache.cassandra.dht.ByteOrderedPartitioner; import org.apache.cassandra.service.StorageService; -import static org.apache.cassandra.ServerTestUtils.daemonInitialization; - public class SelectLimitTest extends CQLTester { // This method will be ran instead of the CQLTester#setUpClass diff --git a/test/unit/org/apache/cassandra/db/CellSpecTest.java b/test/unit/org/apache/cassandra/db/CellSpecTest.java index a72a4650e422..b35ab0b0712e 100644 --- a/test/unit/org/apache/cassandra/db/CellSpecTest.java +++ b/test/unit/org/apache/cassandra/db/CellSpecTest.java @@ -147,11 +147,11 @@ public static Collection data() { tests.add(new NativeCell(allocator, order.getCurrent(), column, 1234, 1, 1, bbBytes, path)); }; // simple - fn.accept(ColumnMetadata.regularColumn(table, bytes("simple"), BytesType.instance), null); + fn.accept(ColumnMetadata.regularColumn(table, bytes("simple"), BytesType.instance, ColumnMetadata.NO_UNIQUE_ID), null); // complex // seems NativeCell does not allow CellPath.TOP, or CellPath.BOTTOM - fn.accept(ColumnMetadata.regularColumn(table, bytes("complex"), ListType.getInstance(BytesType.instance, true)), CellPath.create(TimeUUID.Serializer.instance.serialize(nextTimeUUID()))); + fn.accept(ColumnMetadata.regularColumn(table, bytes("complex"), ListType.getInstance(BytesType.instance, true), ColumnMetadata.NO_UNIQUE_ID), CellPath.create(TimeUUID.Serializer.instance.serialize(nextTimeUUID()))); return tests.stream().map(a -> new Object[] {a.getClass().getSimpleName() + ":" + (a.path() == null ? "simple" : "complex"), a}).collect(Collectors.toList()); } diff --git a/test/unit/org/apache/cassandra/db/CellTest.java b/test/unit/org/apache/cassandra/db/CellTest.java index 9540c8de7456..1e96e8c51a6b 100644 --- a/test/unit/org/apache/cassandra/db/CellTest.java +++ b/test/unit/org/apache/cassandra/db/CellTest.java @@ -50,10 +50,12 @@ import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.FBUtilities; import org.assertj.core.api.Assertions; import org.assertj.core.api.ThrowableAssert; import static java.util.Arrays.asList; +import static org.junit.Assert.assertEquals; public class CellTest { @@ -74,6 +76,8 @@ public class CellTest .addRegularColumn("v", IntegerType.instance) .addRegularColumn("m", MapType.getInstance(IntegerType.instance, IntegerType.instance, true)) .build(); + public static final ByteBuffer TEST_VALUE = ByteBufferUtil.bytes("a"); + @BeforeClass public static void defineSchema() throws ConfigurationException @@ -88,6 +92,7 @@ private static ColumnMetadata fakeColumn(String name, AbstractType type) "fakeTable", ColumnIdentifier.getInterned(name, false), type, + ColumnMetadata.NO_UNIQUE_ID, ColumnMetadata.NO_POSITION, ColumnMetadata.Kind.REGULAR, null); @@ -284,6 +289,86 @@ public void testExpiringCellReconile() Assert.assertEquals(-1, testExpiring("val", "b", 2, 1, null, "a", null, 1)); } + + public static void assertCellsEqual(Cell cellA, Cell cellB) + { + assertEquals(cellA.timestamp(), cellB.timestamp()); + assertEquals(cellA.ttl(), cellB.ttl()); + assertEquals(cellA.localDeletionTime(), cellB.localDeletionTime()); + assertEquals(cellA.buffer(), cellB.buffer()); + } + + static void checkCommutes(ColumnMetadata cmd, long timestamp, long tsDiff, int ttl, int ttlDiff, long nowInSeconds, int nowDiff) + { + long timestampA = timestamp; + long timestampB = timestampA + tsDiff; + int ttlA = ttl; + int ttlB = ttl + ttlDiff; + long nowInSecsA = nowInSeconds; + long nowInSecsB = nowInSecsA + nowDiff; + if (nowInSecsA < 0 || nowInSecsB < 0) + return; + + Cell cellA = ttlA == 0 ? BufferCell.tombstone(cmd, timestampA, nowInSecsA) : + ttlA < 0 ? BufferCell.live(cmd, timestampA, TEST_VALUE) : + BufferCell.expiring(cmd, timestampA, ttlA, nowInSecsA, TEST_VALUE); + Cell cellB = ttlB == 0 ? BufferCell.tombstone(cmd, timestampB, nowInSecsB) : + ttlB < 0 ? BufferCell.live(cmd, timestampB, TEST_VALUE) : + BufferCell.expiring(cmd, timestampB, ttlB, nowInSecsB, TEST_VALUE); + + Cell cellAB = Cells.reconcile(cellA, cellB); + Cell cellBA = Cells.reconcile(cellB, cellA); + + assertCellsEqual(cellAB, cellBA); + } + + @Test + public void checkSameValueDifferentLivenessCommutes() + { + ColumnMetadata cmd = fakeColumn("c", UTF8Type.instance); + long[] tsDiffs = new long[] {0L, + 1L, // microsecond + 1000L, // millisecond + 1000000L, // second + 60000000L}; // minute + int[] ttls = new int[] { -1, 0, 1, 3600, 24 * 3600, 7 * 24 * 3600, 60 * 24 * 3600, 366 * 24 * 3600 }; + int[] ttlDiffs = new int[] { 0, 1, 60, 3600, 24 * 3600, 7 * 24 * 3600, 60 * 24 * 3600, 366 * 24 * 3600 }; + + long nowInSeconds = FBUtilities.nowInSeconds(); + long timestamp = FBUtilities.timestampMicros(); + + for (long tsDiff: tsDiffs) + { + for (int ttl: ttls) + { + for (int ttlDiff : ttlDiffs) + { + for (Integer nowDiff : ttlDiffs) + checkCommutes(cmd, timestamp, tsDiff, ttl, ttlDiff, nowInSeconds, nowDiff); + } + } + } + } + + // Checks that reconciling a cell with a smaller TTL reconcile commutatively + // Similar to rewriting data retrieved with SELECT v, TTL(v), WRITETIMESTAMP(v) with + // INSERT SET v=? USING TTL ? AND TIMESTAMP ? + @Test + public void rewriteCellWithSmallerTTL() + { + ColumnMetadata cmd = fakeColumn("c", UTF8Type.instance); + int[] nowDiffs = new int[] { 0, 1, 60, 3600, 24 * 3600, 7 * 24 * 3600, 60 * 24 * 3600, 366 * 24 * 3600 }; + long timestamp = FBUtilities.timestampMicros(); + long nowInSeconds = FBUtilities.nowInSeconds(); + int ttl = 3600; + + for (Integer nowDiff : nowDiffs) + { + checkCommutes(cmd, timestamp, 0L, ttl, -nowDiff, nowInSeconds, nowDiff); + } + } + + class SimplePurger implements DeletionPurger { private final long gcBefore; diff --git a/test/unit/org/apache/cassandra/db/CleanupTransientTest.java b/test/unit/org/apache/cassandra/db/CleanupTransientTest.java index d1a275c65722..33877c9b8e0d 100644 --- a/test/unit/org/apache/cassandra/db/CleanupTransientTest.java +++ b/test/unit/org/apache/cassandra/db/CleanupTransientTest.java @@ -26,8 +26,10 @@ import org.junit.BeforeClass; import org.junit.Test; +import org.apache.cassandra.CassandraTestBase; +import org.apache.cassandra.CassandraTestBase.PrepareServerNoRegister; +import org.apache.cassandra.CassandraTestBase.UseRandomPartitioner; import org.apache.cassandra.SchemaLoader; -import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.Util; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.compaction.CompactionManager; @@ -45,10 +47,11 @@ import static org.junit.Assert.assertEquals; -public class CleanupTransientTest +@PrepareServerNoRegister +@UseRandomPartitioner +public class CleanupTransientTest extends CassandraTestBase { private static final IPartitioner partitioner = RandomPartitioner.instance; - private static IPartitioner oldPartitioner; public static final int LOOPS = 200; public static final String KEYSPACE1 = "CleanupTest1"; @@ -70,10 +73,7 @@ public class CleanupTransientTest @BeforeClass public static void setup() throws Exception { - DatabaseDescriptor.daemonInitialization(); DatabaseDescriptor.setTransientReplicationEnabledUnsafe(true); - oldPartitioner = StorageService.instance.setPartitionerUnsafe(partitioner); - ServerTestUtils.prepareServerNoRegister(); SchemaLoader.createKeyspace(KEYSPACE1, KeyspaceParams.simple("2/1"), SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1), diff --git a/test/unit/org/apache/cassandra/db/ClusteringHeapSizeTest.java b/test/unit/org/apache/cassandra/db/ClusteringHeapSizeTest.java index 3ba4ae650e13..07561760f34b 100644 --- a/test/unit/org/apache/cassandra/db/ClusteringHeapSizeTest.java +++ b/test/unit/org/apache/cassandra/db/ClusteringHeapSizeTest.java @@ -25,6 +25,7 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; +import org.apache.cassandra.db.marshal.ValueAccessor; import org.apache.cassandra.utils.ObjectSizes; import org.apache.cassandra.utils.concurrent.ImmediateFuture; import org.apache.cassandra.utils.concurrent.OpOrder; @@ -64,13 +65,14 @@ public void unsharedHeapSizeExcludingDataLTEUnsharedHeapSize() @Test public void testSingletonClusteringHeapSize() { - Clustering clustering = this.clustering.accessor().factory().staticClustering(); + ValueAccessor.ObjectFactory factory = this.clustering.ensureAccessorFactorySupport().accessor().factory(); + Clustering clustering = factory.staticClustering(); Assertions.assertThat(clustering.unsharedHeapSize()) .isEqualTo(0); Assertions.assertThat(clustering.unsharedHeapSizeExcludingData()) .isEqualTo(0); - clustering = this.clustering.accessor().factory().clustering(); + clustering = factory.clustering(); Assertions.assertThat(clustering.unsharedHeapSize()) .isEqualTo(0); Assertions.assertThat(clustering.unsharedHeapSizeExcludingData()) diff --git a/test/unit/org/apache/cassandra/db/ClusteringPrefixTest.java b/test/unit/org/apache/cassandra/db/ClusteringPrefixTest.java index a295b2278694..0bbe535348d0 100644 --- a/test/unit/org/apache/cassandra/db/ClusteringPrefixTest.java +++ b/test/unit/org/apache/cassandra/db/ClusteringPrefixTest.java @@ -180,7 +180,7 @@ public void testRetainable(ValueAccessor.ObjectFactory factory, public void testRetainable(ValueAccessor.ObjectFactory factory, Function allocator, - Function, ClusteringPrefix> mapper) + Function, ClusteringPrefix> mapper) { ClusteringPrefix[] clusterings = new ClusteringPrefix[] { diff --git a/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java b/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java index 0febbe660cfa..c134eaed4316 100644 --- a/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java +++ b/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java @@ -842,7 +842,7 @@ private Memtable fakeMemTableWithMinTS(ColumnFamilyStore cfs, long minTS) { @Override - public long put(PartitionUpdate update, UpdateTransaction indexer, Group opGroup) + public long put(PartitionUpdate update, UpdateTransaction indexer, Group opGroup, boolean assumeMissing) { return 0; } diff --git a/test/unit/org/apache/cassandra/db/ColumnsTest.java b/test/unit/org/apache/cassandra/db/ColumnsTest.java index 810e26f945ed..00a9b1a443c3 100644 --- a/test/unit/org/apache/cassandra/db/ColumnsTest.java +++ b/test/unit/org/apache/cassandra/db/ColumnsTest.java @@ -485,18 +485,18 @@ private static void addClustering(List names, List resul private static void addRegular(List names, List results) { for (String name : names) - results.add(ColumnMetadata.regularColumn(TABLE_METADATA, bytes(name), UTF8Type.instance)); + results.add(ColumnMetadata.regularColumn(TABLE_METADATA, bytes(name), UTF8Type.instance, ColumnMetadata.NO_UNIQUE_ID)); } private static void addComplex(List names, List results) { for (String name : names) - results.add(ColumnMetadata.regularColumn(TABLE_METADATA, bytes(name), SetType.getInstance(UTF8Type.instance, true))); + results.add(ColumnMetadata.regularColumn(TABLE_METADATA, bytes(name), SetType.getInstance(UTF8Type.instance, true), ColumnMetadata.NO_UNIQUE_ID)); } private static ColumnMetadata def(String name, AbstractType type, ColumnMetadata.Kind kind) { - return new ColumnMetadata(TABLE_METADATA, bytes(name), type, ColumnMetadata.NO_POSITION, kind, null); + return new ColumnMetadata(TABLE_METADATA, bytes(name), type, ColumnMetadata.NO_UNIQUE_ID, ColumnMetadata.NO_POSITION, kind, null); } private static TableMetadata mock(Columns columns) diff --git a/test/unit/org/apache/cassandra/db/CounterMutationVerbHandlerOutOfRangeTest.java b/test/unit/org/apache/cassandra/db/CounterMutationVerbHandlerOutOfRangeTest.java index 9ae05e3e4ee1..da58eccfa02e 100644 --- a/test/unit/org/apache/cassandra/db/CounterMutationVerbHandlerOutOfRangeTest.java +++ b/test/unit/org/apache/cassandra/db/CounterMutationVerbHandlerOutOfRangeTest.java @@ -37,7 +37,7 @@ import org.apache.cassandra.db.rows.Row; import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; import org.apache.cassandra.exceptions.InvalidRoutingException; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.metrics.StorageMetrics; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; @@ -51,12 +51,18 @@ import org.apache.cassandra.tcm.membership.NodeState; import org.apache.cassandra.utils.FBUtilities; -import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.*; -import static org.apache.cassandra.utils.ByteBufferUtil.bytes; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.MessageDelivery; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.broadcastAddress; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.bytesToken; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.node1; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.randomInt; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.registerOutgoingMessageSink; +import static org.apache.cassandra.utils.ByteBufferUtil.bytes; + public class CounterMutationVerbHandlerOutOfRangeTest { private static final String KEYSPACE = "CounterCacheTest"; @@ -168,7 +174,7 @@ private static void verifyFailureResponse(ListenableFuture mess MessageDelivery response = messageSink.get(100, TimeUnit.MILLISECONDS); assertEquals(Verb.FAILURE_RSP, response.message.verb()); assertEquals(broadcastAddress, response.message.from()); - assertTrue(response.message.payload instanceof RequestFailureReason); + assertTrue(response.message.payload instanceof RequestFailure); assertEquals(messageId, response.message.id()); assertEquals(node1, response.to); } diff --git a/test/unit/org/apache/cassandra/db/DiskBoundaryManagerTest.java b/test/unit/org/apache/cassandra/db/DiskBoundaryManagerTest.java index 0b25f19d99c4..606bdf33deac 100644 --- a/test/unit/org/apache/cassandra/db/DiskBoundaryManagerTest.java +++ b/test/unit/org/apache/cassandra/db/DiskBoundaryManagerTest.java @@ -186,7 +186,7 @@ public void testGetDataDirectoriesForFiles() SSTableReader disk1Boundary = MockSchema.sstable(gen++, (long)sstableFirstDisk1.getTokenValue(), (long)tokens.get(0).getTokenValue(), 0, mock); SSTableReader disk2Full = MockSchema.sstable(gen++, (long)tokens.get(0).nextValidToken().getTokenValue(), (long)tokens.get(1).getTokenValue(), 0, mock); - SSTableReader disk3Full = MockSchema.sstable(gen++, (long)tokens.get(1).nextValidToken().getTokenValue(), (long)partitioner.getMaximumToken().getTokenValue(), 0, mock); + SSTableReader disk3Full = MockSchema.sstable(gen++, (long)tokens.get(1).nextValidToken().getTokenValue(), (long)partitioner.getMaximumTokenForSplitting().getTokenValue(), 0, mock); Assert.assertEquals(tableDirs, mock.getDirectoriesForFiles(ImmutableSet.of())); Assert.assertEquals(Lists.newArrayList(tableDirs.get(0)), mock.getDirectoriesForFiles(ImmutableSet.of(containedDisk1))); diff --git a/test/unit/org/apache/cassandra/db/LivenessInfoTest.java b/test/unit/org/apache/cassandra/db/LivenessInfoTest.java index 193649727865..124231810f54 100644 --- a/test/unit/org/apache/cassandra/db/LivenessInfoTest.java +++ b/test/unit/org/apache/cassandra/db/LivenessInfoTest.java @@ -76,6 +76,11 @@ public void testSupersedes() first = LivenessInfo.withExpirationTime(100, LivenessInfo.EXPIRED_LIVENESS_TTL, nowInSeconds + 1); second = LivenessInfo.withExpirationTime(100, LivenessInfo.EXPIRED_LIVENESS_TTL, nowInSeconds); assertSupersedes(first, second); + + // rewritten expiring with the same expiration time and a lower TTL, take the lower TTL as likely to be more recent + first = LivenessInfo.withExpirationTime(100, 4, nowInSeconds); + second = LivenessInfo.withExpirationTime(100, 5, nowInSeconds); + assertSupersedes(first, second); } @Test diff --git a/test/unit/org/apache/cassandra/db/MutationVerbHandlerOutOfRangeTest.java b/test/unit/org/apache/cassandra/db/MutationVerbHandlerOutOfRangeTest.java index 0421e26aa575..77ef47192ebb 100644 --- a/test/unit/org/apache/cassandra/db/MutationVerbHandlerOutOfRangeTest.java +++ b/test/unit/org/apache/cassandra/db/MutationVerbHandlerOutOfRangeTest.java @@ -37,7 +37,7 @@ import org.apache.cassandra.db.rows.Row; import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; import org.apache.cassandra.exceptions.InvalidRoutingException; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.metrics.StorageMetrics; import org.apache.cassandra.net.IVerbHandler; import org.apache.cassandra.net.Message; @@ -51,11 +51,17 @@ import org.apache.cassandra.tcm.membership.NodeState; import org.apache.cassandra.utils.FBUtilities; -import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.*; -import static org.apache.cassandra.utils.ByteBufferUtil.bytes; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.MessageDelivery; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.broadcastAddress; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.bytesToken; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.node1; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.randomInt; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.registerOutgoingMessageSink; +import static org.apache.cassandra.utils.ByteBufferUtil.bytes; + public class MutationVerbHandlerOutOfRangeTest { private static final String TEST_NAME = "mutation_vh_test_"; @@ -190,7 +196,7 @@ private void getAndVerifyResponse(ListenableFuture messageSink, MessageDelivery response = messageSink.get(100, TimeUnit.MILLISECONDS); assertEquals(isOutOfRange ? Verb.FAILURE_RSP : Verb.MUTATION_RSP, response.message.verb()); assertEquals(broadcastAddress, response.message.from()); - assertEquals(isOutOfRange, response.message.payload instanceof RequestFailureReason); + assertEquals(isOutOfRange, response.message.payload instanceof RequestFailure); assertEquals(messageId, response.message.id()); assertEquals(node1, response.to); assertEquals(startingTotalMetricCount + (isOutOfRange ? 1 : 0), StorageMetrics.totalOpsForInvalidToken.getCount()); diff --git a/test/unit/org/apache/cassandra/db/NativeCellTest.java b/test/unit/org/apache/cassandra/db/NativeCellTest.java index cf2ba4ed0dd1..ebc956947e3b 100644 --- a/test/unit/org/apache/cassandra/db/NativeCellTest.java +++ b/test/unit/org/apache/cassandra/db/NativeCellTest.java @@ -17,6 +17,7 @@ */ package org.apache.cassandra.db; +import java.io.IOException; import java.nio.ByteBuffer; import java.util.Iterator; import java.util.Random; @@ -30,6 +31,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.io.util.DataOutputBuffer; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.cql3.CQLTester; import org.apache.cassandra.cql3.ColumnIdentifier; @@ -37,6 +39,7 @@ import org.apache.cassandra.db.marshal.SetType; import org.apache.cassandra.db.marshal.UTF8Type; import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.concurrent.ImmediateFuture; import org.apache.cassandra.utils.concurrent.OpOrder; import org.apache.cassandra.utils.memory.HeapCloner; @@ -63,7 +66,7 @@ public static void setUp() } @Test - public void testCells() + public void testCells() throws Exception { for (int run = 0 ; run < 1000 ; run++) { @@ -119,6 +122,7 @@ private static ColumnMetadata rndcol() "", ColumnIdentifier.getInterned(uuid.toString(), false), isComplex ? new SetType<>(BytesType.instance, true) : BytesType.instance, + ColumnMetadata.NO_UNIQUE_ID, -1, ColumnMetadata.Kind.REGULAR, null); @@ -152,7 +156,7 @@ private static int sanesize(int randomsize) return Math.min(Math.max(1, randomsize), 1 << 26); } - private static void test(Row row) + private static void test(Row row) throws Exception { Row nrow = row.clone(nativeAllocator.cloner(group)); Row brow = row.clone(HeapCloner.instance); @@ -160,21 +164,170 @@ private static void test(Row row) Assert.assertEquals(row, brow); Assert.assertEquals(nrow, brow); - Assert.assertEquals(row.clustering(), nrow.clustering()); - Assert.assertEquals(row.clustering(), brow.clustering()); - Assert.assertEquals(nrow.clustering(), brow.clustering()); + Digest rowDigest = Digest.forReadResponse(); + Digest nativeRowDigest = Digest.forReadResponse(); + Digest byteBufferRowDigest = Digest.forReadResponse(); + row.digest(rowDigest); + nrow.digest(nativeRowDigest); + brow.digest(byteBufferRowDigest); + byte[] rowDigestValue = rowDigest.digest(); + Assert.assertArrayEquals(rowDigestValue, nativeRowDigest.digest()); + Assert.assertArrayEquals(rowDigestValue, byteBufferRowDigest.digest()); - Assert.assertEquals(row.clustering().dataSize(), nrow.clustering().dataSize()); - Assert.assertEquals(row.clustering().dataSize(), brow.clustering().dataSize()); + Assert.assertEquals(row.dataSize(), nrow.dataSize()); + Assert.assertEquals(row.dataSize(), brow.dataSize()); - ClusteringComparator comparator = new ClusteringComparator(UTF8Type.instance); - Assert.assertEquals(0, comparator.compare(row.clustering(), nrow.clustering())); - Assert.assertEquals(0, comparator.compare(row.clustering(), brow.clustering())); - Assert.assertEquals(0, comparator.compare(nrow.clustering(), brow.clustering())); + assertClustering(row, brow, nrow); assertCellsDataSize(row, nrow); assertCellsDataSize(row, brow); + assertCellsWrittenToOutput(row, nrow); + assertCellsWrittenToOutput(row, brow); + + assertCellsSlicing(row, nrow); + assertCellsSlicing(row, brow); + } + + private static void assertClustering(Row row, Row byteBufferRow, Row nativeRow) throws Exception + { + Assert.assertEquals(row.clustering(), nativeRow.clustering()); + Assert.assertEquals(row.clustering(), byteBufferRow.clustering()); + Assert.assertEquals(nativeRow.clustering(), byteBufferRow.clustering()); + + ClusteringComparator comparator = new ClusteringComparator(UTF8Type.instance); + Assert.assertEquals(0, comparator.compare(row.clustering(), nativeRow.clustering())); + Assert.assertEquals(0, comparator.compare(row.clustering(), byteBufferRow.clustering())); + Assert.assertEquals(0, comparator.compare(nativeRow.clustering(), byteBufferRow.clustering())); + Assert.assertEquals(0, comparator.compare(nativeRow.clustering(), row.clustering())); + Assert.assertEquals(0, comparator.compare(nativeRow.clustering(), nativeRow.clustering())); + + + Assert.assertEquals(row.clustering().size(), nativeRow.clustering().size()); + Assert.assertEquals(row.clustering().size(), byteBufferRow.clustering().size()); + + assertByteBufferArrayEquals(row.clustering().getBufferArray(), nativeRow.clustering().getBufferArray()); + assertByteBufferArrayEquals(row.clustering().getBufferArray(), byteBufferRow.clustering().getBufferArray()); + + assertRawValuesEquals(row.clustering(), nativeRow.clustering()); + assertRawValuesEquals(row.clustering(), byteBufferRow.clustering()); + + + for (int i = 0; i < row.clustering().size(); i++) + { + Assert.assertEquals(row.clustering().isEmpty(i), byteBufferRow.clustering().isEmpty(i)); + Assert.assertEquals(row.clustering().isEmpty(i), nativeRow.clustering().isEmpty(i)); + + Assert.assertEquals(row.clustering().isNull(i), byteBufferRow.clustering().isNull(i)); + Assert.assertEquals(row.clustering().isNull(i), nativeRow.clustering().isNull(i)); + } + + assertClusteringElementSizes(row.clustering(), byteBufferRow.clustering()); + assertClusteringElementSizes(row.clustering(), nativeRow.clustering()); + + assertClusteringElementWrittenToOutput(row.clustering(), byteBufferRow.clustering()); + assertClusteringElementWrittenToOutput(row.clustering(), nativeRow.clustering()); + + assertClusteringSlicing(row.clustering(), byteBufferRow.clustering()); + assertClusteringSlicing(row.clustering(), nativeRow.clustering()); + + } + + private static void assertRawValuesEquals(Clustering c1, Clustering c2) + { + V1[] rawValues1 = c1.getRawValues(); + V2[] rawValues2 = c2.getRawValues(); + Assert.assertEquals(rawValues1.length, rawValues2.length); + for (int i = 0; i < c1.size(); i++) + { + if (rawValues1[i] != null) + Assert.assertEquals(0, c1.accessor().compare(rawValues1[i], rawValues2[i], c2.accessor())); + } + } + + private static void assertClusteringElementSizes(Clustering c1, Clustering c2) + { + for (int i = 0; i < c1.size(); i++) + { + if (c1.get(i) != null) + { + int sizeC1 = c1.accessor().size(c1.get(i)); + int sizeC2 = c2.accessor().size(c2.get(i)); + Assert.assertEquals(sizeC1, sizeC2); + } + } + } + + private static void assertClusteringElementWrittenToOutput(Clustering c1, Clustering c2) throws IOException + { + for (int i = 0; i < c1.size(); i++) + { + if (c1.get(i) != null) + { + DataOutputBuffer outputC1 = new DataOutputBuffer(c1.dataSize()); + DataOutputBuffer outputC2 = new DataOutputBuffer(c2.dataSize()); + c1.accessor().write(c1.get(i), outputC1); + c2.accessor().write(c2.get(i), outputC2); + Assert.assertArrayEquals(outputC1.toByteArray(), outputC2.toByteArray()); + } + } + } + + private static void assertClusteringSlicing(Clustering c1, Clustering c2) throws IOException + { + for (int i = 0; i < c1.size(); i++) + { + if (c1.get(i) != null) + { + int offset = c1.accessor().size(c1.get(i)) / 3; + int length = c1.accessor().size(c1.get(i)) / 2; + V1 slice1 = c1.accessor().slice(c1.get(i), offset, length); + V2 slice2 = c2.accessor().slice(c2.get(i), offset, length); + Assert.assertEquals(0, c1.accessor().compare(slice1, slice2, c2.accessor())); + Assert.assertEquals(0, c2.accessor().compare(slice2, slice1, c1.accessor())); + } + } + } + + private static void assertByteBufferArrayEquals(ByteBuffer[] array1, ByteBuffer[] array2) { + Assert.assertEquals(array1.length, array2.length); + for (int i = 0; i < array1.length; i++) { + if (array1[i] != null) + Assert.assertEquals(0, ByteBufferUtil.compareUnsigned(array1[i], array2[i])); + } + } + + private static void assertCellsWrittenToOutput(Row row1, Row row2) throws IOException + { + Iterator> row1Iterator = row1.cells().iterator(); + Iterator> row2Iterator = row2.cells().iterator(); + while (row1Iterator.hasNext()) + { + Cell cell1 = row1Iterator.next(); + Cell cell2 = row2Iterator.next(); + DataOutputBuffer output1 = new DataOutputBuffer(cell1.dataSize()); + DataOutputBuffer output2 = new DataOutputBuffer(cell2.dataSize()); + cell1.accessor().write(cell1.value(), output1); + cell2.accessor().write(cell2.value(), output2); + Assert.assertArrayEquals(output1.toByteArray(), output2.toByteArray()); + } + } + + private static void assertCellsSlicing(Row row1, Row row2) + { + Iterator> row1Iterator = row1.cells().iterator(); + Iterator> row2Iterator = row2.cells().iterator(); + while (row1Iterator.hasNext()) + { + Cell cell1 = row1Iterator.next(); + Cell cell2 = row2Iterator.next(); + int offset = cell1.accessor().size(cell1.value()) / 3; + int length = cell1.accessor().size(cell1.value()) / 2; + Object slice1 = cell1.accessor().slice(cell1.value(), offset, length); + Object slice2 = cell2.accessor().slice(cell2.value(), offset, length); + Assert.assertEquals(0, cell1.accessor().compare(slice1, slice2, cell2.accessor())); + Assert.assertEquals(0, cell2.accessor().compare(slice2, slice1, cell1.accessor())); + } } private static void assertCellsDataSize(Row row1, Row row2) diff --git a/test/unit/org/apache/cassandra/db/ReadCommandVerbHandlerOutOfRangeTest.java b/test/unit/org/apache/cassandra/db/ReadCommandVerbHandlerOutOfRangeTest.java index 847326ca4d5a..180aa45d4fe3 100644 --- a/test/unit/org/apache/cassandra/db/ReadCommandVerbHandlerOutOfRangeTest.java +++ b/test/unit/org/apache/cassandra/db/ReadCommandVerbHandlerOutOfRangeTest.java @@ -37,7 +37,7 @@ import org.apache.cassandra.dht.Token; import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; import org.apache.cassandra.exceptions.InvalidRoutingException; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.metrics.StorageMetrics; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; @@ -50,7 +50,12 @@ import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FBUtilities; -import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.*; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.MessageDelivery; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.broadcastAddress; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.bytesToken; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.node1; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.randomInt; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.registerOutgoingMessageSink; import static org.apache.cassandra.net.Verb.READ_REQ; import static org.junit.Assert.assertEquals; @@ -184,7 +189,7 @@ private void getAndVerifyResponse(ListenableFuture messageSink, MessageDelivery response = messageSink.get(100, TimeUnit.MILLISECONDS); assertEquals(isOutOfRange ? Verb.FAILURE_RSP : Verb.READ_RSP, response.message.verb()); assertEquals(broadcastAddress, response.message.from()); - assertEquals(isOutOfRange, response.message.payload instanceof RequestFailureReason); + assertEquals(isOutOfRange, response.message.payload instanceof RequestFailure); assertEquals(messageId, response.message.id()); assertEquals(node1, response.to); assertEquals(startingTotalMetricCount + (isOutOfRange ? 1 : 0), StorageMetrics.totalOpsForInvalidToken.getCount()); @@ -213,6 +218,7 @@ private static class StubReadCommand extends SinglePartitionReadCommand false, 0, false, + PotentialTxnConflicts.DISALLOW, tmd, FBUtilities.nowInSeconds(), ColumnFilter.all(tmd), @@ -248,6 +254,7 @@ private static class StubRangeReadCommand extends PartitionRangeReadCommand false, 0, false, + PotentialTxnConflicts.DISALLOW, tmd, FBUtilities.nowInSeconds(), ColumnFilter.all(tmd), diff --git a/test/unit/org/apache/cassandra/db/ReadCommandVerbHandlerTest.java b/test/unit/org/apache/cassandra/db/ReadCommandVerbHandlerTest.java index f71b1ff7ca6b..c00501105138 100644 --- a/test/unit/org/apache/cassandra/db/ReadCommandVerbHandlerTest.java +++ b/test/unit/org/apache/cassandra/db/ReadCommandVerbHandlerTest.java @@ -173,6 +173,7 @@ private static class TrackingSinglePartitionReadCommand extends SinglePartitionR false, 0, false, + PotentialTxnConflicts.DISALLOW, metadata, FBUtilities.nowInSeconds(), ColumnFilter.all(metadata), diff --git a/test/unit/org/apache/cassandra/db/ReadResponseTest.java b/test/unit/org/apache/cassandra/db/ReadResponseTest.java index 988677a83b64..ebe1cf0322b4 100644 --- a/test/unit/org/apache/cassandra/db/ReadResponseTest.java +++ b/test/unit/org/apache/cassandra/db/ReadResponseTest.java @@ -255,6 +255,7 @@ private static class StubReadCommand extends SinglePartitionReadCommand isDigest, 0, false, + PotentialTxnConflicts.DISALLOW, metadata, FBUtilities.nowInSeconds(), ColumnFilter.all(metadata), diff --git a/test/unit/org/apache/cassandra/db/RecoveryManagerFlushedTest.java b/test/unit/org/apache/cassandra/db/RecoveryManagerFlushedTest.java index 4db364f24ed2..42ba2ed6f174 100644 --- a/test/unit/org/apache/cassandra/db/RecoveryManagerFlushedTest.java +++ b/test/unit/org/apache/cassandra/db/RecoveryManagerFlushedTest.java @@ -97,6 +97,7 @@ public static void defineSchema() throws ConfigurationException /* test that commit logs do not replay flushed data */ public void testWithFlush() throws Exception { + // Flush everything that may be in the commit log now to start fresh CompactionManager.instance.disableAutoCompaction(); for (String ks : Schema.instance.getKeyspaces()) Util.flush(Keyspace.open(ks)); diff --git a/test/unit/org/apache/cassandra/db/RowCacheTest.java b/test/unit/org/apache/cassandra/db/RowCacheTest.java index ec5e6176c168..f7a87b134144 100644 --- a/test/unit/org/apache/cassandra/db/RowCacheTest.java +++ b/test/unit/org/apache/cassandra/db/RowCacheTest.java @@ -83,6 +83,7 @@ public static void defineSchema() throws ConfigurationException SchemaLoader.standardCFMD(KEYSPACE_CACHED, CF_CACHED).caching(CachingParams.CACHE_EVERYTHING), SchemaLoader.standardCFMD(KEYSPACE_CACHED, CF_CACHEDINT, 1, IntegerType.instance) .caching(new CachingParams(true, 100))); + StorageService.instance.initServer(); } @AfterClass @@ -299,7 +300,6 @@ public void testRowCacheLoad() throws Exception @Test public void testRowCacheCleanup() throws Exception { - StorageService.instance.initServer(); CacheService.instance.setRowCacheCapacityInMB(1); rowCacheLoad(100, Integer.MAX_VALUE, 1000); @@ -321,7 +321,6 @@ public void testRowCacheCleanup() throws Exception @Test public void testInvalidateRowCache() throws Exception { - StorageService.instance.initServer(); CacheService.instance.setRowCacheCapacityInMB(1); rowCacheLoad(100, Integer.MAX_VALUE, 1000); diff --git a/test/unit/org/apache/cassandra/db/SchemaCQLHelperTest.java b/test/unit/org/apache/cassandra/db/SchemaCQLHelperTest.java index d045584412c6..da5f6d1853cb 100644 --- a/test/unit/org/apache/cassandra/db/SchemaCQLHelperTest.java +++ b/test/unit/org/apache/cassandra/db/SchemaCQLHelperTest.java @@ -22,6 +22,7 @@ import com.google.common.collect.ImmutableMap; import com.google.common.io.Files; +import org.apache.cassandra.service.accord.fastpath.FastPathStrategy; import org.junit.Assert; import org.junit.Test; @@ -309,6 +310,7 @@ public void testCfmOptionsCQL() .compaction(CompactionParams.lcs(Collections.singletonMap("sstable_size_in_mb", "1"))) .compression(CompressionParams.lz4(1 << 16, 1 << 15)) .crcCheckChance(0.3) + .fastPath(FastPathStrategy.simple()) .defaultTimeToLive(4) .gcGraceSeconds(5) .minIndexInterval(6) @@ -317,7 +319,7 @@ public void testCfmOptionsCQL() .speculativeRetry(SpeculativeRetryPolicy.fromString("always")) .additionalWritePolicy(SpeculativeRetryPolicy.fromString("always")) .extensions(ImmutableMap.of("ext1", ByteBuffer.wrap("val1".getBytes()))) - .recordColumnDrop(ColumnMetadata.regularColumn(keyspace, table, "reg1", AsciiType.instance), + .recordColumnDrop(ColumnMetadata.regularColumn(keyspace, table, "reg1", AsciiType.instance, ColumnMetadata.NO_UNIQUE_ID), droppedTimestamp); SchemaLoader.createKeyspace(keyspace, KeyspaceParams.simple(1), builder); @@ -336,6 +338,7 @@ public void testCfmOptionsCQL() " AND compression = {'chunk_length_in_kb': '64', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor', 'min_compress_ratio': '2.0'}\n" + " AND memtable = 'default'\n" + " AND crc_check_chance = 0.3\n" + + " AND fast_path = 'simple'\n" + " AND default_time_to_live = 4\n" + " AND extensions = {'ext1': 0x76616c31}\n" + " AND gc_grace_seconds = 5\n" + @@ -344,6 +347,8 @@ public void testCfmOptionsCQL() " AND memtable_flush_period_in_ms = 8\n" + " AND min_index_interval = 6\n" + " AND read_repair = 'BLOCKING'\n" + + " AND transactional_mode = 'off'\n" + + " AND transactional_migration_from = 'none'\n" + " AND speculative_retry = 'ALWAYS';" )); } diff --git a/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java index 757ad1c08670..3c792442e4c1 100644 --- a/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java +++ b/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java @@ -464,7 +464,7 @@ public void testSubrangeCompactionWith2i() throws Throwable try (LifecycleTransaction txn = idx.getTracker().tryModify(idx.getLiveSSTables(), OperationType.COMPACTION)) { IPartitioner partitioner = getCurrentColumnFamilyStore().getPartitioner(); - getCurrentColumnFamilyStore().forceCompactionForTokenRange(Collections.singleton(new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumToken()))); + getCurrentColumnFamilyStore().forceCompactionForTokenRange(Collections.singleton(new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumTokenForSplitting()))); } } diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionAccordIteratorsTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionAccordIteratorsTest.java new file mode 100644 index 000000000000..0d6d11216ffd --- /dev/null +++ b/test/unit/org/apache/cassandra/db/compaction/CompactionAccordIteratorsTest.java @@ -0,0 +1,401 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.compaction; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Random; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.LockSupport; +import java.util.function.Consumer; +import java.util.stream.Collectors; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterators; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.Agent; +import accord.api.Key; +import accord.api.Result; +import accord.local.CheckedCommands; +import accord.local.Command; +import accord.local.CommandStore; +import accord.local.DurableBefore; +import accord.local.RedundantBefore; +import accord.local.StoreParticipants; +import accord.local.cfk.CommandsForKey; +import accord.local.cfk.Serialize; +import accord.primitives.Ballot; +import accord.primitives.Deps; +import accord.primitives.FullRoute; +import accord.primitives.PartialDeps; +import accord.primitives.PartialTxn; +import accord.primitives.Ranges; +import accord.primitives.Route; +import accord.primitives.SaveStatus; +import accord.primitives.Seekable; +import accord.primitives.Status; +import accord.primitives.Txn; +import accord.primitives.Txn.Kind; +import accord.primitives.TxnId; +import accord.primitives.Writes; +import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.cql3.UntypedResultSet; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.ColumnFamilyStore.FlushReason; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.compaction.CompactionIteratorTest.Scanner; +import org.apache.cassandra.db.partitions.ImmutableBTreePartition; +import org.apache.cassandra.db.partitions.Partition; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.distributed.shared.WithProperties; +import org.apache.cassandra.io.sstable.ISSTableScanner; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.journal.TestParams; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.AccordCommandStore; +import org.apache.cassandra.service.accord.AccordExecutor; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.AccordTestUtils; +import org.apache.cassandra.service.accord.IAccordService; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.Pair; + +import static accord.local.KeyHistory.SYNC; +import static accord.local.PreLoadContext.contextFor; +import static accord.local.RedundantStatus.SomeStatus.GC_BEFORE_AND_LOCALLY_APPLIED; +import static accord.primitives.Routable.Domain.Range; +import static accord.primitives.Timestamp.Flag.HLC_BOUND; +import static accord.primitives.Timestamp.Flag.SHARD_BOUND; +import static accord.utils.async.AsyncChains.getUninterruptibly; +import static org.apache.cassandra.Util.spinAssertEquals; +import static org.apache.cassandra.cql3.statements.schema.CreateTableStatement.parse; +import static org.apache.cassandra.schema.SchemaConstants.ACCORD_KEYSPACE_NAME; +import static org.apache.cassandra.service.accord.AccordKeyspace.COMMANDS_FOR_KEY; +import static org.apache.cassandra.service.accord.AccordKeyspace.CFKAccessor; +import static org.junit.Assert.assertEquals; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class CompactionAccordIteratorsTest +{ + private static final Logger logger = LoggerFactory.getLogger(CompactionAccordIteratorsTest.class); + private static final long CLOCK_START = 44; + private static final long HLC_START = 41; + private static final int NODE = 1; + private static final int EPOCH = 1; + private static final AtomicLong clock = new AtomicLong(CLOCK_START); + private static final TxnId LT_TXN_ID = AccordTestUtils.txnId(EPOCH, HLC_START, NODE); + private static final TxnId TXN_ID = AccordTestUtils.txnId(EPOCH, LT_TXN_ID.hlc() + 1, NODE); + private static final TxnId SECOND_TXN_ID = AccordTestUtils.txnId(EPOCH, TXN_ID.hlc() + 1, NODE, Kind.Read); + private static final TxnId RANGE_TXN_ID = AccordTestUtils.txnId(EPOCH, TXN_ID.hlc() + 2, NODE, Kind.Read, Range); + private static final TxnId GT_TXN_ID = SECOND_TXN_ID.addFlag(HLC_BOUND); + // For CommandsForKey where we test with two commands + private static final TxnId[] TXN_IDS = new TxnId[]{ TXN_ID, SECOND_TXN_ID }; + private static final TxnId GT_SECOND_TXN_ID = AccordTestUtils.txnId(EPOCH, SECOND_TXN_ID.hlc() + 1, NODE).addFlag(HLC_BOUND); + + static ColumnFamilyStore commandsForKey; + static TableMetadata table; + static FullRoute route; + Random random; + + /* + * Whether to compact all tables at once in a single merge or forcing two random tables + * to merge at a time + */ + private boolean singleCompaction; + + @BeforeClass + public static void beforeClass() throws Throwable + { + SchemaLoader.prepareServer(); + // Schema doesn't matter since this is a metadata only test + SchemaLoader.createKeyspace("ks", KeyspaceParams.simple(1), + parse("CREATE TABLE tbl (k int, c int, v int, PRIMARY KEY (k, c)) WITH transactional_mode = 'full'", "ks")); + StorageService.instance.initServer(); + + commandsForKey = ColumnFamilyStore.getIfExists(SchemaConstants.ACCORD_KEYSPACE_NAME, COMMANDS_FOR_KEY); + commandsForKey.disableAutoCompaction(); + + table = ColumnFamilyStore.getIfExists("ks", "tbl").metadata(); + route = AccordTestUtils.keys(table, 42).toRoute(AccordTestUtils.key(table, 42).toUnseekable()); + } + + @Before + public void setUp() + { + // This attempt at determinism doesn't work because the order of the SSTableScanners is not determinisitc + long seed = System.nanoTime(); + logger.info("Seed " + seed + "L"); + random = new Random(seed); + } + + @Test + public void testAccordCommandsForKeyPurgerSingleCompaction() throws Throwable + { + testAccordCommandsForKeyPurger(true); + } + + @Test + public void testAccordCommandsForKeyPurgerMultipleCompactions() throws Throwable + { + testAccordCommandsForKeyPurger(false); + } + + private void testAccordCommandsForKeyPurger(boolean singleCompaction) throws Throwable + { + this.singleCompaction = singleCompaction; + testAccordCommandsForKeyPurger(RedundantBefore.EMPTY, expectedAccordCommandsForKeyNoChange()); + testAccordCommandsForKeyPurger(redundantBefore(LT_TXN_ID), expectedAccordCommandsForKeyNoChange()); + // will erase one more than expected as converted to ExclusiveSyncPoint id which is > base id + testAccordCommandsForKeyPurger(redundantBefore(TXN_ID), expectedAccordCommandsForKeyEraseOne()); + testAccordCommandsForKeyPurger(redundantBefore(GT_TXN_ID), expectedAccordCommandsForKeyEraseAll()); + testAccordCommandsForKeyPurger(redundantBefore(GT_SECOND_TXN_ID), expectedAccordCommandsForKeyEraseAll()); + } + + private static Consumer> expectedAccordCommandsForKeyNoChange() + { + return partitions -> { + assertEquals(1, partitions.size()); + Partition partition = partitions.get(0); + TokenKey partitionKey = new TokenKey(partition.metadata().id, partition.partitionKey().getToken()); + CommandsForKey cfk = CFKAccessor.fromRow(partitionKey, ((Row) partition.unfilteredIterator().next())); + assertEquals(TXN_IDS.length, cfk.size()); + for (int i = 0; i < TXN_IDS.length; ++i) + assertEquals(TXN_IDS[i], cfk.txnId(i)); + }; + } + + private static Consumer> expectedAccordTimestampsForKeyEraseOne() + { + return partitions -> assertEquals(0, partitions.size()); + } + + private static Consumer> expectedAccordCommandsForKeyEraseOne() + { + return partitions -> { + assertEquals(1, partitions.size()); + Partition partition = partitions.get(0); + assertEquals(1, Iterators.size(partition.unfilteredIterator())); + UnfilteredRowIterator rows = partition.unfilteredIterator(); +// assertEquals(TXN_IDS[1], CommandsForKeysAccessor.getTimestamp((Row)rows.next())); + }; + } + + private static Consumer> expectedAccordCommandsForKeyEraseAll() + { + return partitions -> assertEquals(0, partitions.size()); + } + + private void testAccordCommandsForKeyPurger(RedundantBefore redundantBefore, Consumer> expectedResult) throws Throwable + { + testWithCommandStore((commandStore) -> { + IAccordService mockAccordService = mockAccordService(commandStore, redundantBefore, DurableBefore.EMPTY); + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(ACCORD_KEYSPACE_NAME, COMMANDS_FOR_KEY); + List result = compactCFS(mockAccordService, cfs); + expectedResult.accept(result); + }, true); + } + + private static RedundantBefore redundantBefore(TxnId txnId) + { + Ranges ranges = AccordTestUtils.fullRange(AccordTestUtils.keys(table, 42)); + txnId = txnId.as(Kind.ExclusiveSyncPoint, Range).addFlag(SHARD_BOUND); + return RedundantBefore.create(ranges, Long.MIN_VALUE, Long.MAX_VALUE, txnId, GC_BEFORE_AND_LOCALLY_APPLIED, LT_TXN_ID.as(Range)); + } + + enum DurableBeforeType + { + UNIVERSAL, + MAJORITY, + NOT_DURABLE, + EMPTY + } + + private static DurableBefore durableBefore(DurableBeforeType durableBeforeType) + { + Ranges ranges = AccordTestUtils.fullRange(AccordTestUtils.keys(table, 42)); + switch (durableBeforeType) + { + case UNIVERSAL: + return DurableBefore.create(ranges, GT_TXN_ID, GT_TXN_ID); + case MAJORITY: + return DurableBefore.create(ranges, GT_TXN_ID, LT_TXN_ID); + case NOT_DURABLE: + return DurableBefore.create(ranges, LT_TXN_ID, LT_TXN_ID); + case EMPTY: + return DurableBefore.EMPTY; + default: + throw new IllegalStateException(); + } + } + + private static IAccordService mockAccordService(CommandStore commandStore, RedundantBefore redundantBefore, DurableBefore durableBefore) + { + IAccordService mockAccordService = mock(IAccordService.class); + IAccordService.AccordCompactionInfo compactionInfo = new IAccordService.AccordCompactionInfo(commandStore.id(), redundantBefore, commandStore.unsafeGetRangesForEpoch(), ((AccordCommandStore)commandStore).tableId()); + IAccordService.AccordCompactionInfos compactionInfos = new IAccordService.AccordCompactionInfos(durableBefore, 0); + compactionInfos.put(commandStore.id(), compactionInfo); + when(mockAccordService.agent()).thenReturn(mock(Agent.class)); + when(mockAccordService.getCompactionInfo()).thenReturn(compactionInfos); + when(mockAccordService.journalConfiguration()).thenReturn(new TestParams()); + return mockAccordService; + } + + interface TestWithCommandStore + { + void test(AccordCommandStore commandStore) throws Throwable; + } + + + private static void flush(AccordCommandStore commandStore) + { + commandStore.executeBlocking(() -> { + // clear cache and wait for post-eviction writes to complete + try (AccordExecutor.ExclusiveGlobalCaches cache = commandStore.executor().lockCaches();) + { + long cacheSize = cache.global.capacity(); + cache.global.setCapacity(0); + cache.global.setCapacity(cacheSize); + } + }); + commandsForKey.forceBlockingFlush(FlushReason.UNIT_TESTS); + while (commandStore.executor().hasTasks()) + LockSupport.parkNanos(TimeUnit.MILLISECONDS.toNanos(100)); + } + + private void testWithCommandStore(TestWithCommandStore test, boolean additionalCommand) throws Throwable + { + try (WithProperties wp = new WithProperties().set(CassandraRelevantProperties.DTEST_ACCORD_JOURNAL_SANITY_CHECK_ENABLED, "true")) + { + testWithCommandStoreInternal(test, additionalCommand); + } + } + + private void testWithCommandStoreInternal(TestWithCommandStore test, boolean additionalCommand) throws Throwable + { + Keyspace.open(ACCORD_KEYSPACE_NAME).getColumnFamilyStores().forEach(ColumnFamilyStore::truncateBlocking); + ((AccordService) AccordService.instance()).journal().truncateForTesting(); + clock.set(CLOCK_START); + AccordCommandStore commandStore = AccordTestUtils.createAccordCommandStore(clock::incrementAndGet, "ks", "tbl"); + TxnId[] txnIds = additionalCommand ? TXN_IDS : new TxnId[]{ TXN_ID }; + Txn writeTxn = AccordTestUtils.createWriteTxn(42); + Txn readTxn = AccordTestUtils.createTxn(42); + Seekable key = writeTxn.keys().get(0); + for (TxnId txnId : txnIds) + { + Txn txn = txnId.kind().isWrite() ? writeTxn : readTxn; + PartialDeps partialDeps = Deps.NONE.intersecting(AccordTestUtils.fullRange(txn)); + PartialTxn partialTxn = txn.slice(commandStore.unsafeGetRangesForEpoch().currentRanges(), true); + Route partialRoute = route.slice(commandStore.unsafeGetRangesForEpoch().currentRanges()); + getUninterruptibly(commandStore.execute(contextFor(txnId, route, SYNC), safe -> { + CheckedCommands.preaccept(safe, txnId, partialTxn, route, (a, b) -> {}); + }).beginAsResult()); + flush(commandStore); + getUninterruptibly(commandStore.execute(contextFor(txnId, route, SYNC), safe -> { + CheckedCommands.accept(safe, txnId, Ballot.ZERO, partialRoute, txnId, partialDeps, (a, b) -> {}); + }).beginAsResult()); + flush(commandStore); + getUninterruptibly(commandStore.execute(contextFor(txnId, route, SYNC), safe -> { + CheckedCommands.commit(safe, SaveStatus.Stable, Ballot.ZERO, txnId, route, partialTxn, txnId, partialDeps, (a, b) -> {}); + }).beginAsResult()); + flush(commandStore); + getUninterruptibly(commandStore.execute(contextFor(txnId, route, SYNC), safe -> { + Pair result = AccordTestUtils.processTxnResultDirect(safe, txnId, partialTxn, txnId); + CheckedCommands.apply(safe, txnId, route, txnId, partialDeps, partialTxn, result.left, result.right, (a, b) -> {}); + }).beginAsResult()); + flush(commandStore); + // The apply chain is asychronous, so it is easiest to just spin until it is applied + // in order to have the updated state in the system table + spinAssertEquals(true, 5, () -> { + return getUninterruptibly(commandStore.submit(contextFor(txnId, route, SYNC), safe -> { + StoreParticipants participants = StoreParticipants.all(route); + Command command = safe.get(txnId, participants).current(); + return command.hasBeen(Status.Applied); + }).beginAsResult()); + }); + flush(commandStore); + } + UntypedResultSet commandsForKeyTable = QueryProcessor.executeInternal("SELECT * FROM " + ACCORD_KEYSPACE_NAME + "." + COMMANDS_FOR_KEY + ";"); + logger.info(commandsForKeyTable.toStringUnsafe()); + assertEquals(1, commandsForKeyTable.size()); + CommandsForKey cfk = Serialize.fromBytes(((Key) key).toUnseekable(), commandsForKeyTable.iterator().next().getBytes("data")); + assertEquals(txnIds.length, cfk.size()); + for (int i = 0; i < txnIds.length; ++i) + assertEquals(txnIds[i], cfk.txnId(i)); + test.test(commandStore); + } + + private List compactCFS(IAccordService mockAccordService, ColumnFamilyStore cfs) + { + List scanners = cfs.getLiveSSTables().stream().map(SSTableReader::getScanner).collect(Collectors.toList()); + int numScanners = scanners.size(); + List result = null; + do + { + List outputPartitions = new ArrayList<>(); + List nextInputScanners = new ArrayList<>(); + if (singleCompaction || numScanners == 1) + { + nextInputScanners = ImmutableList.copyOf(scanners); + scanners.clear(); + } + else + { + // Process the rows only two sstables at a time to force compacting random slices of command state + nextInputScanners.add(scanners.remove(random.nextInt(scanners.size()))); + nextInputScanners.add(scanners.remove(random.nextInt(scanners.size()))); + } + try (CompactionController controller = new CompactionController(ColumnFamilyStore.getIfExists(ACCORD_KEYSPACE_NAME, cfs.name), Collections.emptySet(), 0); + CompactionIterator compactionIterator = new CompactionIterator(OperationType.COMPACTION, nextInputScanners, controller, FBUtilities.nowInSeconds(), null, ActiveCompactionsTracker.NOOP, null, mockAccordService)) + { + while (compactionIterator.hasNext()) + { + try (UnfilteredRowIterator partition = compactionIterator.next()) + { + outputPartitions.add(ImmutableBTreePartition.create(partition)); + } + } + } + + if (scanners.isEmpty()) + result = outputPartitions; + else + scanners.add(random.nextInt(scanners.size()), new Scanner(cfs.metadata(), outputPartitions.stream().map(Partition::unfilteredIterator).collect(Collectors.toList()))); + } while (!scanners.isEmpty()); + + verify(mockAccordService, times(singleCompaction || numScanners == 1 ? 1 : numScanners - 1)).getCompactionInfo(); + return result; + } +} diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionIteratorTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionIteratorTest.java index 076ef9876f87..d09c9551730c 100644 --- a/test/unit/org/apache/cassandra/db/compaction/CompactionIteratorTest.java +++ b/test/unit/org/apache/cassandra/db/compaction/CompactionIteratorTest.java @@ -17,18 +17,25 @@ */ package org.apache.cassandra.db.compaction; -import static org.apache.cassandra.config.CassandraRelevantProperties.DIAGNOSTIC_SNAPSHOT_INTERVAL_NANOS; -import static org.apache.cassandra.db.transform.DuplicateRowCheckerTest.assertCommandIssued; -import static org.apache.cassandra.db.transform.DuplicateRowCheckerTest.makeRow; -import static org.apache.cassandra.db.transform.DuplicateRowCheckerTest.partition; -import static org.junit.Assert.*; - -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Random; +import java.util.Set; +import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; -import com.google.common.collect.*; - +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Iterables; +import com.google.common.collect.Iterators; +import com.google.common.collect.Lists; import org.junit.BeforeClass; import org.junit.Test; @@ -44,17 +51,30 @@ import org.apache.cassandra.db.marshal.Int32Type; import org.apache.cassandra.db.marshal.UTF8Type; import org.apache.cassandra.db.partitions.AbstractUnfilteredPartitionIterator; -import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.db.rows.RangeTombstoneBoundaryMarker; +import org.apache.cassandra.db.rows.Unfiltered; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; +import org.apache.cassandra.db.rows.UnfilteredRowsGenerator; import org.apache.cassandra.io.sstable.ISSTableScanner; import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.schema.KeyspaceParams; import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FBUtilities; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import static org.apache.cassandra.config.CassandraRelevantProperties.DIAGNOSTIC_SNAPSHOT_INTERVAL_NANOS; +import static org.apache.cassandra.db.transform.DuplicateRowCheckerTest.assertCommandIssued; +import static org.apache.cassandra.db.transform.DuplicateRowCheckerTest.makeRow; +import static org.apache.cassandra.db.transform.DuplicateRowCheckerTest.partition; + public class CompactionIteratorTest extends CQLTester { @@ -274,7 +294,7 @@ private List compact(Iterable> sources, Iterable
  • listToIterator(list, kk))); try (CompactionController controller = new Controller(Keyspace.openAndGetStore(metadata), transformedSources, GC_BEFORE); CompactionIterator iter = new CompactionIterator(OperationType.COMPACTION, - Lists.transform(content, x -> new Scanner(x)), + Lists.transform(content, x -> new Scanner(metadata, x)), controller, NOW, null)) { List result = new ArrayList<>(); @@ -336,7 +356,7 @@ public void transformTest() transformedSources.put(kk, Iterables.transform(tombstoneLists, list -> listToIterator(list, kk))); try (CompactionController controller = new Controller(Keyspace.openAndGetStore(metadata), transformedSources, GC_BEFORE); CompactionIterator iter = new CompactionIterator(OperationType.COMPACTION, - Lists.transform(content, x -> new Scanner(x)), + Lists.transform(content, x -> new Scanner(metadata, x)), controller, NOW, null)) { assertTrue(iter.hasNext()); @@ -369,7 +389,7 @@ public void transformPartitionTest() transformedSources.put(kk, Iterables.transform(tombstoneLists, list -> listToIterator(list, kk))); try (CompactionController controller = new Controller(Keyspace.openAndGetStore(metadata), transformedSources, GC_BEFORE); CompactionIterator iter = new CompactionIterator(OperationType.COMPACTION, - Lists.transform(content, x -> new Scanner(x)), + Lists.transform(content, x -> new Scanner(metadata, x)), controller, NOW, null)) { iter.stop(); @@ -404,12 +424,14 @@ public Iterable shadowSources(DecoratedKey key, boolean t } } - class Scanner extends AbstractUnfilteredPartitionIterator implements ISSTableScanner + static class Scanner extends AbstractUnfilteredPartitionIterator implements ISSTableScanner { Iterator iter; + TableMetadata metadata; - Scanner(Iterable content) + Scanner(TableMetadata metadata, Iterable content) { + this.metadata = metadata; iter = content.iterator(); } @@ -500,7 +522,7 @@ private void iterate(Unfiltered...unfiltereds) DecoratedKey key = cfs.getPartitioner().decorateKey(ByteBufferUtil.bytes("key")); try (CompactionController controller = new CompactionController(cfs, Integer.MAX_VALUE); UnfilteredRowIterator rows = partition(cfs.metadata(), key, false, unfiltereds); - ISSTableScanner scanner = new Scanner(Collections.singletonList(rows)); + ISSTableScanner scanner = new Scanner(cfs.metadata(), Collections.singletonList(rows)); CompactionIterator iter = new CompactionIterator(OperationType.COMPACTION, Collections.singletonList(scanner), controller, FBUtilities.nowInSeconds(), null)) diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java index 19b758f73eac..20420aeef3f5 100644 --- a/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java +++ b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java @@ -42,6 +42,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.CassandraTestBase; +import org.apache.cassandra.CassandraTestBase.SchemaLoaderPrepareServer; +import org.apache.cassandra.CassandraTestBase.UseByteOrderedPartitioner; import org.apache.cassandra.SchemaLoader; import org.apache.cassandra.Util; import org.apache.cassandra.config.DatabaseDescriptor; @@ -53,15 +56,12 @@ import org.apache.cassandra.db.PartitionPosition; import org.apache.cassandra.db.RowUpdateBuilder; import org.apache.cassandra.db.compaction.AbstractStrategyHolder.GroupedSSTableContainer; -import org.apache.cassandra.dht.ByteOrderedPartitioner; -import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.io.util.File; import org.apache.cassandra.notifications.SSTableAddedNotification; import org.apache.cassandra.notifications.SSTableDeletingNotification; import org.apache.cassandra.schema.CompactionParams; import org.apache.cassandra.schema.KeyspaceParams; -import org.apache.cassandra.service.StorageService; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.utils.ByteBufferUtil; @@ -73,27 +73,27 @@ import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; -public class CompactionStrategyManagerTest +/** + * We use byte ordered partitioner in this test to be able to easily infer an SSTable + * disk assignment based on its generation - See {@link this#getSSTableIndex(Integer[], SSTableReader)} + */ +@SchemaLoaderPrepareServer +@UseByteOrderedPartitioner +public class CompactionStrategyManagerTest extends CassandraTestBase { private static final Logger logger = LoggerFactory.getLogger(CompactionStrategyManagerTest.class); private static final String KS_PREFIX = "Keyspace1"; private static final String TABLE_PREFIX = "CF_STANDARD"; - private static IPartitioner originalPartitioner; private static boolean backups; @BeforeClass public static void beforeClass() { - SchemaLoader.prepareServer(); backups = DatabaseDescriptor.isIncrementalBackupsEnabled(); DatabaseDescriptor.setIncrementalBackupsEnabled(false); - /** - * We use byte ordered partitioner in this test to be able to easily infer an SSTable - * disk assignment based on its generation - See {@link this#getSSTableIndex(Integer[], SSTableReader)} - */ - originalPartitioner = StorageService.instance.setPartitionerUnsafe(ByteOrderedPartitioner.instance); + SchemaLoader.createKeyspace(KS_PREFIX, KeyspaceParams.simple(1), SchemaLoader.standardCFMD(KS_PREFIX, TABLE_PREFIX) @@ -110,7 +110,6 @@ public void setUp() throws Exception @AfterClass public static void afterClass() { - DatabaseDescriptor.setPartitionerUnsafe(originalPartitioner); DatabaseDescriptor.setIncrementalBackupsEnabled(backups); } diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsBytemanTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsBytemanTest.java index d338c8b1690e..e49847b443f2 100644 --- a/test/unit/org/apache/cassandra/db/compaction/CompactionsBytemanTest.java +++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsBytemanTest.java @@ -179,7 +179,7 @@ public void testStopSubRangeCompactionRepaired() throws Throwable { testStopCompactionRepaired((cfs) -> { Collection> ranges = Collections.singleton(new Range<>(cfs.getPartitioner().getMinimumToken(), - cfs.getPartitioner().getMaximumToken())); + cfs.getPartitioner().getMaximumTokenForSplitting())); CompactionManager.instance.forceCompactionForTokenRange(cfs, ranges); }); } diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java index ce0422217281..b18a20ec98b7 100644 --- a/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java +++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java @@ -542,10 +542,10 @@ public void testAbortNotifications() throws Throwable getCurrentColumnFamilyStore().forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) getCurrentColumnFamilyStore().getCompactionStrategyManager().getUnrepairedUnsafe().first(); - LeveledCompactionTask lcsTask; + AbstractCompactionTask lcsTask; while (true) { - lcsTask = (LeveledCompactionTask) Iterables.getOnlyElement(lcs.getNextBackgroundTasks(0), null); + lcsTask = Iterables.getOnlyElement(lcs.getNextBackgroundTasks(0), null); if (lcsTask != null) { lcsTask.execute(CompactionManager.instance.active); @@ -591,7 +591,7 @@ public void testAbortNotifications() throws Throwable // ignored } - lcsTask = (LeveledCompactionTask) Iterables.getOnlyElement(lcs.getNextBackgroundTasks(0), null); + lcsTask = Iterables.getOnlyElement(lcs.getNextBackgroundTasks(0), null); try { assertNotNull(lcsTask); @@ -921,7 +921,7 @@ private void loadTestSStables(ColumnFamilyStore cfs, File ksDir) throws IOExcept File tableDir = new File(ksDir, cfs.name); Assert.assertTrue("The table directory " + tableDir + " was not found", tableDir.isDirectory()); for (File file : tableDir.tryList()) - LegacySSTableTest.copyFile(cfDir, file); + LegacySSTableTest.copyFileToDir(file, cfDir); } cfs.loadNewSSTables(); } diff --git a/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java index 0a339b5b2dc5..00bb8b25ff57 100644 --- a/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java +++ b/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java @@ -599,9 +599,20 @@ private int getTaskLevel(ColumnFamilyStore cfs) { try { - assertTrue(task instanceof LeveledCompactionTask); - LeveledCompactionTask lcsTask = (LeveledCompactionTask) task; - level = Math.max(level, lcsTask.getLevel()); + if (task instanceof LeveledCompactionTask) + { + LeveledCompactionTask lcsTask = (LeveledCompactionTask) task; + level = Math.max(level, lcsTask.getLevel()); + } + else if (task instanceof SingleSSTableLCSTask) + { + SingleSSTableLCSTask singleSSTableLCSTask = (SingleSSTableLCSTask) task; + level = Math.max(level, singleSSTableLCSTask.getLevel()); + } + else + { + Assert.fail("Got unexpected task of type " + task.getClass().getCanonicalName()); + } } finally { diff --git a/test/unit/org/apache/cassandra/db/compaction/PartialCompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/PartialCompactionsTest.java index 1d877904a25e..92c479ffb5c4 100644 --- a/test/unit/org/apache/cassandra/db/compaction/PartialCompactionsTest.java +++ b/test/unit/org/apache/cassandra/db/compaction/PartialCompactionsTest.java @@ -41,8 +41,8 @@ import org.apache.cassandra.utils.CloseableIterator; import org.apache.cassandra.utils.FBUtilities; -import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.CoreMatchers.instanceOf; +import static org.hamcrest.MatcherAssert.assertThat; import static org.junit.Assert.assertEquals; public class PartialCompactionsTest extends SchemaLoader @@ -120,7 +120,7 @@ private static long enoughSpaceForAllButTheLargestSSTable(ColumnFamilyStore cfs) private static int liveRows(ColumnFamilyStore cfs) { return Util.getAll(Util.cmd(cfs, "key1").build()).stream() - .map(partition -> count(partition.rowIterator())) + .map(partition -> count(partition.rowIterator(false))) .reduce(Integer::sum) .orElse(0); } diff --git a/test/unit/org/apache/cassandra/db/compaction/ShardManagerTest.java b/test/unit/org/apache/cassandra/db/compaction/ShardManagerTest.java index ced5d7882144..7917b48f6158 100644 --- a/test/unit/org/apache/cassandra/db/compaction/ShardManagerTest.java +++ b/test/unit/org/apache/cassandra/db/compaction/ShardManagerTest.java @@ -543,7 +543,7 @@ List mockNonOverlappingSSTables(int numSSTables) private Token boundary(int numSSTables, int i) { - return partitioner.split(partitioner.getMinimumToken(), partitioner.getMaximumToken(), i * 1.0 / numSSTables); + return partitioner.split(partitioner.getMinimumToken(), partitioner.getMaximumTokenForSplitting(), i * 1.0 / numSSTables); } private SSTableReader mockSSTable(DecoratedKey first, DecoratedKey last) diff --git a/test/unit/org/apache/cassandra/db/compaction/UnifiedCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/UnifiedCompactionStrategyTest.java index 422d67e946fb..d03cfe85fadb 100644 --- a/test/unit/org/apache/cassandra/db/compaction/UnifiedCompactionStrategyTest.java +++ b/test/unit/org/apache/cassandra/db/compaction/UnifiedCompactionStrategyTest.java @@ -277,7 +277,7 @@ private void testGetBucketsOneArena(Map sstableMap, int[] Ws, IPartitioner partitioner = cfs.getPartitioner(); DecoratedKey first = new BufferDecoratedKey(partitioner.getMinimumToken(), ByteBuffer.allocate(0)); - DecoratedKey last = new BufferDecoratedKey(partitioner.getMaximumToken(), ByteBuffer.allocate(0)); + DecoratedKey last = new BufferDecoratedKey(partitioner.getMaximumTokenForSplitting(), ByteBuffer.allocate(0)); List sstables = new ArrayList<>(); long dataSetSizeBytes = 0; @@ -517,7 +517,7 @@ private List createSStables(IPartitioner partitioner, { List mockSSTables = new ArrayList<>(); Token min = partitioner.getMinimumToken(); - Token max = partitioner.getMaximumToken(); + Token max = partitioner.getMaximumTokenForSplitting(); ByteBuffer bb = ByteBuffer.allocate(0); sstablesMap.forEach((size, num) -> { Token first = min.getPartitioner().split(min, max, 0.01); @@ -1118,7 +1118,7 @@ List mockNonOverlappingSSTables(int numSSTables, int level, long private Token boundary(int numSSTables, double i) { - return partitioner.split(partitioner.getMinimumToken(), partitioner.getMaximumToken(), i / numSSTables); + return partitioner.split(partitioner.getMinimumToken(), partitioner.getMaximumTokenForSplitting(), i / numSSTables); } } diff --git a/test/unit/org/apache/cassandra/db/compaction/unified/ControllerTest.java b/test/unit/org/apache/cassandra/db/compaction/unified/ControllerTest.java index 7b8f0477e792..eaa4d6f77a27 100644 --- a/test/unit/org/apache/cassandra/db/compaction/unified/ControllerTest.java +++ b/test/unit/org/apache/cassandra/db/compaction/unified/ControllerTest.java @@ -121,6 +121,64 @@ public void testValidateOptionsIntegers() testValidateOptions(true); } + public void targetSSTableSizeValidator(String inputSize) + { + Map options = new HashMap<>(); + options.putIfAbsent(Controller.TARGET_SSTABLE_SIZE_OPTION, inputSize); + assertThatExceptionOfType(ConfigurationException.class) + .describedAs("Should have thrown a ConfigurationException when target_sstable_size is greater than Long.MAX_VALUE") + .isThrownBy(() -> Controller.validateOptions(options)) + .withMessageContaining(format("target_sstable_size %s is out of range of Long.", inputSize)); + } + + @Test + public void testCassandra20398Values() + { + //TARGET_SSTABLE_SIZE_OPTION = 12E899, the value reported in CASSANDRA-20398 + String inputSize = "12E899 B"; + targetSSTableSizeValidator(inputSize); + } + + @Test + public void testValidateOptionsTargetSSTableSizeGTLongMax() + { + //TARGET_SSTABLE_SIZE_OPTION > LONG.MAX_VALUE + // the inputSize is Long.MAX_VALUE + 100 + String inputSize = "9223372036854775907 B"; + targetSSTableSizeValidator(inputSize); + } + + @Test + public void testValidateOptionsTargetSSTableSizeLTMinTargetSize() + { + // TARGET_SSTABLE_SIZE_OPTION < Default MIN_TARGET_SSTABLE_SIZE (1048576) + Map options = new HashMap<>(); + String inputSize = "1048000 B"; + options.putIfAbsent(Controller.TARGET_SSTABLE_SIZE_OPTION, inputSize); + assertThatExceptionOfType(ConfigurationException.class) + .describedAs("Should have thrown a ConfigurationException when target_sstable_size is less than default MIN_TARGET_SSTABLE_SIZE") + .isThrownBy(() -> Controller.validateOptions(options)) + .withMessageContaining(format("target_sstable_size %s is not acceptable, size must be at least %s", inputSize, FBUtilities.prettyPrintMemory(Controller.MIN_TARGET_SSTABLE_SIZE))); + } + + @Test + public void testValidateOptionsTargetSSTableSizeGTIntMax() + { + //TEST 4: Verifying if TARGET_SSTABLE_SIZE_OPTION (3650722199) < MIN_TARGET_SSTABLE_SIZE (2581450423) + // Previously, TARGET_SSTABLE_SIZE_OPTION * 0.7 was stored as Integer which would 3650722199 * 0.7 = 2147483647 + // By storing it in a Long, 3650722199 * 0.7 = 2581450424. If TARGET_SSTABLE_SIZE_OPTION * 0.7 is truncated, + //this test case will fail + try + { + Map options = new HashMap<>(); + options.putIfAbsent(Controller.TARGET_SSTABLE_SIZE_OPTION, "3650722199 B"); + options.putIfAbsent(Controller.MIN_SSTABLE_SIZE_OPTION, "2581450423 B"); + Controller.validateOptions(options); + } catch(ConfigurationException e) { + fail("3650722199 * 0.7 got truncated. " + e.getMessage()); + } + } + void testValidateOptions(boolean useIntegers) { Map options = new HashMap<>(); @@ -579,7 +637,7 @@ public void testMinSSTableSize() assertThatExceptionOfType(ConfigurationException.class) .describedAs("Should have thrown a ConfigurationException when min_sstable_size is greater than target_sstable_size") .isThrownBy(() -> Controller.validateOptions(options)) - .withMessageContaining(format("less than the target size minimum: %s", FBUtilities.prettyPrintMemory(limit))); + .withMessageContaining(format("Invalid configuration, %s (%s) should be less than 70%% of the targetSSTableSize (%s)", Controller.MIN_SSTABLE_SIZE_OPTION, FBUtilities.prettyPrintMemory(limit+1), FBUtilities.prettyPrintMemory(Controller.DEFAULT_TARGET_SSTABLE_SIZE))); // test min < configured target table size * INV_SQRT_2 limit = (int) Math.ceil(Controller.MIN_TARGET_SSTABLE_SIZE * 2 * Controller.INVERSE_SQRT_2); @@ -589,6 +647,6 @@ public void testMinSSTableSize() assertThatExceptionOfType(ConfigurationException.class) .describedAs("Should have thrown a ConfigurationException when min_sstable_size is greater than target_sstable_size") .isThrownBy(() -> Controller.validateOptions(options)) - .withMessageContaining(format("less than the target size minimum: %s", FBUtilities.prettyPrintMemory(limit))); + .withMessageContaining(format("Invalid configuration, %s (%s) should be less than 70%% of the targetSSTableSize (%s)", Controller.MIN_SSTABLE_SIZE_OPTION, FBUtilities.prettyPrintMemory(limit + 1), FBUtilities.prettyPrintMemory(Controller.MIN_TARGET_SSTABLE_SIZE * 2))); } } \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/db/compaction/writers/CompactionAwareWriterTest.java b/test/unit/org/apache/cassandra/db/compaction/writers/CompactionAwareWriterTest.java index 5202fab76e1f..dc8c78c54e17 100644 --- a/test/unit/org/apache/cassandra/db/compaction/writers/CompactionAwareWriterTest.java +++ b/test/unit/org/apache/cassandra/db/compaction/writers/CompactionAwareWriterTest.java @@ -34,6 +34,7 @@ import org.junit.After; import org.junit.AfterClass; import org.junit.BeforeClass; +import org.junit.Ignore; import org.junit.Test; import org.apache.cassandra.ServerTestUtils; @@ -171,6 +172,7 @@ public int compare(SSTableReader o1, SSTableReader o2) cfs.truncateBlocking(); } + @Ignore @Test public void testMajorLeveledCompactionWriter() throws Throwable { diff --git a/test/unit/org/apache/cassandra/db/filter/ColumnFilterTest.java b/test/unit/org/apache/cassandra/db/filter/ColumnFilterTest.java index f2b47cbf6eab..cbd38ba8e7e5 100644 --- a/test/unit/org/apache/cassandra/db/filter/ColumnFilterTest.java +++ b/test/unit/org/apache/cassandra/db/filter/ColumnFilterTest.java @@ -90,7 +90,7 @@ public static void beforeClass() SchemaLoader.prepareServer(); DatabaseDescriptor.setSeedProvider(Arrays::asList); DatabaseDescriptor.setDefaultFailureDetector(); - DatabaseDescriptor.setPartitionerUnsafe(new Murmur3Partitioner()); + DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); } // Select all diff --git a/test/unit/org/apache/cassandra/db/filter/RowFilterTest.java b/test/unit/org/apache/cassandra/db/filter/RowFilterTest.java index 6cd08d000130..d73f796c9010 100644 --- a/test/unit/org/apache/cassandra/db/filter/RowFilterTest.java +++ b/test/unit/org/apache/cassandra/db/filter/RowFilterTest.java @@ -20,9 +20,10 @@ import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.Collections; import java.util.concurrent.atomic.AtomicBoolean; -import org.junit.Assert; +import com.google.common.collect.ImmutableList; import org.junit.Test; import org.apache.cassandra.cql3.ColumnIdentifier; @@ -33,6 +34,7 @@ import org.apache.cassandra.db.LivenessInfo; import org.apache.cassandra.db.RegularAndStaticColumns; import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.UTF8Type; import org.apache.cassandra.db.partitions.SingletonUnfilteredPartitionIterator; import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator; import org.apache.cassandra.db.rows.BTreeRow; @@ -45,8 +47,12 @@ import org.apache.cassandra.db.rows.UnfilteredRowIterator; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.btree.BTree; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + public class RowFilterTest { @@ -89,8 +95,8 @@ public void close() closed.set(true); } }), 1); - Assert.assertFalse(iter.hasNext()); - Assert.assertTrue(closed.get()); + assertFalse(iter.hasNext()); + assertTrue(closed.get()); filter = RowFilter.none().withNewExpressions(new ArrayList<>()); filter.add(r, Operator.NEQ, one); @@ -123,9 +129,35 @@ public void close() closed.set(true); } }), 1); - Assert.assertFalse(iter.hasNext()); - Assert.assertTrue(closed.get()); + assertFalse(iter.hasNext()); + assertTrue(closed.get()); } + @Test + public void testMutableIntersections() + { + TableMetadata metadata = TableMetadata.builder("testks", "testcf") + .addPartitionKeyColumn("pk", Int32Type.instance) + .addRegularColumn("r", Int32Type.instance) + .addRegularColumn("t", UTF8Type.instance) + .offline() + .build(); + + RowFilter filter = RowFilter.none().withNewExpressions(new ArrayList<>()); + assertFalse(filter.isMutableIntersection()); + + ColumnMetadata r = metadata.getColumn(new ColumnIdentifier("r", true)); + RowFilter.Expression gt = new RowFilter.SimpleExpression(r, Operator.GT, ByteBufferUtil.EMPTY_BYTE_BUFFER); + filter = filter.withNewExpressions(Collections.singletonList(gt)); + assertFalse(filter.isMutableIntersection()); + + RowFilter.Expression lt = new RowFilter.SimpleExpression(r, Operator.LT, ByteBufferUtil.EMPTY_BYTE_BUFFER); + filter = filter.withNewExpressions(ImmutableList.of(gt, lt)); + assertFalse(filter.isMutableIntersection()); + ColumnMetadata t = metadata.getColumn(new ColumnIdentifier("t", true)); + RowFilter.Expression eq = new RowFilter.SimpleExpression(t, Operator.EQ, ByteBufferUtil.EMPTY_BYTE_BUFFER); + filter = filter.withNewExpressions(ImmutableList.of(gt, lt, eq)); + assertTrue(filter.isMutableIntersection()); + } } diff --git a/test/unit/org/apache/cassandra/db/lifecycle/LogTransactionTest.java b/test/unit/org/apache/cassandra/db/lifecycle/LogTransactionTest.java index 1f010bf30e33..096cd02c082d 100644 --- a/test/unit/org/apache/cassandra/db/lifecycle/LogTransactionTest.java +++ b/test/unit/org/apache/cassandra/db/lifecycle/LogTransactionTest.java @@ -17,7 +17,6 @@ */ package org.apache.cassandra.db.lifecycle; - import java.io.IOException; import java.io.UncheckedIOException; import java.nio.file.Files; @@ -38,6 +37,7 @@ import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import com.google.common.collect.Sets; +import org.apache.cassandra.db.streaming.ComponentContext; import org.junit.Test; import org.apache.cassandra.Util; @@ -79,6 +79,7 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -1260,12 +1261,173 @@ public void testGetTemporaryFilesThrowsIfCompletingAfterObsoletion() throws Thro logs.finish(); } + @Test + public void testStatsTSMatchOnStart() throws Throwable + { + ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE); + File dataFolder = new Directories(cfs.metadata()).getDirectoryForNewSSTables(); + SSTableReader sstable = sstable(dataFolder, cfs, 0, 128); + + try(LogTransaction log = new LogTransaction(OperationType.COMPACTION)) + { + assertNotNull(log); + log.trackNew(sstable); + + // Confirm we can remove leftovers when they match + LogTransaction.removeUnfinishedLeftovers(cfs.metadata()); + } + + File sFile = sstable.descriptor.fileFor(SSTableFormat.Components.STATS); + assertFalse("Found STATS file but expected it to be cleaned up.", Files.exists(sFile.toPath())); + sstable.selfRef().release(); + } + + @Test + public void testStatsTSMatchDuringList() throws Throwable + { + ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE); + File dataFolder = new Directories(cfs.metadata()).getDirectoryForNewSSTables(); + SSTableReader sstable = sstable(dataFolder, cfs, 0, 128); + + try(LogTransaction log = new LogTransaction(OperationType.COMPACTION)) + { + assertNotNull(log); + log.trackNew(sstable); + + // Confirm we can successfully classify files when they match - this triggers the LogAwareFileLister verify + listFiles(dataFolder, Directories.OnTxnErr.THROW, Directories.FileType.FINAL); + } + sstable.selfRef().release(); + } + + @Test + public void testStatsTSMismatchDuringStart() throws Throwable + { + ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE); + File dataFolder = new Directories(cfs.metadata()).getDirectoryForNewSSTables(); + SSTableReader sstable = sstable(dataFolder, cfs, 0, 128); + + File sFile = sstable.descriptor.fileFor(SSTableFormat.Components.STATS); + assertTrue("STATS file not created successfully in test setup", Files.exists(sFile.toPath())); + + // Confirm we can remove leftovers even if the STATS file doesn't match + try(LogTransaction log = new LogTransaction(OperationType.COMPACTION)) + { + assertNotNull(log); + + // Need to flag the transaction as having a REMOVE entry so it'll trigger the path to calculate stats on list + log.obsoleted(sstable); + + // Need to sleep for long enough to bypass the millisecond truncation logic due to jdk8 and jdk11 change + Thread.sleep(2000); + assertTrue("Failed to set mtime for STATS file to currentTimeMillis()", sFile.trySetLastModified(System.currentTimeMillis())); + + // Confirm we have an mtime mismatch + File dFile = sstable.descriptor.fileFor(SSTableFormat.Components.DATA); + assertNotEquals(sFile.lastModified(), dFile.lastModified()); + + // We need to add another LogRecord as we allow partial or incorrect entries as the last record... + log.trackNew(sstable(dataFolder, cfs, 2, 128)); + + assertTrue("STATS file gone before removeUnfinished...", Files.exists(sFile.toPath())); + // Confirm we can remove leftovers when the STATS file mismatches + log.prepareToCommit(); // commit so that obsolete sstable components will be removed. + log.commit(); + ComponentContext.create(sstable); + assertTrue(LogTransaction.removeUnfinishedLeftovers(cfs.metadata())); + } + + sstable.selfRef().release(); + } + + @Test + public void testWrongTimestampInTxnFile() throws IOException, InterruptedException + { + ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE); + File dataFolder = new Directories(cfs.metadata()).getDirectoryForNewSSTables(); + SSTableReader sstable = sstable(dataFolder, cfs, 0, 128); + + File sFile = sstable.descriptor.fileFor(SSTableFormat.Components.STATS); + assertTrue("STATS file not created successfully in test setup", Files.exists(sFile.toPath())); + + LogRecord.INCLUDE_STATS_FOR_TESTS = true; + + Thread.sleep(2000); + assertTrue("Failed to set mtime for STATS file to currentTimeMillis()", sFile.trySetLastModified(System.currentTimeMillis())); + + // Confirm we can remove leftovers even if the STATS file doesn't match + try(LogTransaction log = new LogTransaction(OperationType.COMPACTION)) + { + assertNotNull(log); + // Need to flag the transaction as having a REMOVE entry so it'll trigger the path to calculate stats on list + log.obsoleted(sstable); + // Need to sleep for long enough to bypass the millisecond truncation logic due to jdk8 and jdk11 change + // Confirm we have an mtime mismatch + File dFile = sstable.descriptor.fileFor(SSTableFormat.Components.DATA); + assertNotEquals(sFile.lastModified(), dFile.lastModified()); + + // We need to add another LogRecord as we allow partial or incorrect entries as the last record... + log.trackNew(sstable(dataFolder, cfs, 2, 128)); + + assertTrue("STATS file gone before removeUnfinished...", Files.exists(sFile.toPath())); + // Confirm we can remove leftovers when the STATS file mismatches + LogRecord.INCLUDE_STATS_FOR_TESTS = false; + assertTrue(LogTransaction.removeUnfinishedLeftovers(cfs.metadata())); + } + + sstable.selfRef().release(); + } + + /** + * We do not consider the stats file's ts for any cases at this point + */ + @Test + public void testStatsTSMismatchDuringList() throws Throwable + { + SSTableReader sstable = null; + try + { + ColumnFamilyStore cfs = MockSchema.newCFS(KEYSPACE); + File dataFolder = new Directories(cfs.metadata()).getDirectoryForNewSSTables(); + sstable = sstable(dataFolder, cfs, 0, 128); + File sFile = sstable.descriptor.fileFor(SSTableFormat.Components.STATS); + assertTrue("STATS file not created successfully in test setup", Files.exists(sFile.toPath())); + + try(LogTransaction log = new LogTransaction(OperationType.COMPACTION)) + { + assertNotNull(log); + + // Need to flag the transaction as having a REMOVE entry so it'll trigger the path to calculate stats on list + log.obsoleted(sstable); + + // Need to sleep for long enough to bypass the millisecond truncation logic due to jdk8 and jdk11 change + Thread.sleep(2000); + assertTrue("Failed to set mtime for STATS file to currentTimeMillis()", sFile.trySetLastModified(System.currentTimeMillis())); + + // Confirm we have an mtime mismatch + File dFile = sstable.descriptor.fileFor(SSTableFormat.Components.DATA); + assertNotEquals(sFile.lastModified(), dFile.lastModified()); + + // We need to add another LogRecord as we allow partial or incorrect entries as the last record... + log.trackNew(sstable(dataFolder, cfs, 2, 128)); + + // Confirm we don't get a mismatch LogRecord error when the STATS file is different even on listFiles case + listFiles(dataFolder, Directories.OnTxnErr.THROW, Directories.FileType.FINAL); + } + } + finally + { + if (sstable != null) + sstable.selfRef().release(); + } + } + private static SSTableReader sstable(File dataFolder, ColumnFamilyStore cfs, int generation, int size) throws IOException { Descriptor descriptor = new Descriptor(dataFolder, cfs.getKeyspaceName(), cfs.getTableName(), new SequenceBasedSSTableId(generation), DatabaseDescriptor.getSelectedSSTableFormat()); if (BigFormat.isSelected()) { - Set components = ImmutableSet.of(Components.DATA, Components.PRIMARY_INDEX, Components.FILTER, Components.TOC); + Set components = ImmutableSet.of(Components.DATA, Components.PRIMARY_INDEX, Components.FILTER, Components.TOC, Components.STATS); for (Component component : components) { File file = descriptor.fileFor(component); @@ -1399,12 +1561,12 @@ private static void assertFiles(Iterable existingFiles, Set tempor static Set getTemporaryFiles(File folder) { - return listFiles(folder, Directories.FileType.TEMPORARY); + return listFiles(folder, Directories.OnTxnErr.IGNORE, Directories.FileType.TEMPORARY); } static Set getFinalFiles(File folder) { - return listFiles(folder, Directories.FileType.FINAL); + return listFiles(folder, Directories.OnTxnErr.IGNORE, Directories.FileType.FINAL); } // Used by listFiles - this test is deliberately racing with files being @@ -1429,12 +1591,12 @@ private static Stream toCanonicalIgnoringNotFound(File file) } } - static Set listFiles(File folder, Directories.FileType... types) + static Set listFiles(File folder, Directories.OnTxnErr err, Directories.FileType... types) { Collection match = Arrays.asList(types); return new LogAwareFileLister(folder.toPath(), (file, type) -> match.contains(type), - Directories.OnTxnErr.IGNORE).list() + err).list() .stream() .flatMap(LogTransactionTest::toCanonicalIgnoringNotFound) .collect(Collectors.toSet()); diff --git a/test/unit/org/apache/cassandra/db/lifecycle/ViewTest.java b/test/unit/org/apache/cassandra/db/lifecycle/ViewTest.java index 3364b88133f7..6e5b04941733 100644 --- a/test/unit/org/apache/cassandra/db/lifecycle/ViewTest.java +++ b/test/unit/org/apache/cassandra/db/lifecycle/ViewTest.java @@ -119,7 +119,7 @@ public void testCompaction() testFailure(View.updateCompacting(emptySet(), of(r2)), cur); // update one compacting, one non-compacting, of the liveset to another instance of the same readers; // confirm liveset changes but compacting does not - cur = View.updateLiveSet(copyOf(readers.subList(1, 3)), of(r1, r2)).apply(cur); + cur = View.updateLiveSet(copyOf(readers.subList(1, 3)), of(r1, r2), cfs.metric.viewSSTableIntervalTree).apply(cur); Assert.assertSame(readers.get(0), cur.sstablesMap.get(r0)); Assert.assertSame(r1, cur.sstablesMap.get(r1)); Assert.assertSame(r2, cur.sstablesMap.get(r2)); @@ -179,7 +179,7 @@ public void testFlushing() Assert.assertEquals(memtable2, cur.liveMemtables.get(1)); Assert.assertEquals(memtable3, cur.getCurrentMemtable()); - testFailure(View.replaceFlushed(memtable2, null), cur); + testFailure(View.replaceFlushed(memtable2, null, cfs.metric.viewSSTableIntervalTree), cur); cur = View.markFlushing(memtable2).apply(cur); Assert.assertTrue(cur.flushingMemtables.contains(memtable2)); @@ -196,14 +196,14 @@ public void testFlushing() Assert.assertEquals(memtable2, cur.flushingMemtables.get(1)); Assert.assertEquals(memtable3, cur.getCurrentMemtable()); - cur = View.replaceFlushed(memtable2, null).apply(cur); + cur = View.replaceFlushed(memtable2, null, cfs.metric.viewSSTableIntervalTree).apply(cur); Assert.assertEquals(1, cur.liveMemtables.size()); Assert.assertEquals(1, cur.flushingMemtables.size()); Assert.assertEquals(memtable1, cur.flushingMemtables.get(0)); Assert.assertEquals(memtable3, cur.getCurrentMemtable()); SSTableReader sstable = MockSchema.sstable(1, cfs); - cur = View.replaceFlushed(memtable1, singleton(sstable)).apply(cur); + cur = View.replaceFlushed(memtable1, singleton(sstable), cfs.metric.viewSSTableIntervalTree).apply(cur); Assert.assertEquals(0, cur.flushingMemtables.size()); Assert.assertEquals(1, cur.liveMemtables.size()); Assert.assertEquals(memtable3, cur.getCurrentMemtable()); diff --git a/test/unit/org/apache/cassandra/db/marshal/AbstractTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/AbstractTypeTest.java index 5cb051b35e77..84701a211559 100644 --- a/test/unit/org/apache/cassandra/db/marshal/AbstractTypeTest.java +++ b/test/unit/org/apache/cassandra/db/marshal/AbstractTypeTest.java @@ -233,6 +233,8 @@ public void allTypesCovered() continue; if (isTestType(klass)) continue; + if (isPrefixCompositeType(klass)) + continue; String name = klass.getCanonicalName(); if (name == null) name = klass.getName(); @@ -259,14 +261,26 @@ private boolean isTestType(Class klass) return "test".equals(new File(src.getLocation().getPath()).name()); } + @SuppressWarnings("rawtypes") + private boolean isPrefixCompositeType(Class klass) + { + String name = klass.getCanonicalName(); + return name.contains("PrefixCompositeType"); + } + @Test public void isConstrainedTest() { qt().forAll(genBuilder().build()).checkAssert(type -> { - if (type instanceof MapType || type instanceof TupleType || type instanceof AbstractCompositeType) + if (type instanceof TupleType || type instanceof AbstractCompositeType) assertThat(type.isConstrainable()).isEqualTo(false); else - assertThat(type.isConstrainable()).isEqualTo(true); + { + if (type.isCollection() && !type.isFrozenCollection()) + assertThat(type.isConstrainable()).isEqualTo(false); + else + assertThat(type.isConstrainable()).isEqualTo(true); + } }); } @@ -697,7 +711,7 @@ private static void assertBytesEquals(ByteBuffer actual, ByteBuffer expected, St private static ColumnMetadata fake(AbstractType type) { - return new ColumnMetadata(null, null, new ColumnIdentifier("", true), type, 0, ColumnMetadata.Kind.PARTITION_KEY, null); + return new ColumnMetadata(null, null, new ColumnIdentifier("", true), type, ColumnMetadata.NO_UNIQUE_ID, 0, ColumnMetadata.Kind.PARTITION_KEY, null); } private static ByteBuffer parseLiteralType(AbstractType type, String literal) @@ -939,10 +953,10 @@ private static void verifyTypesCompatibility(AbstractType left, AbstractType rig if (!left.isValueCompatibleWith(right)) return; - ColumnMetadata rightColumn1 = new ColumnMetadata("k", "t", ColumnIdentifier.getInterned("c", false), right, ColumnMetadata.NO_POSITION, ColumnMetadata.Kind.REGULAR, null); - ColumnMetadata rightColumn2 = new ColumnMetadata("k", "t", ColumnIdentifier.getInterned("d", false), right, ColumnMetadata.NO_POSITION, ColumnMetadata.Kind.REGULAR, null); - ColumnMetadata leftColumn1 = new ColumnMetadata("k", "t", ColumnIdentifier.getInterned("c", false), left, ColumnMetadata.NO_POSITION, ColumnMetadata.Kind.REGULAR, null); - ColumnMetadata leftColumn2 = new ColumnMetadata("k", "t", ColumnIdentifier.getInterned("d", false), left, ColumnMetadata.NO_POSITION, ColumnMetadata.Kind.REGULAR, null); + ColumnMetadata rightColumn1 = new ColumnMetadata("k", "t", ColumnIdentifier.getInterned("c", false), right, ColumnMetadata.NO_UNIQUE_ID, ColumnMetadata.NO_POSITION, ColumnMetadata.Kind.REGULAR, null); + ColumnMetadata rightColumn2 = new ColumnMetadata("k", "t", ColumnIdentifier.getInterned("d", false), right, ColumnMetadata.NO_UNIQUE_ID, ColumnMetadata.NO_POSITION, ColumnMetadata.Kind.REGULAR, null); + ColumnMetadata leftColumn1 = new ColumnMetadata("k", "t", ColumnIdentifier.getInterned("c", false), left, ColumnMetadata.NO_UNIQUE_ID, ColumnMetadata.NO_POSITION, ColumnMetadata.Kind.REGULAR, null); + ColumnMetadata leftColumn2 = new ColumnMetadata("k", "t", ColumnIdentifier.getInterned("d", false), left, ColumnMetadata.NO_UNIQUE_ID, ColumnMetadata.NO_POSITION, ColumnMetadata.Kind.REGULAR, null); TableMetadata leftTable = TableMetadata.builder("k", "t").addPartitionKeyColumn("pk", EmptyType.instance).addColumn(leftColumn1).addColumn(leftColumn2).build(); TableMetadata rightTable = TableMetadata.builder("k", "t").addPartitionKeyColumn("pk", EmptyType.instance).addColumn(rightColumn1).addColumn(rightColumn2).build(); diff --git a/test/unit/org/apache/cassandra/db/marshal/ByteBufferAccessorTest.java b/test/unit/org/apache/cassandra/db/marshal/ByteBufferAccessorTest.java index 6f39d8378d4f..52d7922ca345 100644 --- a/test/unit/org/apache/cassandra/db/marshal/ByteBufferAccessorTest.java +++ b/test/unit/org/apache/cassandra/db/marshal/ByteBufferAccessorTest.java @@ -20,7 +20,9 @@ import java.nio.ByteBuffer; +import org.junit.AfterClass; import org.junit.Assert; +import org.junit.BeforeClass; import org.junit.Test; import org.apache.cassandra.utils.ByteBufferUtil; @@ -29,6 +31,21 @@ public class ByteBufferAccessorTest extends ValueAccessorTester { + + private static final TestNativeDataAllocator allocator = new TestNativeDataAllocator(); + @BeforeClass + public static void setSetMemoryAllocator() + { + NativeAccessor.setNativeMemoryAllocator(allocator); + } + + @AfterClass + public static void releaseMemory() + { + allocator.close(); + } + + private static byte[] array(int start, int size) { byte[] a = new byte[size]; diff --git a/test/unit/org/apache/cassandra/db/marshal/CollectionTypesTest.java b/test/unit/org/apache/cassandra/db/marshal/CollectionTypesTest.java index 889364b59710..6250c0fea2b5 100644 --- a/test/unit/org/apache/cassandra/db/marshal/CollectionTypesTest.java +++ b/test/unit/org/apache/cassandra/db/marshal/CollectionTypesTest.java @@ -27,7 +27,9 @@ import java.util.Random; import java.util.Set; +import org.junit.AfterClass; import org.junit.Assert; +import org.junit.BeforeClass; import org.junit.Test; import org.apache.cassandra.cql3.CQL3Type; @@ -36,6 +38,20 @@ public class CollectionTypesTest { + + private static final TestNativeDataAllocator allocator = new TestNativeDataAllocator(); + @BeforeClass + public static void setSetMemoryAllocator() + { + NativeAccessor.setNativeMemoryAllocator(allocator); + } + + @AfterClass + public static void releaseMemory() + { + allocator.close(); + } + interface TypeFactory { T createType(AbstractType keyType, AbstractType valType); } interface ValueFactory { T createValue(ValueGenerator keyGen, ValueGenerator valGen, int size, Random random); } @@ -68,6 +84,7 @@ static void testSerializationDeserialization(Type Assert.assertEquals(srcString, dstString); } } + allocator.releaseMemory(); } } } diff --git a/test/unit/org/apache/cassandra/db/marshal/CompositeAndTupleTypesTest.java b/test/unit/org/apache/cassandra/db/marshal/CompositeAndTupleTypesTest.java index 0c86871c28e8..65163924c1df 100644 --- a/test/unit/org/apache/cassandra/db/marshal/CompositeAndTupleTypesTest.java +++ b/test/unit/org/apache/cassandra/db/marshal/CompositeAndTupleTypesTest.java @@ -24,7 +24,9 @@ import java.util.List; import java.util.Random; +import org.junit.AfterClass; import org.junit.Assert; +import org.junit.BeforeClass; import org.junit.Test; import org.apache.cassandra.cql3.FieldIdentifier; @@ -34,6 +36,20 @@ public class CompositeAndTupleTypesTest { + + private static final TestNativeDataAllocator allocator = new TestNativeDataAllocator(); + @BeforeClass + public static void setSetMemoryAllocator() + { + NativeAccessor.setNativeMemoryAllocator(allocator); + } + + @AfterClass + public static void releaseMemory() + { + allocator.close(); + } + interface TypeFactory> { T createType(List> types); } interface ValueCombiner { V combine(AbstractType type, ValueAccessor accessor, V[] values); } @@ -107,6 +123,7 @@ public > void testSerializationDeserialization(TypeFa Assert.assertEquals(srcString, dstString); } } + allocator.releaseMemory(); } } } diff --git a/test/unit/org/apache/cassandra/db/marshal/CompositeTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/CompositeTypeTest.java index 0f3714870af7..994b026c7bd5 100644 --- a/test/unit/org/apache/cassandra/db/marshal/CompositeTypeTest.java +++ b/test/unit/org/apache/cassandra/db/marshal/CompositeTypeTest.java @@ -23,6 +23,7 @@ import java.util.*; import com.google.common.collect.Lists; +import org.junit.AfterClass; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; @@ -70,6 +71,20 @@ public class CompositeTypeTest uuids[i] = nextTimeUUID(); } + + private static final TestNativeDataAllocator allocator = new TestNativeDataAllocator(); + @BeforeClass + public static void setSetMemoryAllocator() + { + NativeAccessor.setNativeMemoryAllocator(allocator); + } + + @AfterClass + public static void releaseMemory() + { + allocator.close(); + } + @BeforeClass public static void defineSchema() throws ConfigurationException { diff --git a/test/unit/org/apache/cassandra/db/marshal/NativeAccessorTest.java b/test/unit/org/apache/cassandra/db/marshal/NativeAccessorTest.java new file mode 100644 index 000000000000..f53d5361545c --- /dev/null +++ b/test/unit/org/apache/cassandra/db/marshal/NativeAccessorTest.java @@ -0,0 +1,367 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.marshal; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.FloatBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.StandardCharsets; +import java.util.UUID; +import java.util.function.BiFunction; +import java.util.function.Function; + +import com.google.common.primitives.UnsignedBytes; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.db.Digest; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.service.paxos.Ballot; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.TimeUUID; +import org.apache.cassandra.utils.UUIDGen; +import org.apache.cassandra.utils.memory.BigEndianMemoryUtil; + +import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; +import static org.quicktheories.QuickTheory.qt; +import static org.quicktheories.generators.SourceDSL.integers; + +public class NativeAccessorTest extends ValueAccessorTester +{ + private static final TestNativeDataAllocator allocator = new TestNativeDataAllocator(); + @BeforeClass + public static void setSetMemoryAllocator() + { + NativeAccessor.setNativeMemoryAllocator(allocator); + } + + @AfterClass + public static void releaseMemory() + { + allocator.close(); + } + + private final ValueAccessor nativeAccessor = NativeAccessor.instance; + private final ValueAccessor bufferAccessor = ByteBufferAccessor.instance; + + @Test + public void testCompare() + { + qt().forAll(accessors(), + byteArrays(integers().between(0, 200)), + byteArrays(integers().between(0, 200)) + ).checkAssert(this::testCompare); + } + + private void testCompare(ValueAccessor rightAccessor, byte[] leftArray, byte[] rightArray) + { + NativeData left = NativeAccessor.instance.valueOf(leftArray); + V right = rightAccessor.valueOf(rightArray); + int expectedResult = Integer.signum(UnsignedBytes.lexicographicalComparator().compare(leftArray, rightArray)); + int actualResult = Integer.signum(NativeAccessor.instance.compare(left, right, rightAccessor)); + Assert.assertEquals(expectedResult, actualResult); + } + + @Test + public void testCopy() + { + qt().forAll(accessors(), + byteArrays(integers().between(10, 100)), + integers().between(0, 9), + integers().between(0, 9) + ).checkAssert(this::testCopy); + } + + private void testCopy(ValueAccessor dstAccessor, byte[] dataToCopy, int srcOffset, int dstOffset) + { + ValueAccessor srcAcccessor = NativeAccessor.instance; + NativeData src = srcAcccessor.valueOf(dataToCopy); + V dst = dstAccessor.valueOf(new byte[dataToCopy.length + dstOffset - srcOffset]); + NativeAccessor.instance.copyTo(src, srcOffset, dst, dstAccessor, dstOffset, dataToCopy.length - srcOffset); + V dstSlice = dstAccessor.slice(dst, dstOffset, dataToCopy.length - srcOffset); + NativeData expectedData = srcAcccessor.slice(src, srcOffset, dataToCopy.length - srcOffset); + + Assert.assertArrayEquals(srcAcccessor.toArray(src, srcOffset, dataToCopy.length - srcOffset), dstAccessor.toArray(dstSlice)); + Assert.assertArrayEquals(srcAcccessor.toArray(expectedData), dstAccessor.toArray(dstSlice)); + } + + @Test + public void testPutMethods() + { + testNativePut((byte) 42, nativeAccessor::putByte, bufferAccessor::getByte); + + testNativePut((short )(Short.MAX_VALUE - 3), nativeAccessor::putShort, bufferAccessor::getShort); + + testNativePut(Integer.MAX_VALUE - 5, nativeAccessor::putInt, bufferAccessor::getInt); + + testNativePut((float) Math.PI, nativeAccessor::putFloat, bufferAccessor::getFloat); + + testNativePut(Long.MAX_VALUE - 2, nativeAccessor::putLong, bufferAccessor::getLong); + + testNativePut(0L, nativeAccessor::putVInt, bufferAccessor::getVInt); + testNativePut(42L, nativeAccessor::putVInt, bufferAccessor::getVInt); + testNativePut(0xFFFFFFL, nativeAccessor::putVInt, bufferAccessor::getVInt); + testNativePut(Long.MAX_VALUE - 1, nativeAccessor::putVInt, bufferAccessor::getVInt); + + testNativePut(42, nativeAccessor::putVInt32, bufferAccessor::getVInt32); + testNativePut(0xFFFFF, nativeAccessor::putVInt32, bufferAccessor::getVInt32); + testNativePut(Integer.MAX_VALUE - 1, nativeAccessor::putVInt32, bufferAccessor::getVInt32); + + testNativePut(42L, nativeAccessor::putUnsignedVInt, bufferAccessor::getUnsignedVInt); + testNativePut(0xFFFFFL, nativeAccessor::putUnsignedVInt, bufferAccessor::getUnsignedVInt); + testNativePut(0xFFFFFFFL, nativeAccessor::putUnsignedVInt, bufferAccessor::getUnsignedVInt); + testNativePut(Long.MAX_VALUE - 1, nativeAccessor::putUnsignedVInt, bufferAccessor::getUnsignedVInt); + } + + @Test + public void testPutDouble() // there is no putDouble method to test it like others + { + Double originalValue = Math.PI; // Double conversion is used to compare values as bit values + NativeData nativeData = nativeAccessor.allocate(25); + ByteBuffer bufferData = bufferAccessor.allocate(25); + int offset = 7; + bufferData.putDouble(offset, originalValue); + nativeAccessor.copyByteBufferTo(bufferData, 0, nativeData, 0, bufferAccessor.size(bufferData)); + Double getValue = nativeAccessor.getDouble(nativeData, offset); + Assert.assertEquals(originalValue, getValue); + + NativeData nativeDataSlice = nativeAccessor.slice(nativeData, offset, nativeData.nativeDataSize() - offset); + Double toValue = nativeAccessor.toDouble(nativeDataSlice); + Assert.assertEquals(originalValue, toValue); + + } + + private void testNativePut(V originalValue, TriFunction putMethod, + BiFunction getMethod) + { + NativeData nativeData = nativeAccessor.allocate(25); + int offset = 2; + putMethod.apply(nativeData, offset, originalValue); + ByteBuffer buffer = nativeAccessor.toBuffer(nativeData); + V getValue = getMethod.apply(buffer, offset); + Assert.assertEquals(originalValue, getValue); + } + + @Test + public void testGetMethods() + { + testNativeGet((byte) 42, bufferAccessor::putByte, nativeAccessor::getByte, nativeAccessor::toByte); + + testNativeGet((short )(Short.MAX_VALUE - 3), bufferAccessor::putShort, nativeAccessor::getShort, nativeAccessor::toShort); + + // nativeAccessor::getUnsignedShort is already tested by org.apache.cassandra.db.marshal.ValueAccessorTest.testUnsignedShort() + + testNativeGet(Integer.MAX_VALUE - 5, bufferAccessor::putInt, nativeAccessor::getInt, nativeAccessor::toInt); + + testNativeGet((float) Math.PI, bufferAccessor::putFloat, nativeAccessor::getFloat, nativeAccessor::toFloat); + + testNativeGet(Long.MAX_VALUE - 2, bufferAccessor::putLong, nativeAccessor::getLong, nativeAccessor::toLong); + + testNativeGet(0L, bufferAccessor::putVInt, nativeAccessor::getVInt, null); + testNativeGet(42L, bufferAccessor::putVInt, nativeAccessor::getVInt, null); + testNativeGet(0xFFFFFFL, bufferAccessor::putVInt, nativeAccessor::getVInt, null); + testNativeGet(Long.MAX_VALUE - 1, bufferAccessor::putVInt, nativeAccessor::getVInt, null); + + testNativeGet(42, bufferAccessor::putVInt32, nativeAccessor::getVInt32, null); + testNativeGet(0xFFFFF, bufferAccessor::putVInt32, nativeAccessor::getVInt32, null); + testNativeGet(Integer.MAX_VALUE - 1, bufferAccessor::putVInt32, nativeAccessor::getVInt32, null); + + testNativeGet(42L, bufferAccessor::putUnsignedVInt, nativeAccessor::getUnsignedVInt, null); + testNativeGet(0xFFFFFL, bufferAccessor::putUnsignedVInt, nativeAccessor::getUnsignedVInt, null); + testNativeGet(0xFFFFFFFL, bufferAccessor::putUnsignedVInt, nativeAccessor::getUnsignedVInt, null); + testNativeGet(Long.MAX_VALUE - 1, bufferAccessor::putUnsignedVInt, nativeAccessor::getUnsignedVInt, null); + } + + private void testNativeGet(V originalValue, TriFunction putMethod, + BiFunction getMethod, Function toMethod) + { + ByteBuffer bufferData = bufferAccessor.allocate(25); + NativeData nativeData = nativeAccessor.allocate(25); + int offset = 2; + putMethod.apply(bufferData, offset, originalValue); + nativeAccessor.copyByteBufferTo(bufferData, 0, nativeData, 0, bufferAccessor.size(bufferData)); + V getValue = getMethod.apply(nativeData, offset); + Assert.assertEquals(originalValue, getValue); + + if (toMethod != null) + { + NativeData nativeDataSlice = nativeAccessor.slice(nativeData, offset, nativeData.nativeDataSize() - offset); + V toValue = toMethod.apply(nativeDataSlice); + Assert.assertEquals(originalValue, toValue); + } + } + + @Test + public void testToUUID() { + UUID originalValue = UUID.randomUUID(); + ByteBuffer encodedOriginalValue = UUIDGen.toByteBuffer(originalValue); + int size = encodedOriginalValue.remaining(); + NativeData nativeData = nativeAccessor.allocate(size); + nativeAccessor.copyByteBufferTo(encodedOriginalValue, 0, nativeData, 0, size); + + UUID nativeUUID = nativeAccessor.toUUID(nativeData); + Assert.assertEquals(originalValue, nativeUUID); + } + + @Test + public void testToTimeUUID() { + TimeUUID originalValue = nextTimeUUID(); + ByteBuffer encodedOriginalValue = originalValue.toBytes(); + int size = encodedOriginalValue.remaining(); + NativeData nativeData = nativeAccessor.allocate(size); + nativeAccessor.copyByteBufferTo(encodedOriginalValue, 0, nativeData, 0, size); + + TimeUUID nativeUUID = nativeAccessor.toTimeUUID(nativeData); + Assert.assertEquals(originalValue, nativeUUID); + } + + @Test + public void testToBullot() { + Ballot originalValue = Ballot.fromUuid(nextTimeUUID().asUUID()); + ByteBuffer encodedOriginalValue = originalValue.toBytes(); + int size = encodedOriginalValue.remaining(); + NativeData nativeData = nativeAccessor.allocate(size); + nativeAccessor.copyByteBufferTo(encodedOriginalValue, 0, nativeData, 0, size); + + Ballot nativeBallot = nativeAccessor.toBallot(nativeData); + Assert.assertEquals(originalValue, nativeBallot); + } + + @Test + public void testToHex() { + int valueSize = 42; + byte[] originalData = new byte[valueSize]; + for (int i = 0; i < valueSize; i++) + originalData[i] = (byte) i; + + ByteBuffer bufferData = bufferAccessor.valueOf(originalData); + String bufferHex = bufferAccessor.toHex(bufferData); + + NativeData nativeData = nativeAccessor.valueOf(originalData); + String nativeHex = nativeAccessor.toHex(nativeData); + Assert.assertEquals(bufferHex, nativeHex); + } + + @Test + public void test() { + NativeData nativeData = nativeAccessor.allocate(4); + BigEndianMemoryUtil.setInt(nativeData.getAddress(), 0x00FF); + + String nativeHex = nativeAccessor.toHex(nativeData); + System.out.println(nativeHex); + } + + @Test + public void testToString() throws CharacterCodingException + { + String originalData = "test string value"; + NativeData nativeData = nativeAccessor.valueOf(originalData, StandardCharsets.UTF_8); + String nativeToString = nativeAccessor.toString(nativeData, StandardCharsets.UTF_8); + Assert.assertEquals(originalData, nativeToString); + } + + @Test + public void testToFloatArray() { + int valueSize = 42; + ByteBuffer buffer = ByteBuffer.allocate(valueSize * Float.BYTES); + FloatBuffer floatBuffer = buffer.asFloatBuffer(); + for (int i = 0; i < valueSize; i++) + floatBuffer.put((float) i); + + NativeData nativeData = nativeAccessor.valueOf(buffer); + float[] decodedFloatArray = nativeAccessor.toFloatArray(nativeData, valueSize); + + for (int i = 0; i < valueSize; i++) + Assert.assertEquals((Float) floatBuffer.get(i), (Float) decodedFloatArray[i]); + // Float conversion is used to compare values as bit values + } + + @Test + public void testDataOutputPlusWrite() throws IOException + { + int valueSize = 25; + NativeData nativeData = nativeAccessor.allocate(valueSize); + byte[] originalData = new byte[valueSize]; + for (int i = 0; i < valueSize; i++) + originalData[i] = (byte) i; + nativeAccessor.putBytes(nativeData, 0, originalData); + + try(DataOutputBuffer dataOutput = new DataOutputBuffer()) + { + nativeAccessor.write(nativeData, dataOutput); + byte[] writenData = dataOutput.toByteArray(); + Assert.assertArrayEquals(originalData, writenData); + } + } + + @Test + public void testHeapByteBufferWrite() + { + testHeapByteBufferWrite(ByteBuffer.allocate(25), 23); + } + + @Test + public void testDirectByteBufferWrite() + { + testHeapByteBufferWrite(ByteBuffer.allocateDirect(25), 23); + } + + private void testHeapByteBufferWrite(ByteBuffer buffer, int valueSize) + { + NativeData nativeData = nativeAccessor.allocate(valueSize); + byte[] originalData = new byte[valueSize]; + for (int i = 0; i < valueSize; i++) + originalData[i] = (byte) i; + nativeAccessor.putBytes(nativeData, 0, originalData); + + int initialPosition = buffer.position(); + nativeAccessor.write(nativeData, buffer); + Assert.assertEquals(valueSize, buffer.position() - initialPosition); + buffer.flip(); + Assert.assertArrayEquals(originalData, ByteBufferUtil.getArray(buffer)); + } + + @Test + public void testDigest() + { + int valueSize = 25; + NativeData nativeData = nativeAccessor.allocate(valueSize); + byte[] originalData = new byte[valueSize]; + for (int i = 0; i < valueSize; i++) + originalData[i] = (byte) i; + nativeAccessor.putBytes(nativeData, 0, originalData); + + Digest byteArrayDigest = Digest.forReadResponse(); + byteArrayDigest.update(originalData, 0, originalData.length); + + Digest nativeDigest = Digest.forReadResponse(); + nativeAccessor.digest(nativeData, nativeDigest); + + Assert.assertArrayEquals(byteArrayDigest.digest(), nativeDigest.digest()); + } + + @FunctionalInterface + interface TriFunction + { + R apply(A a, B b, C c); + } +} diff --git a/test/unit/org/apache/cassandra/db/marshal/TestNativeDataAllocator.java b/test/unit/org/apache/cassandra/db/marshal/TestNativeDataAllocator.java new file mode 100644 index 000000000000..cd45e77f7df2 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/marshal/TestNativeDataAllocator.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.marshal; + +import java.io.Closeable; +import java.nio.ByteBuffer; +import java.util.concurrent.TimeUnit; + +import org.apache.cassandra.utils.concurrent.ImmediateFuture; +import org.apache.cassandra.utils.concurrent.OpOrder; +import org.apache.cassandra.utils.memory.MemoryUtil; +import org.apache.cassandra.utils.memory.NativeAllocator; +import org.apache.cassandra.utils.memory.NativePool; + +/** + * A primitive NativeData allocator is used for test purposes only + * It releases memory only when releaseMemory() or close() are called + */ +public class TestNativeDataAllocator implements NativeDataAllocator, Closeable +{ + private final NativePool nativePool = new NativePool(0, 10 * 1024 * 1024, 1.0f, + () -> ImmediateFuture.success(true)); + private NativeAllocator nativeAllocator = nativePool.newAllocator("test"); + private final OpOrder order = new OpOrder(); + + @Override + public NativeData allocateBasedOnBuffer(ByteBuffer data) + { + try(OpOrder.Group group = order.start()) + { + long address = nativeAllocator.allocate(data.remaining(), group); + MemoryUtil.setBytes(address, data); + return new AddressBasedNativeData(address, data.remaining()); + } + } + + public void releaseMemory() { + nativeAllocator.setDiscarding(); + nativeAllocator.setDiscarded(); + nativeAllocator = nativePool.newAllocator("test"); + } + + public void close() { + nativeAllocator.setDiscarding(); + nativeAllocator.setDiscarded(); + try + { + nativePool.shutdownAndWait(5, TimeUnit.SECONDS); + } + catch (Exception e) + { + throw new RuntimeException(e); + } + } + +} diff --git a/test/unit/org/apache/cassandra/db/marshal/TypeValidationTest.java b/test/unit/org/apache/cassandra/db/marshal/TypeValidationTest.java index 6897661cf6d4..ef264dfa14ab 100644 --- a/test/unit/org/apache/cassandra/db/marshal/TypeValidationTest.java +++ b/test/unit/org/apache/cassandra/db/marshal/TypeValidationTest.java @@ -204,7 +204,7 @@ private static void buildAndSplit(Gen baseGen) qt().forAll(tupleWithValueGen(baseGen)).checkAssert(pair -> { TupleType tuple = pair.left; ByteBuffer value = pair.right; - Assertions.assertThat(tuple.pack(tuple.unpack(value))) + Assertions.assertThat(tuple.pack(tuple.unpack(value), ByteBufferAccessor.instance)) .as("tuple.pack(tuple.unpack(value)) == value") .isEqualTo(value); }); diff --git a/test/unit/org/apache/cassandra/db/marshal/ValueAccessorTest.java b/test/unit/org/apache/cassandra/db/marshal/ValueAccessorTest.java index cd6681705f1d..cb9d6ee322f3 100644 --- a/test/unit/org/apache/cassandra/db/marshal/ValueAccessorTest.java +++ b/test/unit/org/apache/cassandra/db/marshal/ValueAccessorTest.java @@ -22,7 +22,9 @@ import java.nio.ByteBuffer; import java.util.Arrays; +import org.junit.AfterClass; import org.junit.Assert; +import org.junit.BeforeClass; import org.junit.Test; import org.apache.cassandra.io.util.DataOutputBuffer; @@ -69,6 +71,19 @@ private static void testHashCodeAndEquals(byte[] rawBytes, buffer1, buffer2); } + private static final TestNativeDataAllocator allocator = new TestNativeDataAllocator(); + @BeforeClass + public static void setSetMemoryAllocator() + { + NativeAccessor.setNativeMemoryAllocator(allocator); + } + + @AfterClass + public static void releaseMemory() + { + allocator.close(); + } + /** * Identical data should yield identical hashcodes even if the underlying format is different */ diff --git a/test/unit/org/apache/cassandra/db/marshal/ValueAccessors.java b/test/unit/org/apache/cassandra/db/marshal/ValueAccessors.java index e04de761e802..2c65237f9655 100644 --- a/test/unit/org/apache/cassandra/db/marshal/ValueAccessors.java +++ b/test/unit/org/apache/cassandra/db/marshal/ValueAccessors.java @@ -22,7 +22,8 @@ public class ValueAccessors { - public static final ValueAccessor[] ACCESSORS = new ValueAccessor[]{ ByteBufferAccessor.instance, ByteArrayAccessor.instance }; + public static final ValueAccessor[] ACCESSORS = new ValueAccessor[]{ ByteBufferAccessor.instance, ByteArrayAccessor.instance, NativeAccessor.instance }; + public static final ValueAccessor[] FACTORY_SUPPORTED_ACCESSORS = new ValueAccessor[]{ ByteBufferAccessor.instance, ByteArrayAccessor.instance }; public static void assertDataEquals(V1 expected, ValueAccessor expectedAccessor, V2 actual, ValueAccessor actualAccessor) { diff --git a/test/unit/org/apache/cassandra/db/rows/ColumnMetadataVersionComparatorTest.java b/test/unit/org/apache/cassandra/db/rows/ColumnMetadataVersionComparatorTest.java index 854421a91e20..d2988624a8b0 100644 --- a/test/unit/org/apache/cassandra/db/rows/ColumnMetadataVersionComparatorTest.java +++ b/test/unit/org/apache/cassandra/db/rows/ColumnMetadataVersionComparatorTest.java @@ -194,8 +194,8 @@ private void checkComparisonResults(AbstractType oldVersion, AbstractType private static int compare(AbstractType left, AbstractType right) { - ColumnMetadata v1 = ColumnMetadata.regularColumn("ks", "t", "c", left); - ColumnMetadata v2 = ColumnMetadata.regularColumn("ks", "t", "c", right); + ColumnMetadata v1 = ColumnMetadata.regularColumn("ks", "t", "c", left, ColumnMetadata.NO_UNIQUE_ID); + ColumnMetadata v2 = ColumnMetadata.regularColumn("ks", "t", "c", right, ColumnMetadata.NO_UNIQUE_ID); return ColumnMetadataVersionComparator.INSTANCE.compare(v1, v2); } } diff --git a/test/unit/org/apache/cassandra/db/rows/RowsTest.java b/test/unit/org/apache/cassandra/db/rows/RowsTest.java index a4436da88a7a..865d33a8b6a1 100644 --- a/test/unit/org/apache/cassandra/db/rows/RowsTest.java +++ b/test/unit/org/apache/cassandra/db/rows/RowsTest.java @@ -45,6 +45,8 @@ import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FBUtilities; +import static org.apache.cassandra.db.CellTest.assertCellsEqual; + public class RowsTest { private static final String KEYSPACE = "rows_test"; @@ -526,6 +528,58 @@ public void mergeRowDeletionSupercedesLiveness() Assert.assertEquals(0, merged.columns().size()); } + + public static BufferCell expiringWithExpirationTime(ColumnMetadata column, long timestamp, int ttl, long localDeletionTime, ByteBuffer value) + { + return expiringWithExpirationTime(column, timestamp, ttl, localDeletionTime, value, null); + } + + public static BufferCell expiringWithExpirationTime(ColumnMetadata column, long timestamp, int ttl, long localDeletionTime, ByteBuffer value, CellPath path) + { + assert ttl != Cell.NO_TTL; + return new BufferCell(column, timestamp, ttl, localDeletionTime, value, path); + } + + @Test + public void mergeRowsWithSameExpiryDifferentTTLCommutesLiveness() + { + long now1 = FBUtilities.nowInSeconds(); + long ts1 = secondToTs(now1); + long ldt = now1 + 1000; + + Row.Builder r1Builder = BTreeRow.unsortedBuilder(); + r1Builder.newRow(c1); + LivenessInfo originalLiveness = LivenessInfo.withExpirationTime(ts1, 100, ldt); + r1Builder.addPrimaryKeyLivenessInfo(originalLiveness); + + Row.Builder r2Builder = BTreeRow.unsortedBuilder(); + r2Builder.newRow(c1); + LivenessInfo loweredTTL = LivenessInfo.withExpirationTime(ts1, 50, ldt); + r2Builder.addPrimaryKeyLivenessInfo(loweredTTL); + + Cell r2v = expiringWithExpirationTime(v, ts1, 75, ldt, BB1); + Cell r2m2 = expiringWithExpirationTime(m, ts1, 50, ldt, BB1, CellPath.create(BB2)); + Cell r2m3 = expiringWithExpirationTime(m, ts1, 75, ldt, BB2, CellPath.create(BB3)); + Cell r2m4 = expiringWithExpirationTime(m, ts1, 100, ldt, BB3, CellPath.create(BB4)); + List> expectedCells = Lists.newArrayList(r2v, r2m2, r2m3, r2m4); + + expectedCells.forEach(r1Builder::addCell); + expectedCells.forEach(r2Builder::addCell); + + Row r1 = r1Builder.build(); + Row r2 = r2Builder.build(); + + Row r1r2 = Rows.merge(r1, r2); + Row r2r1 = Rows.merge(r2, r1); + + DiffListener mergedListener = new DiffListener(); + Rows.diff(mergedListener, r1r2, r2r1); + + mergedListener.liveness.forEach(pair -> Assert.assertEquals(pair.merged, pair.original)); + mergedListener.cells.forEach(pair -> assertCellsEqual(pair.merged, pair.original)); + } + + // Creates a dummy cell for a (regular) column for the provided name and without a cellPath. private static Cell liveCell(ColumnMetadata name) { diff --git a/test/unit/org/apache/cassandra/db/streaming/CassandraEntireSSTableStreamWriterTest.java b/test/unit/org/apache/cassandra/db/streaming/CassandraEntireSSTableStreamWriterTest.java index c29f059000c7..abb6c2dc1b1e 100644 --- a/test/unit/org/apache/cassandra/db/streaming/CassandraEntireSSTableStreamWriterTest.java +++ b/test/unit/org/apache/cassandra/db/streaming/CassandraEntireSSTableStreamWriterTest.java @@ -62,6 +62,7 @@ import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FBUtilities; +import static java.util.Collections.emptyList; import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -145,14 +146,14 @@ public void testBlockReadingAndWritingOverWire() throws Throwable CassandraEntireSSTableStreamWriter writer = new CassandraEntireSSTableStreamWriter(sstable, session, context); writer.write(out); - session.prepareReceiving(new StreamSummary(sstable.metadata().id, 1, 5104)); + session.prepareReceiving(new StreamSummary(sstable.metadata().id, emptyList(), 1, 5104)); CassandraStreamHeader header = CassandraStreamHeader.builder() .withSSTableVersion(sstable.descriptor.version) .withSSTableLevel(0) .withEstimatedKeys(sstable.estimatedKeys()) - .withSections(Collections.emptyList()) + .withSections(emptyList()) .withSerializationHeader(sstable.header.toComponent()) .withComponentManifest(context.manifest()) .isEntireSSTable(true) @@ -208,7 +209,7 @@ private StreamSession setupStreamingSessionForTest() StreamResultFuture future = StreamResultFuture.createInitiator(nextTimeUUID(), StreamOperation.BOOTSTRAP, Collections.emptyList(), streamCoordinator); InetAddressAndPort peer = FBUtilities.getBroadcastAddressAndPort(); - streamCoordinator.addSessionInfo(new SessionInfo(peer, 0, peer, Collections.emptyList(), Collections.emptyList(), StreamSession.State.INITIALIZED, null)); + streamCoordinator.addSessionInfo(new SessionInfo(peer, 0, peer, emptyList(), emptyList(), StreamSession.State.INITIALIZED, null)); StreamSession session = streamCoordinator.getOrCreateOutboundSession(peer); session.init(future); diff --git a/test/unit/org/apache/cassandra/db/streaming/CassandraStreamManagerTest.java b/test/unit/org/apache/cassandra/db/streaming/CassandraStreamManagerTest.java index 41dcdcb66b88..7085ddf18a47 100644 --- a/test/unit/org/apache/cassandra/db/streaming/CassandraStreamManagerTest.java +++ b/test/unit/org/apache/cassandra/db/streaming/CassandraStreamManagerTest.java @@ -33,15 +33,13 @@ import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.google.common.util.concurrent.Uninterruptibles; - -import org.apache.cassandra.Util; -import org.apache.cassandra.locator.RangesAtEndpoint; import org.junit.Assert; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.Util; import org.apache.cassandra.concurrent.NamedThreadFactory; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.QueryProcessor; @@ -53,6 +51,7 @@ import org.apache.cassandra.io.sstable.Descriptor; import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.RangesAtEndpoint; import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.schema.CompactionParams; import org.apache.cassandra.schema.KeyspaceParams; @@ -69,6 +68,7 @@ import org.apache.cassandra.utils.TimeUUID; import org.apache.cassandra.utils.concurrent.Ref; +import static java.util.Collections.emptyList; import static org.apache.cassandra.service.ActiveRepairService.NO_PENDING_REPAIR; import static org.apache.cassandra.service.ActiveRepairService.UNREPAIRED_SSTABLE; import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; @@ -272,7 +272,7 @@ private Collection createSummaries() Collection summaries = new ArrayList<>(); for (int i = 0; i < 10; i++) { - StreamSummary summary = new StreamSummary(tbm.id, i, (i + 1) * 10); + StreamSummary summary = new StreamSummary(tbm.id, emptyList(), i, (i + 1) * 10); summaries.add(summary); } return summaries; diff --git a/test/unit/org/apache/cassandra/db/streaming/CassandraStreamReceiverTest.java b/test/unit/org/apache/cassandra/db/streaming/CassandraStreamReceiverTest.java new file mode 100644 index 000000000000..8bf96329bacb --- /dev/null +++ b/test/unit/org/apache/cassandra/db/streaming/CassandraStreamReceiverTest.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.streaming; + +import java.util.Collections; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.junit.Before; +import org.junit.Test; + +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.streaming.StreamOperation; +import org.apache.cassandra.streaming.StreamSession; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.when; + +/** + * Unit tests for {@link org.apache.cassandra.db.streaming.CassandraStreamReceiver} + */ +public class CassandraStreamReceiverTest extends CQLTester +{ + @Mock + private StreamSession session; + + private static final String CDC_TABLE = "cdc_table"; + private static final String MV_TABLE = "mv_table"; + private static final String CDC_MV_TABLE = "cdc_mv_table"; + private static final String NO_CDC_MV_TABLE = "no_cdc_mv_table"; + + @Before + public void setup() + { + // Set cdc_on_repair_enabled materialized_views_on_repair to true + DatabaseDescriptor.setCDCOnRepairEnabled(true); + DatabaseDescriptor.setMaterializedViewsOnRepairEnabled(true); + // Enable materialized views + DatabaseDescriptor.setMaterializedViewsEnabled(true); + + MockitoAnnotations.initMocks(this); + QueryProcessor.executeInternal(String.format("CREATE TABLE IF NOT EXISTS %s.%s (pk int PRIMARY KEY, v int) WITH cdc=true", KEYSPACE, CDC_TABLE)); + QueryProcessor.executeInternal(String.format("CREATE TABLE IF NOT EXISTS %s.%s (pk int PRIMARY KEY, v int) WITH cdc=false", KEYSPACE, MV_TABLE)); + QueryProcessor.executeInternal(String.format("CREATE MATERIALIZED VIEW IF NOT EXISTS %s.mv AS SELECT * FROM %s.%s WHERE pk IS NOT NULL PRIMARY KEY (pk)", KEYSPACE, KEYSPACE, MV_TABLE)); + QueryProcessor.executeInternal(String.format("CREATE TABLE IF NOT EXISTS %s.%s (pk int PRIMARY KEY, v int) WITH cdc=true", KEYSPACE, CDC_MV_TABLE)); + QueryProcessor.executeInternal(String.format("CREATE MATERIALIZED VIEW IF NOT EXISTS %s.mv2 AS SELECT * FROM %s.%s WHERE pk IS NOT NULL PRIMARY KEY (pk)", KEYSPACE, KEYSPACE, CDC_MV_TABLE)); + QueryProcessor.executeInternal(String.format("CREATE TABLE IF NOT EXISTS %s.%s (pk int PRIMARY KEY, v int) WITH cdc=false", KEYSPACE, NO_CDC_MV_TABLE)); + } + + @Test + public void testRequiresWritePathRepair() + { + // given a CDC table with a materialized view attached to it. + ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(CDC_MV_TABLE); + when(session.streamOperation()).thenReturn(StreamOperation.REPAIR); + CassandraStreamReceiver receiver = new CassandraStreamReceiver(cfs, session, Collections.EMPTY_LIST, 1); + + // Should require write path since cdc_on_repair_enabled and materialized_views_on_repair_enabled are both true. + assertTrue(receiver.requiresWritePath(cfs)); + } + + @Test + public void testRequiresWritePathBulkLoad() + { + // given a CDC table with a materialized view attached to it. + ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(CDC_MV_TABLE); + when(session.streamOperation()).thenReturn(StreamOperation.BULK_LOAD); + CassandraStreamReceiver receiver = new CassandraStreamReceiver(cfs, session, Collections.EMPTY_LIST, 1); + + // Should require write path since cdc_on_repair_enabled and materialized_views_on_repair_enabled are both true. + assertTrue(receiver.requiresWritePath(cfs)); + } + + @Test + public void testDoesNotRequireWritePathNoCDCOrMV() + { + // Given cdc_on_repaired_enabled and materialized_views_on_repair_enabled are false + // requiresWritePath should still return false for a non-CDC table. + DatabaseDescriptor.setCDCOnRepairEnabled(false); + DatabaseDescriptor.setMaterializedViewsOnRepairEnabled(false); + + ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(NO_CDC_MV_TABLE); + when(session.streamOperation()).thenReturn(StreamOperation.BULK_LOAD); + CassandraStreamReceiver receiver = new CassandraStreamReceiver(cfs, session, Collections.EMPTY_LIST, 1); + + assertFalse(receiver.requiresWritePath(cfs)); + } + + @Test + public void testRequiresWritePathRepairMVOnly() + { + // Given cdc_on_repaired_enabled and materialized_views_on_repair_enabled are true + // requiresWritePath should return true for a table with materialized views. + ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(MV_TABLE); + when(session.streamOperation()).thenReturn(StreamOperation.REPAIR); + CassandraStreamReceiver receiver = new CassandraStreamReceiver(cfs, session, Collections.EMPTY_LIST, 1); + + assertTrue(receiver.requiresWritePath(cfs)); + } + + @Test + public void testRequiresWritePathRepairCDCOnRepairEnabled() + { + // Given cdc_on_repaired_enabled and materialized_views_on_repair_enabled are true + // requiresWritePath should return true for a table with CDC enabled. + ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(CDC_TABLE); + when(session.streamOperation()).thenReturn(StreamOperation.REPAIR); + CassandraStreamReceiver receiver = new CassandraStreamReceiver(cfs, session, Collections.EMPTY_LIST, 1); + DatabaseDescriptor.setCDCOnRepairEnabled(true); + assertTrue(receiver.requiresWritePath(cfs)); + } + + @Test + public void testDoesNotRequireWritePathRepairCDCOnRepairEnabledFalse() + { + // Given cdc_on_repaired_enabled and materialized_views_on_repair_enabled are false + // requiresWritePath should return false for a table with CDC enabled. + DatabaseDescriptor.setCDCOnRepairEnabled(false); + DatabaseDescriptor.setMaterializedViewsOnRepairEnabled(false); + + ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(CDC_TABLE); + when(session.streamOperation()).thenReturn(StreamOperation.BULK_LOAD); + CassandraStreamReceiver receiver1 = new CassandraStreamReceiver(cfs, session, Collections.EMPTY_LIST, 1); + assertFalse(receiver1.requiresWritePath(cfs)); + + // When flipping cdc_on_repair_enabled to true + // requiresWritePath should return true. + DatabaseDescriptor.setCDCOnRepairEnabled(true); + CassandraStreamReceiver receiver2 = new CassandraStreamReceiver(cfs, session, Collections.EMPTY_LIST, 1); + assertTrue(receiver2.requiresWritePath(cfs)); + } +} diff --git a/test/unit/org/apache/cassandra/db/streaming/EntireSSTableStreamConcurrentComponentMutationTest.java b/test/unit/org/apache/cassandra/db/streaming/EntireSSTableStreamConcurrentComponentMutationTest.java index 7f0fe2c9c6e4..55793c9cacbc 100644 --- a/test/unit/org/apache/cassandra/db/streaming/EntireSSTableStreamConcurrentComponentMutationTest.java +++ b/test/unit/org/apache/cassandra/db/streaming/EntireSSTableStreamConcurrentComponentMutationTest.java @@ -87,6 +87,7 @@ import org.jboss.byteman.contrib.bmunit.BMRule; import org.jboss.byteman.contrib.bmunit.BMUnitRunner; +import static java.util.Collections.emptyList; import static org.apache.cassandra.service.ActiveRepairService.NO_PENDING_REPAIR; import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; import static org.junit.Assert.assertTrue; @@ -228,7 +229,7 @@ private void testStreamWithConcurrentComponentMutation(Callable runBeforeStre streaming.get(3, TimeUnit.MINUTES); concurrentMutations.get(3, TimeUnit.MINUTES); - session.prepareReceiving(new StreamSummary(sstable.metadata().id, 1, 5104)); + session.prepareReceiving(new StreamSummary(sstable.metadata().id, emptyList(), 1, 5104)); StreamMessageHeader messageHeader = new StreamMessageHeader(sstable.metadata().id, peer, session.planId(), false, 0, 0, 0, null); try (DataInputBuffer in = new DataInputBuffer(serializedFile.nioBuffer(), false)) @@ -321,10 +322,10 @@ public void write(ChannelHandlerContext ctx, Object msg, ChannelPromise promise) private StreamSession setupStreamingSessionForTest() { StreamCoordinator streamCoordinator = new StreamCoordinator(StreamOperation.BOOTSTRAP, 1, new NettyStreamingConnectionFactory(), false, false, null, PreviewKind.NONE); - StreamResultFuture future = StreamResultFuture.createInitiator(nextTimeUUID(), StreamOperation.BOOTSTRAP, Collections.emptyList(), streamCoordinator); + StreamResultFuture future = StreamResultFuture.createInitiator(nextTimeUUID(), StreamOperation.BOOTSTRAP, emptyList(), streamCoordinator); InetAddressAndPort peer = FBUtilities.getBroadcastAddressAndPort(); - streamCoordinator.addSessionInfo(new SessionInfo(peer, 0, peer, Collections.emptyList(), Collections.emptyList(), StreamSession.State.INITIALIZED, null)); + streamCoordinator.addSessionInfo(new SessionInfo(peer, 0, peer, emptyList(), emptyList(), StreamSession.State.INITIALIZED, null)); StreamSession session = streamCoordinator.getOrCreateOutboundSession(peer); session.init(future); diff --git a/test/unit/org/apache/cassandra/db/view/ViewUtilsTest.java b/test/unit/org/apache/cassandra/db/view/ViewUtilsTest.java index 2e6870e6a638..5a292bc1955e 100644 --- a/test/unit/org/apache/cassandra/db/view/ViewUtilsTest.java +++ b/test/unit/org/apache/cassandra/db/view/ViewUtilsTest.java @@ -23,35 +23,39 @@ import java.util.Map; import java.util.Optional; -import org.junit.*; - +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.CassandraTestBase; +import org.apache.cassandra.CassandraTestBase.DDDaemonInitialization; +import org.apache.cassandra.CassandraTestBase.UseOrderPreservingPartitioner; import org.apache.cassandra.ServerTestUtils; -import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.Keyspace; -import org.apache.cassandra.dht.OrderPreservingPartitioner; import org.apache.cassandra.dht.OrderPreservingPartitioner.StringToken; import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.NetworkTopologyStrategy; import org.apache.cassandra.locator.Replica; -import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.KeyspaceParams; import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.schema.SchemaTestUtil; import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tcm.StubClusterMetadataService; -public class ViewUtilsTest +@DDDaemonInitialization +@UseOrderPreservingPartitioner +public class ViewUtilsTest extends CassandraTestBase { private final String KS = "Keyspace1"; @BeforeClass public static void setUp() throws ConfigurationException, IOException { - DatabaseDescriptor.daemonInitialization(); - DatabaseDescriptor.setPartitionerUnsafe(OrderPreservingPartitioner.instance); ServerTestUtils.cleanupAndLeaveDirs(); Keyspace.setInitialized(); } diff --git a/test/unit/org/apache/cassandra/db/virtual/AbstractLoggerVirtualTableTest.java b/test/unit/org/apache/cassandra/db/virtual/AbstractLoggerVirtualTableTest.java new file mode 100644 index 000000000000..40a926fda73c --- /dev/null +++ b/test/unit/org/apache/cassandra/db/virtual/AbstractLoggerVirtualTableTest.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.virtual; + +import java.time.Instant; +import java.util.Date; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +import com.google.common.collect.ImmutableList; +import org.junit.Ignore; +import org.junit.Test; + +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.spi.LoggingEvent; +import com.datastax.driver.core.Row; +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.db.DataRange; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +@Ignore +public abstract class AbstractLoggerVirtualTableTest extends CQLTester +{ + protected final String keyspace = createKeyspaceName(); + + protected AbstractLoggerVirtualTable table; + + @Test + public void testTruncate() + { + registerTable(); + + int numberOfRows = 100; + List loggingEvents = getLoggingEvents(numberOfRows); + loggingEvents.forEach(table::add); + + execute(query("truncate %s")); + + assertTrue(executeNet(query("select timestamp from %s")).all().isEmpty()); + } + + @Test + public void testEmpty() throws Throwable + { + registerTable(); + assertEmpty(execute(query("select * from %s"))); + } + + @Test + public void testInsert() + { + registerTable(); + + int numberOfRows = 1000; + List loggingEvents = getLoggingEvents(numberOfRows); + loggingEvents.forEach(table::add); + + assertEquals(numberOfRows, execute(query("select * from %s")).size()); + } + + @Test + public void testLimitedCapacity() + { + registerTable(100); + + int numberOfRows = 1000; + List loggingEvents = getLoggingEvents(numberOfRows); + loggingEvents.forEach(table::add); + + // even we inserted 1000 rows, only 100 are present as its capacity is bounded + assertEquals(100, numberOfPartitions()); + + // the first record in the table will be the last one which we inserted + LoggingEvent firstEvent = loggingEvents.get(999); + assertRowsNet(executeNet(query("select timestamp from %s limit 1")), + new Object[]{ new Date(firstEvent.getTimeStamp()) }); + + // the last record in the table will be 900th we inserted + List all = executeNet(query("select timestamp from %s")).all(); + assertEquals(100, all.size()); + Row row = all.get(all.size() - 1); + Date timestamp = row.getTimestamp(0); + assertEquals(loggingEvents.get(900).getTimeStamp(), timestamp.getTime()); + } + + protected abstract void registerTable(int maxSize); + + protected abstract void registerTable(); + + protected void registerVirtualTable(AbstractLoggerVirtualTable table) + { + this.table = table; + VirtualKeyspaceRegistry.instance.register(new VirtualKeyspace(table.metadata.keyspace, ImmutableList.of(this.table))); + } + + protected String query(String query) + { + return String.format(query, table.toString()); + } + + protected List getLoggingEvents(int size) + { + return getLoggingEvents(size, Instant.now(), 1); + } + + protected List getLoggingEvents(int size, Instant firstTimestamp, int logsInMillisecond) + { + List logs = new LinkedList<>(); + int partitions = size / logsInMillisecond; + + for (int i = 0; i < partitions; i++) + { + firstTimestamp = firstTimestamp.plusSeconds(i); + + for (int j = 0; j < logsInMillisecond; j++) + logs.add(getLoggingEvent(firstTimestamp.toEpochMilli())); + } + + return logs; + } + + protected int numberOfPartitions() + { + AbstractVirtualTable.DataSet data = table.data(); + Iterator partitions = data.getPartitions(DataRange.allData(table.metadata.partitioner)); + int numberOfPartitions = 0; + + while (partitions.hasNext()) + { + partitions.next(); + numberOfPartitions += 1; + } + + return numberOfPartitions; + } + + protected LoggingEvent getLoggingEvent(long timestamp) + { + LoggingEvent event = new LoggingEvent(); + event.setLevel(Level.INFO); + event.setMessage(getMessage(timestamp)); + event.setLoggerName(AbstractLoggerVirtualTableTest.class.getName()); + event.setThreadName(Thread.currentThread().getName()); + event.setTimeStamp(timestamp); + + return event; + } + + protected abstract String getMessage(long timestamp); +} diff --git a/test/unit/org/apache/cassandra/db/virtual/AccordDebugKeyspaceTest.java b/test/unit/org/apache/cassandra/db/virtual/AccordDebugKeyspaceTest.java new file mode 100644 index 000000000000..fec005868d6d --- /dev/null +++ b/test/unit/org/apache/cassandra/db/virtual/AccordDebugKeyspaceTest.java @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.virtual; + +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ConcurrentSkipListSet; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.function.BiPredicate; + +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.ProtocolModifiers; +import accord.messages.TxnRequest; +import accord.primitives.Routable; +import accord.primitives.SaveStatus; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.utils.async.AsyncChains; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.config.OptionaldPositiveInt; +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.cql3.UntypedResultSet; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.concurrent.Condition; +import org.awaitility.Awaitility; + +import static accord.primitives.TxnId.FastPath.Unoptimised; +import static org.apache.cassandra.Util.spinUntilSuccess; +import static org.apache.cassandra.service.accord.AccordTestUtils.createTxn; + +public class AccordDebugKeyspaceTest extends CQLTester +{ + private static final Logger logger = LoggerFactory.getLogger(AccordDebugKeyspaceTest.class); + + private static final String QUERY_TXN_BLOCKED_BY = + String.format("SELECT * FROM %s.%s WHERE txn_id=?", SchemaConstants.VIRTUAL_ACCORD_DEBUG, AccordDebugKeyspace.TXN_BLOCKED_BY); + + @BeforeClass + public static void setUpClass() + { + daemonInitialization(); + DatabaseDescriptor.getAccord().queue_shard_count = new OptionaldPositiveInt(1); + DatabaseDescriptor.getAccord().command_store_shard_count = new OptionaldPositiveInt(1); + + CQLTester.setUpClass(); + + AccordService.startup(ClusterMetadata.current().myNodeId()); + VirtualKeyspaceRegistry.instance.register(AccordDebugKeyspace.instance); + requireNetwork(); + } + + @Test + public void unknownIsEmpty() + { + createTable("CREATE TABLE %s (k int, c int, v int, PRIMARY KEY (k, c)) WITH transactional_mode = 'full'"); + assertRows(execute(QUERY_TXN_BLOCKED_BY, TxnId.NONE.toString())); + } + + @Test + public void completedTxn() throws ExecutionException, InterruptedException + { + String tableName = createTable("CREATE TABLE %s (k int, c int, v int, PRIMARY KEY (k, c)) WITH transactional_mode = 'full'"); + var accord = accord(); + TxnId id = accord.node().nextTxnId(Txn.Kind.Write, Routable.Domain.Key); + Txn txn = createTxn(wrapInTxn(String.format("INSERT INTO %s.%s(k, c, v) VALUES (?, ?, ?)", KEYSPACE, tableName)), 0, 0, 0); + AsyncChains.getBlocking(accord.node().coordinate(id, txn)); + + spinUntilSuccess(() -> assertRows(execute(QUERY_TXN_BLOCKED_BY, id.toString()), + row(id.toString(), KEYSPACE, tableName, anyInt(), 0, ByteBufferUtil.EMPTY_BYTE_BUFFER, "Self", any(), null, anyOf(SaveStatus.ReadyToExecute.name(), SaveStatus.Applying.name(), SaveStatus.Applied.name())))); + } + + @Test + public void inflight() throws ExecutionException, InterruptedException + { + AccordMsgFilter filter = new AccordMsgFilter(); + MessagingService.instance().outboundSink.add(filter); + try + { + String tableName = createTable("CREATE TABLE %s (k int, c int, v int, PRIMARY KEY (k, c)) WITH transactional_mode = 'full'"); + var accord = accord(); + TxnId id = accord.node().nextTxnId(Txn.Kind.Write, Routable.Domain.Key); + String insertTxn = String.format("BEGIN TRANSACTION\n" + + " LET r = (SELECT * FROM %s.%s WHERE k = ? AND c = ?);\n" + + " IF r IS NULL THEN\n " + + " INSERT INTO %s.%s (k, c, v) VALUES (?, ?, ?);\n" + + " END IF\n" + + "COMMIT TRANSACTION", KEYSPACE, tableName, KEYSPACE, tableName); + Txn txn = createTxn(insertTxn, 0, 0, 0, 0, 0); + accord.node().coordinate(id, txn); + + filter.preAccept.awaitThrowUncheckedOnInterrupt(); + + assertRows(execute(QUERY_TXN_BLOCKED_BY, id.toString()), + row(id.toString(), KEYSPACE, tableName, anyInt(), 0, ByteBufferUtil.EMPTY_BYTE_BUFFER, "Self", any(), null, anyOf(SaveStatus.PreAccepted.name(), SaveStatus.ReadyToExecute.name()))); + + filter.apply.awaitThrowUncheckedOnInterrupt(); + assertRows(execute(QUERY_TXN_BLOCKED_BY, id.toString()), + row(id.toString(), KEYSPACE, tableName, anyInt(), 0, ByteBufferUtil.EMPTY_BYTE_BUFFER, "Self", any(), null, SaveStatus.ReadyToExecute.name())); + } + finally + { + MessagingService.instance().outboundSink.remove(filter); + } + } + + @Test + public void blocked() throws ExecutionException, InterruptedException + { + ProtocolModifiers.Toggles.setPermitLocalExecution(false); + ProtocolModifiers.Toggles.setPermittedFastPaths(new TxnId.FastPaths(Unoptimised)); + AccordMsgFilter filter = new AccordMsgFilter(); + MessagingService.instance().outboundSink.add(filter); + try + { + String tableName = createTable("CREATE TABLE %s (k int, c int, v int, PRIMARY KEY (k, c)) WITH transactional_mode = 'full'"); + var accord = accord(); + TxnId first = accord.node().nextTxnId(Txn.Kind.Write, Routable.Domain.Key); + String insertTxn = String.format("BEGIN TRANSACTION\n" + + " LET r = (SELECT * FROM %s.%s WHERE k = ? AND c = ?);\n" + + " IF r IS NULL THEN\n " + + " INSERT INTO %s.%s (k, c, v) VALUES (?, ?, ?);\n" + + " END IF\n" + + "COMMIT TRANSACTION", KEYSPACE, tableName, KEYSPACE, tableName); + accord.node().coordinate(first, createTxn(insertTxn, 0, 0, 0, 0, 0)); + + filter.preAccept.awaitThrowUncheckedOnInterrupt(); + assertRows(execute(QUERY_TXN_BLOCKED_BY, first.toString()), + row(first.toString(), KEYSPACE, tableName, anyInt(), 0, ByteBufferUtil.EMPTY_BYTE_BUFFER, "Self", any(), null, anyOf(SaveStatus.PreAccepted.name(), SaveStatus.ReadyToExecute.name()))); + + filter.apply.awaitThrowUncheckedOnInterrupt(); + assertRows(execute(QUERY_TXN_BLOCKED_BY, first.toString()), + row(first.toString(), KEYSPACE, tableName, anyInt(), 0, ByteBufferUtil.EMPTY_BYTE_BUFFER, "Self", anyNonNull(), null, SaveStatus.ReadyToExecute.name())); + + filter.reset(); + + TxnId second = accord.node().nextTxnId(Txn.Kind.Write, Routable.Domain.Key); + accord.node().coordinate(second, createTxn(insertTxn, 0, 0, 0, 0, 0)); + + filter.commit.awaitThrowUncheckedOnInterrupt(); + + Awaitility.await("waiting on key").atMost(1, TimeUnit.MINUTES) + .until(() -> { + UntypedResultSet rs = execute(QUERY_TXN_BLOCKED_BY, second.toString()); + return rs.size() == 2; + }); + assertRows(execute(QUERY_TXN_BLOCKED_BY, second.toString()), + row(second.toString(), KEYSPACE, tableName, anyInt(), 0, ByteBufferUtil.EMPTY_BYTE_BUFFER, "Self", anyNonNull(), null, SaveStatus.Stable.name()), + row(second.toString(), KEYSPACE, tableName, anyInt(), 1, first.toString(), "Key", anyNonNull(), anyNonNull(), SaveStatus.ReadyToExecute.name())); + } + finally + { + MessagingService.instance().outboundSink.remove(filter); + } + } + + private static AccordService accord() + { + return (AccordService) AccordService.instance(); + } + + private static class AccordMsgFilter implements BiPredicate, InetAddressAndPort> + { + volatile Condition preAccept = Condition.newOneTimeCondition(); + volatile Condition commit = Condition.newOneTimeCondition(); + volatile Condition apply = Condition.newOneTimeCondition(); + + void reset() + { + preAccept = Condition.newOneTimeCondition(); + commit = Condition.newOneTimeCondition(); + apply = Condition.newOneTimeCondition(); + } + + ConcurrentMap> txnToVerbs = new ConcurrentHashMap<>(); + + @Override + public boolean test(Message msg, InetAddressAndPort to) + { + if (!msg.verb().name().startsWith("ACCORD_")) + return true; + TxnId txnId = null; + if (msg.payload instanceof TxnRequest) + { + txnId = ((TxnRequest) msg.payload).txnId; + } + Set seen; + if (txnId != null) + { + seen = txnToVerbs.computeIfAbsent(txnId, ignore -> new ConcurrentSkipListSet<>()); + seen.add(msg.verb()); + } + switch (msg.verb()) + { + case ACCORD_APPLY_REQ: + case ACCORD_APPLY_AND_WAIT_REQ: + apply.signalAll(); + case ACCORD_BEGIN_RECOVER_REQ: + return false; + case ACCORD_PRE_ACCEPT_RSP: + preAccept.signalAll(); + return true; + case ACCORD_COMMIT_REQ: + case ACCORD_STABLE_THEN_READ_REQ: + commit.signalAll(); + return true; + case ACCORD_PRE_ACCEPT_REQ: + case ACCORD_ACCEPT_REQ: + case ACCORD_ACCEPT_RSP: + case ACCORD_CHECK_STATUS_REQ: + case ACCORD_CHECK_STATUS_RSP: + case ACCORD_READ_RSP: + case ACCORD_AWAIT_REQ: + case ACCORD_AWAIT_RSP: + case ACCORD_AWAIT_ASYNC_RSP_REQ: + return true; + default: + // many code paths don't log the error... + UnsupportedOperationException e = new UnsupportedOperationException(msg.verb().name()); + logger.error("Unexpected verb {}", msg.verb(), e); + throw e; + } + } + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/db/virtual/AccordVirtualTablesTest.java b/test/unit/org/apache/cassandra/db/virtual/AccordVirtualTablesTest.java new file mode 100644 index 000000000000..d53906cc3c7d --- /dev/null +++ b/test/unit/org/apache/cassandra/db/virtual/AccordVirtualTablesTest.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.virtual; + +import java.util.Collections; +import java.util.List; +import java.util.Set; + +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.api.ConfigurationService; +import accord.api.TopologySorter; +import accord.local.Node; +import accord.primitives.Ranges; +import accord.topology.Shard; +import accord.topology.Topologies; +import accord.topology.Topology; +import accord.topology.TopologyManager; +import accord.utils.SortedArrays; +import accord.utils.async.AsyncResults; +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.IAccordService; +import org.apache.cassandra.service.accord.TokenRange; +import org.mockito.Mockito; + +import static org.apache.cassandra.config.DatabaseDescriptor.getPartitioner; +import static org.apache.cassandra.schema.SchemaConstants.VIRTUAL_VIEWS; + +public class AccordVirtualTablesTest extends CQLTester +{ + public static final Node.Id N1 = new Node.Id(1); + public static final SortedArrays.SortedArrayList ALL = SortedArrays.SortedArrayList.ofSorted(N1); + public static final Set FP = Collections.singleton(N1); + public static final String SUCCESS = "success"; + public static final String PENDING = "pending"; + public static final List FULL_RANGE = List.of("(-Inf, +Inf]"); + + public static TableId T1; + public static TableMetadata T1_META; + + @BeforeClass + public static void setup() + { + addVirtualKeyspace(); + } + + @Before + public void setupTables() + { + if (T1_META != null) return; + + String tbl1 = createTable("CREATE TABLE %s(pk int primary key)"); + T1_META = Schema.instance.getTableMetadata(keyspace(), tbl1); + T1 = T1_META.id; + + ServerTestUtils.markCMS(); + } + + @Test + public void emptyEpochs() + { + TopologyManager tm = empty(); + assertRows(execute("SELECT * FROM " + VIRTUAL_VIEWS + "." + AccordVirtualTables.EPOCHS)); + assertRows(execute("SELECT * FROM " + VIRTUAL_VIEWS + "." + AccordVirtualTables.TABLE_EPOCHS)); + } + + @Test + public void epochUpdates() + { + TopologyManager tm = empty(); + long e1 = 1; + tm.onTopologyUpdate(topology(e1, T1), () -> ConfigurationService.EpochReady.done(e1), e -> {}); + assertRows(execute("SELECT * FROM " + VIRTUAL_VIEWS + "." + AccordVirtualTables.EPOCHS), + row(e1, true, SUCCESS, SUCCESS, SUCCESS, SUCCESS)); + + long e2 = 2; + tm.onTopologyUpdate(topology(e2, T1), () -> pendingReady(e1), e -> {}); + assertRows(execute("SELECT * FROM " + VIRTUAL_VIEWS + "." + AccordVirtualTables.EPOCHS), + row(e2, false, PENDING, PENDING, PENDING, PENDING), + row(e1, true, SUCCESS, SUCCESS, SUCCESS, SUCCESS)); + } + + @Test + public void tableUpdates() + { + TopologyManager tm = empty(); + long e1 = 1; + tm.onTopologyUpdate(topology(e1, T1), () -> ConfigurationService.EpochReady.done(e1), e -> {}); + + // the range was added in the first epoch, so its fully synced + assertRows(execute("SELECT * FROM " + VIRTUAL_VIEWS + "." + AccordVirtualTables.TABLE_EPOCHS), + row(e1, T1_META.keyspace, T1_META.name, FULL_RANGE, List.of(), List.of(), List.of(), FULL_RANGE)); + + // range is no longer "added" so doesn't show up as synced! + long e2 = 2; + tm.onTopologyUpdate(topology(e2, T1), () -> ConfigurationService.EpochReady.done(e2), e -> {}); + assertRows(execute("SELECT * FROM " + VIRTUAL_VIEWS + "." + AccordVirtualTables.TABLE_EPOCHS), + row(e1, T1_META.keyspace, T1_META.name, FULL_RANGE, List.of(), List.of(), List.of(), FULL_RANGE)); + + // sync the range + tm.onEpochSyncComplete(N1, e2); + assertRows(execute("SELECT * FROM " + VIRTUAL_VIEWS + "." + AccordVirtualTables.TABLE_EPOCHS), + row(e2, T1_META.keyspace, T1_META.name, List.of(), List.of(), List.of(), List.of(), FULL_RANGE), + row(e1, T1_META.keyspace, T1_META.name, FULL_RANGE, List.of(), List.of(), List.of(), FULL_RANGE)); + + // lets close e2 + tm.onEpochClosed(Ranges.single(TokenRange.fullRange(T1, getPartitioner())), e2); + assertRows(execute("SELECT * FROM " + VIRTUAL_VIEWS + "." + AccordVirtualTables.TABLE_EPOCHS), + row(e2, T1_META.keyspace, T1_META.name, List.of(), FULL_RANGE, List.of(), List.of(), FULL_RANGE), + row(e1, T1_META.keyspace, T1_META.name, FULL_RANGE, FULL_RANGE, List.of(), List.of(), FULL_RANGE)); + + // enjoy retirement! + tm.onEpochRetired(Ranges.single(TokenRange.fullRange(T1, getPartitioner())), e2); + assertRows(execute("SELECT * FROM " + VIRTUAL_VIEWS + "." + AccordVirtualTables.TABLE_EPOCHS), + row(e2, T1_META.keyspace, T1_META.name, List.of(), FULL_RANGE, List.of(), FULL_RANGE, FULL_RANGE), + row(e1, T1_META.keyspace, T1_META.name, FULL_RANGE, FULL_RANGE, List.of(), FULL_RANGE, FULL_RANGE)); + } + + private static ConfigurationService.EpochReady pendingReady(long epoch) + { + return new ConfigurationService.EpochReady(epoch, AsyncResults.settable(), AsyncResults.settable(), AsyncResults.settable(), AsyncResults.settable()); + } + + private static Topology topology(long epoch, TableId tableId) + { + TokenRange all = TokenRange.fullRange(tableId, getPartitioner()); + return new Topology(epoch, Shard.create(all, ALL, FP)); + } + + private static TopologyManager empty() + { + TopologySorter sorter = (TopologySorter.StaticSorter) (node1, node2, shards) -> 0; + TopologySorter.Supplier supplier = new TopologySorter.Supplier() + { + @Override + public TopologySorter get(Topology topologies) + { + return sorter; + } + + @Override + public TopologySorter get(Topologies topologies) + { + return sorter; + } + }; + TopologyManager tm = new TopologyManager(supplier, null, N1, null, null); + + var mock = Mockito.mock(IAccordService.class); + Mockito.when(mock.topology()).thenReturn(tm); + AccordService.unsafeSetNewAccordService(mock); + return tm; + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/db/virtual/CIDRFilteringMetricsTableTest.java b/test/unit/org/apache/cassandra/db/virtual/CIDRFilteringMetricsTableTest.java index b357c00e0fb7..b9d78ac464de 100644 --- a/test/unit/org/apache/cassandra/db/virtual/CIDRFilteringMetricsTableTest.java +++ b/test/unit/org/apache/cassandra/db/virtual/CIDRFilteringMetricsTableTest.java @@ -35,14 +35,11 @@ import com.codahale.metrics.Snapshot; import org.apache.cassandra.SchemaLoader; import org.apache.cassandra.auth.AuthCacheService; -import org.apache.cassandra.auth.AuthKeyspace; import org.apache.cassandra.auth.AuthTestUtils; import org.apache.cassandra.auth.AuthenticatedUser; -import org.apache.cassandra.auth.CassandraRoleManager; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.CIDR; import org.apache.cassandra.cql3.CQLTester; -import org.apache.cassandra.cql3.QueryProcessor; import org.apache.cassandra.cql3.UntypedResultSet; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.exceptions.ConfigurationException; @@ -58,16 +55,6 @@ public class CIDRFilteringMetricsTableTest extends CQLTester { private static final String KS_NAME = "vts"; - private static void setupSuperUser() - { - QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (role, is_superuser, can_login, salted_hash) " + - "VALUES ('%s', true, true, '%s')", - AUTH_KEYSPACE_NAME, - AuthKeyspace.ROLES, - CassandraRoleManager.DEFAULT_SUPERUSER_NAME, - "xxx")); - } - @BeforeClass public static void defineSchema() throws ConfigurationException { @@ -77,7 +64,7 @@ public static void defineSchema() throws ConfigurationException new AuthTestUtils.LocalCassandraNetworkAuthorizer(), new AuthTestUtils.LocalCassandraCIDRAuthorizer()); AuthCacheService.initializeAndRegisterCaches(); - setupSuperUser(); + AuthTestUtils.setupSuperUser(); } @Before diff --git a/test/unit/org/apache/cassandra/db/virtual/LocalRepairTablesTest.java b/test/unit/org/apache/cassandra/db/virtual/LocalRepairTablesTest.java index 96f09400ecaf..5aa94cc666d5 100644 --- a/test/unit/org/apache/cassandra/db/virtual/LocalRepairTablesTest.java +++ b/test/unit/org/apache/cassandra/db/virtual/LocalRepairTablesTest.java @@ -40,6 +40,7 @@ import org.apache.cassandra.repair.CommonRange; import org.apache.cassandra.repair.RepairJobDesc; import org.apache.cassandra.repair.RepairCoordinator; +import org.apache.cassandra.repair.SharedContext; import org.apache.cassandra.repair.messages.PrepareMessage; import org.apache.cassandra.repair.messages.RepairOption; import org.apache.cassandra.repair.state.Completable; @@ -291,7 +292,7 @@ private static InetAddressAndPort address(int a, int b, int c, int d) private static CoordinatorState coordinator() { RepairOption options = RepairOption.parse(Collections.emptyMap(), DatabaseDescriptor.getPartitioner()); - CoordinatorState state = new CoordinatorState(Clock.Global.clock(), 0, "test", options); + CoordinatorState state = new CoordinatorState(SharedContext.Global.instance, 0, "test", options); ActiveRepairService.instance().register(state); return state; } @@ -299,7 +300,7 @@ private static CoordinatorState coordinator() private static SessionState session() { CoordinatorState parent = coordinator(); - SessionState state = new SessionState(Clock.Global.clock(), parent.id, REPAIR_KS, new String[]{ REPAIR_TABLE }, COMMON_RANGE); + SessionState state = new SessionState(SharedContext.Global.instance, parent.id, REPAIR_KS, new String[]{ REPAIR_TABLE }, COMMON_RANGE); parent.register(state); return state; } diff --git a/test/unit/org/apache/cassandra/db/virtual/LogMessagesTableTest.java b/test/unit/org/apache/cassandra/db/virtual/LogMessagesTableTest.java index dd32058533db..7025e8ad4cb9 100644 --- a/test/unit/org/apache/cassandra/db/virtual/LogMessagesTableTest.java +++ b/test/unit/org/apache/cassandra/db/virtual/LogMessagesTableTest.java @@ -20,69 +20,34 @@ import java.time.Instant; import java.util.Date; -import java.util.Iterator; -import java.util.LinkedList; import java.util.List; -import com.google.common.collect.ImmutableList; import org.junit.Test; -import ch.qos.logback.classic.Level; import ch.qos.logback.classic.spi.LoggingEvent; import com.datastax.driver.core.Row; -import org.apache.cassandra.cql3.CQLTester; -import org.apache.cassandra.db.DataRange; -import org.apache.cassandra.db.marshal.TimestampType; -import org.apache.cassandra.db.virtual.AbstractVirtualTable.DataSet; -import org.apache.cassandra.db.virtual.AbstractVirtualTable.Partition; -import org.apache.cassandra.dht.LocalPartitioner; import static org.apache.cassandra.config.CassandraRelevantProperties.LOGS_VIRTUAL_TABLE_MAX_ROWS; +import static org.apache.cassandra.db.virtual.LogMessagesTable.LOGS_VIRTUAL_TABLE_DEFAULT_ROWS; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; -public class LogMessagesTableTest extends CQLTester +public class LogMessagesTableTest extends AbstractLoggerVirtualTableTest { - private String keyspace = createKeyspaceName(); - private LogMessagesTable table; - - @Test - public void testTruncate() throws Throwable - { - registerVirtualTable(); - - int numberOfRows = 100; - List loggingEvents = getLoggingEvents(numberOfRows); - loggingEvents.forEach(table::add); - - execute(query("truncate %s")); - - assertTrue(executeNet(query("select timestamp from %s")).all().isEmpty()); - } - - @Test - public void empty() throws Throwable - { - registerVirtualTable(); - assertEmpty(execute(query("select * from %s"))); - } - @Test - public void testInsert() + public void testMultipleLogsInSameMillisecond() { - registerVirtualTable(); - - int numberOfRows = 1000; - List loggingEvents = getLoggingEvents(numberOfRows); + registerTable(); + List loggingEvents = getLoggingEvents(10, Instant.now(), 5); loggingEvents.forEach(table::add); - assertEquals(numberOfRows, numberOfPartitions()); + // 2 partitions, 5 rows in each + assertEquals(2, numberOfPartitions()); } @Test - public void testLimitedCapacity() throws Throwable + public void testLimitedCapacity() { - registerVirtualTable(100); + registerTable(100); int numberOfRows = 1000; List loggingEvents = getLoggingEvents(numberOfRows); @@ -94,7 +59,7 @@ public void testLimitedCapacity() throws Throwable // the first record in the table will be the last one which we inserted LoggingEvent firstEvent = loggingEvents.get(999); assertRowsNet(executeNet(query("select timestamp from %s limit 1")), - new Object[] { new Date(firstEvent.getTimeStamp()) }); + new Object[]{ new Date(firstEvent.getTimeStamp()) }); // the last record in the table will be 900th we inserted List all = executeNet(query("select timestamp from %s")).all(); @@ -104,100 +69,47 @@ public void testLimitedCapacity() throws Throwable assertEquals(loggingEvents.get(900).getTimeStamp(), timestamp.getTime()); } - @Test - public void testMultipleLogsInSameMillisecond() - { - registerVirtualTable(10); - List loggingEvents = getLoggingEvents(10, Instant.now(), 5); - loggingEvents.forEach(table::add); - - // 2 partitions, 5 rows in each - assertEquals(2, numberOfPartitions()); - } - @Test public void testResolvingBufferSize() { LOGS_VIRTUAL_TABLE_MAX_ROWS.setInt(-1); - assertEquals(LogMessagesTable.LOGS_VIRTUAL_TABLE_DEFAULT_ROWS, LogMessagesTable.resolveBufferSize()); + assertEquals(LOGS_VIRTUAL_TABLE_DEFAULT_ROWS, resolveBufferSize()); LOGS_VIRTUAL_TABLE_MAX_ROWS.setInt(0); - assertEquals(LogMessagesTable.LOGS_VIRTUAL_TABLE_DEFAULT_ROWS, LogMessagesTable.resolveBufferSize()); + assertEquals(LOGS_VIRTUAL_TABLE_DEFAULT_ROWS, resolveBufferSize()); LOGS_VIRTUAL_TABLE_MAX_ROWS.setInt(1000001); - assertEquals(LogMessagesTable.LOGS_VIRTUAL_TABLE_DEFAULT_ROWS, LogMessagesTable.resolveBufferSize()); + assertEquals(LOGS_VIRTUAL_TABLE_DEFAULT_ROWS, resolveBufferSize()); LOGS_VIRTUAL_TABLE_MAX_ROWS.setInt(999); - assertEquals(LogMessagesTable.LOGS_VIRTUAL_TABLE_DEFAULT_ROWS, LogMessagesTable.resolveBufferSize()); + assertEquals(999, resolveBufferSize()); LOGS_VIRTUAL_TABLE_MAX_ROWS.setInt(50001); - assertEquals(50001, LogMessagesTable.resolveBufferSize()); + assertEquals(50001, resolveBufferSize()); } - private void registerVirtualTable() + private int resolveBufferSize() { - registerVirtualTable(LogMessagesTable.LOGS_VIRTUAL_TABLE_MIN_ROWS); + return AbstractLoggerVirtualTable.resolveBufferSize(LOGS_VIRTUAL_TABLE_MAX_ROWS.getInt(), + LogMessagesTable.LOGS_VIRTUAL_TABLE_MAX_ROWS, + LOGS_VIRTUAL_TABLE_DEFAULT_ROWS); } - private void registerVirtualTable(int size) + @Override + protected void registerTable(int maxSize) { - table = new LogMessagesTable(keyspace, size); - VirtualKeyspaceRegistry.instance.register(new VirtualKeyspace(keyspace, ImmutableList.of(table))); + registerVirtualTable(new LogMessagesTable(keyspace, maxSize)); } - private int numberOfPartitions() + @Override + protected void registerTable() { - DataSet data = table.data(); - - Iterator partitions = data.getPartitions(DataRange.allData(new LocalPartitioner(TimestampType.instance))); - - int numberOfPartitions = 0; - - while (partitions.hasNext()) - { - partitions.next(); - numberOfPartitions += 1; - } - - return numberOfPartitions; - } - - private String query(String query) - { - return String.format(query, table.toString()); - } - - private List getLoggingEvents(int size) - { - return getLoggingEvents(size, Instant.now(), 1); - } - - private List getLoggingEvents(int size, Instant firstTimestamp, int logsInMillisecond) - { - List logs = new LinkedList<>(); - int partitions = size / logsInMillisecond; - - for (int i = 0; i < partitions; i++) - { - long timestamp = firstTimestamp.toEpochMilli(); - firstTimestamp = firstTimestamp.plusSeconds(1); - - for (int j = 0; j < logsInMillisecond; j++) - logs.add(getLoggingEvent(timestamp)); - } - - return logs; + registerTable(1000); } - private LoggingEvent getLoggingEvent(long timestamp) + @Override + protected String getMessage(long timestamp) { - LoggingEvent event = new LoggingEvent(); - event.setLevel(Level.INFO); - event.setMessage("message " + timestamp); - event.setLoggerName("logger " + timestamp); - event.setThreadName(Thread.currentThread().getName()); - event.setTimeStamp(timestamp); - - return event; + return "message " + timestamp; } } diff --git a/test/unit/org/apache/cassandra/db/virtual/SettingsTableTest.java b/test/unit/org/apache/cassandra/db/virtual/SettingsTableTest.java index 71b9172da798..78fb0d47d6ec 100644 --- a/test/unit/org/apache/cassandra/db/virtual/SettingsTableTest.java +++ b/test/unit/org/apache/cassandra/db/virtual/SettingsTableTest.java @@ -31,6 +31,7 @@ import com.datastax.driver.core.Row; import org.apache.cassandra.config.Config; import org.apache.cassandra.config.DurationSpec; +import org.apache.cassandra.config.EncryptionOptions.ServerEncryptionOptions.Builder; import org.apache.cassandra.config.EncryptionOptions.ServerEncryptionOptions.InternodeEncryption; import org.apache.cassandra.config.JMXServerOptions; import org.apache.cassandra.config.ParameterizedClass; @@ -38,7 +39,7 @@ import org.apache.cassandra.security.SSLFactory; import org.yaml.snakeyaml.introspector.Property; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; public class SettingsTableTest extends CQLTester { @@ -176,52 +177,53 @@ public void testEncryptionOverride() throws Throwable List expectedNames = SettingsTable.PROPERTIES.keySet().stream().filter(n -> n.startsWith("server_encryption")).collect(Collectors.toList()); Assert.assertEquals(expectedNames.size(), executeNet(all).all().size()); + Builder serverEncryptionOptionsBuilder = new Builder(config.server_encryption_options); check(pre + "algorithm", null); - config.server_encryption_options = config.server_encryption_options.withAlgorithm("SUPERSSL"); + config.server_encryption_options = serverEncryptionOptionsBuilder.withAlgorithm("SUPERSSL").build(); check(pre + "algorithm", "SUPERSSL"); check(pre + "cipher_suites", null); - config.server_encryption_options = config.server_encryption_options.withCipherSuites("c1", "c2"); + config.server_encryption_options = serverEncryptionOptionsBuilder.withCipherSuites("c1", "c2").build(); check(pre + "cipher_suites", "[c1, c2]"); // name doesn't match yaml check(pre + "protocol", null); - config.server_encryption_options = config.server_encryption_options.withProtocol("TLSv5"); + config.server_encryption_options = serverEncryptionOptionsBuilder.withProtocol("TLSv5").build(); check(pre + "protocol", "[TLSv5]"); - config.server_encryption_options = config.server_encryption_options.withProtocol("TLS"); + config.server_encryption_options = serverEncryptionOptionsBuilder.withProtocol("TLS").build(); check(pre + "protocol", SSLFactory.tlsInstanceProtocolSubstitution().toString()); - config.server_encryption_options = config.server_encryption_options.withProtocol("TLS"); - config.server_encryption_options = config.server_encryption_options.withAcceptedProtocols(ImmutableList.of("TLSv1.2","TLSv1.1")); + config.server_encryption_options = serverEncryptionOptionsBuilder.withProtocol("TLS").build(); + config.server_encryption_options = serverEncryptionOptionsBuilder.withAcceptedProtocols(ImmutableList.of("TLSv1.2","TLSv1.1")).build(); check(pre + "protocol", "[TLSv1.2, TLSv1.1]"); - config.server_encryption_options = config.server_encryption_options.withProtocol("TLSv2"); - config.server_encryption_options = config.server_encryption_options.withAcceptedProtocols(ImmutableList.of("TLSv1.2","TLSv1.1")); + config.server_encryption_options = serverEncryptionOptionsBuilder.withProtocol("TLSv2").build(); + config.server_encryption_options = serverEncryptionOptionsBuilder.withAcceptedProtocols(ImmutableList.of("TLSv1.2","TLSv1.1")).build(); check(pre + "protocol", "[TLSv1.2, TLSv1.1, TLSv2]"); // protocol goes after the explicit accept list if non-TLS check(pre + "optional", "false"); - config.server_encryption_options = config.server_encryption_options.withOptional(true); + config.server_encryption_options = serverEncryptionOptionsBuilder.withOptional(true).build(); check(pre + "optional", "true"); // name doesn't match yaml check(pre + "client_auth", "false"); - config.server_encryption_options = config.server_encryption_options.withRequireClientAuth(REQUIRED); + config.server_encryption_options = serverEncryptionOptionsBuilder.withRequireClientAuth(REQUIRED).build(); check(pre + "client_auth", "true"); // name doesn't match yaml check(pre + "endpoint_verification", "false"); - config.server_encryption_options = config.server_encryption_options.withRequireEndpointVerification(true); + config.server_encryption_options = serverEncryptionOptionsBuilder.withRequireEndpointVerification(true).build(); check(pre + "endpoint_verification", "true"); check(pre + "internode_encryption", "none"); - config.server_encryption_options = config.server_encryption_options.withInternodeEncryption(InternodeEncryption.all); + config.server_encryption_options = serverEncryptionOptionsBuilder.withInternodeEncryption(InternodeEncryption.all).build(); check(pre + "internode_encryption", "all"); check(pre + "enabled", "true"); // name doesn't match yaml check(pre + "legacy_ssl_storage_port", "false"); - config.server_encryption_options = config.server_encryption_options.withLegacySslStoragePort(true); + config.server_encryption_options = serverEncryptionOptionsBuilder.withLegacySslStoragePort(true).build(); check(pre + "legacy_ssl_storage_port", "true"); } diff --git a/test/unit/org/apache/cassandra/db/virtual/SlowQueriesTableTest.java b/test/unit/org/apache/cassandra/db/virtual/SlowQueriesTableTest.java new file mode 100644 index 000000000000..1c261e7cc95b --- /dev/null +++ b/test/unit/org/apache/cassandra/db/virtual/SlowQueriesTableTest.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.virtual; + +import java.util.List; +import java.util.Random; + +import org.junit.Test; + +import ch.qos.logback.classic.spi.LoggingEvent; +import org.apache.cassandra.db.monitoring.MonitorableImpl; +import org.apache.cassandra.db.monitoring.MonitoringTask; +import org.apache.cassandra.db.monitoring.MonitoringTask.Operation; +import org.apache.cassandra.utils.Generators; +import org.quicktheories.impl.JavaRandom; + +import static org.apache.cassandra.config.CassandraRelevantProperties.LOGS_SLOW_QUERIES_VIRTUAL_TABLE_MAX_ROWS; +import static org.apache.cassandra.db.virtual.SlowQueriesTable.LOGS_VIRTUAL_TABLE_DEFAULT_ROWS; +import static org.apache.cassandra.db.virtual.SlowQueriesTable.LOGS_VIRTUAL_TABLE_MAX_ROWS; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class SlowQueriesTableTest extends AbstractLoggerVirtualTableTest +{ + private final Random random = new Random(); + private final JavaRandom javaRandom = new JavaRandom(random); + + @Override + protected void registerTable(int maxSize) + { + registerVirtualTable(new SlowQueriesTable(keyspace, maxSize)); + } + + @Override + protected void registerTable() + { + registerTable(1000); + } + + @Test + public void testLimitedCapacity() + { + registerTable(100); + + int numberOfRows = 1000; + List loggingEvents = getLoggingEvents(numberOfRows); + assertEquals(1000, loggingEvents.size()); + loggingEvents.forEach(table::add); + + // even we inserted 1000 rows, only 100 are present as its capacity is bounded + assertEquals(100, executeNet(query("select * from %s")).all().size()); + } + + @Test + public void testDelete() + { + registerTable(); + + int numberOfRows = 100; + List loggingEvents = getLoggingEvents(numberOfRows); + loggingEvents.forEach(table::add); + + Operation operation = table.buffer.get(0); + + assertEquals(100, executeNet(query("select * from %s")).all().size()); + execute(query("delete from %s where keyspace_name = '" + operation.keyspace() + '\'')); + assertTrue(executeNet(query("select * from %s")).all().size() < 100); + } + + @Test + public void testResolvingBufferSize() + { + LOGS_SLOW_QUERIES_VIRTUAL_TABLE_MAX_ROWS.setInt(-1); + assertEquals(LOGS_VIRTUAL_TABLE_DEFAULT_ROWS, resolveBufferSize()); + + LOGS_SLOW_QUERIES_VIRTUAL_TABLE_MAX_ROWS.setInt(0); + assertEquals(LOGS_VIRTUAL_TABLE_DEFAULT_ROWS, resolveBufferSize()); + + LOGS_SLOW_QUERIES_VIRTUAL_TABLE_MAX_ROWS.setInt(1000001); + assertEquals(LOGS_VIRTUAL_TABLE_DEFAULT_ROWS, resolveBufferSize()); + + LOGS_SLOW_QUERIES_VIRTUAL_TABLE_MAX_ROWS.setInt(999); + assertEquals(999, resolveBufferSize()); + + LOGS_SLOW_QUERIES_VIRTUAL_TABLE_MAX_ROWS.setInt(50001); + assertEquals(50001, resolveBufferSize()); + } + + private int resolveBufferSize() + { + return AbstractLoggerVirtualTable.resolveBufferSize(LOGS_SLOW_QUERIES_VIRTUAL_TABLE_MAX_ROWS.getInt(), + LOGS_VIRTUAL_TABLE_MAX_ROWS, + LOGS_VIRTUAL_TABLE_DEFAULT_ROWS); + } + + + @Override + protected String getMessage(long timestamp) + { + MonitoringTask.SlowOperation slowOperation = new MonitoringTask.SlowOperation(new MonitorableImpl() + { + @Override + public String name() + { + return Generators.SYMBOL_GEN.generate(javaRandom); + } + + @Override + public String monitoredOnKeyspace() + { + return Generators.SYMBOL_GEN.generate(javaRandom); + } + + @Override + public String monitoredOnTable() + { + return Generators.SYMBOL_GEN.generate(javaRandom); + } + + @Override + public boolean isCrossNode() + { + return random.nextBoolean(); + } + }, timestamp); + + return Operation.serialize(List.of(slowOperation)); + } +} diff --git a/test/unit/org/apache/cassandra/db/virtual/StreamingVirtualTableTest.java b/test/unit/org/apache/cassandra/db/virtual/StreamingVirtualTableTest.java index 30a70338f94b..b00815197842 100644 --- a/test/unit/org/apache/cassandra/db/virtual/StreamingVirtualTableTest.java +++ b/test/unit/org/apache/cassandra/db/virtual/StreamingVirtualTableTest.java @@ -54,6 +54,7 @@ import org.apache.cassandra.utils.FBUtilities; import org.assertj.core.util.Throwables; +import static java.util.Collections.emptyList; import static org.apache.cassandra.utils.LocalizeString.toLowerCaseLocalized; import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; @@ -90,15 +91,15 @@ public void single() throws Throwable { StreamingState state = stream(true); assertRows(execute(t("select id, follower, operation, peers, status, progress_percentage, last_updated_at, failure_cause, success_message from %s")), - new Object[] { state.id(), true, "Repair", Collections.emptyList(), "init", 0F, new Date(state.lastUpdatedAtMillis()), null, null }); + new Object[] { state.id(), true, "Repair", emptyList(), "init", 0F, new Date(state.lastUpdatedAtMillis()), null, null }); state.phase.start(); assertRows(execute(t("select id, follower, operation, peers, status, progress_percentage, last_updated_at, failure_cause, success_message from %s")), - new Object[] { state.id(), true, "Repair", Collections.emptyList(), "start", 0F, new Date(state.lastUpdatedAtMillis()), null, null }); + new Object[] { state.id(), true, "Repair", emptyList(), "start", 0F, new Date(state.lastUpdatedAtMillis()), null, null }); - state.handleStreamEvent(new StreamEvent.SessionPreparedEvent(state.id(), new SessionInfo(PEER2, 1, PEER1, Collections.emptyList(), Collections.emptyList(), StreamSession.State.PREPARING, null), StreamSession.PrepareDirection.ACK)); + state.handleStreamEvent(new StreamEvent.SessionPreparedEvent(state.id(), new SessionInfo(PEER2, 1, PEER1, emptyList(), emptyList(), StreamSession.State.PREPARING, null), StreamSession.PrepareDirection.ACK)); - state.onSuccess(new StreamState(state.id(), StreamOperation.REPAIR, ImmutableSet.of(new SessionInfo(PEER2, 1, PEER1, Collections.emptyList(), Collections.emptyList(), StreamSession.State.COMPLETE, null)))); + state.onSuccess(new StreamState(state.id(), StreamOperation.REPAIR, ImmutableSet.of(new SessionInfo(PEER2, 1, PEER1, emptyList(), emptyList(), StreamSession.State.COMPLETE, null)))); assertRows(execute(t("select id, follower, operation, peers, status, progress_percentage, last_updated_at, failure_cause, success_message from %s")), new Object[] { state.id(), true, "Repair", Arrays.asList(address(127, 0, 0, 2).toString()), "success", 100F, new Date(state.lastUpdatedAtMillis()), null, null }); } @@ -222,7 +223,7 @@ private List deterministic(Collection summaries) private static StreamSummary streamSummary() { int files = ThreadLocalRandom.current().nextInt(2, 10); - return new StreamSummary(TableId.fromUUID(UUID.randomUUID()), files, files * 1024); + return new StreamSummary(TableId.fromUUID(UUID.randomUUID()), emptyList(), files, files * 1024); } @Test @@ -232,7 +233,7 @@ public void failed() throws Throwable RuntimeException t = new RuntimeException("You failed!"); state.onFailure(t); assertRows(execute(t("select id, follower, peers, status, progress_percentage, last_updated_at, failure_cause, success_message from %s")), - new Object[] { state.id(), true, Collections.emptyList(), "failure", 100F, new Date(state.lastUpdatedAtMillis()), Throwables.getStackTrace(t), null }); + new Object[] { state.id(), true, emptyList(), "failure", 100F, new Date(state.lastUpdatedAtMillis()), Throwables.getStackTrace(t), null }); } private static String t(String query) diff --git a/test/unit/org/apache/cassandra/dht/AccordSplitterTest.java b/test/unit/org/apache/cassandra/dht/AccordSplitterTest.java new file mode 100644 index 000000000000..ec05ebc6fad9 --- /dev/null +++ b/test/unit/org/apache/cassandra/dht/AccordSplitterTest.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.dht; + +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.List; + +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.local.ShardDistributor; +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.utils.Gens; +import accord.utils.RandomSource; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.utils.AccordGenerators; +import org.assertj.core.api.Assertions; + +import static accord.utils.Property.qt; + +public class AccordSplitterTest +{ + @BeforeClass + public static void setup() throws NoSuchFieldException, IllegalAccessException + { + DatabaseDescriptor.clientInitialization(); + DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); + } + + @Test + public void split() + { + qt().forAll(AccordGenerators.range(), Gens.random()).check((range, rs) -> { + TokenKey startKey = (TokenKey) range.start(); + TokenKey endKey = (TokenKey) range.end(); + IPartitioner partitioner = startKey.token().getPartitioner(); + // this section is filtering out known bugs + // TODO (now): fix the fact accordSplitter returns AccordBytesSplitter which will fail for java.lang.ClassCastException: org.apache.cassandra.dht.LocalPartitioner$LocalToken cannot be cast to org.apache.cassandra.dht.ByteOrderedPartitioner$BytesToken + // spoke with Benedict and he agrees that it doesn't make sense to split a local partitioner range, but this requires pushing this back into the API (similar to how C* returns Optional) + if (partitioner instanceof LocalPartitioner) + return; + // TODO (now): java.lang.AssertionError: [size is not larger than 0 for partitioner org.apache.cassandra.dht.OrderPreservingPartitioner@54a67a45] + if (partitioner instanceof OrderPreservingPartitioner && endKey.isTableSentinel()) + return; + // TODO (now): [size is not larger than 0 for partitioner org.apache.cassandra.dht.ByteOrderedPartitioner@44e3a2b2] + if (partitioner instanceof ByteOrderedPartitioner && endKey.isTableSentinel()) + return; + // TODO (now): [num splits not as expected for partitioner org.apache.cassandra.dht.ByteOrderedPartitioner@4c550889]\nExpected size to be between: <47> and <48> but was:<62> in: + if (partitioner instanceof ByteOrderedPartitioner && startKey.isTableSentinel()) + return; + // TODO (now): [num splits not as expected for partitioner org.apache.cassandra.dht.ByteOrderedPartitioner@13518f37]\nExpected size to be between: <11> and <12> but was:<13> in: + if (partitioner instanceof ByteOrderedPartitioner) + return; + // TODO (now): [num splits not as expected for partitioner org.apache.cassandra.dht.OrderPreservingPartitioner@4233e892]\nExpected size to be between: <51> and <52> but was:<54> in: + if (partitioner instanceof OrderPreservingPartitioner) + return; + AccordSplitter splitter = partitioner.accordSplitter().apply(Ranges.of(range)); + + BigInteger size = splitter.sizeOf(range); + Assertions.assertThat(size).describedAs("size is not larger than 0 for partitioner %s", partitioner).isGreaterThan(BigInteger.ZERO); + int maxSplits = 100; + int minSplits = 10; + if (size.compareTo(BigInteger.valueOf(maxSplits)) < 0) + maxSplits = size.intValue(); + if (size.compareTo(BigInteger.TEN) < 0) + minSplits = Math.min(2, maxSplits - 1); + int numSplits = rs.nextInt(minSplits, maxSplits); + List ranges = new ArrayList<>(numSplits); + BigInteger update = splitter.divide(size, numSplits); + BigInteger offset = BigInteger.ZERO; + while (offset.compareTo(size) < 0) + { + BigInteger end = offset.add(update); + ranges.add(splitter.subRange(range, offset, end)); + offset = end; + } + + // accord.local.ShardDistributor.EvenSplit.split attempts to detect this and work around it; a splitter is allowed to return slightly more in this case + Assertions.assertThat(ranges).describedAs("num splits not as expected for partitioner %s", partitioner).hasSizeBetween(numSplits, numSplits + 1); + + Ranges split = Ranges.of(ranges.toArray(new Range[0])).mergeTouching(); + Ranges missing = Ranges.of(range).without(split); + Assertions.assertThat(missing).isEmpty(); + + testEventSplit(partitioner, range, rs, numSplits); + }); + } + + private static void testEventSplit(IPartitioner partitioner, Range range, RandomSource rs, int numSplits) + { + ShardDistributor.EvenSplit splitter = new ShardDistributor.EvenSplit<>(numSplits, partitioner.accordSplitter()); + + Ranges topLevel = Ranges.of(range); + List ranges = splitter.split(topLevel); + + Assertions.assertThat(ranges).describedAs("num splits not as expected for partitioner %s", partitioner).hasSize(numSplits); + + Ranges split = ranges.stream().reduce(Ranges.EMPTY, Ranges::with).mergeTouching(); + Ranges missing = topLevel.without(split); + Assertions.assertThat(missing).isEmpty(); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/dht/BootStrapperTest.java b/test/unit/org/apache/cassandra/dht/BootStrapperTest.java index f82a51fc584d..c139acbe3cd2 100644 --- a/test/unit/org/apache/cassandra/dht/BootStrapperTest.java +++ b/test/unit/org/apache/cassandra/dht/BootStrapperTest.java @@ -23,6 +23,8 @@ import java.util.Random; import java.util.Set; import java.util.stream.Collectors; +import java.util.concurrent.atomic.AtomicBoolean; + import com.google.common.base.Predicate; import com.google.common.base.Predicates; @@ -31,9 +33,11 @@ import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; +import org.junit.runner.RunWith; import org.apache.cassandra.SchemaLoader; import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; @@ -51,16 +55,35 @@ import org.apache.cassandra.tcm.ownership.MovementMap; import org.apache.cassandra.tcm.sequences.BootstrapAndJoin; import org.apache.cassandra.utils.Pair; +import org.jboss.byteman.contrib.bmunit.BMRule; +import org.jboss.byteman.contrib.bmunit.BMRules; +import org.jboss.byteman.contrib.bmunit.BMUnitRunner; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; - +@RunWith(BMUnitRunner.class) public class BootStrapperTest { static IPartitioner oldPartitioner; static Predicate originalAlivePredicate = RangeStreamer.ALIVE_PREDICATE; + public static AtomicBoolean nonOptimizationHit = new AtomicBoolean(false); + public static AtomicBoolean optimizationHit = new AtomicBoolean(false); + private static final IFailureDetector mockFailureDetector = new IFailureDetector() + { + public boolean isAlive(InetAddressAndPort ep) + { + return true; + } + + public void interpret(InetAddressAndPort ep) { throw new UnsupportedOperationException(); } + public void report(InetAddressAndPort ep) { throw new UnsupportedOperationException(); } + public void registerFailureDetectionEventListener(IFailureDetectionEventListener listener) { throw new UnsupportedOperationException(); } + public void unregisterFailureDetectionEventListener(IFailureDetectionEventListener listener) { throw new UnsupportedOperationException(); } + public void remove(InetAddressAndPort ep) { throw new UnsupportedOperationException(); } + public void forceConviction(InetAddressAndPort ep) { throw new UnsupportedOperationException(); } + }; @BeforeClass public static void setup() throws ConfigurationException @@ -96,46 +119,49 @@ public void testSourceTargetComputation() throws UnknownHostException } } + @Test + @BMRules(rules = { @BMRule(name = "Make sure the non-optimized path is picked up for some operations", + targetClass = "org.apache.cassandra.dht.RangeStreamer", + targetMethod = "convertPreferredEndpointsToWorkMap(EndpointsByReplica)", + action = "org.apache.cassandra.dht.BootStrapperTest.nonOptimizationHit.set(true)"), + @BMRule(name = "Make sure the optimized path is picked up for some operations", + targetClass = "org.apache.cassandra.dht.RangeStreamer", + targetMethod = "getOptimizedWorkMap(EndpointsByReplica,Collection,String)", + action = "org.apache.cassandra.dht.BootStrapperTest.optimizationHit.set(true)") }) + public void testStreamingCandidatesOptmizationSkip() throws UnknownHostException + { + testSkipStreamingCandidatesOptmizationFeatureFlag(true, true, false, getRangeStreamer()); + testSkipStreamingCandidatesOptmizationFeatureFlag(false, true, true, getRangeStreamer()); + } + + private void testSkipStreamingCandidatesOptmizationFeatureFlag(boolean disableOptimization, boolean nonOptimizedPathHit, boolean optimizedPathHit, RangeStreamer s) throws UnknownHostException + { + try + { + nonOptimizationHit.set(false); + optimizationHit.set(false); + CassandraRelevantProperties.SKIP_OPTIMAL_STREAMING_CANDIDATES_CALCULATION.setBoolean(disableOptimization); + + for (String keyspaceName : Schema.instance.getUserKeyspaces().names()) + s.addKeyspaceToFetch(keyspaceName); + + assertEquals(nonOptimizedPathHit, nonOptimizationHit.get()); + if (disableOptimization) // The optimized path may or not be hit depending on the code. + assertEquals(optimizedPathHit, optimizationHit.get()); + } + finally + { + CassandraRelevantProperties.SKIP_OPTIMAL_STREAMING_CANDIDATES_CALCULATION.reset(); + } + } + private RangeStreamer testSourceTargetComputation(String keyspaceName, int numOldNodes, int replicationFactor) throws UnknownHostException { ServerTestUtils.resetCMS(); generateFakeEndpoints(numOldNodes); ClusterMetadata metadata = ClusterMetadata.current(); - assertEquals(numOldNodes, metadata.tokenMap.tokens().size()); - IFailureDetector mockFailureDetector = new IFailureDetector() - { - public boolean isAlive(InetAddressAndPort ep) - { - return true; - } - - public void interpret(InetAddressAndPort ep) { throw new UnsupportedOperationException(); } - public void report(InetAddressAndPort ep) { throw new UnsupportedOperationException(); } - public void registerFailureDetectionEventListener(IFailureDetectionEventListener listener) { throw new UnsupportedOperationException(); } - public void unregisterFailureDetectionEventListener(IFailureDetectionEventListener listener) { throw new UnsupportedOperationException(); } - public void remove(InetAddressAndPort ep) { throw new UnsupportedOperationException(); } - public void forceConviction(InetAddressAndPort ep) { throw new UnsupportedOperationException(); } - }; - - Token myToken = metadata.partitioner.getRandomToken(); - InetAddressAndPort myEndpoint = InetAddressAndPort.getByName("127.0.0.1"); - NodeId newNode = ClusterMetadataTestHelper.register(myEndpoint); - ClusterMetadataTestHelper.JoinProcess join = ClusterMetadataTestHelper.lazyJoin(myEndpoint, myToken); - join.prepareJoin(); - metadata = ClusterMetadata.current(); - BootstrapAndJoin joiningPlan = (BootstrapAndJoin) metadata.inProgressSequences.get(newNode); - Pair movements = joiningPlan.getMovementMaps(metadata); - RangeStreamer s = new RangeStreamer(metadata, - StreamOperation.BOOTSTRAP, - true, - DatabaseDescriptor.getNodeProximity(), - new StreamStateStore(), - mockFailureDetector, - false, - 1, - movements.left, - movements.right); + RangeStreamer s = getRangeStreamer(); assertNotNull(Keyspace.open(keyspaceName)); s.addKeyspaceToFetch(keyspaceName); @@ -159,10 +185,41 @@ public boolean isAlive(InetAddressAndPort ep) // there isn't any point in testing the size of these collections for any specific size. When a random partitioner // is used, they will vary. assert toFetch.values().size() > 0; - assert toFetch.keys().stream().noneMatch(myEndpoint::equals); + + assert toFetch.keys().stream().noneMatch(InetAddressAndPort.getByName("127.0.0.1")::equals); return s; } + private RangeStreamer getRangeStreamer() throws UnknownHostException + { + ClusterMetadata metadata = ClusterMetadata.current(); + Pair movements = Pair.create(MovementMap.empty(), MovementMap.empty()); + + if (metadata.myNodeId() == null) + { + Token myToken = metadata.partitioner.getRandomToken(); + InetAddressAndPort myEndpoint = InetAddressAndPort.getByName("127.0.0.1"); + NodeId newNode = ClusterMetadataTestHelper.register(myEndpoint); + ClusterMetadataTestHelper.JoinProcess join = ClusterMetadataTestHelper.lazyJoin(myEndpoint, myToken); + join.prepareJoin(); + metadata = ClusterMetadata.current(); + BootstrapAndJoin joiningPlan = (BootstrapAndJoin) metadata.inProgressSequences.get(newNode); + movements = joiningPlan.getMovementMaps(metadata); + } + + return new RangeStreamer(metadata, + StreamOperation.BOOTSTRAP, + true, + DatabaseDescriptor.getNodeProximity(), + new StreamStateStore(), + mockFailureDetector, + false, + 1, + movements.left, + movements.right, + true); + } + private boolean includesWraparound(Collection> toFetch) { long minTokenCount = toFetch.stream() diff --git a/test/unit/org/apache/cassandra/dht/ByteOrderedPartitionerTest.java b/test/unit/org/apache/cassandra/dht/ByteOrderedPartitionerTest.java index f40c284b0e36..e4ff1bc66ef1 100644 --- a/test/unit/org/apache/cassandra/dht/ByteOrderedPartitionerTest.java +++ b/test/unit/org/apache/cassandra/dht/ByteOrderedPartitionerTest.java @@ -17,6 +17,12 @@ */ package org.apache.cassandra.dht; +import java.util.Arrays; + +import org.junit.Assert; + +import org.apache.cassandra.dht.ByteOrderedPartitioner.BytesToken; + public class ByteOrderedPartitionerTest extends PartitionerTestCase { public void initPartitioner() @@ -28,4 +34,14 @@ protected boolean shouldStopRecursion(Token left, Token right) { return false; } + + @Override + protected void checkRoundTrip(Token original, Token roundTrip) + { + BytesToken orig = (BytesToken) original; + BytesToken rt = (BytesToken) roundTrip; + Assert.assertArrayEquals(orig.token, Arrays.copyOf(rt.token, orig.token.length)); + for (int i = orig.token.length ; i < rt.token.length ; ++i) + Assert.assertEquals((byte)0, rt.token[i]); + } } diff --git a/test/unit/org/apache/cassandra/dht/IPartitionerTest.java b/test/unit/org/apache/cassandra/dht/IPartitionerTest.java new file mode 100644 index 000000000000..e531e25da5b0 --- /dev/null +++ b/test/unit/org/apache/cassandra/dht/IPartitionerTest.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.dht; + +import java.lang.reflect.Modifier; +import java.security.CodeSource; +import java.security.ProtectionDomain; +import java.util.Objects; +import java.util.Set; + +import com.google.common.collect.Sets; +import org.junit.Test; + +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.utils.AbstractTypeGenerators; +import org.apache.cassandra.utils.AccordGenerators; +import org.apache.cassandra.utils.CassandraGenerators; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.assertj.core.api.Assertions; +import org.reflections.Reflections; +import org.reflections.scanners.Scanners; +import org.reflections.util.ConfigurationBuilder; + +import static accord.utils.Property.qt; + +public class IPartitionerTest +{ + //TODO (now, maintaince): this is copied from AbstractTypeTest + private static final Reflections reflections = new Reflections(new ConfigurationBuilder() + .forPackage("org.apache.cassandra") + .setScanners(Scanners.SubTypes) + .setExpandSuperTypes(true) + .setParallel(true)); + + @Test + public void allCovered() + { + Set> subTypes = reflections.getSubTypesOf(IPartitioner.class); + Set> coverage = CassandraGenerators.knownPartitioners(); + StringBuilder sb = new StringBuilder(); + for (Class klass : Sets.difference(subTypes, coverage)) + { + if (Modifier.isAbstract(klass.getModifiers())) + continue; + if (isTestType(klass)) + continue; + if (ReversedLongLocalPartitioner.class.equals(klass)) + continue; + String name = klass.getCanonicalName(); + if (name == null) + name = klass.getName(); + sb.append(name).append('\n'); + } + if (sb.length() > 0) + throw new AssertionError("Uncovered types:\n" + sb); + } + + private boolean isTestType(Class klass) + { + String name = klass.getCanonicalName(); + if (name == null) + name = klass.getName(); + if (name == null) + name = klass.toString(); + if (name.contains("Test")) + return true; + if (name.equals(LengthPartitioner.class.getCanonicalName())) + return true; + ProtectionDomain domain = klass.getProtectionDomain(); + if (domain == null) return false; + CodeSource src = domain.getCodeSource(); + if (src == null) return false; + return "test".equals(new File(src.getLocation().getPath()).name()); + } + + @Test + public void byteCompareSerde() + { + qt().forAll(AccordGenerators.fromQT(CassandraGenerators.token())).check(token -> { + var p = token.getPartitioner(); + var comparable = Objects.requireNonNull(ByteSource.peekable(p.getTokenFactory().asComparableBytes(token, ByteComparable.Version.OSS50))); + Token read = p.getTokenFactory().fromComparableBytes(comparable, ByteComparable.Version.OSS50); + Assertions.assertThat(read) + .describedAs("If LocalPartitioner, the type is %s", (token.getPartitioner() instanceof LocalPartitioner ? AbstractTypeGenerators.typeTree(((LocalPartitioner) token.getPartitioner()).comparator) : null)) + .isEqualTo(token); + }); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java b/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java index c0573d61f7dc..abc6f023c16a 100644 --- a/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java +++ b/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java @@ -20,26 +20,26 @@ import java.math.BigInteger; import java.util.List; -import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; +import org.apache.cassandra.CassandraTestBase; +import org.apache.cassandra.CassandraTestBase.SchemaLoaderPrepareServer; +import org.apache.cassandra.CassandraTestBase.UseLengthPartitioner; import org.apache.cassandra.SchemaLoader; import org.apache.cassandra.Util; -import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.db.marshal.IntegerType; -import org.apache.cassandra.schema.Schema; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.RowUpdateBuilder; -import org.apache.cassandra.db.partitions.*; +import org.apache.cassandra.db.marshal.IntegerType; +import org.apache.cassandra.db.partitions.FilteredPartition; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.schema.KeyspaceParams; -import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.schema.Schema; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.bytecomparable.ByteComparable; import org.apache.cassandra.utils.bytecomparable.ByteSource; -import org.apache.cassandra.utils.FBUtilities; /** * Test cases where multiple keys collides, ie have the same token. @@ -48,29 +48,21 @@ * length partitioner that takes the length of the key as token, making * collision easy and predictable. */ -public class KeyCollisionTest +@UseLengthPartitioner +@SchemaLoaderPrepareServer +public class KeyCollisionTest extends CassandraTestBase { - static IPartitioner oldPartitioner; private static final String KEYSPACE1 = "KeyCollisionTest1"; private static final String CF = "Standard1"; @BeforeClass public static void defineSchema() throws ConfigurationException { - DatabaseDescriptor.daemonInitialization(); - oldPartitioner = StorageService.instance.setPartitionerUnsafe(LengthPartitioner.instance); - SchemaLoader.prepareServer(); SchemaLoader.createKeyspace(KEYSPACE1, KeyspaceParams.simple(1), SchemaLoader.standardCFMD(KEYSPACE1, CF)); } - @AfterClass - public static void tearDown() - { - DatabaseDescriptor.setPartitionerUnsafe(oldPartitioner); - } - @Test public void testGetSliceWithCollision() throws Exception { @@ -122,6 +114,12 @@ public IPartitioner getPartitioner() return LengthPartitioner.instance; } + @Override + public int tokenHash() + { + return token.hashCode(); + } + @Override public long getHeapSize() { diff --git a/test/unit/org/apache/cassandra/dht/LengthPartitioner.java b/test/unit/org/apache/cassandra/dht/LengthPartitioner.java index 07e40c571c1f..1fcfa14ee1ca 100644 --- a/test/unit/org/apache/cassandra/dht/LengthPartitioner.java +++ b/test/unit/org/apache/cassandra/dht/LengthPartitioner.java @@ -19,17 +19,23 @@ import java.math.BigInteger; import java.nio.ByteBuffer; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; import java.util.concurrent.ThreadLocalRandom; +import java.util.function.Function; -import org.apache.cassandra.schema.Schema; -import org.apache.cassandra.schema.TableMetadata; +import accord.primitives.Ranges; import org.apache.cassandra.db.BufferDecoratedKey; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.IntegerType; import org.apache.cassandra.db.marshal.PartitionerDefinedOrder; import org.apache.cassandra.dht.KeyCollisionTest.BigIntegerToken; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FBUtilities; @@ -37,7 +43,7 @@ import org.apache.cassandra.utils.bytecomparable.ByteComparable; import org.apache.cassandra.utils.bytecomparable.ByteSource; -public class LengthPartitioner implements IPartitioner +public class LengthPartitioner extends AccordSplitter implements IPartitioner { public static final BigInteger ZERO = new BigInteger("0"); public static final BigIntegerToken MINIMUM = new BigIntegerToken("-1"); @@ -70,7 +76,7 @@ public BigIntegerToken getMinimumToken() } @Override - public Token getMaximumToken() + public Token getMaximumTokenForSplitting() { return null; } @@ -116,6 +122,8 @@ public Token fromString(String string) public void validate(String token) {} }; + private LengthPartitioner() {} + public Token.TokenFactory getTokenFactory() { return tokenFactory; @@ -184,4 +192,34 @@ public AbstractType partitionOrdering(AbstractType partitionKeyType) { return new PartitionerDefinedOrder(this, partitionKeyType); } + + @Override + public Function accordSplitter() + { + return ignore -> this; + } + + @Override + BigInteger valueForToken(Token token) + { + return ((BigIntegerToken)token).token; + } + + @Override + Token tokenForValue(BigInteger value) + { + return new BigIntegerToken(value); + } + + @Override + BigInteger minimumValue() + { + throw new UnsupportedOperationException(); + } + + @Override + BigInteger maximumValue() + { + throw new UnsupportedOperationException(); + } } diff --git a/test/unit/org/apache/cassandra/dht/OrderPreservingPartitionerTest.java b/test/unit/org/apache/cassandra/dht/OrderPreservingPartitionerTest.java index 6ab5b456b3d8..131ce13d58ea 100644 --- a/test/unit/org/apache/cassandra/dht/OrderPreservingPartitionerTest.java +++ b/test/unit/org/apache/cassandra/dht/OrderPreservingPartitionerTest.java @@ -19,10 +19,11 @@ import java.io.IOException; +import org.junit.Assert; import org.junit.BeforeClass; -import org.junit.Test; import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.dht.OrderPreservingPartitioner.StringToken; public class OrderPreservingPartitionerTest extends PartitionerTestCase { @@ -44,15 +45,12 @@ protected boolean shouldStopRecursion(Token left, Token right) return false; } - @Test - public void testCompare() + @Override + protected void checkRoundTrip(Token original, Token roundTrip) { - assert tok("").compareTo(tok("asdf")) < 0; - assert tok("asdf").compareTo(tok("")) > 0; - assert tok("").compareTo(tok("")) == 0; - assert tok("z").compareTo(tok("a")) > 0; - assert tok("a").compareTo(tok("z")) < 0; - assert tok("asdf").compareTo(tok("asdf")) == 0; - assert tok("asdz").compareTo(tok("asdf")) > 0; + StringToken orig = (StringToken) original; + StringToken rt = (StringToken) roundTrip; + Assert.assertEquals(orig.token, rt.token.substring(0, orig.token.length())); + Assert.assertTrue(rt.token.substring(orig.token.length()).matches("\0*")); } } diff --git a/test/unit/org/apache/cassandra/dht/PartitionerTestCase.java b/test/unit/org/apache/cassandra/dht/PartitionerTestCase.java index eb9733c92615..df84f589afda 100644 --- a/test/unit/org/apache/cassandra/dht/PartitionerTestCase.java +++ b/test/unit/org/apache/cassandra/dht/PartitionerTestCase.java @@ -17,6 +17,7 @@ */ package org.apache.cassandra.dht; +import java.math.BigInteger; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Collections; @@ -24,14 +25,19 @@ import java.util.Map; import java.util.Random; +import org.junit.Assert; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; +import accord.primitives.Ranges; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.commitlog.CommitLog; import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.service.accord.api.TokenKey; +import static org.apache.cassandra.service.accord.AccordTestUtils.TABLE_ID1; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; @@ -216,4 +222,113 @@ private void testDescribeOwnershipWith(int numTokens) totalOwnership += ownership; assertEquals(1.0, totalOwnership, 0.001); } + + @Test + public void testCompare() + { + if (!partitioner.preservesOrder()) + return; + + assert tok("").compareTo(tok("asdf")) < 0; + assert tok("asdf").compareTo(tok("")) > 0; + assert tok("").compareTo(tok("")) == 0; + assert tok("z").compareTo(tok("a")) > 0; + assert tok("a").compareTo(tok("z")) < 0; + assert tok("asdf").compareTo(tok("asdf")) == 0; + assert tok("asdz").compareTo(tok("asdf")) > 0; + } + + @Test + public void testCompareSplitter() + { + for (int i = 0 ; i < 16 ; ++i) + { + Token a = partitioner.getRandomToken(), b = partitioner.getRandomToken(); + while (a.equals(b)) + b = partitioner.getRandomToken(); + if (a.compareTo(b) > 0) { Token tmp = a; a = b; b = tmp; } + testCompareSplitter(a, b); + } + + if (!partitioner.preservesOrder()) + return; + + testCompareSplitter(tok(""), tok("asdf")); + testCompareSplitter(tok(""), tok("")); + testCompareSplitter(tok("a"), tok("z")); + testCompareSplitter(tok("asdf"), tok("asdf")); + testCompareSplitter(tok("asd"), tok("asdf")); + testCompareSplitter(tok("asdf"), tok("asf")); + testCompareSplitter(tok("asdf"), tok("asdz")); + } + + @Test + public void testSplitter() + { + for (int i = 0 ; i < 1024 ; ++i) + { + Token a = partitioner.getRandomToken(), b = partitioner.getRandomToken(); + while (a.equals(b)) + b = partitioner.getRandomToken(); + if (a.compareTo(b) > 0) { Token tmp = a; a = b; b = tmp; } + testSplitter(a, b); + } + + if (!partitioner.preservesOrder()) + return; + + testSplitter(tok(""), tok("asdf")); + testSplitter(tok("a"), tok("z")); + testSplitter(tok("asd"), tok("asdf")); + testSplitter(tok("asdf"), tok("asdz")); + } + + void testCompareSplitter(Token less, Token more) + { + Ranges ranges; + if (less.equals(more) && less.isMinimum()) + ranges = Ranges.EMPTY; + else if (less.equals(more)) + ranges = Ranges.of(TokenRange.create(new TokenKey(TABLE_ID1, partitioner.getMinimumToken()), new TokenKey(TABLE_ID1, less))); + else + ranges = Ranges.of(TokenRange.create(new TokenKey(TABLE_ID1, less), new TokenKey(TABLE_ID1, more))); + + AccordSplitter splitter = partitioner.accordSplitter().apply(ranges); + BigInteger lv = splitter.valueForToken(less); + BigInteger rv = splitter.valueForToken(more); + Assert.assertEquals(less.equals(more) ? 0 : -1, normaliseCompare(lv.compareTo(rv))); + Assert.assertEquals(less.equals(more) ? 0 : 1, normaliseCompare(rv.compareTo(lv))); + checkRoundTrip(less, splitter.tokenForValue(lv)); + checkRoundTrip(more, splitter.tokenForValue(rv)); + } + + void testSplitter(Token start, Token end) + { + accord.primitives.Range range = TokenRange.create(new TokenKey(TABLE_ID1, start), new TokenKey(TABLE_ID1, end)); + AccordSplitter splitter = partitioner.accordSplitter().apply(Ranges.of(range)); + if (!start.isMinimum()) + testSplitter(TokenRange.create(new TokenKey(TABLE_ID1, partitioner.getMinimumToken()), new TokenKey(TABLE_ID1, start))); + testSplitter(TokenRange.create(new TokenKey(TABLE_ID1, start), new TokenKey(TABLE_ID1, splitter.tokenForValue(splitter.maximumValue())))); + checkRoundTrip(start, splitter.tokenForValue(splitter.valueForToken(start))); + checkRoundTrip(end, splitter.tokenForValue(splitter.valueForToken(end))); + } + + void testSplitter(accord.primitives.Range range) + { + AccordSplitter splitter = partitioner.accordSplitter().apply(Ranges.of(range)); + BigInteger size = splitter.sizeOf(range); + Assert.assertEquals(range, splitter.subRange(range, BigInteger.ZERO, size)); + } + + protected void checkRoundTrip(Token original, Token roundTrip) + { + Assert.assertEquals(original, roundTrip); + } + + static int normaliseCompare(int compareResult) + { + if (compareResult < 0) return -1; + if (compareResult > 0) return 1; + return 0; + } } diff --git a/test/unit/org/apache/cassandra/dht/RandomPartitionerTest.java b/test/unit/org/apache/cassandra/dht/RandomPartitionerTest.java index 273fe2170fe2..1fefa1d3828c 100644 --- a/test/unit/org/apache/cassandra/dht/RandomPartitionerTest.java +++ b/test/unit/org/apache/cassandra/dht/RandomPartitionerTest.java @@ -19,9 +19,13 @@ package org.apache.cassandra.dht; import java.math.BigInteger; +import java.util.Arrays; import org.junit.Test; +import org.apache.cassandra.cql3.functions.types.utils.Bytes; +import org.apache.cassandra.harry.checker.TestHelper; + public class RandomPartitionerTest extends PartitionerTestCase { public void initPartitioner() @@ -55,4 +59,57 @@ public void testSplitExceedMaximumCase() assertSplit(left, tok("a"), 16); } + + @Test + public void testIncrement() + { + TestHelper.withRandom((rng) -> { + for (int i = 0; i < 10_000; i++) + { + BigInteger bi = BigInteger.valueOf(Math.abs(rng.next())); + byte[] bytes = bi.toByteArray(); + byte[] copy = Arrays.copyOf(bytes, bytes.length); + + BigInteger incremented = new BigInteger(RandomPartitioner.increment(bytes)); + BigInteger expected = bi.add(BigInteger.valueOf(1)); + if (!expected.equals(incremented)) + { + throw new IllegalArgumentException(String.format("\nBefore increment: %s" + + "\n After increment: %s," + + "\n%s != %s", + Bytes.toHexString(copy), + Bytes.toHexString(bytes), + expected, + incremented)); + } + } + }); + } + + @Test + public void testDecrement() + { + TestHelper.withRandom((rng) -> { + for (int i = 0; i < 10_000; i++) + { + BigInteger bi = BigInteger.valueOf(Math.abs(rng.next() + 1)); + byte[] bytes = bi.toByteArray(); + byte[] copy = Arrays.copyOf(bytes, bytes.length); + + RandomPartitioner.decrement(bytes); + BigInteger incremented = new BigInteger(bytes); + BigInteger expected = bi.add(BigInteger.valueOf(-1)); + if (!expected.equals(incremented)) + { + throw new IllegalArgumentException(String.format("\nBefore increment: %s" + + "\n After increment: %s," + + "\n%s != %s", + Bytes.toHexString(copy), + Bytes.toHexString(bytes), + expected, + incremented)); + } + } + }); + } } diff --git a/test/unit/org/apache/cassandra/dht/RangeTest.java b/test/unit/org/apache/cassandra/dht/RangeTest.java index 84ca1246a3d4..e8198b9cb497 100644 --- a/test/unit/org/apache/cassandra/dht/RangeTest.java +++ b/test/unit/org/apache/cassandra/dht/RangeTest.java @@ -18,36 +18,59 @@ package org.apache.cassandra.dht; import java.nio.ByteBuffer; -import java.util.Collection; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.List; +import java.util.Objects; import java.util.Random; import java.util.Set; import com.google.common.base.Joiner; +import com.google.common.base.Stopwatch; +import com.google.common.collect.ImmutableList; import com.google.common.collect.Sets; import org.apache.commons.lang3.StringUtils; import org.junit.BeforeClass; import org.junit.Test; -import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.CassandraTestBase; +import org.apache.cassandra.CassandraTestBase.DDDaemonInitialization; +import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.db.PartitionPosition; import org.apache.cassandra.dht.ByteOrderedPartitioner.BytesToken; +import org.apache.cassandra.dht.Murmur3Partitioner.LongToken; import org.apache.cassandra.dht.RandomPartitioner.BigIntegerToken; +import org.apache.cassandra.utils.Pair; +import static java.lang.String.format; import static java.util.Arrays.asList; +import static java.util.Collections.emptyList; +import static java.util.concurrent.TimeUnit.SECONDS; import static org.apache.cassandra.Util.range; -import static org.junit.Assert.*; - - -public class RangeTest +import static org.apache.cassandra.config.CassandraRelevantProperties.TEST_RANGE_EXPENSIVE_CHECKS; +import static org.apache.cassandra.dht.NormalizedRanges.normalizedRanges; +import static org.apache.cassandra.dht.Range.fromString; +import static org.apache.cassandra.dht.Range.intersectionAndRemainder; +import static org.apache.cassandra.dht.Range.normalize; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotSame; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + + +@DDDaemonInitialization +public class RangeTest extends CassandraTestBase { @BeforeClass - public static void setupDD() + public static void enableExpensiveRangeChecks() { - DatabaseDescriptor.daemonInitialization(); + assertFalse(TEST_RANGE_EXPENSIVE_CHECKS.getBoolean()); // Expect off by default + CassandraRelevantProperties.TEST_RANGE_EXPENSIVE_CHECKS.setBoolean(true); + assertTrue(TEST_RANGE_EXPENSIVE_CHECKS.getBoolean()); } @Test @@ -201,11 +224,11 @@ static > void assertIntersection(Range one, Range> correct = Range.rangeSet(ranges); Set> result1 = one.intersectionWith(two); - assert result1.equals(correct) : String.format("%s != %s", + assert result1.equals(correct) : format("%s != %s", StringUtils.join(result1, ","), StringUtils.join(correct, ",")); Set> result2 = two.intersectionWith(one); - assert result2.equals(correct) : String.format("%s != %s", + assert result2.equals(correct) : format("%s != %s", StringUtils.join(result2, ","), StringUtils.join(correct, ",")); } @@ -578,7 +601,7 @@ public void testDifferenceToFetchNewWraps() private > void assertNormalize(List> input, List> expected) { - List> result = Range.normalize(input); + List> result = normalize(input); assert result.equals(expected) : "Expecting " + expected + " but got " + result; } @@ -681,7 +704,7 @@ public void testRandomOrderedRangeContainmentChecker() for (Token t : tokensToTest) { if (checker.test(t) != Range.isInRanges(t, ranges)) // avoid running Joiner.on(..) every iteration - fail(String.format("This should never flap! If it does, it is a bug (ranges = %s, token = %s)", Joiner.on(",").join(ranges), t)); + fail(format("This should never flap! If it does, it is a bug (ranges = %s, token = %s)", Joiner.on(",").join(ranges), t)); } } } @@ -702,6 +725,12 @@ private static Range r(long left, long right) { return new Range<>(t(left), t(right)); } + + private static Range r(Token left, Token right) + { + return new Range<>(left, right); + } + private static Token t(long t) { return new Murmur3Partitioner.LongToken(t); @@ -736,4 +765,418 @@ public void testGroupSubtract() assertEquals(ranges, Range.subtract(ranges, asList(r(6, 7), r(20, 25)))); assertEquals(Sets.newHashSet(r(1, 4), r(11, 15)), Range.subtract(ranges, asList(r(4, 7), r(8, 11)))); } + + @Test + public void testIntersectsBounds() + { + Range r = r(0, 100); + assertTrue(r.intersects(bounds(5, 10))); + assertTrue(r.intersects(bounds(100, 110))); + assertTrue(r.intersects(bounds(-100, 200))); + assertTrue(r.intersects(bounds(10, 15))); + assertTrue(r.intersects(bounds(20,20))); + + assertFalse(r.intersects(bounds(-5, 0))); + assertFalse(r.intersects(bounds(-5, -1))); + assertFalse(r.intersects(bounds(110, 114))); + } + + private static Bounds bounds(long left, long right) + { + return new Bounds<>(t(left), t(right)); + } + + private static AbstractBounds bounds(PartitionPosition left, boolean leftInclusive, PartitionPosition right, boolean rightInclusive) + { + return AbstractBounds.bounds(left, leftInclusive, right, rightInclusive); + } + + @Test + @UseMurmur3Partitioner + public void testIsInNormalizedRanges() + { + NormalizedRanges ranges = normalizedRanges(ImmutableList.of(fromString("(1,10]"), fromString("(10,20]"), fromString("(30,40]"), fromString("(50,60]"), fromString("(60,70]"), fromString("(80,90]"), fromString("(" + Long.MAX_VALUE + ",-9223372036854775808]"))); + for (int ii = 0; ii < 100; ii++) + { + boolean isIn = ranges.intersects(new LongToken(ii)); + if (ii > 1 && ii <= 20) + assertTrue("Index " + ii, isIn); + else if (ii > 30 && ii <= 40) + assertTrue("Index " + ii, isIn); + else if (ii > 50 && ii <= 70) + assertTrue("Index " + ii, isIn); + else if (ii > 80 && ii <= 90) + assertTrue("Index " + ii, isIn); + else + assertFalse("Index " + ii, isIn); + } + assertFalse(ranges.intersects(new LongToken(Long.MAX_VALUE))); + assertTrue(ranges.intersects(new LongToken(Long.MIN_VALUE))); + ranges = normalizedRanges(ImmutableList.of(fromString("(-9223372036854775808,-9223372036854775807]"))); + assertFalse(ranges.intersects(new LongToken(Long.MIN_VALUE))); + assertTrue(ranges.intersects(new LongToken(Long.MIN_VALUE + 1))); + ranges = normalizedRanges(ImmutableList.of(fromString("(" + (Long.MAX_VALUE - 1) + ",-9223372036854775808]"))); + assertFalse(ranges.intersects(new LongToken(Long.MAX_VALUE - 1))); + assertTrue(ranges.intersects(new LongToken(Long.MAX_VALUE))); + assertTrue(ranges.intersects(new LongToken(Long.MIN_VALUE))); + assertFalse(ranges.intersects(new LongToken(Long.MAX_VALUE - 1))); + assertTrue(ranges.intersects(new LongToken(Long.MAX_VALUE))); + assertTrue(ranges.intersects(new LongToken(Long.MIN_VALUE))); + } + + @Test + @UseMurmur3Partitioner + public void testSubtractNormalizedRanges() + { + NormalizedRanges ranges = normalizedRanges(ImmutableList.of(fromString("(1,10]"), fromString("(10,20]"), fromString("(30,40]"), fromString("(50,60]"), fromString("(60,70]"), fromString("(80,90]"), fromString("(" + Long.MAX_VALUE + ",-9223372036854775808]"))); + for (int ii = 0; ii < 100; ii++) + { + boolean isIn = ranges.intersects(new LongToken(ii)); + if (ii > 1 && ii <= 20) + assertTrue("Index " + ii, isIn); + else if (ii > 30 && ii <= 40) + assertTrue("Index " + ii, isIn); + else if (ii > 50 && ii <= 70) + assertTrue("Index " + ii, isIn); + else if (ii > 80 && ii <= 90) + assertTrue("Index " + ii, isIn); + else + assertFalse("Index " + ii, isIn); + } + NormalizedRanges rightMostRange = normalizedRanges(ImmutableList.of(r(Long.MAX_VALUE, Long.MIN_VALUE))); + NormalizedRanges maxLongRange = normalizedRanges(ImmutableList.of(r(Long.MAX_VALUE - 1, Long.MAX_VALUE))); + + assertEquals(emptyList(), ranges.subtract(ranges)); + assertEquals(emptyList(), rightMostRange.subtract(ranges)); + assertEquals(maxLongRange, maxLongRange.subtract(ranges)); + ranges = maxLongRange; + assertEquals(emptyList(), ranges.subtract(ranges)); + assertEquals(rightMostRange, rightMostRange.subtract(ranges)); + assertEquals(emptyList(), maxLongRange.subtract(ranges)); + ranges = normalizedRanges(ImmutableList.of(fromString("(" + (Long.MAX_VALUE - 1) + ",-9223372036854775808]"))); + assertEquals(emptyList(), ranges.subtract(ranges)); + assertEquals(emptyList(), rightMostRange.subtract(ranges)); + assertEquals(emptyList(), maxLongRange.subtract(ranges)); + } + + @Test + public void testExpensiveChecksBurn() + { + long seed = System.nanoTime(); + System.out.println(seed); + Random r = new java.util.Random(seed); + + Stopwatch elapsed = Stopwatch.createStarted(); + while (elapsed.elapsed(SECONDS) != 10) + { + int numRanges = 3; + List> aList = new ArrayList(); + for (int ii = 0; ii < numRanges; ii++) + { + aList.add(new Range<>(new LongToken(r.nextLong()), new LongToken(r.nextLong()))); + } + NormalizedRanges a = normalizedRanges(aList); + List> bList = new ArrayList(); + for (int ii = 0; ii < numRanges; ii++) + { + bList.add(new Range<>(new LongToken(r.nextLong()), new LongToken(r.nextLong()))); + } + NormalizedRanges b = normalizedRanges(bList); + + for (int ii = 0; ii < 1000; ii++) + { + Token t = new LongToken(r.nextLong()); + a.intersects(t); + b.intersects(t); + } + + a.intersection(b); + a.intersection(a); + b.intersection(a); + b.intersection(b); + + a.subtract(b); + a.subtract(a); + b.subtract(a); + b.subtract(b); + + if (!a.isEmpty()) + a.invert(); + if (!b.isEmpty()) + b.invert(); + } + } + + @Test + public void testIntersectionAndRemainder() + { + // Intersection will only make max key bounds so use minKeyBound + // so you know where the bound came from + Token oneT = t(1); + PartitionPosition onePP = oneT.minKeyBound(); + Token twoT = t(2); + PartitionPosition twoPP = twoT.minKeyBound(); + Token threeT = t(3); + PartitionPosition threePP = threeT.minKeyBound(); + Token fourT = t(4); + PartitionPosition fourPP = fourT.minKeyBound(); + Token fiveT = t(5); + PartitionPosition fivePP = fiveT.minKeyBound(); + Token sixT = t(6); + PartitionPosition sixPP = sixT.minKeyBound(); + Token sevenT = t(7); + PartitionPosition sevenPP = sevenT.minKeyBound(); + Token eightT = t(8); + PartitionPosition eightPP = eightT.minKeyBound(); + Token minT = Murmur3Partitioner.MINIMUM; + PartitionPosition minPP = minT.minKeyBound(); + + Range r = r(threeT, sixT); + + // Completely before + testInclusivity(onePP, twoPP, r, + Pair.create(null, null), + Pair.create(null, null), + Pair.create(null, null), + Pair.create(null, null)); + + // Completely after + testInclusivity(sevenPP, eightPP, r, + Pair.create(null, bounds(sevenPP, true, eightPP, true)), + Pair.create(null, bounds(sevenPP, true, eightPP, false)), + Pair.create(null, bounds(sevenPP, false, eightPP, true)), + Pair.create(null, bounds(sevenPP, false, eightPP, false))); + + // Overlapping + testInclusivity(threePP, sixPP, r, + Pair.create(bounds(threeT.maxKeyBound(), false, sixPP, true), null), + Pair.create(bounds(threeT.maxKeyBound(), false, sixPP, false), null), + Pair.create(bounds(threeT.maxKeyBound(), false, sixPP, true), null), + Pair.create(bounds(threeT.maxKeyBound(), false, sixPP, false), null)); + + // Completely contained by range, should echo back the same bound + testInclusivity(fourPP, fivePP, r, + Pair.create(bounds(fourPP, true, fivePP, true), null), + Pair.create(bounds(fourPP, true, fivePP, false), null), + Pair.create(bounds(fourPP, false, fivePP, true), null), + Pair.create(bounds(fourPP, false, fivePP, false), null)); + + // Overlap left only + testInclusivity(threePP, fivePP, r, + Pair.create(bounds(threeT.maxKeyBound(), false, fivePP, true), null), + Pair.create(bounds(threeT.maxKeyBound(), false, fivePP, false), null), + Pair.create(bounds(threeT.maxKeyBound(), false, fivePP, true), null), + Pair.create(bounds(threeT.maxKeyBound(), false, fivePP, false), null)); + + // Overlap right only + testInclusivity(fourPP, sixPP, r, + Pair.create(bounds(fourPP, true, sixPP, true), null), + Pair.create(bounds(fourPP, true, sixPP, false), null), + Pair.create(bounds(fourPP, false, sixPP, true), null), + Pair.create(bounds(fourPP, false, sixPP, false), null)); + + // Contains range + testInclusivity(twoPP, sevenPP, r, + Pair.create(bounds(threeT.maxKeyBound(), false, sixT.maxKeyBound(), true), bounds(sixT.maxKeyBound(), false, sevenPP, true)), + Pair.create(bounds(threeT.maxKeyBound(), false, sixT.maxKeyBound(), true), bounds(sixT.maxKeyBound(), false, sevenPP, false)), + Pair.create(bounds(threeT.maxKeyBound(), false, sixT.maxKeyBound(), true), bounds(sixT.maxKeyBound(), false, sevenPP, true)), + Pair.create(bounds(threeT.maxKeyBound(), false, sixT.maxKeyBound(), true), bounds(sixT.maxKeyBound(), false, sevenPP, false))); + + // Split by range left bound + testInclusivity(twoPP, fivePP, r, + Pair.create(bounds(threeT.maxKeyBound(), false, fivePP, true), null), + Pair.create(bounds(threeT.maxKeyBound(), false, fivePP, false), null), + Pair.create(bounds(threeT.maxKeyBound(), false, fivePP, true), null), + Pair.create(bounds(threeT.maxKeyBound(), false, fivePP, false), null)); + + // Split by range right bound + testInclusivity(fivePP, sevenPP, r, + Pair.create(bounds(fivePP, true, sixT.maxKeyBound(), true), bounds(sixT.maxKeyBound(), false, sevenPP, true)), + Pair.create(bounds(fivePP, true, sixT.maxKeyBound(), true), bounds(sixT.maxKeyBound(), false, sevenPP, false)), + Pair.create(bounds(fivePP, false, sixT.maxKeyBound(), true), bounds(sixT.maxKeyBound(), false, sevenPP, true)), + Pair.create(bounds(fivePP, false, sixT.maxKeyBound(), true), bounds(sixT.maxKeyBound(), false, sevenPP, false))); + + /* + * Test size 1 bound + */ + // Completely before + testInclusivity(onePP, onePP, r, + Pair.create(null, null)); + + // Completely after + testInclusivity(eightPP, eightPP, r, + Pair.create(null, bounds(eightPP, true, eightPP, true))); + + // Completely contained by range, should echo back the same bound + testInclusivity(fivePP, fivePP, r, + Pair.create(bounds(fivePP, true, fivePP, true), null)); + + // Overlap left only + testInclusivity(threePP, threePP, r, + Pair.create(null, null)); + + // Overlap right only + testInclusivity(sixPP, sixPP, r, + Pair.create(bounds(sixPP, true, sixPP, true), null)); + + /* + * Test all cases where the right of Bounds is minimum + */ + // Completely after + testInclusivity(sevenPP, minPP, r, + Pair.create(null, bounds(sevenPP, true, minPP, true)), + Pair.create(null, bounds(sevenPP, true, minPP, false)), + Pair.create(null, bounds(sevenPP, false, minPP, true)), + Pair.create(null, bounds(sevenPP, false, minPP, false))); + + // Contains range + testInclusivity(twoPP, minPP, r, + Pair.create(bounds(threeT.maxKeyBound(), false, sixT.maxKeyBound(), true), bounds(sixT.maxKeyBound(), false, minPP, true)), + Pair.create(bounds(threeT.maxKeyBound(), false, sixT.maxKeyBound(), true), bounds(sixT.maxKeyBound(), false, minPP, false)), + Pair.create(bounds(threeT.maxKeyBound(), false, sixT.maxKeyBound(), true), bounds(sixT.maxKeyBound(), false, minPP, true)), + Pair.create(bounds(threeT.maxKeyBound(), false, sixT.maxKeyBound(), true), bounds(sixT.maxKeyBound(), false, minPP, false))); + + // Split by range right bound + testInclusivity(fivePP, minPP, r, + Pair.create(bounds(fivePP, true, sixT.maxKeyBound(), true), bounds(sixT.maxKeyBound(), false, minPP, true)), + Pair.create(bounds(fivePP, true, sixT.maxKeyBound(), true), bounds(sixT.maxKeyBound(), false, minPP, false)), + Pair.create(bounds(fivePP, false, sixT.maxKeyBound(), true), bounds(sixT.maxKeyBound(), false, minPP, true)), + Pair.create(bounds(fivePP, false, sixT.maxKeyBound(), true), bounds(sixT.maxKeyBound(), false, minPP, false))); + + /* + * Test all cases where the right of the range is minimum + */ + r = r(threeT, minT); + // Completely before + testInclusivity(onePP, twoPP, r, + Pair.create(null, null), + Pair.create(null, null), + Pair.create(null, null), + Pair.create(null, null)); + + // Overlapping + testInclusivity(threePP, minPP, r, + Pair.create(bounds(threeT.maxKeyBound(), false, minPP, true), null), + Pair.create(bounds(threeT.maxKeyBound(), false, minPP, false), null), + Pair.create(bounds(threeT.maxKeyBound(), false, minPP, true), null), + Pair.create(bounds(threeT.maxKeyBound(), false, minPP, false), null)); + + // Completely contained by range, should echo back the same bound + testInclusivity(fourPP, fivePP, r, + Pair.create(bounds(fourPP, true, fivePP, true), null), + Pair.create(bounds(fourPP, true, fivePP, false), null), + Pair.create(bounds(fourPP, false, fivePP, true), null), + Pair.create(bounds(fourPP, false, fivePP, false), null)); + + // Overlap left only + testInclusivity(threePP, fivePP, r, + Pair.create(bounds(threeT.maxKeyBound(), false, fivePP, true), null), + Pair.create(bounds(threeT.maxKeyBound(), false, fivePP, false), null), + Pair.create(bounds(threeT.maxKeyBound(), false, fivePP, true), null), + Pair.create(bounds(threeT.maxKeyBound(), false, fivePP, false), null)); + + // Overlap right only + testInclusivity(fourPP, minPP, r, + Pair.create(bounds(fourPP, true, minPP, true), null), + Pair.create(bounds(fourPP, true, minPP, false), null), + Pair.create(bounds(fourPP, false, minPP, true), null), + Pair.create(bounds(fourPP, false, minPP, false), null)); + + // Contains range + testInclusivity(twoPP, minPP, r, + Pair.create(bounds(threeT.maxKeyBound(), false, minPP, true), null), + Pair.create(bounds(threeT.maxKeyBound(), false, minPP, false), null), + Pair.create(bounds(threeT.maxKeyBound(), false, minPP, true), null), + Pair.create(bounds(threeT.maxKeyBound(), false, minPP, false), null)); + + // Split by range left bound + testInclusivity(twoPP, fivePP, r, + Pair.create(bounds(threeT.maxKeyBound(), false, fivePP, true), null), + Pair.create(bounds(threeT.maxKeyBound(), false, fivePP, false), null), + Pair.create(bounds(threeT.maxKeyBound(), false, fivePP, true), null), + Pair.create(bounds(threeT.maxKeyBound(), false, fivePP, false), null)); + } + + private void testInclusivity(PartitionPosition left, PartitionPosition right, Range range, + Pair, AbstractBounds> expected) + { + testInclusivity(left, right, range, expected, null, null, null); + } + + private void testInclusivity(PartitionPosition left, PartitionPosition right, Range range, + Pair, AbstractBounds> expected1, + Pair, AbstractBounds> expected2, + Pair, AbstractBounds> expected3, + Pair, AbstractBounds> expected4) + { + testInclusivity(left, right, range, new Pair[] { expected1, expected2, expected3, expected4 }); + } + + private void testInclusivity(PartitionPosition left, PartitionPosition right, Range range, + Pair, AbstractBounds>[] expecteds) + { + int i = 0; + for (Boolean leftInclusive : ImmutableList.of(true, false)) + { + for (Boolean rightInclusive : ImmutableList.of(true, false)) + { + Pair, AbstractBounds> expected = expecteds[i++]; + if (expected == null) + continue; + AbstractBounds expectedIntersection = expected.left; + AbstractBounds expectedRemainder = expected.right; + AbstractBounds testBounds = bounds(left, leftInclusive, right, rightInclusive); + Pair, AbstractBounds> interSectionAndRemainder = intersectionAndRemainder(testBounds, range); + AbstractBounds intersection = interSectionAndRemainder.left; + AbstractBounds remainder = interSectionAndRemainder.right; + String message = format("Expected %s intersecting inclusive left %b, inclusive right %b, %s with %s", expected, leftInclusive, rightInclusive, bounds(left, leftInclusive, right, rightInclusive), range); + assertEquals(message, expected, intersectionAndRemainder(bounds(left, leftInclusive, right, rightInclusive), range)); + System.out.println(message.replace("Expected", "Expecting")); + + if (remainder == testBounds) + assertNull(intersection); + if (intersection == testBounds) + assertNull(remainder); + if (Objects.equals(remainder, testBounds) && remainder != testBounds) + fail("Should return existing bounds"); + if (Objects.equals(intersection, testBounds) && intersection != testBounds) + fail("Should return existing bounds"); + + // Need to validate that we roundtrip the actual exact input PartitionPosition and don't lose part of the input bound that might be needed + // Remainder can either be the entire thing because there is no intersection + // Remainder should always preserve the right bound since range is right inclusive, the left bound will always be a `maxKeyBound` from the range + if (remainder != null && remainder != testBounds) + { + assertTrue(remainder.right == expectedRemainder.right); + assertTrue(remainder.right == right); + assertEquals(remainder.inclusiveRight(), expectedRemainder.inclusiveRight()); + assertFalse(remainder.inclusiveLeft()); + assertFalse(remainder.left == left); + // Not strictly necessary, but we do always use the left of the range and create a new key bound + assertEquals(remainder.left, range.right.maxKeyBound()); + } + + // Range is left exclusive so the left should be preserved if it is greater than range left + // otherwise it should be replaced + if (intersection != null && intersection != testBounds) + { + if (intersection.left.getToken().compareTo(range.left) > 0) + { + assertEquals(intersection.inclusiveLeft(), expectedIntersection.inclusiveLeft()); + assertTrue(intersection.inclusiveRight()); + assertTrue(intersection.left == expectedIntersection.left); + assertTrue(intersection.left == left); + } + else + { + // Should be replaced by a KeyBound from the range + assertTrue(intersection.left != expectedIntersection.left); + assertTrue(intersection.left != left); + // Max bound since range is not left inclusive and excluded + assertEquals(intersection.left, range.left.maxKeyBound()); + } + } + } + } + } } diff --git a/test/unit/org/apache/cassandra/dht/SplitterTest.java b/test/unit/org/apache/cassandra/dht/SplitterTest.java index 1de22ff8fc69..560bc8241380 100644 --- a/test/unit/org/apache/cassandra/dht/SplitterTest.java +++ b/test/unit/org/apache/cassandra/dht/SplitterTest.java @@ -46,25 +46,25 @@ public class SplitterTest @Test public void randomSplitTestNoVNodesRandomPartitioner() { - randomSplitTestNoVNodes(new RandomPartitioner()); + randomSplitTestNoVNodes(RandomPartitioner.instance); } @Test public void randomSplitTestNoVNodesMurmur3Partitioner() { - randomSplitTestNoVNodes(new Murmur3Partitioner()); + randomSplitTestNoVNodes(Murmur3Partitioner.instance); } @Test public void randomSplitTestVNodesRandomPartitioner() { - randomSplitTestVNodes(new RandomPartitioner()); + randomSplitTestVNodes(RandomPartitioner.instance); } @Test public void randomSplitTestVNodesMurmur3Partitioner() { - randomSplitTestVNodes(new Murmur3Partitioner()); + randomSplitTestVNodes(Murmur3Partitioner.instance); } // CASSANDRA-18013 @@ -168,7 +168,7 @@ private static boolean assertRangeSizeEqual(List localRa for (int i = 0; i < tokens.size(); i++) { - Token end = i == tokens.size() - 1 ? partitioner.getMaximumToken() : tokens.get(i); + Token end = i == tokens.size() - 1 ? partitioner.getMaximumTokenForSplitting() : tokens.get(i); splits.add(sumOwnedBetween(localRanges, start, end, splitter, splitIndividualRanges)); start = end; } @@ -235,13 +235,13 @@ private static List generateLocalRanges(int numTokens, i @Test public void testSplitMurmur3Partitioner() { - testSplit(new Murmur3Partitioner()); + testSplit(Murmur3Partitioner.instance); } @Test public void testSplitRandomPartitioner() { - testSplit(new RandomPartitioner()); + testSplit(RandomPartitioner.instance); } @SuppressWarnings("unchecked") @@ -250,7 +250,7 @@ private static void testSplit(IPartitioner partitioner) boolean isRandom = partitioner instanceof RandomPartitioner; Splitter splitter = getSplitter(partitioner); BigInteger min = splitter.valueForToken(partitioner.getMinimumToken()); - BigInteger max = splitter.valueForToken(partitioner.getMaximumToken()); + BigInteger max = splitter.valueForToken(partitioner.getMaximumTokenForSplitting()); BigInteger first = isRandom ? RandomPartitioner.ZERO : min; BigInteger last = isRandom ? max.subtract(BigInteger.valueOf(1)) : max; BigInteger midpoint = last.add(first).divide(BigInteger.valueOf(2)); @@ -359,13 +359,13 @@ private static Token token(IPartitioner partitioner, Object n) @Test public void testTokensInRangeRandomPartitioner() { - testTokensInRange(new RandomPartitioner()); + testTokensInRange(RandomPartitioner.instance); } @Test public void testTokensInRangeMurmur3Partitioner() { - testTokensInRange(new Murmur3Partitioner()); + testTokensInRange(Murmur3Partitioner.instance); } private static void testTokensInRange(IPartitioner partitioner) @@ -373,8 +373,8 @@ private static void testTokensInRange(IPartitioner partitioner) Splitter splitter = getSplitter(partitioner); // test full range - Range fullRange = new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumToken()); - BigInteger fullRangeSize = splitter.valueForToken(partitioner.getMaximumToken()).subtract(splitter.valueForToken(partitioner.getMinimumToken())); + Range fullRange = new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumTokenForSplitting()); + BigInteger fullRangeSize = splitter.valueForToken(partitioner.getMaximumTokenForSplitting()).subtract(splitter.valueForToken(partitioner.getMinimumToken())); assertEquals(fullRangeSize, splitter.tokensInRange(fullRange)); fullRange = new Range<>(splitter.tokenForValue(BigInteger.valueOf(-10)), splitter.tokenForValue(BigInteger.valueOf(-10))); assertEquals(fullRangeSize, splitter.tokensInRange(fullRange)); @@ -391,13 +391,13 @@ private static void testTokensInRange(IPartitioner partitioner) @Test public void testElapsedTokensRandomPartitioner() { - testElapsedMultiRange(new RandomPartitioner()); + testElapsedMultiRange(RandomPartitioner.instance); } @Test public void testElapsedTokensMurmur3Partitioner() { - testElapsedMultiRange(new Murmur3Partitioner()); + testElapsedMultiRange(Murmur3Partitioner.instance); } private static void testElapsedMultiRange(IPartitioner partitioner) @@ -413,13 +413,13 @@ private static void testElapsedMultiRange(IPartitioner partitioner) // wrapped range BigInteger min = splitter.valueForToken(partitioner.getMinimumToken()); - BigInteger max = splitter.valueForToken(partitioner.getMaximumToken()); + BigInteger max = splitter.valueForToken(partitioner.getMaximumTokenForSplitting()); Range wrappedRange = new Range<>(splitter.tokenForValue(max.subtract(BigInteger.valueOf(1350))), splitter.tokenForValue(min.add(BigInteger.valueOf(20394)))); testElapsedTokens(partitioner, wrappedRange, true); // full range - Range fullRange = new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumToken()); + Range fullRange = new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumTokenForSplitting()); testElapsedTokens(partitioner, fullRange, false); } @@ -457,13 +457,13 @@ private static void testElapsedTokens(IPartitioner partitioner, Range ran @Test public void testPositionInRangeRandomPartitioner() { - testPositionInRangeMultiRange(new RandomPartitioner()); + testPositionInRangeMultiRange(RandomPartitioner.instance); } @Test public void testPositionInRangeMurmur3Partitioner() { - testPositionInRangeMultiRange(new Murmur3Partitioner()); + testPositionInRangeMultiRange(Murmur3Partitioner.instance); } private static void testPositionInRangeMultiRange(IPartitioner partitioner) @@ -490,15 +490,15 @@ private static void testPositionInRangeMultiRange(IPartitioner partitioner) testPositionInRange(partitioner, splitter, range); // Test wrap-around range - start = splitter.tokenForValue(splitter.valueForToken(partitioner.getMaximumToken()).subtract(BigInteger.valueOf(123456789))); + start = splitter.tokenForValue(splitter.valueForToken(partitioner.getMaximumTokenForSplitting()).subtract(BigInteger.valueOf(123456789))); end = splitter.tokenForValue(splitter.valueForToken(partitioner.getMinimumToken()).add(BigInteger.valueOf(123456789))); range = new Range<>(start, end); testPositionInRange(partitioner, splitter, range); // Test full range - testPositionInRange(partitioner, splitter, new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumToken())); + testPositionInRange(partitioner, splitter, new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumTokenForSplitting())); testPositionInRange(partitioner, splitter, new Range<>(partitioner.getMinimumToken(), partitioner.getMinimumToken())); - testPositionInRange(partitioner, splitter, new Range<>(partitioner.getMaximumToken(), partitioner.getMaximumToken())); + testPositionInRange(partitioner, splitter, new Range<>(partitioner.getMaximumTokenForSplitting(), partitioner.getMaximumTokenForSplitting())); testPositionInRange(partitioner, splitter, new Range<>(splitter.tokenForValue(BigInteger.ONE), splitter.tokenForValue(BigInteger.ONE))); } @@ -508,7 +508,7 @@ private static void testPositionInRange(IPartitioner partitioner, Splitter split //full range case if (range.left.equals(range.right)) { - actualRange = new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumToken()); + actualRange = new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumTokenForSplitting()); } assertEquals(0.0, splitter.positionInRange(actualRange.left, range), 0.01); assertEquals(0.25, splitter.positionInRange(getTokenInPosition(partitioner, actualRange, 0.25), range), 0.01); @@ -523,7 +523,7 @@ private static Token getTokenInPosition(IPartitioner partitioner, Range r { if (range.left.equals(range.right)) { - range = new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumToken()); + range = new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumTokenForSplitting()); } Splitter splitter = getSplitter(partitioner); BigInteger totalTokens = splitter.tokensInRange(range); @@ -535,7 +535,7 @@ private static Token getTokenInPosition(IPartitioner partitioner, Range r private static Token getWrappedToken(IPartitioner partitioner, BigInteger position) { Splitter splitter = getSplitter(partitioner); - BigInteger maxTokenValue = splitter.valueForToken(partitioner.getMaximumToken()); + BigInteger maxTokenValue = splitter.valueForToken(partitioner.getMaximumTokenForSplitting()); BigInteger minTokenValue = splitter.valueForToken(partitioner.getMinimumToken()); if (position.compareTo(maxTokenValue) > 0) { diff --git a/test/unit/org/apache/cassandra/dht/StreamStateStoreTest.java b/test/unit/org/apache/cassandra/dht/StreamStateStoreTest.java index d731385fd318..816366e26fd8 100644 --- a/test/unit/org/apache/cassandra/dht/StreamStateStoreTest.java +++ b/test/unit/org/apache/cassandra/dht/StreamStateStoreTest.java @@ -19,18 +19,18 @@ import java.util.Collections; -import org.apache.cassandra.db.commitlog.CommitLog; -import org.apache.cassandra.locator.RangesAtEndpoint; import org.junit.BeforeClass; import org.junit.Test; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.commitlog.CommitLog; import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.streaming.async.NettyStreamingConnectionFactory; +import org.apache.cassandra.locator.RangesAtEndpoint; import org.apache.cassandra.streaming.PreviewKind; import org.apache.cassandra.streaming.StreamEvent; import org.apache.cassandra.streaming.StreamOperation; import org.apache.cassandra.streaming.StreamSession; +import org.apache.cassandra.streaming.async.NettyStreamingConnectionFactory; import org.apache.cassandra.utils.FBUtilities; import static org.apache.cassandra.net.MessagingService.current_version; @@ -51,7 +51,7 @@ public static void initDD() public void testUpdateAndQueryAvailableRanges() { // let range (0, 100] of keyspace1 be bootstrapped. - IPartitioner p = new Murmur3Partitioner(); + IPartitioner p = Murmur3Partitioner.instance; Token.TokenFactory factory = p.getTokenFactory(); Range range = new Range<>(factory.fromString("0"), factory.fromString("100")); diff --git a/test/unit/org/apache/cassandra/dht/TokenTest.java b/test/unit/org/apache/cassandra/dht/TokenTest.java new file mode 100644 index 000000000000..fa934e2065d1 --- /dev/null +++ b/test/unit/org/apache/cassandra/dht/TokenTest.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.dht; + +import org.junit.Test; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.utils.AccordGenerators; +import org.apache.cassandra.utils.CassandraGenerators; + +import static accord.utils.Property.qt; + +public class TokenTest +{ + static + { + DatabaseDescriptor.clientInitialization(); + } + + @Test + public void serde() + { + DataOutputBuffer output = new DataOutputBuffer(); + qt().check(rs -> { + IPartitioner partitioner = AccordGenerators.partitioner().next(rs); + DatabaseDescriptor.setPartitionerUnsafe(partitioner); + Token token = AccordGenerators.fromQT(CassandraGenerators.token(partitioner)).next(rs); + for (MessagingService.Version version : MessagingService.Version.values()) + Serializers.testSerde(output, Token.compactSerializer, token, version.value); + }); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/dht/tokenallocator/TokenAllocationTest.java b/test/unit/org/apache/cassandra/dht/tokenallocator/TokenAllocationTest.java index 6b4ef404656d..7099565e3eb0 100644 --- a/test/unit/org/apache/cassandra/dht/tokenallocator/TokenAllocationTest.java +++ b/test/unit/org/apache/cassandra/dht/tokenallocator/TokenAllocationTest.java @@ -28,16 +28,15 @@ import java.util.Set; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; -import org.junit.AfterClass; import org.junit.Assert; import org.junit.Before; -import org.junit.BeforeClass; import org.junit.Test; import org.apache.cassandra.ServerTestUtils; -import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.CassandraTestBase; +import org.apache.cassandra.CassandraTestBase.DDDaemonInitialization; +import org.apache.cassandra.CassandraTestBase.UseMurmur3Partitioner; import org.apache.cassandra.dht.IPartitioner; -import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.dht.Token; import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; import org.apache.cassandra.exceptions.ConfigurationException; @@ -56,18 +55,12 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; -public class TokenAllocationTest +@DDDaemonInitialization +@UseMurmur3Partitioner +public class TokenAllocationTest extends CassandraTestBase { - static IPartitioner oldPartitioner; static Random rand = new Random(1); - @BeforeClass - public static void beforeClass() throws ConfigurationException - { - DatabaseDescriptor.daemonInitialization(); - oldPartitioner = StorageService.instance.setPartitionerUnsafe(Murmur3Partitioner.instance); - } - @Before public void before() throws ConfigurationException { @@ -81,12 +74,6 @@ public void after() throws ConfigurationException ClusterMetadataService.unsetInstance(); } - @AfterClass - public static void afterClass() - { - DatabaseDescriptor.setPartitionerUnsafe(oldPartitioner); - } - private static TokenAllocation createForTest(ClusterMetadata metadata, int replicas, int numTokens) { return TokenAllocation.create(metadata.locator.local().datacenter, metadata, replicas, numTokens); diff --git a/test/unit/org/apache/cassandra/exceptions/RemoteExceptionTest.java b/test/unit/org/apache/cassandra/exceptions/RemoteExceptionTest.java new file mode 100644 index 000000000000..42e211e16236 --- /dev/null +++ b/test/unit/org/apache/cassandra/exceptions/RemoteExceptionTest.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.exceptions; + +import java.util.HashMap; +import java.util.Map; + +import org.junit.Test; + +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.net.MessagingService; + +import static com.google.common.base.Throwables.getStackTraceAsString; +import static org.apache.cassandra.exceptions.ExceptionSerializer.getMessageWithOriginatingHost; +import static org.apache.cassandra.exceptions.ExceptionSerializer.nullableRemoteExceptionSerializer; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +public class RemoteExceptionTest +{ + @Test + public void testRoundtrip() throws Exception + { + testRoundtrip(null); + Throwable root = new Throwable(); + testRoundtrip(root); + Throwable suppressed = new Throwable(); + Throwable causedByRoot = new Throwable(root); + testRoundtrip(causedByRoot); + causedByRoot.addSuppressed(root); + testRoundtrip(causedByRoot); + root.addSuppressed(causedByRoot); + testRoundtrip(root); + root.addSuppressed(suppressed); + testRoundtrip(root); + } + + public void testRoundtrip(Throwable original) throws Exception + { + Throwable normalizedOriginal = normalizeThrowable(original); + + DataOutputBuffer dob = new DataOutputBuffer(); + nullableRemoteExceptionSerializer.serialize(original, dob, MessagingService.current_version); + assertEquals(nullableRemoteExceptionSerializer.serializedSize(original, MessagingService.current_version), dob.toByteArray().length); + DataInputBuffer dib = new DataInputBuffer(dob.toByteArray()); + Throwable test = nullableRemoteExceptionSerializer.deserialize(dib, MessagingService.current_version); + if (original == null) + { + assertNull(test); + } + else + { + String originalString = getStackTraceAsString(normalizedOriginal); + String testString = getStackTraceAsString(test); + assertEquals(originalString, testString); + } + } + + public static Throwable normalizeThrowable(Throwable t) throws Exception + { + return normalizeThrowable(t, true, new HashMap<>()); + } + + private static Throwable normalizeThrowable(Throwable t, boolean isFirstException, Map alreadyNormalized) throws Exception + { + if (t == null) + return null; + + if (alreadyNormalized.containsKey(t)) + return alreadyNormalized.get(t); + + // Classloader, module name, and module version are difficult to get right because STE doesn't + // expose enough parameters to serialize the formatting correctly so settle for something close, but not exact + // Alternatives look fragile across different JVM versions and yield only moderate additional debugability + // when using class loaders and modules + StackTraceElement[] originalStack = t.getStackTrace(); + StackTraceElement[] normalizedStack = new StackTraceElement[originalStack.length]; + for (int i = 0; i < originalStack.length; i++) + { + StackTraceElement originalSTE = originalStack[i]; + normalizedStack[i] = new StackTraceElement(originalSTE.getClassName(), originalSTE.getMethodName(), originalSTE.getFileName(), originalSTE.getLineNumber()); + } + + Throwable normalized; + if (t.getCause() == null) + normalized = t.getClass().getConstructor(String.class).newInstance(getMessageWithOriginatingHost(t, isFirstException)); + else + normalized = t.getClass().getConstructor(String.class, Throwable.class).newInstance(getMessageWithOriginatingHost(t, isFirstException), normalizeThrowable(t.getCause(), false, alreadyNormalized)); + alreadyNormalized.put(t, normalized); + normalized.setStackTrace(normalizedStack); + for (Throwable suppressed : t.getSuppressed()) + normalized.addSuppressed(normalizeThrowable(suppressed, false, alreadyNormalized)); + return normalized; + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/exceptions/RequestFailureReasonTest.java b/test/unit/org/apache/cassandra/exceptions/RequestFailureReasonTest.java index b2fdcd365d73..4be82d491c73 100644 --- a/test/unit/org/apache/cassandra/exceptions/RequestFailureReasonTest.java +++ b/test/unit/org/apache/cassandra/exceptions/RequestFailureReasonTest.java @@ -20,8 +20,10 @@ import org.junit.Test; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.junit.Assert.assertEquals; + public class RequestFailureReasonTest { private static final RequestFailureReason[] REASONS = RequestFailureReason.values(); @@ -37,7 +39,9 @@ public class RequestFailureReasonTest { 7, "READ_TOO_MANY_INDEXES" }, { 8, "NOT_CMS" }, { 9, "INVALID_ROUTING" }, - { 10, "COORDINATOR_BEHIND" } + { 10, "COORDINATOR_BEHIND" }, + { 11, "RETRY_ON_DIFFERENT_TRANSACTION_SYSTEM" }, + { 503, "INDEX_BUILD_IN_PROGRESS" }, }; @Test @@ -54,4 +58,42 @@ public void testEnumCodesAndNames() assertEquals("Number of RequestFailureReason enum constants has changed. Update the test.", EXPECTED_VALUES.length, REASONS.length); } + + @Test + public void testFromCode() + { + // Test valid codes + for (Object[] expected : EXPECTED_VALUES) + { + int code = (Integer) expected[0]; + String name = (String) expected[1]; + assertEquals(RequestFailureReason.valueOf(name), RequestFailureReason.fromCode(code)); + } + + // Test invalid codes + assertEquals(RequestFailureReason.UNKNOWN, RequestFailureReason.fromCode(200)); + assertEquals(RequestFailureReason.UNKNOWN, RequestFailureReason.fromCode(999)); + assertThatThrownBy(() -> RequestFailureReason.fromCode(-1)).isInstanceOf(IllegalArgumentException.class); + } + + @Test + public void testExceptionSubclassMapping() + { + // Create a subclass of UnknownTableException + class CustomUnknownTableException extends IncompatibleSchemaException + { + public CustomUnknownTableException(String ks) + { + super(ks); + } + } + + // Verify the parent class still maps correctly + assertEquals(RequestFailureReason.INCOMPATIBLE_SCHEMA, + RequestFailureReason.forException(new CustomUnknownTableException("ks"))); + + // Test unmapped exception returns UNKNOWN + assertEquals(RequestFailureReason.UNKNOWN, + RequestFailureReason.forException(new RuntimeException("test"))); + } } diff --git a/test/unit/org/apache/cassandra/gms/GossiperTest.java b/test/unit/org/apache/cassandra/gms/GossiperTest.java index 4362021e57fb..d254850a36d2 100644 --- a/test/unit/org/apache/cassandra/gms/GossiperTest.java +++ b/test/unit/org/apache/cassandra/gms/GossiperTest.java @@ -205,7 +205,7 @@ public void testDuplicatedStateUpdate() throws Exception proposedRemoteState = new EndpointState(proposedRemoteHeartBeat); // Bump the heartbeat version and use the same TOKENS state - proposedRemoteHeartBeat.updateHeartBeat(); + proposedRemoteState.updateHeartBeat(); proposedRemoteState.addApplicationState(ApplicationState.TOKENS, tokensValue); // The following state change should only update heartbeat without updating the TOKENS state diff --git a/test/unit/org/apache/cassandra/gms/SerializationsTest.java b/test/unit/org/apache/cassandra/gms/SerializationsTest.java index c7320b194e6b..28a77144662e 100644 --- a/test/unit/org/apache/cassandra/gms/SerializationsTest.java +++ b/test/unit/org/apache/cassandra/gms/SerializationsTest.java @@ -133,7 +133,7 @@ private static class Statics private static List Digests = new ArrayList(); static { - HeartbeatSt.updateHeartBeat(); + EndpointSt.updateHeartBeat(); EndpointSt.addApplicationState(ApplicationState.LOAD, vv0); EndpointSt.addApplicationState(ApplicationState.STATUS_WITH_PORT, vv1); for (int i = 0; i < 100; i++) diff --git a/test/unit/org/apache/cassandra/gms/VersionedValueTest.java b/test/unit/org/apache/cassandra/gms/VersionedValueTest.java index ac47ee5496ce..af0177e3573d 100644 --- a/test/unit/org/apache/cassandra/gms/VersionedValueTest.java +++ b/test/unit/org/apache/cassandra/gms/VersionedValueTest.java @@ -50,4 +50,4 @@ private static Gen values() // sometimes the text is too big, must not be larger than Short.MAX_VALUE .filter(vv -> TypeSizes.encodedUTF8Length(vv.value) <= Short.MAX_VALUE); } -} \ No newline at end of file +} diff --git a/test/unit/org/apache/cassandra/hints/HintServiceBytemanTest.java b/test/unit/org/apache/cassandra/hints/HintServiceBytemanTest.java index b7f431dd81be..1beaa123a8bf 100644 --- a/test/unit/org/apache/cassandra/hints/HintServiceBytemanTest.java +++ b/test/unit/org/apache/cassandra/hints/HintServiceBytemanTest.java @@ -42,7 +42,7 @@ import org.jboss.byteman.contrib.bmunit.BMRule; import org.jboss.byteman.contrib.bmunit.BMUnitRunner; -import static org.apache.cassandra.hints.HintsTestUtil.MockFailureDetector; +import org.apache.cassandra.utils.MockFailureDetector; import static org.apache.cassandra.hints.HintsTestUtil.sendHintsAndResponses; import static org.junit.Assert.assertEquals; @@ -52,7 +52,7 @@ public class HintServiceBytemanTest private static final String KEYSPACE = "hints_service_test"; private static final String TABLE = "table"; - private final MockFailureDetector failureDetector = new HintsTestUtil.MockFailureDetector(); + private final MockFailureDetector failureDetector = new MockFailureDetector(); private static TableMetadata metadata; @BeforeClass diff --git a/test/unit/org/apache/cassandra/hints/HintsServiceTest.java b/test/unit/org/apache/cassandra/hints/HintsServiceTest.java index dd0eb5a6edde..78aa85c59e0a 100644 --- a/test/unit/org/apache/cassandra/hints/HintsServiceTest.java +++ b/test/unit/org/apache/cassandra/hints/HintsServiceTest.java @@ -20,6 +20,7 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicInteger; import javax.annotation.Nullable; import com.google.common.util.concurrent.Futures; @@ -30,8 +31,16 @@ import org.junit.BeforeClass; import org.junit.Test; +import accord.primitives.Keys; +import accord.primitives.TxnId; import com.datastax.driver.core.utils.MoreFutures; import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.db.IMutation; +import org.apache.cassandra.db.Mutation; +import org.apache.cassandra.metrics.HintsServiceMetrics; import org.apache.cassandra.metrics.StorageMetrics; import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.net.MockMessagingService; @@ -40,11 +49,27 @@ import org.apache.cassandra.schema.Schema; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.AccordResult; +import org.apache.cassandra.service.accord.AccordTestUtils; +import org.apache.cassandra.service.accord.IAccordService.IAccordResult; +import org.apache.cassandra.service.accord.TimeOnlyRequestBookkeeping.LatencyRequestBookkeeping; +import org.apache.cassandra.service.accord.txn.TxnData; +import org.apache.cassandra.service.accord.txn.TxnResult; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationMutationHelper; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.transport.Dispatcher; +import org.apache.cassandra.utils.MockFailureDetector; -import static org.apache.cassandra.hints.HintsTestUtil.MockFailureDetector; +import static org.apache.cassandra.Util.spinAssertEquals; +import static org.apache.cassandra.config.CassandraRelevantProperties.HINT_DISPATCH_INTERVAL_MS; import static org.apache.cassandra.hints.HintsTestUtil.sendHintsAndResponses; +import static org.apache.cassandra.hints.HintsTestUtil.sendHintsWithRetryDifferentSystemUUID; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import static org.mockito.ArgumentMatchers.notNull; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; public class HintsServiceTest { @@ -63,12 +88,15 @@ public static void defineSchema() KeyspaceParams.simple(1), SchemaLoader.standardCFMD(KEYSPACE, TABLE)); metadata = Schema.instance.getTableMetadata(KEYSPACE, TABLE); + HINT_DISPATCH_INTERVAL_MS.setLong(100); + DatabaseDescriptor.setHintsFlushPeriodInMS(100); } @After public void cleanup() { MockMessagingService.cleanup(); + ConsensusMigrationMutationHelper.resetInstanceForTest(); } @Before @@ -173,4 +201,58 @@ public void testPageSeek() throws InterruptedException, ExecutionException assertTrue(dispatchOffset != null); assertTrue(((ChecksummedDataInput.Position) dispatchOffset).sourcePosition > 0); } + + /* + * Make sure that if hints from the batchlog end up needing to be executed without Accord + * that they are turned into + */ + @Test + public void testHintsNeedingRehinting() throws Throwable + { + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(metadata.id); + long startWrites = cfs.metric.writeLatency.latency.getCount(); + HintsService.instance = spy(HintsService.instance); + AtomicInteger accordTxnCount = new AtomicInteger(); + ConsensusMigrationMutationHelper.replaceInstanceForTest( + new ConsensusMigrationMutationHelper() + { + int count = 0; + + @Override + public SplitMutation splitMutationIntoAccordAndNormal(T mutation, ClusterMetadata cm) + { + if (count > 2) + return super.splitMutationIntoAccordAndNormal(mutation, cm); + + SplitMutation split; + if (count % 2 == 0) + split = new SplitMutation(mutation, null); + else + split = new SplitMutation<>(null, mutation); + count++; + return split; + } + + @Override + public IAccordResult mutateWithAccordAsync(ClusterMetadata cm, Mutation mutation, @Nullable ConsistencyLevel consistencyLevel, Dispatcher.RequestTime requestTime) + { + accordTxnCount.incrementAndGet(); + TxnId txnId = AccordTestUtils.txnId(42, 43, 44); + AccordResult result = new AccordResult<>(txnId, Keys.EMPTY, new LatencyRequestBookkeeping(null), requestTime.startedAtNanos(), requestTime.startedAtNanos(), true); + result.accept(new TxnData(), null); + return result; + } + }); + sendHintsWithRetryDifferentSystemUUID(metadata); + // Two should be Accord transactions + spinAssertEquals(2, accordTxnCount::get, 10); + Thread.sleep(1000); + // An attempt should be made to write to all replicas + verify(HintsService.instance, times(1)).writeForAllReplicas(notNull()); + // And it should be written locally + spinAssertEquals(startWrites + 1L, cfs.metric.writeLatency.latency::getCount, 10); + + // Hints that are rehinted are treated as succeeding immediately for the ACCORD_HINT_ENDPOINT + assertEquals(3, HintsServiceMetrics.getDelayCount(HintsServiceMetrics.ACCORD_HINT_ENDPOINT)); + } } diff --git a/test/unit/org/apache/cassandra/hints/HintsTestUtil.java b/test/unit/org/apache/cassandra/hints/HintsTestUtil.java index 727404e6e8e5..b3f3250481fa 100644 --- a/test/unit/org/apache/cassandra/hints/HintsTestUtil.java +++ b/test/unit/org/apache/cassandra/hints/HintsTestUtil.java @@ -24,9 +24,6 @@ import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.partitions.AbstractBTreePartition; import org.apache.cassandra.db.partitions.PartitionUpdate; -import org.apache.cassandra.gms.IFailureDetectionEventListener; -import org.apache.cassandra.gms.IFailureDetector; -import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MockMessagingService; import org.apache.cassandra.net.MockMessagingSpy; @@ -35,12 +32,12 @@ import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.Clock; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; import static org.apache.cassandra.Util.dk; import static org.apache.cassandra.net.MockMessagingService.verb; import static org.apache.cassandra.net.Verb.HINT_REQ; import static org.apache.cassandra.net.Verb.HINT_RSP; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; final class HintsTestUtil { @@ -92,43 +89,19 @@ static MockMessagingSpy sendHintsAndResponses(TableMetadata metadata, int noOfHi return spy; } - static class MockFailureDetector implements IFailureDetector + static void sendHintsWithRetryDifferentSystemUUID(TableMetadata metadata) { - boolean isAlive = true; - - public boolean isAlive(InetAddressAndPort ep) - { - return isAlive; - } - - public void interpret(InetAddressAndPort ep) + // create and write three hints, two that should be routed to Accord, and one should need rehinting since + // it doesn't end up routed to Accord + UUID hostId = HintsService.RETRY_ON_DIFFERENT_SYSTEM_UUID; + for (int i = 0; i < 3; i++) { - throw new UnsupportedOperationException(); - } - - public void report(InetAddressAndPort ep) - { - throw new UnsupportedOperationException(); - } - - public void registerFailureDetectionEventListener(IFailureDetectionEventListener listener) - { - throw new UnsupportedOperationException(); - } - - public void unregisterFailureDetectionEventListener(IFailureDetectionEventListener listener) - { - throw new UnsupportedOperationException(); - } - - public void remove(InetAddressAndPort ep) - { - throw new UnsupportedOperationException(); - } - - public void forceConviction(InetAddressAndPort ep) - { - throw new UnsupportedOperationException(); + long now = Clock.Global.currentTimeMillis(); + DecoratedKey dkey = dk(String.valueOf(i)); + PartitionUpdate.SimpleBuilder builder = PartitionUpdate.simpleBuilder(metadata, dkey).timestamp(now); + builder.row("column0").add("val", "value0"); + Hint hint = Hint.create(builder.buildAsMutation(), now); + HintsService.instance.write(hostId, hint); } } } diff --git a/test/unit/org/apache/cassandra/index/CustomIndexTest.java b/test/unit/org/apache/cassandra/index/CustomIndexTest.java index 9f568a877394..483d109ec53c 100644 --- a/test/unit/org/apache/cassandra/index/CustomIndexTest.java +++ b/test/unit/org/apache/cassandra/index/CustomIndexTest.java @@ -20,7 +20,16 @@ */ package org.apache.cassandra.index; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; @@ -33,6 +42,7 @@ import com.google.common.collect.Maps; import com.google.common.collect.Sets; import org.junit.Assume; +import org.junit.BeforeClass; import org.junit.Test; import com.datastax.driver.core.exceptions.QueryValidationException; @@ -43,12 +53,21 @@ import org.apache.cassandra.cql3.Operator; import org.apache.cassandra.cql3.restrictions.IndexRestrictions; import org.apache.cassandra.cql3.restrictions.StatementRestrictions; -import org.apache.cassandra.cql3.statements.schema.IndexTarget; import org.apache.cassandra.cql3.statements.ModificationStatement; -import org.apache.cassandra.db.*; import org.apache.cassandra.db.ColumnFamilyStore.FlushReason; import org.apache.cassandra.db.filter.RowFilter; import org.apache.cassandra.db.lifecycle.LifecycleNewTracker; +import org.apache.cassandra.cql3.statements.schema.IndexTarget; +import org.apache.cassandra.db.CassandraWriteContext; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.RangeTombstone; +import org.apache.cassandra.db.ReadCommand; +import org.apache.cassandra.db.ReadExecutionController; +import org.apache.cassandra.db.RegularAndStaticColumns; +import org.apache.cassandra.db.WriteContext; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.Int32Type; import org.apache.cassandra.db.marshal.UTF8Type; @@ -80,6 +99,20 @@ public class CustomIndexTest extends CQLTester { + @BeforeClass + public static void setUpClass() // overrides CQLTester.setUpClass() + { + // Accord breaks indexBuildingPagesLargePartitions because it introduces blocking OpOrder.Group + // when it sees the schema change and forces a flush of the Accord keyspace topologies table + // which creates a blocking OpOrder.Group. + // The test is explicitly trying to assert none of the created groups are blocking and that is pretty + // fragile as implemented since any background things could create mark a group blocking becuase Keyspace.writeOrder + // is global + CQLTester.daemonInitialization(); + DatabaseDescriptor.setAccordTransactionsEnabled(false); + CQLTester.setUpClass(); + } + @Test public void testInsertsOnCfsBackedIndex() throws Throwable { @@ -1183,7 +1216,7 @@ public void removeRow(Row row) { } @Test - public void testFlushObserver() throws Throwable + public void testFlushObserver() { createTable("CREATE TABLE %s (k int, c int, s int static, v int, PRIMARY KEY (k, c))"); String indexName = "test_index_with_flush_observer"; diff --git a/test/unit/org/apache/cassandra/index/IndexStatusManagerTest.java b/test/unit/org/apache/cassandra/index/IndexStatusManagerTest.java index 947b7a57bc47..39401ac1bc47 100644 --- a/test/unit/org/apache/cassandra/index/IndexStatusManagerTest.java +++ b/test/unit/org/apache/cassandra/index/IndexStatusManagerTest.java @@ -359,7 +359,7 @@ public void shouldThrowWhenNoQueryableEndpoints() .hasMessageStartingWith("Operation failed") .hasMessageContaining("INDEX_NOT_AVAILABLE from /127.0.0.253:7000") .hasMessageContaining("INDEX_NOT_AVAILABLE from /127.0.0.254:7000") - .hasMessageContaining("INDEX_NOT_AVAILABLE from /127.0.0.255:7000"); + .hasMessageContaining("INDEX_BUILD_IN_PROGRESS from /127.0.0.255:7000"); } void runTest(Testcase testcase) diff --git a/test/unit/org/apache/cassandra/index/StubIndex.java b/test/unit/org/apache/cassandra/index/StubIndex.java index cfaff698ec10..696f05062832 100644 --- a/test/unit/org/apache/cassandra/index/StubIndex.java +++ b/test/unit/org/apache/cassandra/index/StubIndex.java @@ -228,9 +228,13 @@ public ReadCommand command() } @Override - public UnfilteredPartitionIterator search(ReadExecutionController executionController) + public UnfilteredPartitionIterator search(ReadExecutionController controller) { - return Util.executeLocally((PartitionRangeReadCommand)command, baseCfs, executionController); + if (command instanceof PartitionRangeReadCommand) + return Util.executeLocally((PartitionRangeReadCommand)command, baseCfs, controller); + if (command instanceof SinglePartitionReadCommand) + return Util.executeLocally((SinglePartitionReadCommand) command, baseCfs, controller); + throw new IllegalArgumentException("Unexpected ReadCommand type: " + command.getClass()); } } } diff --git a/test/unit/org/apache/cassandra/index/accord/CheckpointIntervalArrayIndexTest.java b/test/unit/org/apache/cassandra/index/accord/CheckpointIntervalArrayIndexTest.java new file mode 100644 index 000000000000..70506bb722b9 --- /dev/null +++ b/test/unit/org/apache/cassandra/index/accord/CheckpointIntervalArrayIndexTest.java @@ -0,0 +1,441 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.accord; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.time.Duration; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.EnumMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.UUID; +import java.util.function.Consumer; +import java.util.stream.Collectors; + +import com.google.common.collect.ImmutableMap; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.utils.Gen; +import accord.utils.Gens; +import accord.utils.RandomSource; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ClusteringComparator; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.index.accord.CheckpointIntervalArrayIndex.Interval; +import org.apache.cassandra.index.accord.IndexDescriptor.IndexComponent; +import org.apache.cassandra.io.sstable.Descriptor; +import org.apache.cassandra.io.sstable.SequenceBasedSSTableId; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.io.util.FileHandle; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.utils.ByteArrayUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; +import org.assertj.core.api.Assertions; + +import static accord.utils.Property.qt; +import static org.apache.cassandra.utils.ByteBufferUtil.bytes; + +public class CheckpointIntervalArrayIndexTest +{ + private static final Logger logger = LoggerFactory.getLogger(CheckpointIntervalArrayIndexTest.class); + + static + { + DatabaseDescriptor.toolInitialization(); + } + + private static final byte[] EMPTY = new byte[0]; + private static final TreeSet EMPTY_TREE_SET = new TreeSet<>(); + + private static final Gen.IntGen MAX_TOKEN_GEN = Gens.pickInt(1 << 14, + 1 << 16, + 1 << 20, + 1 << 30); + + private enum Pattern { RANDOM, NO_OVERLAP, PARTIAL_OVERLAP } + + private static final Gen PATTERN_GEN = Gens.enums().all(Pattern.class); + + @Rule + public TemporaryFolder folder = new TemporaryFolder(); + private int generation = 0; + + @Test + public void simple() throws IOException + { + int bytesPerKey = Integer.BYTES; + int bytesPerValue = 0; + + List list = new ArrayList<>(10); + list.add(new Interval(bytes(Integer.MIN_VALUE).array(), bytes(Integer.MAX_VALUE).array(), EMPTY)); + for (int i = 0; i < 10; i++) + list.add(new Interval(bytes(i).array(), bytes(i + 1).array(), EMPTY)); + + try (var searcher = index(bytesPerKey, bytesPerValue, list)) + { + Set> expected = Set.of(List.of(-2147483648, 2147483647), + List.of(2, 3), + List.of(3, 4)); + Set> actual = new HashSet<>(); + var stats = searcher.intersects(bytes(2).array(), bytes(4).array(), value -> actual.add(List.of(ByteBuffer.wrap(value.start).getInt(), ByteBuffer.wrap(value.end).getInt()))); + logger.info("Stats: {}", stats); + Assertions.assertThat(actual).isEqualTo(expected); + } + } + + @Test + public void fuzzSmall() + { + var minToken = 0; + int numRecords = 10; + qt().withTimeout(Duration.ofSeconds(60)).check(rs -> fuzz(rs, minToken, MAX_TOKEN_GEN.nextInt(rs), PATTERN_GEN.next(rs), numRecords)); + } + + @Test + public void fuzzMedium() + { + var minToken = 0; + int numRecords = 1_000; + qt().withTimeout(Duration.ofSeconds(60)).check(rs -> fuzz(rs, minToken, MAX_TOKEN_GEN.nextInt(rs), PATTERN_GEN.next(rs), numRecords)); + } + + private void fuzz(RandomSource rs, int minToken, int maxToken, Pattern pattern, int numRecords) throws IOException + { + List intervals = buildIntervals(rs, minToken, maxToken, pattern, numRecords); + List nonContainedRanges = findMissingRanges(intervals); + + try (var searcher = index(Integer.BYTES, Integer.BYTES, intervals)) + { + for (int i = 0, samples = rs.nextInt(Math.min(10, numRecords), Math.min(10, numRecords) * 10); i < samples; i++) + { + SearchContext ctx = rs.decide(.2) ? miss(rs, nonContainedRanges) + : hit(rs, intervals, pattern); + Set actual = new TreeSet<>(); + try + { + var stats = searcher.intersects(ctx.start, ctx.end, interval -> actual.add(new DetailedInterval(interval))); + logger.info("[Pattern={}, size={}, expectedMatches={}, query=[{}, {})] Stats: {}", pattern, intervals.size(), ctx.expected.size(), ctx.a, ctx.b, stats); + } + catch (Throwable t) + { + throw new AssertionError(String.format("Failure searching for [%d, %d) from %s", ctx.a, ctx.b, intervals), t); + } + Assertions.assertThat(actual).describedAs("search(%d, %d) from %s", ctx.a, ctx.b, intervals).isEqualTo(ctx.expected); + } + } + } + + /** + * mutable/shared ctx to avoid allocating in a loop... + */ + private SearchContext searchContext = new SearchContext(); + + private SearchContext miss(RandomSource rs, List nonContainedRanges) + { + var range = rs.pick(nonContainedRanges); + var s = unbc(range.start); + var e = unbc(range.end); + int domain = e - s; + int a, b; + if (domain == 1) + { + // you can not find multiple values within this range! + a = s; + b = e; + } + else + { + a = e == Integer.MAX_VALUE ? rs.nextInt(s, e) : rs.nextInt(s, e) + 1; + b = e == Integer.MAX_VALUE ? rs.nextInt(s, e) : rs.nextInt(s, e) + 1; + for (int i = 0; i < 42 && a == b; i++) + b = e == Integer.MAX_VALUE ? rs.nextInt(s, e) : rs.nextInt(s, e) + 1; + if (a == b) + throw new IllegalStateException("Unable to create missing range: " + range); + if (b < a) + { + var tmp = a; + a = b; + b = tmp; + } + } + searchContext.a = a; + searchContext.b = b; + searchContext.start = bc(a); + searchContext.end = bc(b); + searchContext.expected = EMPTY_TREE_SET; + return searchContext; + } + + private SearchContext hit(RandomSource rs, List intervals, Pattern pattern) + { + int numRecords = intervals.size(); + DetailedInterval first, second; + do + { + var offset = rs.nextInt(0, numRecords); + int endOffset; + switch (pattern) + { + case PARTIAL_OVERLAP: + case RANDOM: + endOffset = offset; + break; + case NO_OVERLAP: + endOffset = offset == numRecords - 1 ? offset : offset + rs.nextInt(1, Math.min(3, numRecords - offset)); + break; + default: + throw new IllegalArgumentException("Unknown pattern: " + pattern); + } + first = intervals.get(offset); + second = intervals.get(endOffset); + } + while (first.compareTo(second) == 0 && first.size() == 1); + int a, b; + a = rs.nextInt(unbc(first.start), unbc(first.end)) + 1; + b = rs.nextInt(unbc(second.start), unbc(second.end)) + 1; + while (a == b) + b = rs.nextInt(unbc(second.start), unbc(second.end)) + 1; + if (b < a) + { + var tmp = b; + b = a; + a = tmp; + } + + searchContext.start = bc(a); + searchContext.end = bc(b); + + searchContext.expected = intervals.stream().filter(i -> i.intersects(searchContext.start, searchContext.end)).collect(Collectors.toCollection(TreeSet::new)); + Assertions.assertThat(searchContext.expected).isNotEmpty(); + return searchContext; + } + + private static List buildIntervals(RandomSource rs, int minToken, int maxToken, Pattern pattern, int numRecords) + { + List intervals = new ArrayList<>(numRecords); + { + var domain = maxToken - minToken + 1; + var delta = domain / numRecords; + var sub_delta = delta / 2; + for (int i = 0; i < numRecords; i++) + { + switch (pattern) + { + case RANDOM: + { + var start = rs.nextInt(minToken, maxToken); + var remaining = maxToken - start; + var end = start + (remaining == 1 ? 1 : rs.nextInt(1, remaining)); + intervals.add(new DetailedInterval(bc(start), bc(end), bytes(i).array())); + } + break; + case NO_OVERLAP: + { + var start = delta * i; + var end = start + sub_delta; + intervals.add(new DetailedInterval(bc(start), bc(end), bytes(i).array())); + } + break; + case PARTIAL_OVERLAP: + { + if (i > 1 && rs.decide(.2)) + { + // overlap + DetailedInterval start, end; + do + { + int numOverlaps = rs.nextInt(1, Math.min(3, intervals.size())); + int offset = rs.nextInt(0, intervals.size() - numOverlaps); + start = intervals.get(offset); + end = intervals.get(offset + numOverlaps); + } + while (start.compareStart(end) == 0 && start.size() == 1); + var a = rs.nextInt(unbc(start.start), unbc(start.end)) + 1; + var b = rs.nextInt(unbc(end.start), unbc(end.end)) + 1; + if (a == b && end.size() == 1) + { + while (a == b) + a = rs.nextInt(unbc(start.start), unbc(start.end)) + 1; + } + else + { + while (a == b) + b = rs.nextInt(unbc(end.start), unbc(end.end)) + 1; + } + if (a > b) + { + var tmp = a; + a = b; + b = tmp; + } + intervals.add(new DetailedInterval(bc(a), bc(b), bytes(i).array())); + intervals.sort(Comparator.naturalOrder()); // so partial can work next time + } + else + { + // no overlap + var start = delta * i; + var end = start + sub_delta; + intervals.add(new DetailedInterval(bc(start), bc(end), bytes(i).array())); + } + } + break; + default: + throw new IllegalArgumentException("Unknown pattern: " + pattern); + } + } + intervals.sort(Comparator.naturalOrder()); + } + return intervals; + } + + private static List findMissingRanges(List intervals) + { + List list = new ArrayList<>(); + list.add(new DetailedInterval(bc(Integer.MIN_VALUE), intervals.get(0).start, bytes(0).array())); + // track current visable coverage + int end = unbc(intervals.get(0).end); + for (var i : intervals) + { + int istar = unbc(i.start); + int iend = unbc(i.end); + if (end >= istar) + { + // current scope includes this range + end = Math.max(end, iend); + } + else + { + // range doesn't intersect, and a new start/end are formed! + list.add(new DetailedInterval(bc(end), bc(istar), bytes(list.size()).array())); + end = iend; + } + } + list.add(new DetailedInterval(bc(end), bc(Integer.MAX_VALUE), bytes(list.size()).array())); + return list; + } + + private static class DetailedInterval extends Interval + { + public DetailedInterval(byte[] start, byte[] end, byte[] value) + { + super(start, end, value); + } + + public DetailedInterval(Interval other) + { + super(other); + } + + public int size() + { + return unbc(end) - unbc(start); + } + + @Override + public String toString() + { + return "[" + unbc(start) + ", " + unbc(end) + ") -> " + ByteBuffer.wrap(value).getInt(); + } + } + + private static byte[] bc(int value) + { + ByteBuffer bb = bytes(value); + var bs = Int32Type.instance.asComparableBytes(ByteBufferAccessor.instance, bb, ByteComparable.Version.OSS50); + return ByteSourceInverse.readBytes(bs); + } + + private static int unbc(byte[] bc) + { + return Int32Type.instance.fromComparableBytes(ByteSource.peekable(ByteSource.fixedLength(bc)), ByteComparable.Version.OSS50).getInt(); + } + + @SuppressWarnings({ "IOResourceOpenedButNotSafelyClosed", "resource" }) + private Searcher index(int bytesPerKey, int bytesPerValue, List sortedIntervals) throws IOException + { + IndexDescriptor descriptor = nextDescriptor(); + + var writer = new CheckpointIntervalArrayIndex.SegmentWriter(descriptor, bytesPerKey, bytesPerValue); + var metas = writer.write(sortedIntervals.toArray(Interval[]::new)); + + // going through the RouteIndexFormat isn't required for this test, but it helps improve coverage there... + Segment segment = new Segment(ImmutableMap.of(new Group(0, TableId.fromUUID(new UUID(0, 0))), new Segment.Metadata(metas, ByteArrayUtil.EMPTY_BYTE_ARRAY, ByteArrayUtil.EMPTY_BYTE_ARRAY))); + RouteIndexFormat.appendSegment(descriptor, segment); + + Map files = new EnumMap<>(IndexComponent.class); + for (IndexComponent c : descriptor.getLiveComponents()) + files.put(c, new FileHandle.Builder(descriptor.fileFor(c)).mmapped(true).complete()); + List segments = RouteIndexFormat.readSegments(files); + files.remove(IndexComponent.SEGMENT).close(); + files.remove(IndexComponent.METADATA).close(); + + var searcher = new CheckpointIntervalArrayIndex.SegmentSearcher(files.get(IndexComponent.CINTIA_SORTED_LIST).sharedCopy(), metas.get(IndexComponent.CINTIA_SORTED_LIST).offset, + files.get(IndexComponent.CINTIA_CHECKPOINTS).sharedCopy(), metas.get(IndexComponent.CINTIA_CHECKPOINTS).offset); + return new Searcher() + { + @Override + public CheckpointIntervalArrayIndex.Stats intersects(byte[] start, byte[] end, Consumer callback) throws IOException + { + return searcher.intersects(start, end, callback); + } + + @Override + public void close() + { + searcher.close(); + for (var fh : files.values()) + fh.close(); + } + }; + } + + private IndexDescriptor nextDescriptor() + { + return IndexDescriptor.create(new Descriptor(new File(folder.getRoot()), "test", "test", new SequenceBasedSSTableId(generation++)), + Murmur3Partitioner.instance, + new ClusteringComparator()); + } + + private static class SearchContext + { + TreeSet expected; + byte[] start, end; + int a, b; + } + + public interface Searcher extends Closeable + { + CheckpointIntervalArrayIndex.Stats intersects(byte[] start, byte[] end, Consumer callback) throws IOException; + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/index/accord/RouteIndexTest.java b/test/unit/org/apache/cassandra/index/accord/RouteIndexTest.java new file mode 100644 index 000000000000..963ee402c8ee --- /dev/null +++ b/test/unit/org/apache/cassandra/index/accord/RouteIndexTest.java @@ -0,0 +1,772 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.accord; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.TreeSet; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +import com.google.common.collect.Iterables; +import com.google.common.collect.Sets; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.api.Journal; +import accord.api.RoutingKey; +import accord.local.CommandStore; +import accord.local.CommandStores; +import accord.local.CommandStores.RangesForEpoch; +import accord.local.DurableBefore; +import accord.local.Node; +import accord.local.RedundantBefore; +import accord.local.StoreParticipants; +import accord.primitives.Ballot; +import accord.primitives.Deps; +import accord.primitives.FullKeyRoute; +import accord.primitives.PartialDeps; +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.primitives.Routable.Domain; +import accord.primitives.Route; +import accord.primitives.SaveStatus; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.topology.Shard; +import accord.topology.Topology; +import accord.utils.Gen; +import accord.utils.Gens; +import accord.utils.Property.Command; +import accord.utils.Property.UnitCommand; +import accord.utils.RandomSource; +import org.agrona.collections.Int2ObjectHashMap; +import org.agrona.collections.Long2ObjectHashMap; +import org.agrona.collections.ObjectHashSet; +import org.apache.cassandra.concurrent.ExecutorFactory; +import org.apache.cassandra.concurrent.ForwardingExecutorFactory; +import org.apache.cassandra.concurrent.ForwardingExecutorPlus; +import org.apache.cassandra.concurrent.ImmediateExecutor; +import org.apache.cassandra.concurrent.SequentialExecutorPlus; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.config.OptionaldPositiveInt; +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.compaction.CompactionManager; +import org.apache.cassandra.dht.Murmur3Partitioner.LongToken; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.AccordCommandStore; +import org.apache.cassandra.service.accord.AccordJournal; +import org.apache.cassandra.service.accord.AccordKeyspace; +import org.apache.cassandra.service.accord.AccordService; +import org.apache.cassandra.service.accord.AccordTestUtils; +import org.apache.cassandra.service.accord.AccordTopology; +import org.apache.cassandra.service.accord.IAccordService; +import org.apache.cassandra.service.accord.IAccordService.AccordCompactionInfo; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.utils.AccordGenerators; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.LazyToString; +import org.apache.cassandra.utils.RTree; +import org.apache.cassandra.utils.RangeTree; +import org.apache.cassandra.utils.concurrent.CountDownLatch; +import org.assertj.core.api.Assertions; +import org.mockito.Mockito; + +import static accord.local.RedundantStatus.SomeStatus.NONE; +import static accord.utils.Property.commands; +import static accord.utils.Property.stateful; +import static accord.utils.SortedArrays.SortedArrayList.ofSorted; +import static org.apache.cassandra.config.DatabaseDescriptor.getPartitioner; +import static org.apache.cassandra.schema.SchemaConstants.ACCORD_KEYSPACE_NAME; + +public class RouteIndexTest extends CQLTester +{ + private static final Node.Id NODE = new Node.Id(42); + private static final int MIN_TOKEN = 0; + private static final int MAX_TOKEN = 1 << 18; + private static final int TOKEN_RANGE_SIZE = MAX_TOKEN - MIN_TOKEN + 1; + private static final int MAX_STORES = 10; + private static final Gen.IntGen NUM_STORES_GEN = Gens.ints().between(1, MAX_STORES); + private static final Gen TOKEN_DISTRIBUTION = Gens.mixedDistribution(MIN_TOKEN, MAX_TOKEN + 1); + private static final Gen RANGE_SIZE_DISTRIBUTION = Gens.mixedDistribution(10, (int) (TOKEN_RANGE_SIZE * .01)); + private static final Gen> DOMAIN_DISTRIBUTION = Gens.mixedDistribution(Domain.values()); + + @BeforeClass + public static void setUpClass() + { + // since this test does frequent truncates, the info table gets updated and forced flushed... which is 90% of the cost of this test... + // this flag disables that flush + CassandraRelevantProperties.UNSAFE_SYSTEM.setBoolean(true); + + DatabaseDescriptor.daemonInitialization(); + DatabaseDescriptor.setAccordTransactionsEnabled(true); + // disable journal compaction so the test can control when it happens + DatabaseDescriptor.getAccord().enable_journal_compaction = false; + DatabaseDescriptor.setIncrementalBackupsEnabled(false); + DatabaseDescriptor.setAutoSnapshot(false); + + CQLTester.prePrepareServer(); + + // Journal will async release segment references and close files... this adds possible race condition issues with truncate + // so make these steps happen inline. + // Simulatored executors for the journal would be ideal, but given the blocking nature only a few executors can be simulated + // without The Simulator, so full simulation is out of scope of this test. + ExecutorFactory.Global.unsafeSet(new ForwardingExecutorFactory(ExecutorFactory.Global.executorFactory()) { + @Override + public SequentialExecutorPlus sequential(String name) + { + if (name.endsWith("-releaser") || name.endsWith("-closer")) + return new ForwardingExecutorPlus(ImmediateExecutor.INSTANCE); + return super.sequential(name); + } + }); + + CQLTester.InMemory.prepareServer(); + } + + private static TableId tableId = null; + + @Before + public void setupTable() + { + if (tableId != null) return; + String name = createTable("CREATE TABLE %s (pk int primary key) WITH " + TransactionalMode.full.asCqlParam()); + tableId = Keyspace.open(KEYSPACE).getColumnFamilyStore(name).metadata().id; + } + + private Command insert(RandomSource rs, State state) + { + Domain domain = state.domainGen.next(rs); + TxnId txnId = state.nextTxnId(domain); + Route route = createRoute(state, rs, domain, rs.nextInt(1, 20)); + int storeId = state.accordService.node().commandStores().select(route.homeKey()).id(); + return new InsertTxn(storeId, txnId, SaveStatus.PreAccepted, route); + } + + private static KeySearch keySearchExisting(RandomSource rs, State state) + { + int storeId = rs.pickUnorderedSet(state.storeToTableToRangesToTxns.keySet()); + var tables = state.storeToTableToRangesToTxns.get(storeId); + TableId tableId = rs.pickUnorderedSet(tables.keySet()); + var ranges = tables.get(tableId); + TokenRange range = selectExistingRange(rs, ranges); + + // have a key, so find a key within the range + long start = range.start().isMin() ? Long.MIN_VALUE : ((LongToken) range.start().token()).token; + long end = range.end().isMax() ? Long.MAX_VALUE : ((LongToken) range.end().token()).token; + long token = 1 + rs.nextLong(start, end); + return new KeySearch(storeId, new TokenKey(tableId, new LongToken(token))); + } + + private static RangeSearch rangeSearchExisting(RandomSource rs, State state) + { + int storeId = rs.pickUnorderedSet(state.storeToTableToRangesToTxns.keySet()); + var tables = state.storeToTableToRangesToTxns.get(storeId); + TableId tableId = rs.pickUnorderedSet(tables.keySet()); + var ranges = tables.get(tableId); + return new RangeSearch(storeId, selectExistingRange(rs, ranges)); + } + + private static Command rangeSearch(RandomSource rs, State state) + { + return new RangeSearch(rs.nextInt(0, state.numStores), state.rangeGen.next(rs)); + } + + private static Command keySearch(RandomSource rs, State state) + { + return new KeySearch(rs.nextInt(0, state.numStores), new TokenKey(rs.pick(state.tables), new LongToken(state.tokenGen.nextInt(rs)))); + } + + @Test + public void test() + { + cfs().disableAutoCompaction(); // let the test control compaction + //TODO (coverage): include with the ability to mark ranges as durable for compaction cleanup + stateful().withExamples(10).withSteps(500).check(commands(() -> State::new, Sut::new) + .destroyState(State::close) + .destroySut(Sut::close) + .addIf(State::mayFlush, CLOSE) + .add(COMPACTOR) + .addIf(State::mayCompact, COMPACT) + .add(PURGE) + .add(RESTART) + .add(this::insert) + .add(RouteIndexTest::rangeSearch) + .add(RouteIndexTest::keySearch) + .addIf(state -> !state.storeToTableToRangesToTxns.isEmpty(), RouteIndexTest::rangeSearchExisting) + .addIf(state -> !state.storeToTableToRangesToTxns.isEmpty(), RouteIndexTest::keySearchExisting) + .build()); + } + + private static class InsertTxn implements UnitCommand + { + private final int storeId; + private final TxnId txnId; + private final SaveStatus saveStatus; + private final StoreParticipants participants; + + private InsertTxn(int storeId, TxnId txnId, SaveStatus saveStatus, Route route) + { + this.storeId = storeId; + this.txnId = txnId; + this.saveStatus = saveStatus; + this.participants = StoreParticipants.all(route); + } + + @Override + public void applyUnit(State state) + { + state.insertTxn(storeId, txnId, participants.route()); + } + + @Override + public void runUnit(Sut sut) + { + sut.insertTxn(storeId, txnId, saveStatus, participants); + } + + @Override + public String toString() + { + return "InsertTxn{" + + "storeId=" + storeId + + ", txnId=" + txnId + + ", saveStatus=" + saveStatus + + ", participants=" + participants + + '}'; + } + } + + private static class KeySearch implements Command> + { + private final int storeId; + private final TokenKey key; + + private KeySearch(int storeId, TokenKey key) + { + this.storeId = storeId; + this.key = key; + } + + @Override + public Set apply(State state) throws Throwable + { + var tables = state.storeToTableToRangesToTxns.get(storeId); + if (tables == null) return Collections.emptySet(); + var ranges = tables.get(key.table()); + if (ranges == null) return Collections.emptySet(); + Set matches = new HashSet<>(); + ranges.searchToken(key, e -> matches.add(e.getValue())); + return matches; + } + + @Override + public Set run(Sut sut) throws Throwable + { + Set result = new ObjectHashSet<>(); + sut.journal.get().rangeSearcher().search(storeId, key, TxnId.NONE, Timestamp.MAX).consume(result::add); + return result; + } + + @Override + public void checkPostconditions(State state, Set expected, + Sut sut, Set actual) + { + Assertions.assertThat(actual).describedAs("Unexpected txns for key %s", key).isEqualTo(expected); + } + + @Override + public String toString() + { + return "KeySearch{" + + "storeId=" + storeId + + ", key=" + key + + '}'; + } + } + + private static class RangeSearch implements Command> + { + private final int storeId; + private final TokenRange range; + + private RangeSearch(int storeId, TokenRange range) + { + this.storeId = storeId; + this.range = range; + } + + @Override + public Set apply(State state) throws Throwable + { + var tables = state.storeToTableToRangesToTxns.get(storeId); + if (tables == null) return Collections.emptySet(); + var ranges = tables.get(range.table()); + if (ranges == null) return Collections.emptySet(); + Set matches = new HashSet<>(); + ranges.search(range, e -> matches.add(e.getValue())); + return matches; + } + + @Override + public Set run(Sut sut) throws Throwable + { + Set result = new ObjectHashSet<>(); + sut.journal.get().rangeSearcher().search(storeId, range, TxnId.NONE, Timestamp.MAX).consume(result::add); + return result; + } + + @Override + public void checkPostconditions(State state, Set expected, + Sut sut, Set actual) + { + Assertions.assertThat(actual).describedAs("Unexpected txns for range %s; missing %s, added %s", range, LazyToString.lazy(() -> Sets.difference(expected, actual).toString()), LazyToString.lazy(() -> Sets.difference(actual, expected).toString())).isEqualTo(expected); + } + + @Override + public String toString() + { + return "RangeSearch{" + + "storeId=" + storeId + + ", range=" + range + + '}'; + } + } + + private static abstract class CassandraCommand implements UnitCommand + { + private final String name; + + protected CassandraCommand(String name) + { + this.name = name; + } + + @Override + public void applyUnit(State state) + { + // no-op + } + + @Override + public String detailed(State state) + { + return name; + } + } + + private static final CassandraCommand CLOSE = new CassandraCommand("Close Current Segment") + { + @Override + public void runUnit(Sut sut) + { + sut.journal.get().closeCurrentSegmentForTestingIfNonEmpty(); + } + }; + + private static final CassandraCommand COMPACTOR = new CassandraCommand("Compactor") + { + @Override + public void runUnit(Sut sut) + { + sut.journal.get().runCompactorForTesting(); + } + }; + + private static final CassandraCommand COMPACT = new CassandraCommand("Compact") + { + @Override + public void runUnit(Sut sut) + { + try + { + sut.cfs.enableAutoCompaction(); + FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(sut.cfs)); + } + finally + { + sut.cfs.disableAutoCompaction(); + } + } + }; + + private static final CassandraCommand PURGE = new CassandraCommand("Purge") + { + @Override + public void runUnit(Sut sut) + { + sut.journal.get().purge(sut.stores.get(), () -> 0); + } + }; + + private static final UnitCommand RESTART = new UnitCommand() + { + @Override + public void applyUnit(State state) throws Throwable + { + state.restartAccord(); + } + + @Override + public void runUnit(Sut sut) + { + // no-op + } + + @Override + public String detailed(State state) + { + return "Restart Accord"; + } + }; + + private static class State implements AutoCloseable + { + private final Int2ObjectHashMap>>> storeToTableToRoutingKeysToTxns = new Int2ObjectHashMap<>(); + private final Int2ObjectHashMap>> storeToTableToRangesToTxns = new Int2ObjectHashMap<>(); + private final Int2ObjectHashMap storeRangesForEpochs = new Int2ObjectHashMap<>(); + private final RedundantBefore emptyRedundantBefore = RedundantBefore.create(Ranges.of(TokenRange.fullRange(tableId, getPartitioner())), TxnId.NONE, NONE); + + private final int numStores; + private final List tables; + private final Gen.IntGen tokenGen; + private final Gen rangeGen; + private final Gen domainGen; + private final ColumnFamilyStore journalTable; + private AccordService accordService; + private int hlc = 1000; + + public State(RandomSource rs) + { + numStores = NUM_STORES_GEN.nextInt(rs); + DatabaseDescriptor.getAccord().command_store_shard_count = new OptionaldPositiveInt(numStores); + tables = Collections.singletonList(tableId); + tokenGen = TOKEN_DISTRIBUTION.next(rs); + rangeGen = rangeGen(rs, tables); + domainGen = DOMAIN_DISTRIBUTION.next(rs); + journalTable = Keyspace.open(ACCORD_KEYSPACE_NAME).getColumnFamilyStore(AccordKeyspace.JOURNAL); + + for (int i = 0 ; i < numStores ; ++i) + storeRangesForEpochs.put(i, new RangesForEpoch(1, Ranges.of(TokenRange.fullRange(tableId, getPartitioner())))); + + accordService = startAccord(); + accordService.configService().listener.notifyPostCommit(null, ClusterMetadata.current(), false); + accordService.epochReady(ClusterMetadata.current().epoch).awaitUninterruptibly(); + } + + AccordService startAccord() + { + NodeId tcmNodeId = ClusterMetadata.current().myNodeId(); + AccordService as = new AccordService(AccordTopology.tcmIdToAccord(tcmNodeId)); + Topology topology = new Topology(1, Shard.create(TokenRange.fullRange(tableId, getPartitioner()), ofSorted(new Node.Id(1)), ofSorted(new Node.Id(1)))); + as.unsafeStartupWithOverrides(new Journal.TopologyUpdate(storeRangesForEpochs, topology, topology)); + for (CommandStore commandStore : as.node().commandStores().all()) + ((AccordCommandStore)commandStore).unsafeUpsertRedundantBefore(emptyRedundantBefore); + // the reason for the mocking is to speed up compaction. Collecting the info from the stores has been slow and its always empty in this test... so stub it out to speed up the test + AccordService mock = Mockito.spy(as); + Mockito.doReturn(emptyCompactionInfo(tableId, emptyRedundantBefore, storeRangesForEpochs)).when(mock).getCompactionInfo(); + AccordService.unsafeSetNewAccordService(mock); + + AccordService.replayJournal(as); + return as; + } + + TxnId nextTxnId(Domain domain) + { + return new TxnId(1, hlc++, Txn.Kind.Write, domain, NODE); + } + + void insertTxn(int storeId, TxnId txnId, Route route) + { + for (var u : Objects.requireNonNull(route)) + { + switch (u.domain()) + { + case Key: + { + TokenKey key = (TokenKey) u; + var table = key.table(); + var token = key.token().getLongValue(); + storeToTableToRoutingKeysToTxns.computeIfAbsent(storeId, ignore -> new HashMap<>()) + .computeIfAbsent(table, ignore -> new Long2ObjectHashMap<>()) + .computeIfAbsent(token, ignore -> new ArrayList<>()) + .add(txnId); + } + break; + case Range: + { + TokenRange range = (TokenRange) u; + var table = range.table(); + storeToTableToRangesToTxns.computeIfAbsent(storeId, ignore -> new HashMap<>()) + .computeIfAbsent(table, ignore -> rangeTree()) + .add(range, txnId); + } + break; + default: + throw new AssertionError("Unexpected domain: " + u.domain()); + } + } + } + + public boolean mayFlush() + { + return accordService.journal().inMemorySize() > 0; + } + + public boolean mayCompact() + { + return journalTable.getLiveSSTables().size() > 1; + } + + @Override + public String toString() + { + return "State{" + + "numStores=" + numStores + + ", tables=" + tables + + '}'; + } + + @Override + public void close() + { + accordService.shutdown(); + } + + private void restartAccord() + { + accordService.shutdown(); + accordService = startAccord(); + } + } + + public static class Sut implements AutoCloseable + { + private final ColumnFamilyStore cfs; + private final Supplier stores; + private final Supplier journal; + + public Sut(State state) + { + cfs = cfs(); + this.stores = () -> state.accordService.node().commandStores(); + this.journal = () -> state.accordService.journal(); + } + + void insertTxn(int storeId, TxnId txnId, SaveStatus saveStatus, StoreParticipants participants) + { + Txn txn = toTxn(txnId, participants); + AccordGenerators.CommandBuilder builder = new AccordGenerators.CommandBuilder(txnId, txn, txnId, txn.slice(participants.owns().toRanges(), true), PartialDeps.NONE, Ballot.ZERO, Ballot.ZERO, accord.local.Command.WaitingOn.none(txnId.domain(), Deps.NONE)); + var cmd = builder.build(saveStatus); + CountDownLatch latch = CountDownLatch.newCountDownLatch(1); + journal.get().saveCommand(storeId, new Journal.CommandUpdate(null, cmd), () -> latch.decrement()); + latch.awaitThrowUncheckedOnInterrupt(); + } + + private static Txn toTxn(TxnId txnId, StoreParticipants participants) + { + Ranges ranges = Objects.requireNonNull(participants.route()).toRanges(); + return AccordTestUtils.createTxn(txnId.kind(), ranges); + } + + @Override + public void close() throws Exception + { + journal.get().truncateForTesting(); + cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); + cfs.truncateBlocking(); + } + } + + private static RangeTree rangeTree() + { + return RTree.create(ACCESSOR); + } + + private static final RangeTree.Accessor ACCESSOR = new RangeTree.Accessor<>() + { + @Override + public TokenKey start(TokenRange tokenRange) + { + return tokenRange.start(); + } + + @Override + public TokenKey end(TokenRange tokenRange) + { + return tokenRange.end(); + } + + @Override + public boolean contains(TokenKey start, TokenKey end, TokenKey tokenKey) + { + return TokenRange.create(start, end).contains(tokenKey); + } + + @Override + public boolean intersects(TokenRange tokenRange, TokenKey start, TokenKey end) + { + return tokenRange.compareIntersecting(TokenRange.create(start, end)) == 0; + } + }; + + private static IAccordService.AccordCompactionInfos emptyCompactionInfo(TableId tableId, RedundantBefore redundantBefore, Int2ObjectHashMap storeRangesForEpoch) + { + IAccordService.AccordCompactionInfos compactionInfos = new IAccordService.AccordCompactionInfos(DurableBefore.EMPTY, 0); + for (int i = 0; i < storeRangesForEpoch.size(); i++) + compactionInfos.put(i, new AccordCompactionInfo(i, redundantBefore, storeRangesForEpoch.get(i), tableId)); + return compactionInfos; + } + + private static ColumnFamilyStore cfs() + { + return Keyspace.open(SchemaConstants.ACCORD_KEYSPACE_NAME) + .getColumnFamilyStore(AccordKeyspace.JOURNAL); + } + + private static Gen rangeGen(RandomSource rand, List tables) + { + Gen.IntGen tokenGen = TOKEN_DISTRIBUTION.next(rand); + Gen tableIdGen = Gens.mixedDistribution(tables).next(rand); + switch (rand.nextInt(0, 3)) + { + case 0: // pure random + return rs -> { + int a = tokenGen.nextInt(rs); + int b = tokenGen.nextInt(rs); + while (a == b) + b = tokenGen.nextInt(rs); + if (a > b) + { + int tmp = a; + a = b; + b = tmp; + } + TableId tableId = tableIdGen.next(rs); + return TokenRange.create(new TokenKey(tableId, new LongToken(a)), + new TokenKey(tableId, new LongToken(b))); + }; + case 1: // small range + Gen.IntGen rangeSizeGen = RANGE_SIZE_DISTRIBUTION.next(rand); + return rs -> { + int a = tokenGen.nextInt(rs); + int rangeSize = rangeSizeGen.nextInt(rs); + int b = a + rangeSize; + if (b > MAX_TOKEN) + { + b = a; + a = b - rangeSize; + } + TableId tableId = tableIdGen.next(rs); + return TokenRange.create(new TokenKey(tableId, new LongToken(a)), + new TokenKey(tableId, new LongToken(b))); + }; + case 2: // single element + return rs -> { + int a = tokenGen.nextInt(rs); + int b = a + 1; + TableId tableId = tableIdGen.next(rs); + return TokenRange.create(new TokenKey(tableId, new LongToken(a)), + new TokenKey(tableId, new LongToken(b))); + }; + default: + throw new AssertionError(); + } + } + + private static Route createRoute(State state, RandomSource rs, Domain domain, int numKeys) + { + switch (domain) + { + case Key: + { + TreeSet keys = new TreeSet<>(); + while (keys.size() < numKeys) + { + var table = rs.pick(state.tables); + var token = new LongToken(state.tokenGen.nextInt(rs)); + keys.add(new TokenKey(table, token)); + } + return new FullKeyRoute(keys.first(), keys.toArray(RoutingKey[]::new)); + } + case Range: + { + TreeSet set = new TreeSet<>(Range::compareTo); + while (set.size() < numKeys) + set.add(state.rangeGen.next(rs)); + return Ranges.ofSorted(set.toArray(Range[]::new)).toRoute(set.first().end()); + } + default: + throw new IllegalArgumentException("Unknown domain: " + domain); + } + } + + private static TokenRange selectExistingRange(RandomSource rs, RangeTree ranges) + { + TreeSet distinctRanges = ranges.stream().map(Map.Entry::getKey).collect(Collectors.toCollection(() -> new TreeSet<>(TokenRange::compareTo))); + TokenRange range; + if (distinctRanges.size() == 1) + { + range = Iterables.getFirst(distinctRanges, null); + } + else + { + switch (rs.nextInt(0, 2)) + { + case 0: // perfect match + range = rs.pickOrderedSet(distinctRanges); + break; + case 1: // mutli-match + { + TokenRange a = rs.pickOrderedSet(distinctRanges); + TokenRange b = rs.pickOrderedSet(distinctRanges); + while (a.equals(b)) + b = rs.pickOrderedSet(distinctRanges); + if (b.compareTo(a) < 0) + { + TokenRange tmp = a; + a = b; + b = tmp; + } + range = TokenRange.create(a.start(), b.end()); + } + break; + default: + throw new AssertionError(); + } + } + return range; + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/index/internal/CassandraIndexTest.java b/test/unit/org/apache/cassandra/index/internal/CassandraIndexTest.java index 647b4d85f82d..3b3ca9479440 100644 --- a/test/unit/org/apache/cassandra/index/internal/CassandraIndexTest.java +++ b/test/unit/org/apache/cassandra/index/internal/CassandraIndexTest.java @@ -42,6 +42,7 @@ import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.AccordKeyspace; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FBUtilities; import org.awaitility.Awaitility; @@ -569,25 +570,25 @@ public void indexCorrectlyMarkedAsBuildAndRemoved() throws Throwable Awaitility.await() .atMost(1, TimeUnit.MINUTES) .pollDelay(1, TimeUnit.SECONDS) - .untilAsserted(() -> assertRows(execute(selectBuiltIndexesQuery), row("system", "PaxosUncommittedIndex", null))); + .untilAsserted(() -> assertRows(execute(selectBuiltIndexesQuery), row("system", "PaxosUncommittedIndex", null), row("system_accord", AccordKeyspace.JOURNAL_INDEX_NAME, null))); String indexName = "build_remove_test_idx"; createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a, b))"); createIndex(String.format("CREATE INDEX %s ON %%s(c)", indexName)); // check that there are no other rows in the built indexes table - assertRows(execute(selectBuiltIndexesQuery), row(KEYSPACE, indexName, null), row("system", "PaxosUncommittedIndex", null)); + assertRows(execute(selectBuiltIndexesQuery), row(KEYSPACE, indexName, null), row("system", "PaxosUncommittedIndex", null), row("system_accord", AccordKeyspace.JOURNAL_INDEX_NAME, null)); // rebuild the index and verify the built status table getCurrentColumnFamilyStore().rebuildSecondaryIndex(indexName); waitForIndexQueryable(indexName); // check that there are no other rows in the built indexes table - assertRows(execute(selectBuiltIndexesQuery), row(KEYSPACE, indexName, null), row("system", "PaxosUncommittedIndex", null)); + assertRows(execute(selectBuiltIndexesQuery), row(KEYSPACE, indexName, null), row("system", "PaxosUncommittedIndex", null), row("system_accord", AccordKeyspace.JOURNAL_INDEX_NAME, null)); // check that dropping the index removes it from the built indexes table dropIndex("DROP INDEX %s." + indexName); - assertRows(execute(selectBuiltIndexesQuery), row("system", "PaxosUncommittedIndex", null)); + assertRows(execute(selectBuiltIndexesQuery), row("system", "PaxosUncommittedIndex", null), row("system_accord", AccordKeyspace.JOURNAL_INDEX_NAME, null)); } diff --git a/test/unit/org/apache/cassandra/index/sai/SAITester.java b/test/unit/org/apache/cassandra/index/sai/SAITester.java index e004f0c0ee6c..b0a312c6985c 100644 --- a/test/unit/org/apache/cassandra/index/sai/SAITester.java +++ b/test/unit/org/apache/cassandra/index/sai/SAITester.java @@ -46,6 +46,7 @@ import javax.management.ObjectName; import com.google.common.collect.Sets; +import com.google.common.primitives.Ints; import org.junit.After; import org.junit.Assert; import org.junit.BeforeClass; @@ -311,7 +312,7 @@ public static StorageAttachedIndex createMockIndex(AbstractType cellType) public static IndexTermType createIndexTermType(AbstractType cellType) { - return IndexTermType.create(ColumnMetadata.regularColumn("sai", "internal", "val", cellType), Collections.emptyList(), IndexTarget.Type.SIMPLE); + return IndexTermType.create(ColumnMetadata.regularColumn("sai", "internal", "val", cellType, ColumnMetadata.NO_UNIQUE_ID), Collections.emptyList(), IndexTarget.Type.SIMPLE); } public IndexIdentifier createIndexIdentifier(String indexName) @@ -678,7 +679,9 @@ protected void runInitializationTask() throws Exception protected int getCompactionTasks() { - return CompactionManager.instance.getActiveCompactions() + CompactionManager.instance.getPendingTasks(); + long activeCount = CompactionManager.instance.active.getCompactions().stream().filter(compaction -> compaction.getCompactionInfo().getTableMetadata().keyspace.equals(KEYSPACE)).count(); + int pendingCount = Keyspace.open(KEYSPACE).getColumnFamilyStores().stream().map(columnFamilyStore -> columnFamilyStore.getCompactionStrategyManager().getEstimatedRemainingTasks()).reduce(0, Integer::sum); + return Ints.checkedCast(activeCount + pendingCount); } protected int snapshot(String snapshotName) diff --git a/test/unit/org/apache/cassandra/index/sai/cql/AllowFilteringTest.java b/test/unit/org/apache/cassandra/index/sai/cql/AllowFilteringTest.java index 7a9198a7009a..3ae090518585 100644 --- a/test/unit/org/apache/cassandra/index/sai/cql/AllowFilteringTest.java +++ b/test/unit/org/apache/cassandra/index/sai/cql/AllowFilteringTest.java @@ -21,10 +21,14 @@ import org.junit.Test; import org.apache.cassandra.cql3.restrictions.StatementRestrictions; +import org.apache.cassandra.index.IndexBuildInProgressException; import org.apache.cassandra.index.sai.SAITester; import org.apache.cassandra.index.sai.StorageAttachedIndex; +import org.apache.cassandra.inject.Injections; +import org.apache.cassandra.inject.InvokePointBuilder; import static java.lang.String.format; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.junit.Assert.assertNotNull; /** @@ -391,4 +395,32 @@ private void test(String query, boolean requiresAllowFiltering) throws Throwable assertNotNull(execute(query + " ALLOW FILTERING")); } + private static final Injections.Barrier blockIndexBuild = Injections.newBarrier("block_index_build", 2, false) + .add(InvokePointBuilder.newInvokePoint() + .onClass(StorageAttachedIndex.class) + .onMethod("startInitialBuild")) + .build(); + + @Test + public void testAllowFilteringDuringIndexBuild() throws Throwable + { + createTable("CREATE TABLE %s (k int PRIMARY KEY, v int)"); + Injections.inject(blockIndexBuild); + String idx = createIndexAsync(String.format("CREATE CUSTOM INDEX ON %%s(v) USING '%s'", StorageAttachedIndex.class.getName())); + + String expectedErrorMessage = String.format(IndexBuildInProgressException.INDEX_BUILD_IN_PROGRESS_ERROR, idx); + assertThatThrownBy(() -> execute("SELECT * FROM %s WHERE v=0")) + .hasMessage(expectedErrorMessage) + .isInstanceOf(IndexBuildInProgressException.class); + + assertThatThrownBy(() -> execute("SELECT * FROM %s WHERE v=0 ALLOW FILTERING")) + .hasMessage(expectedErrorMessage) + .isInstanceOf(IndexBuildInProgressException.class); + + blockIndexBuild.countDown(); + blockIndexBuild.disable(); + waitForIndexQueryable(idx); + execute("SELECT * FROM %s WHERE v=0"); + execute("SELECT * FROM %s WHERE v=0 ALLOW FILTERING"); + } } diff --git a/test/unit/org/apache/cassandra/index/sai/cql/ComplexQueryTest.java b/test/unit/org/apache/cassandra/index/sai/cql/ComplexQueryTest.java index 0b4a053d1232..a0fbeab89f79 100644 --- a/test/unit/org/apache/cassandra/index/sai/cql/ComplexQueryTest.java +++ b/test/unit/org/apache/cassandra/index/sai/cql/ComplexQueryTest.java @@ -65,4 +65,48 @@ public void splitRowsWithBooleanLogic() var result = execute("SELECT pk FROM %s WHERE str_val = 'A' AND val = 'A'"); assertRows(result, row(3)); } + + @Test + public void compositeTypeWithMapInsideQuery() + { + createTable(KEYSPACE, "CREATE TABLE %s (" + + "pk1 frozenLongType,I=>ByteType,6=>LexicalUUIDType)'>>," + + "pk2 frozen>>>," + + "ck1 frozen>>>," + + "ck2 tinyint," + + "r1 frozenDecimalType,y=>TimestampType,f=>BooleanType)'>> static," + + "r2 'DynamicCompositeType(P=>ShortType)'," + + "r3 'CompositeType(FrozenType(ListType(DoubleType)),FrozenType(MapType(LongType,DoubleType)),DoubleType)'," + + "r4 frozen>>>," + + "r5 'CompositeType(CompositeType(ShortType,SimpleDateType,BooleanType),CompositeType(FloatType),MapType(ByteType,TimeType))'," + + "r6 set," + + "PRIMARY KEY ((pk1, pk2), ck1, ck2))"); + + + + createIndex("CREATE INDEX ON %s (FULL(ck1)) USING 'SAI'"); + createIndex("CREATE INDEX ON %s (FULL(pk1)) USING 'SAI'"); + createIndex("CREATE INDEX ON %s (FULL(r4)) USING 'SAI'"); + createIndex("CREATE INDEX ON %s (r2) USING 'SAI'"); + createIndex("CREATE INDEX ON %s (r3) USING 'SAI'"); + + + UntypedResultSet withMultipleColumns = execute("SELECT pk1 FROM " + + "%s " + + "WHERE r5 = 0x0010000230bd00000457f0bd31000001000000000700049f647252000000260000000200000001f300000008000001c4e14bba4b00000001260000000800003f2b300d385d00" + + " AND r3 = 0x001c00000002000000083380d171eace676900000008e153bb97fdd5c22e00006d000000030000000897c5493857999fc000000013f08cc4fad0f04d0de51cff28d4ae743d2da1c40000000857108e8c372c868400000013f0cc6bca55f0ee240b27ff12c77a7b7dc3c665000000086c07d25fcdd3403500000013f0745922bdf0ac44c9b5ffd80f025ded9a211d000008200547f5da7a43aa00" + + " AND r2 = 0x8050000255e200 " + + " AND pk2 = ((-1.2651989E-23))" + + " ALLOW FILTERING;"); + + assertRowCount(withMultipleColumns, 0); + + UntypedResultSet withoutSAI = execute("SELECT pk1 FROM " + + "%s " + + " WHERE r5 = 0x001c00000002000000083380d171eace676900000008e153bb97fdd5c22e00006d000000030000000897c5493857999fc000000013f08cc4fad0f04d0de51cff28d4ae743d2da1c40000000857108e8c372c868400000013f0cc6bca55f0ee240b27ff12c77a7b7dc3c665000000086c07d25fcdd3403500000013f0745922bdf0ac44c9b5ffd80f025ded9a211d000008200547f5da7a43aa00" + + " ALLOW FILTERING;"); + + + assertRowCount(withoutSAI, 0); + } } diff --git a/test/unit/org/apache/cassandra/index/sai/cql/CompositePartitionKeyIndexTest.java b/test/unit/org/apache/cassandra/index/sai/cql/CompositePartitionKeyIndexTest.java index 5d3c9e11d75a..7f9f7459157e 100644 --- a/test/unit/org/apache/cassandra/index/sai/cql/CompositePartitionKeyIndexTest.java +++ b/test/unit/org/apache/cassandra/index/sai/cql/CompositePartitionKeyIndexTest.java @@ -22,19 +22,110 @@ import org.junit.Test; import org.apache.cassandra.cql3.restrictions.StatementRestrictions; +import org.apache.cassandra.db.marshal.ByteType; import org.apache.cassandra.db.marshal.FloatType; +import org.apache.cassandra.db.marshal.IntegerType; import org.apache.cassandra.db.marshal.SimpleDateType; import org.apache.cassandra.db.marshal.TimeType; import org.apache.cassandra.db.marshal.UUIDType; +import org.apache.cassandra.distributed.test.cql3.SingleNodeTableWalkTest; import org.apache.cassandra.index.sai.SAITester; +import org.apache.cassandra.utils.ByteBufferUtil; public class CompositePartitionKeyIndexTest extends SAITester { + /** + * Originally discovered by {@link SingleNodeTableWalkTest} with the following seeds: + * 8837255108450816265 + * 1164443107607596330 + * -6614981692374717168 + * 1746205502103206170 + */ + @Test + public void testStaticAndNonStaticKeysOnFlush() throws Throwable + { + createTable("CREATE TABLE %s (pk0 tinyint, pk1 bigint, ck0 blob, s1 text static, s0 set static, v0 smallint, PRIMARY KEY ((pk0, pk1), ck0)) WITH CLUSTERING ORDER BY (ck0 DESC)"); + disableCompaction(KEYSPACE); + createIndex("CREATE INDEX tbl_pk0 ON %s(pk0) USING 'sai'"); + createIndex("CREATE INDEX tbl_pk1 ON %s(pk1) USING 'sai'"); + createIndex("CREATE INDEX tbl_s1 ON %s(s1) USING 'sai'"); + createIndex("CREATE INDEX tbl_v0 ON %s(v0) USING 'sai'"); + + execute("INSERT INTO %s (pk0, pk1, ck0, s1, s0, v0) VALUES (-62, -5815950741950477880, 0x326f, '켅\uF6EB憓ᤃ\uEF32ꝃ窰ŷ', {00000000-0000-4700-aa00-000000000000}, 19310) USING TIMESTAMP 1"); + execute("DELETE FROM %s USING TIMESTAMP 2 WHERE pk0 = 45 AND pk1 = 6014418364385708772 AND ck0 = 0x7c10"); + execute("DELETE FROM %s USING TIMESTAMP 3 WHERE pk0 = -41 AND pk1 = -3934225888295599640"); + execute("INSERT INTO %s (pk0, pk1, ck0, s1, s0, v0) " + + "VALUES (-64, 7973592261481566341, 0x0d, '\uE11B摻', {00000000-0000-4800-8900-000000000000, 00000000-0000-4900-8600-000000000000}, -23873) USING TIMESTAMP 4"); + flush(KEYSPACE); + + execute("UPDATE %s USING TIMESTAMP 5 SET v0=-359, s1='ل≻Ⱆ喡䮠?' WHERE pk0 = -64 AND pk1 = 7973592261481566341 AND ck0 = 0x99d570024de738f37877"); + execute("INSERT INTO %s (pk0, pk1, ck0, v0, s1, s0) " + + "VALUES (-104, -4990846884898776392, 0xf7ac771298eaf1d4, -6977, '凘纖볭菮⏏↶?蜑', null) USING TIMESTAMP 6"); + execute("INSERT INTO %s (pk0, pk1, ck0, s1, s0, v0) " + + "VALUES (-62, -5815950741950477880, 0x9277e744212e1c4b50, '\uF6AD瀛⛕徳倬糽ᢷ' + '雴', {00000000-0000-4700-b100-000000000000, 00000000-0000-4800-9300-000000000000}, 5423) USING TIMESTAMP 7"); + execute("DELETE FROM %s USING TIMESTAMP 8 WHERE pk0 = -64 AND pk1 = 7973592261481566341"); + flush(KEYSPACE); + + execute("DELETE s0, s1, s0 FROM %s USING TIMESTAMP 9 WHERE pk0 = -62 AND pk1 = -5815950741950477880"); + execute("DELETE FROM %s USING TIMESTAMP 10 WHERE pk0 = -41 AND pk1 = -3934225888295599640 AND ck0 = 0xd753dc3a473acaf665"); + execute("INSERT INTO %s (pk0, pk1, ck0, s1, s0, v0) " + + "VALUES (-62, -5815950741950477880, 0xd1e07b568a7188, 'ᑿ鼾戆' + '篐뵡?䰫', {00000000-0000-4500-b000-000000000000}, 17933) USING TIMESTAMP 11"); + execute("UPDATE %s USING TIMESTAMP 12 SET v0=null, s0={00000000-0000-4600-a000-000000000000, 00000000-0000-4d00-8200-000000000000, 00000000-0000-4f00-9200-000000000000} " + + "WHERE pk0 = -41 AND pk1 = -3934225888295599640 AND ck0 = 0x0dab3b038131efa2"); + + assertRowCount(execute("SELECT * FROM %s WHERE pk0 >= ? LIMIT 81", (byte) -104), 5); + execute("DELETE FROM %s USING TIMESTAMP 13 WHERE pk0 = -64 AND pk1 = 7973592261481566341"); + flush(KEYSPACE); + + beforeAndAfterFlush(() -> + assertRows(execute("SELECT pk0, pk1, ck0 FROM %s WHERE pk0 >= ?", (byte) -104), + row((byte) -62, -5815950741950477880L, ByteBufferUtil.hexToBytes("d1e07b568a7188")), + row((byte) -62, -5815950741950477880L, ByteBufferUtil.hexToBytes("9277e744212e1c4b50")), + row((byte) -62, -5815950741950477880L, ByteBufferUtil.hexToBytes("326f")), + row((byte) -104, -4990846884898776392L, ByteBufferUtil.hexToBytes("f7ac771298eaf1d4")), + row((byte) -41, -3934225888295599640L, null))); + } + + /** + * Originally discovered by {@link SingleNodeTableWalkTest} with seed -5732060315438955166 + */ + @Test + public void testIgnoreCellDeletions() throws Throwable + { + createTable("CREATE TABLE %s (pk0 boolean, pk1 varint, ck0 tinyint, ck1 varint, s0 list>> static, " + + " s1 map>, frozen>> static, v0 frozen>, uuid>>, " + + " PRIMARY KEY ((pk0, pk1), ck0, ck1)) WITH CLUSTERING ORDER BY (ck0 DESC, ck1 DESC)"); + disableCompaction(KEYSPACE); + createIndex("CREATE INDEX tbl_pk0 ON %s(pk0) USING 'sai'"); + + execute("INSERT INTO %s (pk0, pk1, ck0, ck1, s0, s1, v0) " + + "VALUES (true, 0, 109, 0, [{2.2352903520430565E260: -29214, 2.605618737869944E274: -13041}], " + + " {{00000000-0000-4400-9f00-000000000000, 00000000-0000-4500-9b00-000000000000, 00000000-0000-4b00-bf00-000000000000}: {'18.112.79.221': '-2306623-03-19', '227.58.183.116': '-3929454-04-25'}}, " + + " {{'⭎憢?', '黣偛紑'}: 00000000-0000-4900-8600-000000000000, {'㛽ꓗ', '剢ꮱ死䰀륬ਐ喑ퟚ', '竖䝏爐뷤曀'}: 00000000-0000-4900-bc00-000000000000}) USING TIMESTAMP 1"); + execute("INSERT INTO %s (pk0, pk1, ck0, ck1, s1, v0) " + + "VALUES (true, 0, 114, 742, {{00000000-0000-4000-9a00-000000000000, 00000000-0000-4700-ba00-000000000000}: {'96.31.70.25': '-912836-06-15', '185.90.18.173': '-5257542-01-31', '223.18.191.245': '-4633145-10-30'}}, " + + " {{'뫥㩎뎠ྭẒ'}: 00000000-0000-4800-8600-000000000000}) USING TIMESTAMP 2"); + + // This will result in the creation of erroneous postings if cell deletions are not accounted for: + execute("DELETE v0, s1, s0 FROM %s USING TIMESTAMP 6 WHERE pk0 = true AND pk1 = 0 AND ck0 = 121 AND ck1 = 1"); + + execute("UPDATE %s USING TIMESTAMP 8 SET s0 += [{4.3056056376102396E-169: 22551, 1.439623561042819E208: 20450}, {-2.7900719406964408E-242: 30147, 8.586565205109037E-211: 28721, 4.603864140847754E20: -12814}], " + + " s1 += {{00000000-0000-4200-b900-000000000000, 00000000-0000-4500-ab00-000000000000}: {'2.67.240.121': '-471656-04-17', '134.186.187.51': '-2056459-04-13'}}, " + + " v0={{'?', '蠥╩徰昰弳펠재', '됢簔Ὕ텇⢌យ稭澣'}: 00000000-0000-4d00-8d00-000000000000} " + + "WHERE pk0 = true AND pk1 = 0 AND ck0 = 37 AND ck1 = 0"); + + beforeAndAfterFlush(() -> + assertRows(execute("SELECT pk0, pk1, ck0, ck1 FROM %s WHERE pk0 = ? LIMIT 4", true), + row(true, IntegerType.instance.fromString("0"), ByteType.instance.fromString("114"), IntegerType.instance.fromString("742")), + row(true, IntegerType.instance.fromString("0"), ByteType.instance.fromString("109"), IntegerType.instance.fromString("0")), + row(true, IntegerType.instance.fromString("0"), ByteType.instance.fromString("37"), IntegerType.instance.fromString("0")))); + } + @Test public void testIntersectionOnMixedPostingsOnDelete() throws Throwable { createTable("CREATE TABLE %s (pk0 boolean, pk1 uuid, ck0 date, ck1 smallint, s0 timeuuid static, v0 bigint, v1 float, PRIMARY KEY ((pk0, pk1), ck0, ck1)) WITH CLUSTERING ORDER BY (ck0 DESC, ck1 ASC)"); - + disableCompaction(KEYSPACE); createIndex("CREATE INDEX tbl_pk0 ON %s(pk0) USING 'sai'"); createIndex("CREATE INDEX tbl_ck0 ON %s(ck0) USING 'sai'"); @@ -54,7 +145,7 @@ public void testIntersectionOnMixedPostingsOnDelete() throws Throwable public void testIntersectionOnMixedPostingsOnUpdate() throws Throwable { createTable("CREATE TABLE %s (pk0 boolean, pk1 uuid, ck0 date, ck1 smallint, s0 timeuuid static, v0 bigint, v1 float, PRIMARY KEY ((pk0, pk1), ck0, ck1)) WITH CLUSTERING ORDER BY (ck0 DESC, ck1 ASC)"); - + disableCompaction(KEYSPACE); createIndex("CREATE INDEX tbl_pk0 ON %s(pk0) USING 'sai'"); createIndex("CREATE INDEX tbl_ck0 ON %s(ck0) USING 'sai'"); @@ -74,6 +165,7 @@ public void testIntersectionOnMixedPostingsOnUpdate() throws Throwable public void testIntersectionWithStaticOverlap() throws Throwable { createTable("CREATE TABLE %s (pk0 int, pk1 int, ck0 int, s1 int static, v0 int, PRIMARY KEY((pk0, pk1), ck0))"); + disableCompaction(KEYSPACE); createIndex("CREATE INDEX ON %s(pk0) USING 'sai'"); execute("UPDATE %s USING TIMESTAMP 1 SET s1 = 0, v0 = 0 WHERE pk0 = 0 AND pk1 = 1 AND ck0 = 0"); @@ -91,6 +183,7 @@ public void testIntersectionWithStaticOverlap() throws Throwable public void testIntersectionWithStaticUpdate() throws Throwable { createTable("CREATE TABLE %s (pk0 time, pk1 varint, ck0 date, s0 boolean static, s1 text static, v0 boolean, PRIMARY KEY ((pk0, pk1), ck0))"); + disableCompaction(KEYSPACE); createIndex("CREATE INDEX tbl_pk0 ON %s(pk0) USING 'sai'"); createIndex("CREATE INDEX tbl_s0 ON %s(s0) USING 'sai'"); @@ -116,6 +209,7 @@ public void testIntersectionWithStaticUpdate() throws Throwable public void testCompositePartitionIndex() throws Throwable { createTable("CREATE TABLE %s (pk1 int, pk2 text, val int, PRIMARY KEY((pk1, pk2)))"); + disableCompaction(KEYSPACE); createIndex("CREATE INDEX ON %s(pk1) USING 'sai'"); createIndex("CREATE INDEX ON %s(pk2) USING 'sai'"); @@ -168,6 +262,7 @@ public void testCompositePartitionIndex() throws Throwable public void testFilterWithIndexForContains() throws Throwable { createTable("CREATE TABLE %s (k1 int, k2 int, v set, PRIMARY KEY ((k1, k2)))"); + disableCompaction(KEYSPACE); createIndex("CREATE INDEX ON %s(k2) USING 'sai'"); execute("INSERT INTO %s (k1, k2, v) VALUES (?, ?, ?)", 0, 0, set(1, 2, 3)); diff --git a/test/unit/org/apache/cassandra/index/sai/cql/IntraPartitionSkippingTest.java b/test/unit/org/apache/cassandra/index/sai/cql/IntraPartitionSkippingTest.java new file mode 100644 index 000000000000..a6537920a915 --- /dev/null +++ b/test/unit/org/apache/cassandra/index/sai/cql/IntraPartitionSkippingTest.java @@ -0,0 +1,318 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.sai.cql; + +import org.junit.Ignore; +import org.junit.Test; + +import org.HdrHistogram.Histogram; +import org.apache.cassandra.index.sai.SAITester; + +/** + * Tests for verifying intra-partition and partition-level skipping optimizations + * introduced in CASSANDRA-20191 for SAI. + *

    + * These tests validate that Cassandra can efficiently skip over rows + * within a partition using clustering filters (name and slice), paging, reversed order, + * and sparse matches. + *

    + * Each test documents a scenario where skipping logic is expected to apply along with few where it doesn't skip. + */ +public class IntraPartitionSkippingTest extends SAITester +{ + @Test + public void testNameFilterExactMatch() throws Throwable + { + createTable("CREATE TABLE %S (pk int, ck int, val text, PRIMARY KEY (pk, ck))"); + createIndex("CREATE INDEX ON %s(val) USING 'sai'"); + + for (int ck = 0; ck < 10; ck++) + { + execute("INSERT INTO %s (pk, ck, val) VALUES (?, ?, ?)", 1, ck, "val" + ck); + } + + beforeAndAfterFlush(() -> assertRows(execute("SELECT * FROM %s WHERE pk = 1 AND ck = 5 AND val = 'val5' ALLOW FILTERING"), + row(1, 5,"val5"))); + } + + @Test + public void testSliceFilterRangeMatch() throws Throwable + { + createTable("CREATE TABLE %S (pk int, ck int, val text, PRIMARY KEY (pk, ck))"); + createIndex("CREATE INDEX ON %s(val) USING 'sai'"); + + for (int ck = 0; ck < 100; ck++) + { + execute("INSERT INTO %s (pk, ck, val) VALUES (?, ?, ?)", 1, ck, "val" + ck); + } + + beforeAndAfterFlush(() -> assertRows(execute("SELECT * FROM %s WHERE pk = 1 AND ck > 90 AND val = 'val99' ALLOW FILTERING"), + row(1, 99,"val99"))); + } + + @Test + public void testReversedClustering() throws Throwable + { + createTable("CREATE TABLE %S (pk int, ck int, val text, PRIMARY KEY (pk, ck)) WITH CLUSTERING ORDER BY (ck DESC)"); + createIndex("CREATE INDEX ON %s(val) USING 'sai'"); + + for (int ck = 0; ck < 20; ck++) + { + execute("INSERT INTO %s (pk, ck, val) VALUES (?, ?, ?)", 1, ck, "val" + ck); + } + + beforeAndAfterFlush(() -> assertRows(execute("SELECT * FROM %s WHERE pk = 1 AND ck < 10 AND val = 'val5' ALLOW FILTERING"), + row(1,5,"val5"))); + } + + @Test + public void testSkippingWithPaging() throws Throwable + { + createTable("CREATE TABLE %S (pk int, ck int, val int, PRIMARY KEY (pk, ck))"); + + createIndex("CREATE INDEX ON %s(val) USING 'sai'"); + + for (int ck = 0; ck < 100; ck++) + { + int val = 1000 + ck; + execute("INSERT INTO %s (pk, ck, val) VALUES (?, ?, ?)", 1, ck, val); + } + + beforeAndAfterFlush(() -> assertRowsNet(executeNetWithPaging("SELECT * FROM %s WHERE pk = 1 AND ck > 90 AND val > 1090 ALLOW FILTERING", 5), + row(1, 91, 1091), + row(1, 92, 1092), + row(1, 93, 1093), + row(1, 94, 1094), + row(1, 95, 1095), + row(1, 96, 1096), + row(1, 97, 1097), + row(1, 98, 1098), + row(1, 99, 1099))); + } + + @Test + public void testCompositeClusteringKeySkipping() throws Throwable + { + createTable("CREATE TABLE %S (pk int, ck1 int, ck2 int, val text, PRIMARY KEY (pk, ck1, ck2))"); + createIndex("CREATE INDEX ON %s(val) USING 'sai'"); + + for (int ck1 = 0; ck1 < 10; ck1++) + for (int ck2 = 0; ck2 < 10; ck2++) + execute("INSERT INTO %s (pk, ck1, ck2, val) VALUES (?, ?, ?, ?)", 1, ck1, ck2, "v" + (ck1*10+ck2)); + + + beforeAndAfterFlush(() -> assertRows(execute("SELECT * FROM %s WHERE pk = 1 AND ck1 = 9 AND ck2 = 9 AND val = 'v99' ALLOW FILTERING"), + row(1,9,9,"v99"))); + + } + + @Test + public void testSparseMatch() throws Throwable + { + createTable("CREATE TABLE %S (pk int, ck int, val text, PRIMARY KEY (pk, ck))"); + createIndex("CREATE INDEX ON %s(val) USING 'sai'"); + + for (int ck = 0; ck < 1000; ck++) + { + String value = (ck % 450 == 0) ? "insert" : "skip"; + execute("INSERT INTO %s (pk, ck, val) VALUES (?, ?, ?)", 1, ck, value); + } + + beforeAndAfterFlush(() -> assertRows(execute("SELECT * FROM %s WHERE pk = 1 AND ck > 899 AND val = 'insert' ALLOW FILTERING"), + row(1,900,"insert"))); + + } + + @Test + public void testMultipleNameFilters() throws Throwable + { + createTable("CREATE TABLE %S (pk int, ck int, val text, PRIMARY KEY (pk, ck))"); + createIndex("CREATE INDEX ON %s(val) USING 'sai'"); + + for (int i = 0; i < 20; i++) + execute("INSERT INTO %s (pk, ck, val) VALUES (?, ?, ?)", 1, i, "v5"); + + beforeAndAfterFlush(() -> assertRows(execute("SELECT * FROM %s WHERE pk = 1 AND ck IN (5, 10, 15) AND val = 'v5' ALLOW FILTERING"), + row(1,5,"v5"), row(1,10,"v5"), row(1,15,"v5"))); + + } + + // Multiple partition range scans won't skip + @Test + public void testPartitionRangeSkipping() throws Throwable + { + createTable("CREATE TABLE %S (pk int, ck int, val text, PRIMARY KEY (pk, ck))"); + createIndex("CREATE INDEX ON %s(val) USING 'sai'"); + + for (int pk = 0; pk < 10; pk++) + for (int ck = 0; ck < 5; ck++) + execute("INSERT INTO %s (pk, ck, val) VALUES (?, ?, ?)", pk, ck, "value" + pk); + + beforeAndAfterFlush(() -> assertRows(execute("SELECT * FROM %s WHERE val = 'value9' AND ck > 2 ALLOW FILTERING"), + row(9,3,"value9"), row(9,4,"value9"))); + + } + + @Test + public void testStaticColumns() throws Throwable + { + createTable("CREATE TABLE %S (pk int, ck int, s text static, val text, PRIMARY KEY (pk, ck))"); + createIndex("CREATE INDEX ON %s(val) USING 'sai'"); + + execute("INSERT INTO %s (pk, s) VALUES (?, ?)", 1, "static1"); + + for (int ck = 0; ck < 200; ck++) + { + execute("INSERT INTO %s (pk, ck, val) VALUES (?, ?, ?)", 1, ck, "val" + ck); + } + + + // We will not skip + beforeAndAfterFlush(() -> assertRows(execute("SELECT * FROM %s WHERE pk = 1 AND ck > 100 AND s = 'static1' AND val = 'val101' ALLOW FILTERING"), + row(1,101,"static1","val101"))); + + // we will skip + beforeAndAfterFlush(() -> assertRows(execute("SELECT * FROM %s WHERE pk = 1 AND ck > 100 AND val = 'val101' ALLOW FILTERING"), + row(1,101,"static1","val101"))); + } + + @Test + public void testNextKeyClusteringIndexNamesFilter() throws Throwable + { + createTable("CREATE TABLE %S (" + + "pk int," + + "ck int," + + "v int," + + "PRIMARY KEY (pk, ck))"); + + createIndex("CREATE INDEX ON %s(v) USING 'sai'"); + + int pk = 1; + for (int ck = 0; ck < 10; ck++) + { + int v = ck + 1000; + execute("INSERT INTO %s (pk, ck, v) VALUES (?, ?, ?)", pk, ck, v); + } + + int pk1 = 2; + for (int ck = 0; ck < 100; ck++) + { + execute("INSERT INTO %s (pk, ck, v) VALUES (?, ?, ?)", pk1, ck, ck); + } + + beforeAndAfterFlush(() -> { + assertRows(execute("SELECT * FROM %s WHERE pk = 1 AND ck = 5 AND v > 1004 ALLOW FILTERING"), + row(1, 5, 1005)); + + assertRows(execute("SELECT * FROM %s WHERE pk = 1 AND ck = 5 AND v > 1004 AND v < 20000 ALLOW FILTERING"), + row(1, 5, 1005)); + }); + + + } + + // Performance testing test-cases and can be ingnored. + @Ignore ("performance test case for Index Slice filter.") + @Test + public void testNextKeyPerfClusteringIndexSliceFilter() + { + createTable("CREATE TABLE %S (" + + "pk int, " + + "ck int, " + + "val text, " + + "PRIMARY KEY (pk, ck))"); + + createIndex("CREATE INDEX ON %s(val) USING 'sai'"); + + int pk = 1; + for (int ck = 0; ck < 10000; ck++) + { + execute("INSERT INTO %s (pk, ck, val) VALUES (?, ?, ?)", pk, ck, "hello1"); + } + + int pk1 = 2; + for (int ck = 0; ck < 100; ck++) + { + execute("INSERT INTO %s (pk, ck, val) VALUES (?, ?, ?)", pk1, ck, "hello2"); + } + + Histogram histogram = new Histogram(4); + + + for (int i = 0; i < 10000; i++) + { + long start = System.nanoTime(); + execute("SELECT * FROM %s WHERE pk = 1 AND ck > 9000 AND val = 'hello1' ALLOW FILTERING"); + histogram.recordValue(System.nanoTime() - start); + + if (i % 1000 == 0) + { + System.out.println("50th: " + histogram.getValueAtPercentile(0.5)); + System.out.println("95th: " + histogram.getValueAtPercentile(0.95)); + System.out.println("99th: " + histogram.getValueAtPercentile(0.99)); + } + } + + } + + + @Ignore ("performance test case for Index Names filter.") + @Test + public void testNextKeyPerfClusteringIndexNamesFilter() + { + createTable("CREATE TABLE %S (" + + "pk int," + + "ck int," + + "v int," + + "PRIMARY KEY (pk, ck))"); + + createIndex("CREATE INDEX ON %s(v) USING 'sai'"); + + int pk = 1; + for (int ck = 0; ck < 20000; ck++) + { + int v = ck + 10; + execute("INSERT INTO %s (pk, ck, v) VALUES (?, ?, ?)", pk, ck, v); + } + + int pk1 = 2; + for (int ck = 0; ck < 100; ck++) + { + execute("INSERT INTO %s (pk, ck, v) VALUES (?, ?, ?)", pk1, ck, ck); + } + + Histogram histogram = new Histogram(4); + + for (int i = 0; i < 10000; i++) + { + long start = System.nanoTime(); + execute("SELECT * FROM %s WHERE pk = 1 AND ck = 15000 AND v > 9000 ALLOW FILTERING"); + histogram.recordValue(System.nanoTime() - start); + + if (i % 1000 == 0) + { + System.out.println("50th: " + histogram.getValueAtPercentile(0.5)); + System.out.println("95th: " + histogram.getValueAtPercentile(0.95)); + System.out.println("99th: " + histogram.getValueAtPercentile(0.99)); + } + } + + } + +} diff --git a/test/unit/org/apache/cassandra/index/sai/cql/StaticColumnIndexTest.java b/test/unit/org/apache/cassandra/index/sai/cql/StaticColumnIndexTest.java index faaf74f5c603..428cfe8ca856 100644 --- a/test/unit/org/apache/cassandra/index/sai/cql/StaticColumnIndexTest.java +++ b/test/unit/org/apache/cassandra/index/sai/cql/StaticColumnIndexTest.java @@ -20,6 +20,10 @@ import org.junit.Test; +import org.apache.cassandra.db.marshal.BytesType; +import org.apache.cassandra.db.marshal.TimeType; +import org.apache.cassandra.db.marshal.UUIDType; +import org.apache.cassandra.distributed.test.cql3.SingleNodeTableWalkTest; import org.apache.cassandra.index.sai.SAITester; public class StaticColumnIndexTest extends SAITester @@ -28,6 +32,7 @@ public class StaticColumnIndexTest extends SAITester public void staticIndexReturnsAllRowsInPartition() throws Throwable { createTable("CREATE TABLE %s (pk int, ck int, val1 int static, val2 int, PRIMARY KEY(pk, ck))"); + disableCompaction(KEYSPACE); createIndex("CREATE INDEX ON %s(val1) USING 'sai'"); execute("INSERT INTO %s(pk, ck, val1, val2) VALUES(?, ?, ?, ?)", 1, 1, 2, 1); @@ -42,6 +47,7 @@ public void staticIndexReturnsAllRowsInPartition() throws Throwable public void staticIndexAndNonStaticIndex() throws Throwable { createTable("CREATE TABLE %s (pk int, ck int, val1 int static, val2 int, PRIMARY KEY(pk, ck))"); + disableCompaction(KEYSPACE); createIndex("CREATE INDEX ON %s(val1) USING 'sai'"); createIndex("CREATE INDEX ON %s(val2) USING 'sai'"); @@ -57,6 +63,7 @@ public void staticIndexAndNonStaticIndex() throws Throwable public void staticAndNonStaticRangeIntersection() throws Throwable { createTable("CREATE TABLE %s (pk int, ck int, v1 int, s1 int static, PRIMARY KEY(pk, ck))"); + disableCompaction(KEYSPACE); createIndex("CREATE INDEX ON %s(v1) USING 'sai'"); createIndex("CREATE INDEX ON %s(s1) USING 'sai'"); @@ -71,4 +78,45 @@ public void staticAndNonStaticRangeIntersection() throws Throwable beforeAndAfterFlush(() -> assertRowCount(execute("SELECT * FROM %s WHERE pk = ? AND v1 > ? AND s1 = ?", 0, 2, 100), 3)); } + + /** + * Originally discovered by {@link SingleNodeTableWalkTest} with seed -464866883761188308 + */ + @Test + public void testTupleAndBlobFiltering() throws Throwable + { + String blobTupleType = createType("CREATE TYPE IF NOT EXISTS %s (f0 blob)"); + String boolTinyTextType = createType("CREATE TYPE IF NOT EXISTS %s (f0 boolean, f1 tinyint, f2 text)"); + createTable("CREATE TABLE %s (pk0 time, pk1 uuid, ck0 uuid, ck1 blob, s0 frozen>>> static, " + + " v0 vector, 3>, v1 frozen, vector>>, " + + " v2 vector, 2>, v3 bigint, PRIMARY KEY ((pk0, pk1), ck0, ck1)) WITH CLUSTERING ORDER BY (ck0 DESC, ck1 DESC)"); + disableCompaction(KEYSPACE); + createIndex("CREATE INDEX tbl_pk1 ON %s(pk1) USING 'sai'"); + createIndex("CREATE INDEX tbl_s0 ON %s(s0) USING 'sai'"); + + execute("INSERT INTO %s (pk0, pk1, ck0, ck1, s0, v0, v1, v2, v3) " + + "VALUES ('02:43:47.716011275', 00000000-0000-4200-b200-000000000000, 00000000-0000-4e00-8400-000000000000, 0xf2791941aea8e469, " + + " (12129, {-2.58545975E14}), [[-1781797567, 330686172], [103364202, 2031130152], [-550709009, 492544493]], " + + " {{f0: 0x34839b8bae653b2bdee8}: [-8431172225521461427, 8894719445990427242]}, [{f0: false, f1: 53, f2: '嵆왛孷쏆䊖恣'}, {f0: true, f1: 21, f2: 'ᨚ?榥쯢?ɚ챛ퟡ'}], 9167463065336786821) USING TIMESTAMP 3"); + execute("UPDATE %s USING TIMESTAMP 4 " + + "SET s0=(23307, {-8.214548E-18}), v0=[[672139924, -1253475201], [353181149, -1829076723], [179355765, 379303855]], " + + " v1={{f0: 0x64850696464d}: [-7485547085069825418, 7795885370802556756], {f0: 0x67633db6f091}: [-8484578637223040646, 8216210044102487771]}, " + + " v2=[{f0: true, f1: 68, f2: '䝿ᝧ䶨푥펟겭매郂쀌'}, {f0: true, f1: 98, f2: '髃爫삿챥卛☓읂ີ?'}], v3=-4626482462417652499 * -7377486305688263453 " + + "WHERE pk0 = '03:36:30.876439626' AND pk1 = 00000000-0000-4000-ad00-000000000000 AND ck0 = 00000000-0000-4000-9f00-000000000000 AND ck1 = 0xa06bb301"); + execute("INSERT INTO %s (pk0, pk1, ck0, ck1, s0, v0, v1, v2, v3) " + + "VALUES ('07:08:47.775161332', 00000000-0000-4800-ad00-000000000000, 00000000-0000-4a00-a500-000000000000, 0xfef0d63ff7, (-15283, {-1.132058E24, 2.9319742E-31}), " + + " [[-335960956, 678086816], [-2139882146, 1011627708], [-55338955, -2094185756]], {{f0: 0xd9c3ab}: [-9002034104664383537, -8074261670215737032]}, " + + " [{f0: true, f1: -79, f2: '霠♘칳⦵ঋ幗䶐'}, {f0: true, f1: 7, f2: '䉻ݹ鞞텔㙠'}], 1885613374025825905) USING TIMESTAMP 5"); + execute("DELETE FROM %s USING TIMESTAMP 6 WHERE pk0 = '14:02:14.975449434' AND pk1 = 00000000-0000-4900-9900-000000000000"); + execute("DELETE FROM %s USING TIMESTAMP 7 WHERE pk0 = '12:15:35.151327231' AND pk1 = 00000000-0000-4500-ac00-000000000000"); + execute("DELETE FROM %s USING TIMESTAMP 8 WHERE pk0 = '07:08:47.775161332' AND pk1 = 00000000-0000-4800-ad00-000000000000 AND ck0 = 00000000-0000-4b00-b000-000000000000 AND ck1 = 0xa4121adb08"); + execute("INSERT INTO %s (pk0, pk1, ck0, ck1, s0, v0, v1, v2, v3) " + + "VALUES ('03:36:30.876439626', 00000000-0000-4000-ad00-000000000000, 00000000-0000-4600-b400-000000000000, 0x63f5, (28387, {-1.18764904E-20}), " + + " [[-441895935, 313114446], [-740629531, -678512740], [1429899934, -1259907921]], {{f0: 0x5df1}: [414225888834712632, -5730196176171247108], " + + " {f0: 0x92c1497d7072b81c91}: [-7587541014989351350, -2813091340484612608]}, [{f0: true, f1: 41, f2: '쎺╇⒀왶'}, {f0: true, f1: -84, f2: '턺䋏篷'}], -1473884563651667176 + 128345915915881356) USING TIMESTAMP 9"); + + beforeAndAfterFlush(() -> assertRows(execute("SELECT pk0, pk1, ck0, ck1 FROM %s WHERE s0 = (28387, {-1.18764904E-20}) AND pk1 = 00000000-0000-4000-ad00-000000000000 AND ck1 = 0xa06bb301 LIMIT 307 ALLOW FILTERING"), + row(TimeType.instance.fromString("03:36:30.876439626"), UUIDType.instance.fromString("00000000-0000-4000-ad00-000000000000"), + UUIDType.instance.fromString("00000000-0000-4000-9f00-000000000000"), BytesType.instance.fromString("a06bb301")))); + } } diff --git a/test/unit/org/apache/cassandra/index/sai/cql/VectorTypeTest.java b/test/unit/org/apache/cassandra/index/sai/cql/VectorTypeTest.java index 1a0123067bbb..94c8b39b5155 100644 --- a/test/unit/org/apache/cassandra/index/sai/cql/VectorTypeTest.java +++ b/test/unit/org/apache/cassandra/index/sai/cql/VectorTypeTest.java @@ -551,7 +551,7 @@ private Collection keys(UntypedResultSet result) private Collection keysWithLowerBound(Collection keys, int leftKey, boolean leftInclusive) { return keysInTokenRange(keys, partitioner.getToken(Int32Type.instance.decompose(leftKey)), leftInclusive, - partitioner.getMaximumToken().getToken(), true); + partitioner.getMaximumTokenForSplitting().getToken(), true); } private Collection keysWithUpperBound(Collection keys, int rightKey, boolean rightInclusive) diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/InvertedIndexSearcherTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/InvertedIndexSearcherTest.java index 93045c80d6d1..93ee3f99cd4f 100644 --- a/test/unit/org/apache/cassandra/index/sai/disk/v1/InvertedIndexSearcherTest.java +++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/InvertedIndexSearcherTest.java @@ -21,7 +21,6 @@ import java.nio.ByteBuffer; import java.util.List; -import org.junit.BeforeClass; import org.junit.Test; import com.carrotsearch.hppc.LongArrayList; @@ -34,18 +33,17 @@ import org.apache.cassandra.index.sai.QueryContext; import org.apache.cassandra.index.sai.SAITester; import org.apache.cassandra.index.sai.StorageAttachedIndex; -import org.apache.cassandra.index.sai.iterators.KeyRangeIterator; -import org.apache.cassandra.index.sai.memory.MemtableTermsIterator; import org.apache.cassandra.index.sai.disk.PrimaryKeyMap; import org.apache.cassandra.index.sai.disk.format.IndexDescriptor; import org.apache.cassandra.index.sai.disk.v1.segment.IndexSegmentSearcher; import org.apache.cassandra.index.sai.disk.v1.segment.LiteralIndexSegmentSearcher; import org.apache.cassandra.index.sai.disk.v1.segment.SegmentMetadata; import org.apache.cassandra.index.sai.disk.v1.trie.LiteralIndexWriter; +import org.apache.cassandra.index.sai.iterators.KeyRangeIterator; +import org.apache.cassandra.index.sai.memory.MemtableTermsIterator; import org.apache.cassandra.index.sai.plan.Expression; import org.apache.cassandra.index.sai.utils.PrimaryKey; import org.apache.cassandra.index.sai.utils.SAIRandomizedTester; -import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.Pair; import org.apache.cassandra.utils.bytecomparable.ByteComparable; import org.apache.cassandra.utils.bytecomparable.ByteSource; @@ -92,13 +90,6 @@ public long floor(Token token) }; public static final PrimaryKeyMap.Factory TEST_PRIMARY_KEY_MAP_FACTORY = () -> TEST_PRIMARY_KEY_MAP; - @BeforeClass - public static void setupCQLTester() - { - DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); - StorageService.instance.setPartitionerUnsafe(Murmur3Partitioner.instance); - } - @Test public void testEqQueriesAgainstStringIndex() throws Exception { @@ -194,7 +185,7 @@ private IndexSegmentSearcher buildIndexAndOpenSearcher(StorageAttachedIndex inde 0, Long.MAX_VALUE, SAITester.TEST_FACTORY.create(DatabaseDescriptor.getPartitioner().getMinimumToken()), - SAITester.TEST_FACTORY.create(DatabaseDescriptor.getPartitioner().getMaximumToken()), + SAITester.TEST_FACTORY.create(DatabaseDescriptor.getPartitioner().getMaximumTokenForSplitting()), wrap(termsEnum.get(0).left), wrap(termsEnum.get(terms - 1).left), indexMetas); diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/SegmentFlushTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/SegmentFlushTest.java index f0d1eae642f0..e580fffe012a 100644 --- a/test/unit/org/apache/cassandra/index/sai/disk/v1/SegmentFlushTest.java +++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/SegmentFlushTest.java @@ -112,7 +112,7 @@ private void testFlushBetweenRowIds(long sstableRowId1, long sstableRowId2, int Murmur3Partitioner.instance, SAITester.EMPTY_COMPARATOR); - ColumnMetadata column = ColumnMetadata.regularColumn("sai", "internal", "column", UTF8Type.instance); + ColumnMetadata column = ColumnMetadata.regularColumn("sai", "internal", "column", UTF8Type.instance, ColumnMetadata.NO_UNIQUE_ID); StorageAttachedIndex index = SAITester.createMockIndex(column); diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/SegmentTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/SegmentTest.java index 6a0869fbfd6e..d3d22e2489fd 100644 --- a/test/unit/org/apache/cassandra/index/sai/disk/v1/SegmentTest.java +++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/SegmentTest.java @@ -50,7 +50,7 @@ public static void init() DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); partitioner = DatabaseDescriptor.getPartitioner(); min = partitioner.getMinimumToken(); - max = partitioner.getMaximumToken(); + max = partitioner.getMaximumTokenForSplitting(); tokens = IntStream.rangeClosed(0, 10).boxed().map(i -> partitioner.getRandomToken()) .distinct().sorted().collect(Collectors.toList()); } diff --git a/test/unit/org/apache/cassandra/index/sai/memory/TrieMemoryIndexTest.java b/test/unit/org/apache/cassandra/index/sai/memory/TrieMemoryIndexTest.java index 0ab4c846f754..963742c02c1c 100644 --- a/test/unit/org/apache/cassandra/index/sai/memory/TrieMemoryIndexTest.java +++ b/test/unit/org/apache/cassandra/index/sai/memory/TrieMemoryIndexTest.java @@ -29,8 +29,10 @@ import java.util.function.IntFunction; import java.util.stream.Collectors; +import org.junit.Ignore; import org.junit.Test; +import org.HdrHistogram.Histogram; import org.apache.cassandra.cql3.Operator; import org.apache.cassandra.cql3.statements.schema.IndexTarget; import org.apache.cassandra.db.Clustering; @@ -243,4 +245,28 @@ private TrieMemoryIndex newTrieMemoryIndex(AbstractType columnType) index = new StorageAttachedIndex(cfs, indexMetadata); return new TrieMemoryIndex(index); } + + @Ignore + @Test + public void testMemtableRangeQueryPerformance() + { + createTable("CREATE TABLE %S (pk int, ck int, val int, PRIMARY KEY (pk, ck))"); + createIndex("CREATE INDEX ON %s(val) USING 'sai'"); + + for (int pk = 0; pk < 20; pk++) + for (int ck = 0; ck < 10000; ck++) + execute("INSERT INTO %s (pk, ck, val) VALUES (?, ?, ?)", pk, ck, ck); + + Histogram histogram = new Histogram(4); + + for (int i = 0; i < 20000; i++) + { + long start = System.nanoTime(); + execute("SELECT * FROM %s WHERE pk = 5 AND val > ? LIMIT 10", 4000); + histogram.recordValue(System.nanoTime() - start); + } + + System.out.println("50th: " + histogram.getValueAtPercentile(0.5)); + System.out.println("99th: " + histogram.getValueAtPercentile(0.99)); + } } diff --git a/test/unit/org/apache/cassandra/index/sai/utils/IndexTermTypeTest.java b/test/unit/org/apache/cassandra/index/sai/utils/IndexTermTypeTest.java index 315adfdac5e8..4421c8c66d24 100644 --- a/test/unit/org/apache/cassandra/index/sai/utils/IndexTermTypeTest.java +++ b/test/unit/org/apache/cassandra/index/sai/utils/IndexTermTypeTest.java @@ -195,7 +195,7 @@ private static IndexTermType indexTermType(AbstractType type, IndexTarget.Typ private static ColumnMetadata column(AbstractType type) { - return ColumnMetadata.regularColumn("ks", "cf", "col", type); + return ColumnMetadata.regularColumn("ks", "cf", "col", type, ColumnMetadata.NO_UNIQUE_ID); } @Test diff --git a/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java b/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java index 70da2a0ab6d6..464775c26689 100644 --- a/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java +++ b/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java @@ -1791,7 +1791,7 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right }; // first let's check that we get 'false' for 'isLiteral' if we don't set the option with special comparator - ColumnMetadata columnA = ColumnMetadata.regularColumn(KS_NAME, CF_NAME, "special-A", stringType); + ColumnMetadata columnA = ColumnMetadata.regularColumn(KS_NAME, CF_NAME, "special-A", stringType, ColumnMetadata.NO_UNIQUE_ID); ColumnIndex indexA = new ColumnIndex(UTF8Type.instance, columnA, IndexMetadata.fromSchemaMetadata("special-index-A", IndexMetadata.Kind.CUSTOM, new HashMap() {{ @@ -1802,7 +1802,7 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right Assert.assertFalse(indexA.isLiteral()); // now let's double-check that we do get 'true' when we set it - ColumnMetadata columnB = ColumnMetadata.regularColumn(KS_NAME, CF_NAME, "special-B", stringType); + ColumnMetadata columnB = ColumnMetadata.regularColumn(KS_NAME, CF_NAME, "special-B", stringType, ColumnMetadata.NO_UNIQUE_ID); ColumnIndex indexB = new ColumnIndex(UTF8Type.instance, columnB, IndexMetadata.fromSchemaMetadata("special-index-B", IndexMetadata.Kind.CUSTOM, new HashMap() {{ @@ -1814,7 +1814,7 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right Assert.assertTrue(indexB.isLiteral()); // and finally we should also get a 'true' if it's built-in UTF-8/ASCII comparator - ColumnMetadata columnC = ColumnMetadata.regularColumn(KS_NAME, CF_NAME, "special-C", UTF8Type.instance); + ColumnMetadata columnC = ColumnMetadata.regularColumn(KS_NAME, CF_NAME, "special-C", UTF8Type.instance, ColumnMetadata.NO_UNIQUE_ID); ColumnIndex indexC = new ColumnIndex(UTF8Type.instance, columnC, IndexMetadata.fromSchemaMetadata("special-index-C", IndexMetadata.Kind.CUSTOM, new HashMap() {{ @@ -1824,7 +1824,7 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right Assert.assertTrue(indexC.isIndexed()); Assert.assertTrue(indexC.isLiteral()); - ColumnMetadata columnD = ColumnMetadata.regularColumn(KS_NAME, CF_NAME, "special-D", AsciiType.instance); + ColumnMetadata columnD = ColumnMetadata.regularColumn(KS_NAME, CF_NAME, "special-D", AsciiType.instance, ColumnMetadata.NO_UNIQUE_ID); ColumnIndex indexD = new ColumnIndex(UTF8Type.instance, columnD, IndexMetadata.fromSchemaMetadata("special-index-D", IndexMetadata.Kind.CUSTOM, new HashMap() {{ @@ -1835,7 +1835,7 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right Assert.assertTrue(indexD.isLiteral()); // and option should supersedes the comparator type - ColumnMetadata columnE = ColumnMetadata.regularColumn(KS_NAME, CF_NAME, "special-E", UTF8Type.instance); + ColumnMetadata columnE = ColumnMetadata.regularColumn(KS_NAME, CF_NAME, "special-E", UTF8Type.instance, ColumnMetadata.NO_UNIQUE_ID); ColumnIndex indexE = new ColumnIndex(UTF8Type.instance, columnE, IndexMetadata.fromSchemaMetadata("special-index-E", IndexMetadata.Kind.CUSTOM, new HashMap() {{ @@ -1850,7 +1850,8 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right ColumnMetadata columnF = ColumnMetadata.regularColumn(KS_NAME, CF_NAME, "special-F", - ListType.getInstance(UTF8Type.instance, false)); + ListType.getInstance(UTF8Type.instance, false), + ColumnMetadata.NO_UNIQUE_ID); ColumnIndex indexF = new ColumnIndex(UTF8Type.instance, columnF, IndexMetadata.fromSchemaMetadata("special-index-F", IndexMetadata.Kind.CUSTOM, new HashMap() {{ diff --git a/test/unit/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzerTest.java b/test/unit/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzerTest.java index 0cb57a607b0f..bebf757ed72e 100644 --- a/test/unit/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzerTest.java +++ b/test/unit/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzerTest.java @@ -90,14 +90,14 @@ public void testBlankEntries() throws Exception public void ensureIncompatibleInputOnCollectionTypeSkipped() { new DelimiterAnalyzer().validate(Collections.emptyMap(), - ColumnMetadata.regularColumn("a", "b", "c", SetType.getInstance(UTF8Type.instance, true))); + ColumnMetadata.regularColumn("a", "b", "c", SetType.getInstance(UTF8Type.instance, true), ColumnMetadata.NO_UNIQUE_ID)); } @Test(expected = ConfigurationException.class) public void ensureIncompatibleInputSkipped() { new DelimiterAnalyzer().validate(Collections.emptyMap(), - ColumnMetadata.regularColumn("a", "b", "c", Int32Type.instance)); + ColumnMetadata.regularColumn("a", "b", "c", Int32Type.instance, ColumnMetadata.NO_UNIQUE_ID)); } @Test diff --git a/test/unit/org/apache/cassandra/index/sasi/conf/IndexModeTest.java b/test/unit/org/apache/cassandra/index/sasi/conf/IndexModeTest.java index 1cea46952072..848fc5cf1cf5 100644 --- a/test/unit/org/apache/cassandra/index/sasi/conf/IndexModeTest.java +++ b/test/unit/org/apache/cassandra/index/sasi/conf/IndexModeTest.java @@ -65,7 +65,7 @@ public void test_bad_mode_option() @Test public void test_asciiType() { - ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), AsciiType.instance); + ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), AsciiType.instance, ColumnMetadata.NO_UNIQUE_ID); IndexMode result = IndexMode.getMode(cd, Collections.singletonMap("something", "nothing")); Assert.assertNull(result.analyzerClass); @@ -78,7 +78,7 @@ public void test_asciiType() @Test public void test_asciiType_notLiteral() { - ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), AsciiType.instance); + ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), AsciiType.instance, ColumnMetadata.NO_UNIQUE_ID); IndexMode result = IndexMode.getMode(cd, Collections.singletonMap("is_literal", "false")); Assert.assertNull(result.analyzerClass); @@ -91,7 +91,7 @@ public void test_asciiType_notLiteral() @Test public void test_asciiType_errLiteral() { - ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), AsciiType.instance); + ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), AsciiType.instance, ColumnMetadata.NO_UNIQUE_ID); IndexMode result = IndexMode.getMode(cd, Collections.singletonMap("is_literal", "junk")); Assert.assertNull(result.analyzerClass); @@ -104,7 +104,7 @@ public void test_asciiType_errLiteral() @Test public void test_asciiType_analyzed() { - ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), AsciiType.instance); + ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), AsciiType.instance, ColumnMetadata.NO_UNIQUE_ID); IndexMode result = IndexMode.getMode(cd, Collections.singletonMap("analyzed", "true")); Assert.assertNull(result.analyzerClass); @@ -117,7 +117,7 @@ public void test_asciiType_analyzed() @Test public void test_utf8Type() { - ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), UTF8Type.instance); + ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), UTF8Type.instance, ColumnMetadata.NO_UNIQUE_ID); IndexMode result = IndexMode.getMode(cd, Collections.singletonMap("something", "nothing")); Assert.assertNull(result.analyzerClass); @@ -130,7 +130,7 @@ public void test_utf8Type() @Test public void test_bytesType() { - ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), BytesType.instance); + ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), BytesType.instance, ColumnMetadata.NO_UNIQUE_ID); IndexMode result = IndexMode.getMode(cd, Collections.singletonMap("something", "nothing")); Assert.assertNull(result.analyzerClass); @@ -143,7 +143,7 @@ public void test_bytesType() @Test public void test_bytesType_isLiteral() { - ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), BytesType.instance); + ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), BytesType.instance, ColumnMetadata.NO_UNIQUE_ID); IndexMode result = IndexMode.getMode(cd, Collections.singletonMap("is_literal", "true")); Assert.assertNull(result.analyzerClass); @@ -156,7 +156,7 @@ public void test_bytesType_isLiteral() @Test public void test_bytesType_errLiteral() { - ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), BytesType.instance); + ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), BytesType.instance, ColumnMetadata.NO_UNIQUE_ID); IndexMode result = IndexMode.getMode(cd, Collections.singletonMap("is_literal", "junk")); Assert.assertNull(result.analyzerClass); @@ -169,7 +169,7 @@ public void test_bytesType_errLiteral() @Test public void test_bytesType_analyzed() { - ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), BytesType.instance); + ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), BytesType.instance, ColumnMetadata.NO_UNIQUE_ID); IndexMode result = IndexMode.getMode(cd, Collections.singletonMap("analyzed", "true")); Assert.assertNull(result.analyzerClass); @@ -182,7 +182,7 @@ public void test_bytesType_analyzed() @Test public void test_bytesType_analyzer() { - ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), BytesType.instance); + ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), BytesType.instance, ColumnMetadata.NO_UNIQUE_ID); IndexMode result = IndexMode.getMode(cd, Collections.singletonMap("analyzer_class", "java.lang.Object")); Assert.assertEquals(Object.class, result.analyzerClass); @@ -195,7 +195,7 @@ public void test_bytesType_analyzer() @Test public void test_bytesType_analyzer_unanalyzed() { - ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), BytesType.instance); + ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), BytesType.instance, ColumnMetadata.NO_UNIQUE_ID); IndexMode result = IndexMode.getMode(cd, ImmutableMap.of("analyzer_class", "java.lang.Object", "analyzed", "false")); @@ -210,7 +210,7 @@ public void test_bytesType_analyzer_unanalyzed() @Test public void test_bytesType_maxCompactionFlushMemoryInBytes() { - ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), BytesType.instance); + ColumnMetadata cd = ColumnMetadata.regularColumn(cfm, ByteBufferUtil.bytes("TestColumnMetadata"), BytesType.instance, ColumnMetadata.NO_UNIQUE_ID); IndexMode result = IndexMode.getMode(cd, Collections.singletonMap("max_compaction_flush_memory_in_mb", "1")); Assert.assertNull(result.analyzerClass); diff --git a/test/unit/org/apache/cassandra/io/FileSystemListeners.java b/test/unit/org/apache/cassandra/io/FileSystemListeners.java new file mode 100644 index 000000000000..2e3f784717d2 --- /dev/null +++ b/test/unit/org/apache/cassandra/io/FileSystemListeners.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io; + +import java.nio.channels.FileChannel; +import java.nio.file.OpenOption; +import java.nio.file.Path; +import java.nio.file.attribute.FileAttribute; +import java.util.Collections; +import java.util.HashMap; +import java.util.IdentityHashMap; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import javax.annotation.Nullable; + +import org.apache.cassandra.io.filesystem.ListenableFileSystem; + +/** + * Listeners to aid in debugging and monitoring of disk issues. The class is normally unused as its target is to help + * debug issues, for this reason its likely dead code most of the time... it's here when you need it! + */ +@SuppressWarnings("unused") +public class FileSystemListeners +{ + /** + * Adds a listener to {@link ListenableFileSystem} that will track open/close/delete operations. If a path delete + * is detected before the file is closed, this will throw an exception as this could be due to a possible filesystem leak. + * A filesystem leak can be very dangerous as the allocated disk space keeps growing and growing, but when you look + * at the file system you won't see all the space as taken; the files were deleted but since the file handles are still + * open the kernel will keep the data around. + */ + public static class FileLeak implements ListenableFileSystem.OnPostOpen, ListenableFileSystem.OnPreClose, ListenableFileSystem.OnPreDelete + { + @Nullable + private final IdentityHashMap whereDidChannelsComeFrom; + private final Map> pathToChannels = new HashMap<>(); + + public FileLeak(boolean trackOpenSource) + { + whereDidChannelsComeFrom = trackOpenSource ? new IdentityHashMap<>() : null; + } + + public FileLeak() + { + this(false); + } + + @Override + public synchronized void postOpen(Path path, + Set options, + FileAttribute[] attrs, + FileChannel channel) + { + pathToChannels.computeIfAbsent(path, i -> Collections.newSetFromMap(new IdentityHashMap<>())).add(channel); + if (whereDidChannelsComeFrom != null) + whereDidChannelsComeFrom.put(channel, new Throwable("here")); + } + + @Override + public synchronized void preClose(Path path, FileChannel channel) + { + Set channels = pathToChannels.get(path); + if (channels == null || !channels.remove(channel)) + return; // listener was added after the open? + if (channels.isEmpty()) + pathToChannels.remove(path); + } + + @Override + public synchronized void preDelete(Path path) + { + var cs = pathToChannels.get(path); + if (cs == null) return; + AssertionError e = new AssertionError("File leak (delete before close) detected on path " + path + "; " + cs.size() + " open handels detected"); + if (whereDidChannelsComeFrom != null) + { + var sources = cs.stream().map(whereDidChannelsComeFrom::get).collect(Collectors.toList()); + sources.forEach(e::addSuppressed); + + } + throw e; + } + } +} diff --git a/test/unit/org/apache/cassandra/io/IVersionedSerializers.java b/test/unit/org/apache/cassandra/io/IVersionedSerializers.java index e17e0b7ce2ea..cbf62fb58c07 100644 --- a/test/unit/org/apache/cassandra/io/IVersionedSerializers.java +++ b/test/unit/org/apache/cassandra/io/IVersionedSerializers.java @@ -20,6 +20,8 @@ import java.io.IOException; +import accord.utils.LazyToString; +import accord.utils.ReflectionUtils; import org.apache.cassandra.io.util.DataInputBuffer; import org.apache.cassandra.io.util.DataOutputBuffer; import org.assertj.core.api.Assertions; @@ -34,6 +36,6 @@ public static void testSerde(DataOutputBuffer output, IVersionedSerializer ReflectionUtils.recursiveEquals(read, input).toString())).isEqualTo(input); } } \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/io/Serializers.java b/test/unit/org/apache/cassandra/io/Serializers.java new file mode 100644 index 000000000000..829d51c395f5 --- /dev/null +++ b/test/unit/org/apache/cassandra/io/Serializers.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import accord.utils.LazyToString; +import accord.utils.ReflectionUtils; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.assertj.core.api.Assertions; + +public class Serializers +{ + // When using a shard buffer the following is the recommend thing to copy/paste + // @SuppressWarnings({ "resource", "IOResourceOpenedButNotSafelyClosed" }) DataOutputBuffer output = new DataOutputBuffer(); + + public static void testSerde(DataOutputBuffer output, AsymmetricUnversionedSerializer serializer, T input) throws IOException + { + output.clear(); + long expectedSize = serializer.serializedSize(input); + serializer.serialize(input, output); + Assertions.assertThat(output.getLength()).describedAs("The serialized size and bytes written do not match").isEqualTo(expectedSize); + ByteBuffer buffer = output.unsafeGetBufferAndFlip(); + DataInputBuffer in = new DataInputBuffer(buffer, false); + T read = serializer.deserialize(in); + Assertions.assertThat(read).describedAs("The deserialized output does not match the serialized input; difference %s", new LazyToString(() -> ReflectionUtils.recursiveEquals(read, input).toString())).isEqualTo(input); + Assertions.assertThat(buffer.remaining()).describedAs("deserialize did not consume all the serialized input").isEqualTo(0); + buffer.flip(); + serializer.skip(in); + Assertions.assertThat(buffer.remaining()).describedAs("skip did not consume all the serialized input").isEqualTo(0); + } + + public static void testSerde(DataOutputBuffer output, ParameterisedUnversionedSerializer serializer, T input, P p) throws IOException + { + output.clear(); + long expectedSize = serializer.serializedSize(input, p); + serializer.serialize(input, p, output); + Assertions.assertThat(output.getLength()).describedAs("The serialized size and bytes written do not match").isEqualTo(expectedSize); + DataInputBuffer in = new DataInputBuffer(output.unsafeGetBufferAndFlip(), false); + T read = serializer.deserialize(p, in); + Assertions.assertThat(read).describedAs("The deserialized output does not match the serialized input; difference %s", new LazyToString(() -> ReflectionUtils.recursiveEquals(read, input).toString())).isEqualTo(input); + } + + public static void testSerde(DataOutputBuffer output, ParameterisedVersionedSerializer serializer, T input, P p, Version version) throws IOException + { + output.clear(); + long expectedSize = serializer.serializedSize(input, p, version); + serializer.serialize(input, p, output, version); + Assertions.assertThat(output.getLength()).describedAs("The serialized size and bytes written do not match").isEqualTo(expectedSize); + DataInputBuffer in = new DataInputBuffer(output.unsafeGetBufferAndFlip(), false); + T read = serializer.deserialize(p, in, version); + Assertions.assertThat(read).describedAs("The deserialized output does not match the serialized input; difference %s", new LazyToString(() -> ReflectionUtils.recursiveEquals(read, input).toString())).isEqualTo(input); + } + + public static void testSerde(AsymmetricUnversionedSerializer serializer, T input) throws IOException + { + try (DataOutputBuffer output = new DataOutputBuffer(Math.toIntExact(serializer.serializedSize(input)))) + { + testSerde(output, serializer, input); + } + } + + public static void testSerde(DataOutputBuffer output, IVersionedAsymmetricSerializer serializer, T input, int version) throws IOException + { + output.clear(); + long expectedSize = serializer.serializedSize(input, version); + serializer.serialize(input, output, version); + Assertions.assertThat(output.getLength()).describedAs("The serialized size and bytes written do not match").isEqualTo(expectedSize); + DataInputBuffer in = new DataInputBuffer(output.unsafeGetBufferAndFlip(), false); + T read = serializer.deserialize(in, version); + Assertions.assertThat(read).describedAs("The deserialized output does not match the serialized input; difference %s", new LazyToString(() -> ReflectionUtils.recursiveEquals(read, input).toString())).isEqualTo(input); + } + + public static void testSerde(IVersionedAsymmetricSerializer serializer, T input, int version) throws IOException + { + try (DataOutputBuffer output = new DataOutputBuffer(Math.toIntExact(serializer.serializedSize(input, version)))) + { + testSerde(output, serializer, input, version); + } + } + + public static void testSerde(DataOutputBuffer output, AsymmetricVersionedSerializer serializer, T input, Version version) throws IOException + { + output.clear(); + long expectedSize = serializer.serializedSize(input, version); + serializer.serialize(input, output, version); + Assertions.assertThat(output.getLength()).describedAs("The serialized size and bytes written do not match").isEqualTo(expectedSize); + DataInputBuffer in = new DataInputBuffer(output.unsafeGetBufferAndFlip(), false); + T read = serializer.deserialize(in, version); + Assertions.assertThat(read).describedAs("The deserialized output does not match the serialized input; difference %s", new LazyToString(() -> ReflectionUtils.recursiveEquals(read, input).toString())).isEqualTo(input); + } + + public static void testSerde(AsymmetricVersionedSerializer serializer, T input, Version version) throws IOException + { + try (DataOutputBuffer output = new DataOutputBuffer(Math.toIntExact(serializer.serializedSize(input, version)))) + { + testSerde(output, serializer, input, version); + } + } + + public static void testSerde(ParameterisedVersionedSerializer serializer, T input, P param, Version version) throws IOException + { + try (DataOutputBuffer output = new DataOutputBuffer(Math.toIntExact(serializer.serializedSize(input, param, version)))) + { + testSerde(output, serializer, input, param, version); + } + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/io/filesystem/ListenableFileSystem.java b/test/unit/org/apache/cassandra/io/filesystem/ListenableFileSystem.java index 659d34dfe235..83c2f4d225ad 100644 --- a/test/unit/org/apache/cassandra/io/filesystem/ListenableFileSystem.java +++ b/test/unit/org/apache/cassandra/io/filesystem/ListenableFileSystem.java @@ -20,6 +20,7 @@ import java.io.EOFException; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -35,6 +36,7 @@ import java.nio.channels.WritableByteChannel; import java.nio.file.FileSystem; import java.nio.file.FileSystems; +import java.nio.file.NoSuchFileException; import java.nio.file.OpenOption; import java.nio.file.Path; import java.nio.file.StandardOpenOption; @@ -75,6 +77,16 @@ public interface OnPostOpen extends Listener void postOpen(Path path, Set options, FileAttribute[] attrs, FileChannel channel) throws IOException; } + public interface OnPreClose extends Listener + { + void preClose(Path path, FileChannel channel) throws IOException; + } + + public interface OnPreDelete extends Listener + { + void preDelete(Path path) throws IOException; + } + public interface OnPreRead extends Listener { void preRead(Path path, FileChannel channel, long position, ByteBuffer dst) throws IOException; @@ -153,6 +165,8 @@ public interface Unsubscribable extends AutoCloseable private final List onPreOpen = new CopyOnWriteArrayList<>(); private final List onPostOpen = new CopyOnWriteArrayList<>(); + private final List onPreClose = new CopyOnWriteArrayList<>(); + private final List onPreDelete = new CopyOnWriteArrayList<>(); private final List onPreTransferTo = new CopyOnWriteArrayList<>(); private final List onPostTransferTo = new CopyOnWriteArrayList<>(); private final List onPreRead = new CopyOnWriteArrayList<>(); @@ -197,6 +211,16 @@ public Unsubscribable listen(Listener listener) onPostOpen.add((OnPostOpen) listener); matches.add(onPostOpen); } + if (listener instanceof OnPreClose) + { + onPreClose.add((OnPreClose) listener); + matches.add(onPreClose); + } + if (listener instanceof OnPreDelete) + { + onPreDelete.add((OnPreDelete) listener); + matches.add(onPreDelete); + } if (listener instanceof OnPreRead) { onPreRead.add((OnPreRead) listener); @@ -298,6 +322,11 @@ public Unsubscribable onPostOpen(PathFilter filter, OnPostOpen callback) }); } + public Unsubscribable onPreClose(OnPreClose callback) + { + return listen(callback); + } + public Unsubscribable onPreRead(OnPreRead callback) { return listen(callback); @@ -544,6 +573,13 @@ protected Path unwrap(Path p) return ListenableFileSystem.this.unwrap(p); } + @Override + public void delete(Path path) throws IOException + { + notifyListeners(onPreDelete, l -> l.preDelete(path)); + super.delete(path); + } + @Override public OutputStream newOutputStream(Path path, OpenOption... options) throws IOException { @@ -796,18 +832,30 @@ else if (mode == MapMode.READ_WRITE) long pos = position; try { - while (local.hasRemaining()) + // the channel could be closed... so always create a new channel to avoid this problem + try (FileChannel channel = provider().newFileChannel(path, Set.of(StandardOpenOption.WRITE))) + { + while (local.hasRemaining()) + { + int wrote = channel.write(local, pos); + if (wrote == -1) + throw new EOFException(); + pos += wrote; + } + } + catch (NoSuchFileException | FileNotFoundException e) { - int wrote = write(local, pos); - if (wrote == -1) - throw new EOFException(); - pos += wrote; + // nothing to see here } } catch (IOException e) { throw new UncheckedIOException(e); } + synchronized (this) + { + mutable.set(null); + } }; MemoryUtil.setAttachment(bb, forcer); if (!mutable.compareAndSet(null, mapped)) @@ -823,8 +871,8 @@ else if (mode == MapMode.READ_WRITE) @Override protected void implCloseChannel() throws IOException { + notifyListeners(onPreClose, l -> l.preClose(path, this)); super.implCloseChannel(); - mutable.set(null); } } diff --git a/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterConcurrencyTest.java b/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterConcurrencyTest.java index 3d69cdad5a03..54d8cec32c76 100644 --- a/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterConcurrencyTest.java +++ b/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterConcurrencyTest.java @@ -25,6 +25,7 @@ import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; import org.junit.Rule; import org.junit.Test; @@ -36,6 +37,7 @@ import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.io.util.File; import org.apache.cassandra.schema.Schema; +import org.assertj.core.description.Description; import static org.assertj.core.api.Assertions.assertThat; @@ -70,6 +72,7 @@ public void testConcurrentSchemaModification() throws InterruptedException, IOEx File[] dataDirs = new File[nThreads]; String baseDataDir = tempFolder.newFolder().getAbsolutePath(); + AtomicReference errors = new AtomicReference<>(""); for (int i = 0; i < nThreads; i++) { tableNames[i] = String.format("table_%02d", i); @@ -113,6 +116,7 @@ public void testConcurrentSchemaModification() throws InterruptedException, IOEx catch (Throwable throwable) { LOGGER.error("Error while processing element number {}", finalI, throwable); + errors.updateAndGet(s -> s + "\n" + throwable.getMessage()); errorCount.incrementAndGet(); } }); @@ -123,6 +127,13 @@ public void testConcurrentSchemaModification() throws InterruptedException, IOEx { LOGGER.warn("Unable to close executor pool after 1 minute"); } - assertThat(errorCount.get()).isEqualTo(0); + int count = errorCount.get(); + assertThat(count).isEqualTo(0).describedAs(new Description() + { + public String value() + { + return String.format("Caught %d errors: %s", count, errors.get()); + } + }); } } diff --git a/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterTest.java b/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterTest.java index 45de365a6ced..f2a131b716b3 100644 --- a/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterTest.java +++ b/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterTest.java @@ -20,6 +20,8 @@ import java.io.IOException; import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -35,6 +37,7 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.function.BiPredicate; import java.util.stream.Collectors; +import java.util.stream.Stream; import java.util.stream.StreamSupport; import com.google.common.collect.ImmutableList; @@ -49,11 +52,13 @@ import org.apache.cassandra.Util; import org.apache.cassandra.cql3.QueryProcessor; import org.apache.cassandra.cql3.UntypedResultSet; +import org.apache.cassandra.cql3.constraints.ConstraintViolationException; import org.apache.cassandra.cql3.functions.types.DataType; import org.apache.cassandra.cql3.functions.types.LocalDate; import org.apache.cassandra.cql3.functions.types.TypeCodec; import org.apache.cassandra.cql3.functions.types.UDTValue; import org.apache.cassandra.cql3.functions.types.UserType; +import org.apache.cassandra.db.marshal.FloatType; import org.apache.cassandra.db.marshal.UTF8Type; import org.apache.cassandra.dht.ByteOrderedPartitioner; import org.apache.cassandra.dht.Murmur3Partitioner; @@ -62,8 +67,10 @@ import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.index.sai.disk.format.IndexDescriptor; import org.apache.cassandra.index.sai.utils.IndexIdentifier; +import org.apache.cassandra.io.sstable.format.SSTableFormat; import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.io.sstable.format.big.BigFormat; +import org.apache.cassandra.io.sstable.format.bti.BtiFormat; import org.apache.cassandra.io.util.File; import org.apache.cassandra.io.util.PathUtils; import org.apache.cassandra.locator.RangesAtEndpoint; @@ -77,8 +84,10 @@ import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.JavaDriverUtils; import org.apache.cassandra.utils.OutputHandler; +import org.assertj.core.api.Assertions; import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis; +import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; @@ -111,7 +120,20 @@ public void perTestSetup() throws IOException } @Test - public void testUnsortedWriter() throws Exception + public void testUnsortedWriterBig() throws Exception + { + BigFormat format = BigFormat.getInstance(); + testWritingSstableWithFormat(format); + } + + @Test + public void testUnsortedWriterBti() throws Exception + { + SSTableFormat btiFormat = new BtiFormat.BtiFormatFactory().getInstance(Collections.emptyMap()); + testWritingSstableWithFormat(btiFormat); + } + + private void testWritingSstableWithFormat(SSTableFormat format) throws Exception { try (AutoCloseable ignored = Util.switchPartitioner(ByteOrderedPartitioner.instance)) { @@ -124,6 +146,7 @@ public void testUnsortedWriter() throws Exception CQLSSTableWriter writer = CQLSSTableWriter.builder() .inDirectory(dataDir) .forTable(schema) + .withFormat(format) .using(insert).build(); writer.addRow(0, "test1", 24); @@ -133,6 +156,7 @@ public void testUnsortedWriter() throws Exception writer.close(); + validateFilesAreInFormat(format); loadSSTables(dataDir, keyspace, table); if (verifyDataAfterLoading) @@ -151,7 +175,6 @@ public void testUnsortedWriter() throws Exception row = iter.next(); assertEquals(1, row.getInt("k")); assertEquals("test2", row.getString("v1")); - //assertFalse(row.has("v2")); assertEquals(44, row.getInt("v2")); row = iter.next(); @@ -161,12 +184,24 @@ public void testUnsortedWriter() throws Exception row = iter.next(); assertEquals(3, row.getInt("k")); - assertEquals(null, row.getBytes("v1")); // Using getBytes because we know it won't NPE + assertFalse(row.has("v1")); assertEquals(12, row.getInt("v2")); } } } + private void validateFilesAreInFormat(SSTableFormat format) throws IOException + { + try (Stream dataFilePaths = Files.list(dataDir.toPath()).filter(p -> p.toString().endsWith("Data.db"))) + { + dataFilePaths.forEach(dataFilePath -> { + File dataFile = new File(dataFilePath.toFile()); + Descriptor descriptor = Descriptor.fromFile(dataFile); + assertEquals(format, descriptor.version.format); + }); + } + } + @Test public void testForbidCounterUpdates() throws Exception { @@ -1578,6 +1613,83 @@ public void testSkipBuildingIndexesWithSAI() throws Exception assertFalse(indexDescriptor.isPerColumnIndexBuildComplete(new IndexIdentifier(keyspace, table, "idx2"))); } + @Test + public void testWritingVectorData() throws Exception + { + final String schema = "CREATE TABLE " + qualifiedTable + " (" + + " k int," + + " v1 VECTOR," + + " PRIMARY KEY (k)" + + ")"; + + CQLSSTableWriter writer = CQLSSTableWriter.builder() + .inDirectory(dataDir) + .forTable(schema) + .using("INSERT INTO " + keyspace + "." + table + " (k, v1) " + + "VALUES (?, ?)").build(); + + for (int i = 0; i < 100; i++) + { + writer.addRow(i, List.of( (float)i, (float)i, (float)i, (float)i, (float)i)); + } + + writer.close(); + loadSSTables(dataDir, keyspace, table); + + if (verifyDataAfterLoading) + { + UntypedResultSet resultSet = QueryProcessor.executeInternal("SELECT * FROM " + keyspace + "." + table); + + assertEquals(resultSet.size(), 100); + int cnt = 0; + for (UntypedResultSet.Row row : resultSet) + { + assertEquals(cnt, row.getInt("k")); + List vector = row.getVector("v1", FloatType.instance, 5); + assertThat(vector).hasSize(5); + final float floatCount = (float)cnt; + assertThat(vector).allMatch(val -> val == floatCount); + cnt++; + } + } + } + + @Test + public void testConstraintViolation() throws Exception + { + final String schema = "CREATE TABLE " + qualifiedTable + " (" + + " k int," + + " v1 int CHECK v1 < 5 ," + + " PRIMARY KEY (k)" + + ")"; + + CQLSSTableWriter writer = CQLSSTableWriter.builder() + .inDirectory(dataDir) + .forTable(schema) + .using("INSERT INTO " + keyspace + "." + table + " (k, v1) " + + "VALUES (?, ?)").build(); + + writer.addRow(1, 4); + + Assertions.assertThatThrownBy(() -> writer.addRow(2, 11)) + .describedAs("Should throw when adding a row that violates constraints") + .isInstanceOf(ConstraintViolationException.class) + .hasMessageContaining("Column value does not satisfy value constraint for column 'v1'. It should be v1 < 5"); + + writer.close(); + loadSSTables(dataDir, keyspace, table); + + if (verifyDataAfterLoading) + { + UntypedResultSet resultSet = QueryProcessor.executeInternal("SELECT * FROM " + keyspace + "." + table); + + assertEquals(resultSet.size(), 1); + UntypedResultSet.Row row = resultSet.one(); + assertEquals(1, row.getInt("k")); + assertEquals(4, row.getInt("v1")); + } + } + protected static void loadSSTables(File dataDir, final String ks, final String tb) throws ExecutionException, InterruptedException { SSTableLoader loader = new SSTableLoader(dataDir, new SSTableLoader.Client() diff --git a/test/unit/org/apache/cassandra/io/sstable/DescriptorTest.java b/test/unit/org/apache/cassandra/io/sstable/DescriptorTest.java index 32b4813d29e6..88973414e800 100644 --- a/test/unit/org/apache/cassandra/io/sstable/DescriptorTest.java +++ b/test/unit/org/apache/cassandra/io/sstable/DescriptorTest.java @@ -169,6 +169,7 @@ public void testKeyspaceTableParsing() "/path/to/cassandra/data/dir2/dir5/dir6/ks1/tab1-34234234234234234234234234234234/backups/na-1-big-Index.db", "/path/to/cassandra/data/dir2/dir5/dir6/ks1/tab1-34234234234234234234234234234234/nb-3g1m_0nuf_3vj5m2k1125165rxa7-big-Index.db", "/path/to/cassandra/data/dir2/dir5/dir6/ks1/tab1-34234234234234234234234234234234/snapshots/snapshot/nb-3g1m_0nuf_3vj5m2k1125165rxa7-big-Index.db", + "/path/to/cassandra/data/dir2/dir5/dir6/ks1/tab1-34234234234234234234234234234234/snapshots/snapshot-12345-1.2.3_TEST#=/nb-3g1m_0nuf_3vj5m2k1125165rxa7-big-Index.db", "/path/to/cassandra/data/dir2/dir5/dir6/ks1/tab1-34234234234234234234234234234234/backups/nb-3g1m_0nuf_3vj5m2k1125165rxa7-big-Index.db", }; @@ -230,6 +231,7 @@ public void testKeyspaceTableParsing() "/path/to/cassandra/data/dir2/dir5/dir6/backups/backups/na-1-big-Index.db", "/path/to/cassandra/data/dir2/dir5/dir6/backups/backups/nb-1-big-TOC.txt", //"/path/to/cassandra/data/dir2/dir5/dir6/backups/backups/snapshots/snapshots/na-1-big-Index.db", #not supported (CASSANDRA-14013) + "/path/to/cassandra/data/dir2/dir5/dir6/backups/backups/snapshots/snapshot-12345-1.2.3_TEST#=/na-1-big-Index.db", "/path/to/cassandra/data/dir2/dir5/dir6/backups/backups/backups/na-1-big-Index.db", "/path/to/cassandra/data/dir2/dir5/dir6/backups/backups/nb-3g1m_0nuf_3vj5m2k1125165rxa7-big-Index.db", //"/path/to/cassandra/data/dir2/dir5/dir6/backups/backups/snapshots/snapshots/nb-3g1m_0nuf_3vj5m2k1125165rxa7-big-Index.db", #not supported (CASSANDRA-14013) diff --git a/test/unit/org/apache/cassandra/io/sstable/LargePartitionsTest.java b/test/unit/org/apache/cassandra/io/sstable/LargePartitionsTest.java index 17fb0f28ec21..34bc53b98c2c 100644 --- a/test/unit/org/apache/cassandra/io/sstable/LargePartitionsTest.java +++ b/test/unit/org/apache/cassandra/io/sstable/LargePartitionsTest.java @@ -132,10 +132,10 @@ private static void keyCacheMetrics(String title) { CacheMetrics metrics = CacheService.instance.keyCache.getMetrics(); System.out.println("Key cache metrics " + title + ": capacity:" + metrics.capacity.getValue() + - " size:"+metrics.size.getValue()+ + " size:" + metrics.size.getValue() + " entries:" + metrics.entries.getValue() + - " hit-rate:"+metrics.hitRate.getValue() + - " one-min-rate:"+metrics.oneMinuteHitRate.getValue()); + " hit-rate:" + metrics.hitRate.getValue() + + " one-min-rate:" + metrics.oneMinuteHitRate.getValue()); } @Test diff --git a/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java b/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java index aa80f0e25c53..eb7480e309bb 100644 --- a/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java +++ b/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java @@ -34,8 +34,10 @@ import org.junit.After; import org.junit.Assert; import org.junit.BeforeClass; +import org.junit.ClassRule; import org.junit.Ignore; import org.junit.Test; +import org.junit.rules.TemporaryFolder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -90,6 +92,9 @@ public class LegacySSTableTest { private static final Logger logger = LoggerFactory.getLogger(LegacySSTableTest.class); + @ClassRule + public static TemporaryFolder tempFolder = new TemporaryFolder(); + public static File LEGACY_SSTABLE_ROOT; private static final String LEGACY_TABLES_KEYSPACE = "legacy_tables"; @@ -164,11 +169,11 @@ public void tearDown() /** * Get a descriptor for the legacy sstable at the given version. */ - protected Descriptor getDescriptor(String legacyVersion, String table) throws IOException + protected Descriptor getDescriptor(File dir) throws IOException { - Path file = Files.list(getTableDir(legacyVersion, table).toPath()) + Path file = Files.list(dir.toPath()) .findFirst() - .orElseThrow(() -> new RuntimeException(String.format("No files for verion=%s and table=%s", legacyVersion, table))); + .orElseThrow(() -> new RuntimeException(String.format("No files for path=%s", dir.absolutePath()))); return Descriptor.fromFile(new File(file)); } @@ -494,15 +499,19 @@ private void streamLegacyTables(String legacyVersion) throws Exception streamLegacyTable("legacy_%s_clust", legacyVersion); streamLegacyTable("legacy_%s_clust_counter", legacyVersion); streamLegacyTable("legacy_%s_tuple", legacyVersion); + streamLegacyTable("legacy_%s_clust_be_index_summary", legacyVersion); } private void streamLegacyTable(String tablePattern, String legacyVersion) throws Exception { String table = String.format(tablePattern, legacyVersion); - Descriptor descriptor = getDescriptor(legacyVersion, table); + // streaming can mutate test data (rewrite IndexSummary, so we have to copy them) + File testDataDir = new File(tempFolder.newFolder(LEGACY_TABLES_KEYSPACE, table)); + copySstablesToTestData(legacyVersion, table, testDataDir); + Descriptor descriptor = getDescriptor(testDataDir); if (null != descriptor) { - SSTableReader sstable = SSTableReader.open(null, getDescriptor(legacyVersion, table)); + SSTableReader sstable = SSTableReader.open(null, descriptor); IPartitioner p = sstable.getPartitioner(); List> ranges = new ArrayList<>(); ranges.add(new Range<>(p.getMinimumToken(), p.getToken(ByteBufferUtil.bytes("100")))); @@ -526,6 +535,7 @@ public static void truncateLegacyTables(String legacyVersion) throws Exception Keyspace.open(LEGACY_TABLES_KEYSPACE).getColumnFamilyStore(String.format("legacy_%s_clust", legacyVersion)).truncateBlocking(); Keyspace.open(LEGACY_TABLES_KEYSPACE).getColumnFamilyStore(String.format("legacy_%s_clust_counter", legacyVersion)).truncateBlocking(); Keyspace.open(LEGACY_TABLES_KEYSPACE).getColumnFamilyStore(String.format("legacy_%s_tuple", legacyVersion)).truncateBlocking(); + Keyspace.open(LEGACY_TABLES_KEYSPACE).getColumnFamilyStore(String.format("legacy_%s_clust_be_index_summary", legacyVersion)).truncateBlocking(); CacheService.instance.invalidateCounterCache(); CacheService.instance.invalidateKeyCache(); } @@ -538,6 +548,7 @@ private static void compactLegacyTables(String legacyVersion) throws Exception Keyspace.open(LEGACY_TABLES_KEYSPACE).getColumnFamilyStore(String.format("legacy_%s_clust", legacyVersion)).forceMajorCompaction(); Keyspace.open(LEGACY_TABLES_KEYSPACE).getColumnFamilyStore(String.format("legacy_%s_clust_counter", legacyVersion)).forceMajorCompaction(); Keyspace.open(LEGACY_TABLES_KEYSPACE).getColumnFamilyStore(String.format("legacy_%s_tuple", legacyVersion)).forceMajorCompaction(); + Keyspace.open(LEGACY_TABLES_KEYSPACE).getColumnFamilyStore(String.format("legacy_%s_clust_be_index_summary", legacyVersion)).forceMajorCompaction(); } public static void loadLegacyTables(String legacyVersion) throws Exception @@ -548,6 +559,7 @@ public static void loadLegacyTables(String legacyVersion) throws Exception loadLegacyTable(legacyVersion, "clust"); loadLegacyTable(legacyVersion, "clust_counter"); loadLegacyTable(legacyVersion, "tuple"); + loadLegacyTable(legacyVersion, "clust_be_index_summary"); } private static void verifyCache(String legacyVersion, long startCount) throws InterruptedException, java.util.concurrent.ExecutionException @@ -585,7 +597,8 @@ private static void verifyReads(String legacyVersion) readSimpleCounterTable(legacyVersion, pkValue); } - readClusteringTable(legacyVersion, ck, ckValue, pkValue); + readClusteringTable("legacy_%s_clust", legacyVersion, ck, ckValue, pkValue); + readClusteringTable("legacy_%s_clust_be_index_summary", legacyVersion, ck, ckValue, pkValue); readClusteringCounterTable(legacyVersion, ckValue, pkValue); } } @@ -601,16 +614,16 @@ private static void readClusteringCounterTable(String legacyVersion, String ckVa Assert.assertEquals(1L, rs.one().getLong("val")); } - private static void readClusteringTable(String legacyVersion, int ck, String ckValue, String pkValue) + private static void readClusteringTable(String tableName, String legacyVersion, int ck, String ckValue, String pkValue) { logger.debug("Read legacy_{}_clust", legacyVersion); UntypedResultSet rs; - rs = QueryProcessor.executeInternal(String.format("SELECT val FROM legacy_tables.legacy_%s_clust WHERE pk=? AND ck=?", legacyVersion), pkValue, ckValue); + rs = QueryProcessor.executeInternal(String.format("SELECT val FROM legacy_tables." + tableName + " WHERE pk=? AND ck=?", legacyVersion), pkValue, ckValue); assertLegacyClustRows(1, rs); String ckValue2 = Integer.toString(ck < 10 ? 40 : ck - 1) + longString; String ckValue3 = Integer.toString(ck > 39 ? 10 : ck + 1) + longString; - rs = QueryProcessor.executeInternal(String.format("SELECT val FROM legacy_tables.legacy_%s_clust WHERE pk=? AND ck IN (?, ?, ?)", legacyVersion), pkValue, ckValue, ckValue2, ckValue3); + rs = QueryProcessor.executeInternal(String.format("SELECT val FROM legacy_tables." + tableName + " WHERE pk=? AND ck IN (?, ?, ?)", legacyVersion), pkValue, ckValue, ckValue2, ckValue3); assertLegacyClustRows(3, rs); } @@ -645,7 +658,7 @@ private static void createTables(String legacyVersion) QueryProcessor.executeInternal(String.format("CREATE TABLE legacy_tables.legacy_%s_simple_counter (pk text PRIMARY KEY, val counter)", legacyVersion)); QueryProcessor.executeInternal(String.format("CREATE TABLE legacy_tables.legacy_%s_clust (pk text, ck text, val text, PRIMARY KEY (pk, ck))", legacyVersion)); QueryProcessor.executeInternal(String.format("CREATE TABLE legacy_tables.legacy_%s_clust_counter (pk text, ck text, val counter, PRIMARY KEY (pk, ck))", legacyVersion)); - + QueryProcessor.executeInternal(String.format("CREATE TABLE legacy_tables.legacy_%s_clust_be_index_summary (pk text, ck text, val text, PRIMARY KEY (pk, ck))", legacyVersion)); QueryProcessor.executeInternal(String.format("CREATE TYPE legacy_tables.legacy_%s_tuple_udt (name tuple)", legacyVersion)); @@ -668,6 +681,7 @@ private static void truncateTables(String legacyVersion) QueryProcessor.executeInternal(String.format("TRUNCATE legacy_tables.legacy_%s_simple_counter", legacyVersion)); QueryProcessor.executeInternal(String.format("TRUNCATE legacy_tables.legacy_%s_clust", legacyVersion)); QueryProcessor.executeInternal(String.format("TRUNCATE legacy_tables.legacy_%s_clust_counter", legacyVersion)); + QueryProcessor.executeInternal(String.format("TRUNCATE legacy_tables.legacy_%s_clust_be_index_summary", legacyVersion)); CacheService.instance.invalidateCounterCache(); CacheService.instance.invalidateKeyCache(); } @@ -747,6 +761,13 @@ public void testGenerateSstables() throws Throwable QueryProcessor.executeInternal(String.format("UPDATE legacy_tables.legacy_%s_clust_counter SET val = val + 1 WHERE pk = '%s' AND ck='%s'", format.getLatestVersion(), valPk, valCk + longString)); + + // note: to emulate BE for offsets in Summary you can comment temporary the following line: + // offset = Integer.reverseBytes(offset); + // in org.apache.cassandra.io.sstable.indexsummary.IndexSummary.IndexSummarySerializer.serialize + QueryProcessor.executeInternal(String.format("INSERT INTO legacy_tables.legacy_%s_clust_be_index_summary (pk, ck, val) VALUES ('%s', '%s', '%s')", + format.getLatestVersion(), valPk, valCk + longString, randomString)); + } } @@ -759,6 +780,7 @@ public void testGenerateSstables() throws Throwable copySstablesFromTestData(format.getLatestVersion(), "legacy_%s_clust", ksDir); copySstablesFromTestData(format.getLatestVersion(), "legacy_%s_clust_counter", ksDir); copySstablesFromTestData(format.getLatestVersion(), "legacy_%s_tuple", ksDir); + copySstablesFromTestData(format.getLatestVersion(), "legacy_%s_clust_be_index_summary", ksDir); } public static void copySstablesFromTestData(Version legacyVersion, String tablePattern, File ksDir) throws IOException @@ -774,42 +796,47 @@ public static void copySstablesFromTestData(Version legacyVersion, String tableP for (File srcDir : Keyspace.open(ks).getColumnFamilyStore(table).getDirectories().getCFDirectories()) { - for (File file : srcDir.tryList()) + for (File sourceFile : srcDir.tryList()) { // Sequence IDs represent the C* version used when creating the SSTable, i.e. with #testGenerateSstables() (if not uuid based) String newSeqId = FBUtilities.getReleaseVersionString().split("-")[0].replaceAll("[^0-9]", ""); - File target = new File(cfDir, file.name().replace(legacyVersion + "-1-", legacyVersion + "-" + newSeqId + "-")); - copyFile(cfDir, file, target); + File target = new File(cfDir, sourceFile.name().replace(legacyVersion + "-1-", legacyVersion + "-" + newSeqId + "-")); + copyFile(sourceFile, target); } } } - private static void copySstablesToTestData(String legacyVersion, String table, File cfDir) throws IOException + private static void copySstablesToTestData(String legacyVersion, String table, File targetDir) throws IOException + { + File testDataTableDir = getTestDataTableDir(legacyVersion, table); + Assert.assertTrue("The table directory " + testDataTableDir + " was not found", testDataTableDir.isDirectory()); + for (File sourceTestFile : testDataTableDir.tryList()) + copyFileToDir(sourceTestFile, targetDir); + } + + private static File getTestDataTableDir(File parentDir, String legacyVersion, String table) { - File tableDir = getTableDir(legacyVersion, table); - Assert.assertTrue("The table directory " + tableDir + " was not found", tableDir.isDirectory()); - for (File file : tableDir.tryList()) - copyFile(cfDir, file); + return new File(parentDir, String.format("%s/legacy_tables/%s", legacyVersion, table)); } - private static File getTableDir(String legacyVersion, String table) + private static File getTestDataTableDir(String legacyVersion, String table) { - return new File(LEGACY_SSTABLE_ROOT, String.format("%s/legacy_tables/%s", legacyVersion, table)); + return getTestDataTableDir(LEGACY_SSTABLE_ROOT, legacyVersion, table); } - public static void copyFile(File cfDir, File file) throws IOException + public static void copyFileToDir(File sourceFile, File targetDir) throws IOException { - copyFile(cfDir, file, new File(cfDir, file.name())); + copyFile(sourceFile, new File(targetDir, sourceFile.name())); } - public static void copyFile(File cfDir, File file, File target) throws IOException + public static void copyFile(File sourceFile, File targetFile) throws IOException { byte[] buf = new byte[65536]; - if (file.isFile()) + if (sourceFile.isFile()) { int rd; - try (FileInputStreamPlus is = new FileInputStreamPlus(file); - FileOutputStreamPlus os = new FileOutputStreamPlus(target);) + try (FileInputStreamPlus is = new FileInputStreamPlus(sourceFile); + FileOutputStreamPlus os = new FileOutputStreamPlus(targetFile);) { while ((rd = is.read(buf)) >= 0) os.write(buf, 0, rd); diff --git a/test/unit/org/apache/cassandra/io/sstable/ScrubTest.java b/test/unit/org/apache/cassandra/io/sstable/ScrubTest.java index e7952549035a..323cd01fbb6b 100644 --- a/test/unit/org/apache/cassandra/io/sstable/ScrubTest.java +++ b/test/unit/org/apache/cassandra/io/sstable/ScrubTest.java @@ -478,7 +478,7 @@ public void testScrubOutOfOrder() // This test assumes ByteOrderPartitioner to create out-of-order SSTable IPartitioner oldPartitioner = DatabaseDescriptor.getPartitioner(); - DatabaseDescriptor.setPartitionerUnsafe(new ByteOrderedPartitioner()); + DatabaseDescriptor.setPartitionerUnsafe(ByteOrderedPartitioner.instance); // Create out-of-order SSTable File tempDir = FileUtils.createTempFile("ScrubTest.testScrubOutOfOrder", "").parent(); diff --git a/test/unit/org/apache/cassandra/io/sstable/format/bti/PartitionIndexTest.java b/test/unit/org/apache/cassandra/io/sstable/format/bti/PartitionIndexTest.java index 6ceee331f6e8..36e9f8fb068c 100644 --- a/test/unit/org/apache/cassandra/io/sstable/format/bti/PartitionIndexTest.java +++ b/test/unit/org/apache/cassandra/io/sstable/format/bti/PartitionIndexTest.java @@ -291,7 +291,7 @@ private long eq(List keys, DecoratedKey key) @Test public void testAddEmptyKey() throws Exception { - IPartitioner p = new RandomPartitioner(); + IPartitioner p = RandomPartitioner.instance; File file = FileUtils.createTempFile("ColumnTrieReaderTest", ""); FileHandle.Builder fhBuilder = makeHandle(file); diff --git a/test/unit/org/apache/cassandra/io/sstable/indexsummary/IndexSummaryTest.java b/test/unit/org/apache/cassandra/io/sstable/indexsummary/IndexSummaryTest.java index aea166aa3ea1..079437c6ff15 100644 --- a/test/unit/org/apache/cassandra/io/sstable/indexsummary/IndexSummaryTest.java +++ b/test/unit/org/apache/cassandra/io/sstable/indexsummary/IndexSummaryTest.java @@ -248,7 +248,7 @@ public void testSerialization() throws IOException @Test public void testAddEmptyKey() throws Exception { - IPartitioner p = new RandomPartitioner(); + IPartitioner p = RandomPartitioner.instance; try (IndexSummaryBuilder builder = new IndexSummaryBuilder(1, 1, BASE_SAMPLING_LEVEL)) { builder.maybeAddEntry(p.decorateKey(ByteBufferUtil.EMPTY_BYTE_BUFFER), 0); diff --git a/test/unit/org/apache/cassandra/io/util/ChecksumedDataTest.java b/test/unit/org/apache/cassandra/io/util/ChecksumedDataTest.java new file mode 100644 index 000000000000..5389c79bb7d5 --- /dev/null +++ b/test/unit/org/apache/cassandra/io/util/ChecksumedDataTest.java @@ -0,0 +1,247 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io.util; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.Supplier; +import java.util.zip.CRC32C; +import java.util.zip.Checksum; + +import javax.annotation.Nullable; + +import org.junit.Test; + +import accord.utils.Gen; +import accord.utils.Gens; +import accord.utils.Property.Command; +import accord.utils.Property.Commands; +import accord.utils.Property.UnitCommand; +import org.apache.cassandra.utils.FailingConsumer; +import org.assertj.core.api.Assertions; + +import static accord.utils.Property.stateful; + +public class ChecksumedDataTest +{ + public static final Supplier CHECKSUM_SUPPLIER = CRC32C::new; + + @Test + public void singleType() + { + DataOutputBuffer out = new DataOutputBuffer(); + stateful().check(new Commands() + { + @Override + public Gen genInitialState() + { + return ignore -> { + out.clear(); + return out; + }; + } + + @Override + public DataOutputBuffer createSut(DataOutputBuffer dataOutputBuffer) + { + return dataOutputBuffer; + } + + @Override + public Gen> commands(DataOutputBuffer dataOutputBuffer) + { + return Gens.oneOf( + rs -> { + boolean b = rs.nextBoolean(); + return new StatelessChecksumCommand<>(out -> out.writeBoolean(b), DataInputPlus::readBoolean, () -> b); + }, + rs -> { + short s = (short) rs.nextInt(Short.MIN_VALUE, Short.MAX_VALUE); + return new StatelessChecksumCommand<>(out -> out.writeShort(s), DataInputPlus::readShort, () -> s); + }, + rs -> { + char c = (char) rs.nextInt(Character.MIN_VALUE, Character.MAX_VALUE); + return new StatelessChecksumCommand<>(out -> out.writeChar(c), DataInputPlus::readChar, () -> c); + }, + rs -> { + int value = rs.nextInt(); + return new StatelessChecksumCommand<>(out -> out.writeInt(value), DataInputPlus::readInt, () -> value); + }, + rs -> { + float value = rs.nextFloat(); + return new StatelessChecksumCommand<>(out -> out.writeFloat(value), DataInputPlus::readFloat, () -> value); + }, + rs -> { + double value = rs.nextDouble(); + return new StatelessChecksumCommand<>(out -> out.writeDouble(value), DataInputPlus::readDouble, () -> value); + } + ); + } + }); + } + + @Test + public void withState() + { + DataOutputBuffer out = new DataOutputBuffer(); + ChecksumedDataOutputPlus checksummedOut = new ChecksumedDataOutputPlus(out, CHECKSUM_SUPPLIER); + checksummedOut.resetChecksum(); + stateful().check(new Commands>>() { + @Override + public Gen genInitialState() + { + return ignore -> { + out.clear(); + checksummedOut.resetChecksum(); + return checksummedOut; + }; + } + + @Override + public List> createSut(ChecksumedDataOutputPlus checksumedDataOutputPlus) + { + return new ArrayList<>(1000); + } + + @Override + public Gen>, ?>> commands(ChecksumedDataOutputPlus checksumedDataOutputPlus) + { + return Gens.oneOf( + rs -> { + boolean b = rs.nextBoolean(); + return new StatefulChecksumCommand<>(out -> out.writeBoolean(b), DataInputPlus::readBoolean, () -> b); + }, + rs -> { + short s = (short) rs.nextInt(Short.MIN_VALUE, Short.MAX_VALUE); + return new StatefulChecksumCommand<>(out -> out.writeShort(s), DataInputPlus::readShort, () -> s); + }, + rs -> { + char c = (char) rs.nextInt(Character.MIN_VALUE, Character.MAX_VALUE); + return new StatefulChecksumCommand<>(out -> out.writeChar(c), DataInputPlus::readChar, () -> c); + }, + rs -> { + int value = rs.nextInt(); + return new StatefulChecksumCommand<>(out -> out.writeInt(value), DataInputPlus::readInt, () -> value); + }, + rs -> { + float value = rs.nextFloat(); + return new StatefulChecksumCommand<>(out -> out.writeFloat(value), DataInputPlus::readFloat, () -> value); + }, + rs -> { + double value = rs.nextDouble(); + return new StatefulChecksumCommand<>(out -> out.writeDouble(value), DataInputPlus::readDouble, () -> value); + } + ); + } + + @Override + public void destroySut(List> sut, @Nullable Throwable t) throws Throwable + { + if (t != null) return; + ChecksumedDataInputPlus in = new ChecksumedDataInputPlus(new DataInputBuffer(out.unsafeGetBufferAndFlip(), false), CHECKSUM_SUPPLIER); + for (StatefulChecksumCommand cmd : sut) + { + Assertions.assertThat(cmd.read.apply(in)).isEqualTo(cmd.expected.get()); + Assertions.assertThat(in.checksum().getValue()).isEqualTo(cmd.checksum); + } + } + }); + } + + public interface FailingFunction + { + O apply(I input) throws Throwable; + } + + private static class StatefulChecksumCommand implements UnitCommand>> + { + private final FailingConsumer update; + private final FailingFunction read; + private final Supplier expected; + private Long checksum = null; + + private StatefulChecksumCommand(FailingConsumer update, FailingFunction read, Supplier expected) + { + this.update = update; + this.read = read; + this.expected = expected; + } + + @Override + public void applyUnit(ChecksumedDataOutputPlus out) throws Throwable + { + update.doAccept(out); + checksum = out.checksum().getValue(); + } + + @Override + public void runUnit(List> sut) + { + sut.add(this); + } + } + + private static class StatelessChecksumCommand implements Command + { + private final FailingConsumer update; + private final FailingFunction read; + private final Supplier expected; + + private StatelessChecksumCommand(FailingConsumer update, + FailingFunction read, + Supplier expected) + { + this.update = update; + this.read = read; + this.expected = expected; + } + + @Override + public Long apply(DataOutputBuffer out) throws Throwable + { + out.clear(); + ChecksumedDataOutputPlus c = new ChecksumedDataOutputPlus(out, CHECKSUM_SUPPLIER); + update.doAccept(c); + return c.checksum().getValue(); + } + + @Override + public Long run(DataOutputBuffer out) throws Throwable + { + out.clear(); + update.doAccept(out); + ChecksumedDataInputPlus i = new ChecksumedDataInputPlus(new DataInputBuffer(out.unsafeGetBufferAndFlip(), false), CHECKSUM_SUPPLIER); + Assertions.assertThat(read.apply(i)).isEqualTo(expected.get()); + return i.checksum().getValue(); + } + + @Override + public void checkPostconditions(DataOutputBuffer dataOutputBuffer, Long expected, + DataOutputBuffer sut, Long actual) + { + Assertions.assertThat(actual).isEqualTo(expected); + } + + @Override + public String detailed(DataOutputBuffer dataOutputBuffer) + { + return expected.get().getClass().getSimpleName(); + } + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/io/util/DataOutputInputPlusTest.java b/test/unit/org/apache/cassandra/io/util/DataOutputInputPlusTest.java new file mode 100644 index 000000000000..8be57b5faab1 --- /dev/null +++ b/test/unit/org/apache/cassandra/io/util/DataOutputInputPlusTest.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io.util; + +import org.junit.Test; + +import accord.utils.Gens; +import org.assertj.core.api.Assertions; + +import static accord.utils.Property.qt; + +public class DataOutputInputPlusTest +{ + @Test + public void leastSignificantBytes() + { + @SuppressWarnings({ "resource", "IOResourceOpenedButNotSafelyClosed" }) DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(Gens.longs().all()).check(expected -> { + output.clear(); + + int expectedSize = numberOfBytes(expected); + output.writeLeastSignificantBytes(expected, expectedSize); + Assertions.assertThat(output.getLength()).describedAs("The serialized size and bytes written do not match").isEqualTo(expectedSize); + @SuppressWarnings({ "resource", "IOResourceOpenedButNotSafelyClosed" }) DataInputBuffer in = new DataInputBuffer(output.unsafeGetBufferAndFlip(), false); + long read = in.readLeastSignificantBytes(expectedSize); + Assertions.assertThat(read).describedAs("The deserialized output does not match the serialized input").isEqualTo(expected); + }); + } + + private static int numberOfBytes(long value) + { + return (64 + 7 - Long.numberOfLeadingZeros(value)) / 8; + } +} diff --git a/test/unit/org/apache/cassandra/io/util/Files.java b/test/unit/org/apache/cassandra/io/util/Files.java new file mode 100644 index 000000000000..4a0ec3285d41 --- /dev/null +++ b/test/unit/org/apache/cassandra/io/util/Files.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io.util; + +import java.nio.file.FileSystem; + +import com.google.common.jimfs.Jimfs; + +public class Files +{ + public static FileSystem newGlobalInMemoryFileSystem() + { + FileSystem fs = Jimfs.newFileSystem(); + File.unsafeSetFilesystem(fs); + return fs; + } +} diff --git a/test/unit/org/apache/cassandra/io/util/MmappedRegionsTest.java b/test/unit/org/apache/cassandra/io/util/MmappedRegionsTest.java index e6b5dd0c0962..af4c6f042d70 100644 --- a/test/unit/org/apache/cassandra/io/util/MmappedRegionsTest.java +++ b/test/unit/org/apache/cassandra/io/util/MmappedRegionsTest.java @@ -114,10 +114,11 @@ public void testEmpty() throws Exception public void testTwoSegments() throws Exception { ByteBuffer buffer = allocateBuffer(2048); + int bufSize = 1024; try (ChannelProxy channel = new ChannelProxy(writeFile("testTwoSegments", buffer)); MmappedRegions regions = MmappedRegions.empty(channel)) { - regions.extend(1024); + regions.extend(1024, bufSize); for (int i = 0; i < 1024; i++) { MmappedRegions.Region region = regions.floor(i); @@ -126,7 +127,7 @@ public void testTwoSegments() throws Exception assertEquals(1024, region.end()); } - regions.extend(2048); + regions.extend(2048, bufSize); for (int i = 0; i < 2048; i++) { MmappedRegions.Region region = regions.floor(i); @@ -149,14 +150,15 @@ public void testTwoSegments() throws Exception public void testSmallSegmentSize() throws Exception { MmappedRegions.MAX_SEGMENT_SIZE = 1024; + int bufSize = 1024; ByteBuffer buffer = allocateBuffer(4096); try (ChannelProxy channel = new ChannelProxy(writeFile("testSmallSegmentSize", buffer)); MmappedRegions regions = MmappedRegions.empty(channel)) { - regions.extend(1024); - regions.extend(2048); - regions.extend(4096); + regions.extend(1024, bufSize); + regions.extend(2048, bufSize); + regions.extend(4096, bufSize); final int SIZE = MmappedRegions.MAX_SEGMENT_SIZE; for (int i = 0; i < buffer.capacity(); i++) @@ -169,17 +171,45 @@ public void testSmallSegmentSize() throws Exception } } + @Test + public void testSizeIsChunkMultiple() throws Exception + { + final int oldMaxSegmentSize = MmappedRegions.MAX_SEGMENT_SIZE; + final int bufSize = 1024; + MmappedRegions.MAX_SEGMENT_SIZE = 2047; + ByteBuffer buffer = allocateBuffer(4096); + try(ChannelProxy channel = new ChannelProxy(writeFile("testSmallSegmentSize", buffer)); + MmappedRegions regions = MmappedRegions.empty(channel)) + { + regions.extend(1024, bufSize); + regions.extend(2048, bufSize); + regions.extend(4096, bufSize); + for (int i = 0; i < buffer.capacity(); i++) + { + MmappedRegions.Region region = regions.floor(i); + assertNotNull(region); + assertEquals(bufSize * (i / bufSize), region.offset()); + assertEquals(bufSize + (bufSize * (i / bufSize)), region.end()); + } + } + finally + { + MmappedRegions.MAX_SEGMENT_SIZE = oldMaxSegmentSize; + } + } + @Test public void testAllocRegions() throws Exception { MmappedRegions.MAX_SEGMENT_SIZE = 1024; ByteBuffer buffer = allocateBuffer(MmappedRegions.MAX_SEGMENT_SIZE * MmappedRegions.REGION_ALLOC_SIZE * 3); + int bufSize = 1024; try (ChannelProxy channel = new ChannelProxy(writeFile("testAllocRegions", buffer)); MmappedRegions regions = MmappedRegions.empty(channel)) { - regions.extend(buffer.capacity()); + regions.extend(buffer.capacity(), bufSize); final int SIZE = MmappedRegions.MAX_SEGMENT_SIZE; for (int i = 0; i < buffer.capacity(); i++) @@ -196,17 +226,18 @@ public void testAllocRegions() throws Exception public void testCopy() throws Exception { ByteBuffer buffer = allocateBuffer(128 * 1024); + int bufSize = 4096; MmappedRegions snapshot; ChannelProxy channelCopy; try (ChannelProxy channel = new ChannelProxy(writeFile("testSnapshot", buffer)); - MmappedRegions regions = MmappedRegions.map(channel, buffer.capacity() / 4)) + MmappedRegions regions = MmappedRegions.map(channel, buffer.capacity() / 4, bufSize)) { // create 3 more segments, one per quater capacity - regions.extend(buffer.capacity() / 2); - regions.extend(3 * buffer.capacity() / 4); - regions.extend(buffer.capacity()); + regions.extend(buffer.capacity() / 2, bufSize); + regions.extend(3 * buffer.capacity() / 4, bufSize); + regions.extend(buffer.capacity(), bufSize); // make a snapshot snapshot = regions.sharedCopy(); @@ -238,6 +269,7 @@ public void testCopy() throws Exception public void testCopyCannotExtend() throws Exception { ByteBuffer buffer = allocateBuffer(128 * 1024); + int bufSize = 1024; MmappedRegions snapshot; ChannelProxy channelCopy; @@ -245,7 +277,7 @@ public void testCopyCannotExtend() throws Exception try (ChannelProxy channel = new ChannelProxy(writeFile("testSnapshotCannotExtend", buffer)); MmappedRegions regions = MmappedRegions.empty(channel)) { - regions.extend(buffer.capacity() / 2); + regions.extend(buffer.capacity() / 2, bufSize); // make a snapshot snapshot = regions.sharedCopy(); @@ -256,7 +288,7 @@ public void testCopyCannotExtend() throws Exception try { - snapshot.extend(buffer.capacity()); + snapshot.extend(buffer.capacity(), bufSize); } finally { @@ -269,12 +301,13 @@ public void testCopyCannotExtend() throws Exception public void testExtendOutOfOrder() throws Exception { ByteBuffer buffer = allocateBuffer(4096); + int bufSize = 1024; try (ChannelProxy channel = new ChannelProxy(writeFile("testExtendOutOfOrder", buffer)); MmappedRegions regions = MmappedRegions.empty(channel)) { - regions.extend(4096); - regions.extend(1024); - regions.extend(2048); + regions.extend(4096, bufSize); + regions.extend(1024, bufSize); + regions.extend(2048, bufSize); for (int i = 0; i < buffer.capacity(); i++) { @@ -290,10 +323,11 @@ public void testExtendOutOfOrder() throws Exception public void testNegativeExtend() throws Exception { ByteBuffer buffer = allocateBuffer(1024); + int bufSize = 1024; try (ChannelProxy channel = new ChannelProxy(writeFile("testNegativeExtend", buffer)); MmappedRegions regions = MmappedRegions.empty(channel)) { - regions.extend(-1); + regions.extend(-1, bufSize); } } @@ -341,8 +375,9 @@ public void testMapForCompressionMetadata() throws Exception public void testIllegalArgForMap1() throws Exception { ByteBuffer buffer = allocateBuffer(1024); + int bufSize = 1024; try (ChannelProxy channel = new ChannelProxy(writeFile("testIllegalArgForMap1", buffer)); - MmappedRegions regions = MmappedRegions.map(channel, 0)) + MmappedRegions regions = MmappedRegions.map(channel, 0, bufSize)) { assertTrue(regions.isEmpty()); } @@ -352,8 +387,9 @@ public void testIllegalArgForMap1() throws Exception public void testIllegalArgForMap2() throws Exception { ByteBuffer buffer = allocateBuffer(1024); + int bufSize = 1024; try (ChannelProxy channel = new ChannelProxy(writeFile("testIllegalArgForMap2", buffer)); - MmappedRegions regions = MmappedRegions.map(channel, -1L)) + MmappedRegions regions = MmappedRegions.map(channel, -1L, bufSize)) { assertTrue(regions.isEmpty()); } @@ -382,6 +418,7 @@ public void testExtendForCompressionMetadata(int maxSegmentSize, int chunkSize, { MmappedRegions.MAX_SEGMENT_SIZE = maxSegmentSize << 10; int size = Arrays.stream(writeSizes).sum() << 10; + int bufSize = 4096; ByteBuffer buffer = allocateBuffer(size); File f = FileUtils.createTempFile("testMapForCompressionMetadata", "1"); @@ -423,10 +460,10 @@ public void testExtendForCompressionMetadata(int maxSegmentSize, int chunkSize, writer.sync(); // verify that calling extend for the same (first iteration) or some previous metadata (further iterations) has no effect - assertFalse(regions.extend(metadata)); + assertFalse(regions.extend(metadata, bufSize)); logger.info("Checking extend on compressed chunk for range={} {}..{} / {}", idx, pos, pos + (writeSizes[idx] << 10), size); - checkExtendOnCompressedChunks(f, writer, regions); + checkExtendOnCompressedChunks(f, writer, regions, bufSize); pos += writeSizes[idx] << 10; idx++; } @@ -434,12 +471,12 @@ public void testExtendForCompressionMetadata(int maxSegmentSize, int chunkSize, } } - private void checkExtendOnCompressedChunks(File f, CompressedSequentialWriter writer, MmappedRegions regions) + private void checkExtendOnCompressedChunks(File f, CompressedSequentialWriter writer, MmappedRegions regions, int bufSize) { int dataOffset; try (CompressionMetadata metadata = writer.open(writer.getLastFlushOffset())) { - regions.extend(metadata); + regions.extend(metadata, bufSize); assertFalse(regions.isEmpty()); dataOffset = 0; while (dataOffset < metadata.dataLength) diff --git a/test/unit/org/apache/cassandra/io/util/ThreadLocalReadAheadBufferTest.java b/test/unit/org/apache/cassandra/io/util/ThreadLocalReadAheadBufferTest.java index 4d43017b2a5a..9bb957e3a5e1 100644 --- a/test/unit/org/apache/cassandra/io/util/ThreadLocalReadAheadBufferTest.java +++ b/test/unit/org/apache/cassandra/io/util/ThreadLocalReadAheadBufferTest.java @@ -41,6 +41,7 @@ import org.quicktheories.WithQuickTheories; import org.quicktheories.core.Gen; +import static java.lang.Math.max; import static org.apache.cassandra.config.CassandraRelevantProperties.JAVA_IO_TMPDIR; public class ThreadLocalReadAheadBufferTest implements WithQuickTheories @@ -48,16 +49,17 @@ public class ThreadLocalReadAheadBufferTest implements WithQuickTheories private static final int numFiles = 5; private static final File[] files = new File[numFiles]; private static final Logger logger = LoggerFactory.getLogger(ThreadLocalReadAheadBufferTest.class); + private static Integer seed; @BeforeClass public static void setup() { - int seed = new Random().nextInt(); + seed = new Random().nextInt(); logger.info("Seed: {}", seed); for (int i = 0; i < numFiles; i++) { - int size = new Random().nextInt((Integer.MAX_VALUE - 1) / 8); + int size = new Random(seed).nextInt((Integer.MAX_VALUE - 1) / 8); files[i] = writeFile(seed, size); } } @@ -81,7 +83,7 @@ public static void cleanup() @Test public void testLastBlockReads() { - qt().forAll(lastBlockReads()) + qt().withFixedSeed(seed).forAll(lastBlockReads()) .checkAssert(this::testReads); } @@ -89,7 +91,7 @@ public void testLastBlockReads() public void testReadsLikeChannelProxy() { - qt().forAll(randomReads()) + qt().withFixedSeed(seed).forAll(reads()) .checkAssert(this::testReads); } @@ -127,7 +129,7 @@ private void testReads(InputData propertyInputs) } } - private Gen lastBlockReads() + private Gen reads() { return arbitrary().pick(List.of(files)) .flatMap((file) -> @@ -137,12 +139,12 @@ private Gen lastBlockReads() } - private Gen randomReads() + private Gen lastBlockReads() { int blockSize = new DataStorageSpec.IntKibibytesBound("256KiB").toBytes(); return arbitrary().pick(List.of(files)) .flatMap((file) -> - lists().of(longs().between(fileSize(file) - blockSize, fileSize(file)).zip(integers().between(1, 100), Pair::create)) + lists().of(longs().between(max(0, fileSize(file) - blockSize), fileSize(file)).zip(integers().between(1, 100), Pair::create)) .ofSizeBetween(5, 10) .map(positionsAndLengths -> new InputData(file, positionsAndLengths))); diff --git a/test/unit/org/apache/cassandra/journal/DescriptorTest.java b/test/unit/org/apache/cassandra/journal/DescriptorTest.java new file mode 100644 index 000000000000..f0f2975be9b2 --- /dev/null +++ b/test/unit/org/apache/cassandra/journal/DescriptorTest.java @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.nio.file.FileSystem; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Set; + +import com.google.common.collect.Sets; +import org.junit.Test; + +import accord.utils.Gen; +import accord.utils.Gens; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.io.util.Files; +import org.apache.cassandra.io.util.PathUtils; +import org.apache.cassandra.utils.Pair; +import org.assertj.core.api.Condition; + +import static accord.utils.Property.qt; +import static org.assertj.core.api.Assertions.assertThat; + +public class DescriptorTest +{ + private static final FileSystem FS = Files.newGlobalInMemoryFileSystem(); + + static + { + PathUtils.setDeletionListener(ignore -> {}); + } + + @Test + public void serde() + { + qt().forAll(descriptors()) + .check(desc -> + assertThat(Descriptor.fromFile(desc.fileFor(Component.DATA))).isEqualTo(desc)); + } + + @Test + public void isTmp() + { + Condition isTmp = new Condition("isTmpFile") + { + @Override + public boolean matches(File value) + { + return Descriptor.isTmpFile(value); + } + }; + qt().forAll(descriptors()).check(desc -> { + for (Component comp : Component.values()) + { + assertThat(desc.tmpFileFor(comp)).is(isTmp); + assertThat(desc.fileFor(comp)).isNot(isTmp); + } + }); + } + + @Test + public void list() + { + qt().withPure(false) + .forAll(children()) + .check(pair -> + assertThat(Descriptor.list(pair.left)).containsExactlyInAnyOrderElementsOf(pair.right)); + } + + @Test + public void order() + { + qt().withPure(false).forAll(children().filter(p -> p.right.size() >= 2)).check(pair -> + { + List list = new ArrayList<>(pair.right); + Collections.sort(list); + + Descriptor last = list.get(0); + for (int i = 1; i < list.size(); i++) + { + Descriptor current = list.get(i); + assertThat(current.directory).isEqualTo(last.directory); + assertThat(current.timestamp).isGreaterThanOrEqualTo(last.timestamp); + if (current.timestamp == last.timestamp) + assertThat(current.generation).isGreaterThanOrEqualTo(last.generation); + if (current.timestamp == last.timestamp + && current.generation == last.generation) + assertThat(current.journalVersion).isGreaterThanOrEqualTo(last.journalVersion); + if (current.timestamp == last.timestamp + && current.generation == last.generation + && current.journalVersion == last.journalVersion) + assertThat(current.userVersion).isGreaterThanOrEqualTo(last.userVersion); + last = current; + } + }); + } + + private static Gen>> children() + { + Gen dirs = dirs(); + return rs -> + { + File dir = dirs.next(rs); + if (dir.exists()) + dir.deleteRecursive(); + if (!dir.createDirectoriesIfNotExists()) + throw new AssertionError("Directory " + dir + " exists"); + int size = rs.nextInt(0, 10); + if (size == 0) + return Pair.create(dir, Collections.emptySet()); + Set uniq = Sets.newHashSetWithExpectedSize(size); + Gen descriptors = descriptors(Gens.constant(dir)); + for (int i = 0; i < size; i++) + { + Descriptor d = descriptors.next(rs); + while (!uniq.add(d)) + d = descriptors.next(rs); + } + for (Descriptor d : uniq) + d.fileFor(Component.DATA).createFileIfNotExists(); + return Pair.create(dir, uniq); + }; + } + + private static Gen descriptors() + { + Gen dir = dirs(); + return descriptors(dir); + } + + private static Gen descriptors(Gen dir) + { + Gen.LongGen longs = Gens.longs().between(0, 10); + Gen.IntGen ints = Gens.ints().between(0, 10); + return rs -> new Descriptor(dir.next(rs), longs.nextLong(rs), ints.next(rs), ints.next(rs), ints.next(rs)); + } + + private static Gen dirs() + { + Gen names = asciiVisible().ofLengthBetween(1, 100); + Gen gen = rs -> new File(FS.getPath('/' + names.next(rs))); + return gen.filter(f -> f.toCanonical().parent() != null); + } + + // TODO: replace with Gens.strings().asciiVisible() + public static Gens.SizeBuilder asciiVisible() + { + return new Gens.SizeBuilder<>(sizes -> Gens.strings().betweenCodePoints(sizes, 33, 127)); + } +} diff --git a/test/unit/org/apache/cassandra/journal/IndexTest.java b/test/unit/org/apache/cassandra/journal/IndexTest.java new file mode 100644 index 000000000000..acde8cc69c53 --- /dev/null +++ b/test/unit/org/apache/cassandra/journal/IndexTest.java @@ -0,0 +1,308 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.io.IOException; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.TreeMap; +import java.util.stream.Collectors; + +import com.google.common.collect.Maps; +import org.junit.Assert; +import org.junit.Test; + +import accord.utils.Invariants; +import org.agrona.collections.IntHashSet; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.utils.Generators; +import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.TimeUUID; +import org.quicktheories.core.Gen; +import org.quicktheories.impl.Constraint; + +import static org.apache.cassandra.journal.Index.composeOffsetAndSize; +import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.quicktheories.QuickTheory.qt; + +public class IndexTest +{ + private static final long[] EMPTY = new long[0]; + + @Test + public void testInMemoryIndexBasics() + { + InMemoryIndex index = InMemoryIndex.create(TimeUUIDKeySupport.INSTANCE); + + TimeUUID key0 = nextTimeUUID(); + TimeUUID key1 = nextTimeUUID(); + TimeUUID key2 = nextTimeUUID(); + TimeUUID key3 = nextTimeUUID(); + TimeUUID key4 = nextTimeUUID(); + + assertArrayEquals(EMPTY, index.lookUp(key0)); + assertArrayEquals(EMPTY, index.lookUp(key1)); + assertArrayEquals(EMPTY, index.lookUp(key2)); + assertArrayEquals(EMPTY, index.lookUp(key3)); + assertArrayEquals(EMPTY, index.lookUp(key4)); + + int val11 = 1100; + int val21 = 2100; + int val22 = 2200; + int val31 = 3100; + int val32 = 3200; + int val33 = 3300; + + index.update(key1, val11, 1); + index.update(key2, val21, 2); + index.update(key2, val22, 3); + index.update(key3, val31, 4); + index.update(key3, val32, 5); + index.update(key3, val33, 6); + + assertArrayEquals(EMPTY, index.lookUp(key0)); + + assertArrayEquals(new long[] { composeOffsetAndSize(val11, 1) }, index.lookUp(key1)); + assertArrayEquals(new long[] { composeOffsetAndSize(val22, 3), composeOffsetAndSize(val21, 2) }, index.lookUp(key2)); + assertArrayEquals(new long[] { composeOffsetAndSize(val33, 6), composeOffsetAndSize(val32, 5), composeOffsetAndSize(val31, 4) }, index.lookUp(key3)); + assertArrayEquals(EMPTY, index.lookUp(key4)); + + assertEquals(key1, index.firstId()); + assertEquals(key3, index.lastId()); + + assertFalse(index.mayContainId(key0)); + assertTrue(index.mayContainId(key1)); + assertTrue(index.mayContainId(key2)); + assertTrue(index.mayContainId(key3)); + assertFalse(index.mayContainId(key4)); + } + + @Test + public void testInMemoryIndexPersists() throws IOException + { + InMemoryIndex inMemory = InMemoryIndex.create(TimeUUIDKeySupport.INSTANCE); + + TimeUUID key0 = nextTimeUUID(); + TimeUUID key1 = nextTimeUUID(); + TimeUUID key2 = nextTimeUUID(); + TimeUUID key3 = nextTimeUUID(); + TimeUUID key4 = nextTimeUUID(); + + int val11 = 1100; + int val21 = 2100; + int val22 = 2200; + int val31 = 3100; + int val32 = 3200; + int val33 = 3300; + + inMemory.update(key1, val11, 1); + inMemory.update(key2, val21, 2); + inMemory.update(key2, val22, 3); + inMemory.update(key3, val31, 4); + inMemory.update(key3, val32, 5); + inMemory.update(key3, val33, 6); + + File directory = new File(Files.createTempDirectory(null)); + directory.deleteOnExit(); + Descriptor descriptor = Descriptor.create(directory, System.currentTimeMillis(), 1); + inMemory.persist(descriptor); + + try (OnDiskIndex onDisk = OnDiskIndex.open(descriptor, TimeUUIDKeySupport.INSTANCE)) + { + assertArrayEquals(EMPTY, onDisk.lookUp(key0)); + assertArrayEquals(new long[] { composeOffsetAndSize(val11, 1) }, onDisk.lookUp(key1)); + assertArrayEquals(new long[] { composeOffsetAndSize(val22, 3), composeOffsetAndSize(val21, 2) }, onDisk.lookUp(key2)); + assertArrayEquals(new long[] { composeOffsetAndSize(val33, 6), composeOffsetAndSize(val32, 5), composeOffsetAndSize(val31, 4) }, onDisk.lookUp(key3)); + assertArrayEquals(EMPTY, onDisk.lookUp(key4)); + + assertEquals(key1, onDisk.firstId()); + assertEquals(key3, onDisk.lastId()); + + assertFalse(onDisk.mayContainId(key0)); + assertTrue(onDisk.mayContainId(key1)); + assertTrue(onDisk.mayContainId(key2)); + assertTrue(onDisk.mayContainId(key3)); + assertFalse(onDisk.mayContainId(key4)); + } + } + + @Test + public void prop() throws IOException + { + Constraint sizeConstraint = Constraint.between(1, 10); + Constraint valueSizeConstraint = Constraint.between(0, 10); + Constraint positionConstraint = Constraint.between(0, Integer.MAX_VALUE); + Gen keyGen = Generators.timeUUID(); + Gen valueGen = rs -> { + long[] array = new long[(int) rs.next(valueSizeConstraint)]; + IntHashSet uniq = new IntHashSet(); + for (int i = 0 ; i < array.length ; ++i) + { + int offset = (int) rs.next(positionConstraint); + while (!uniq.add(offset)) + offset = (int) rs.next(positionConstraint); + array[i] = Index.composeOffsetAndSize(offset, (int) rs.next(positionConstraint)); + } + + Arrays.sort(array); + for (int i = 0 ; i < array.length / 2 ; ++i) + { + int back = array.length - (1 + i); + long v = array[i]; + array[i] = array[back]; + array[back] = v; + } + + return array; + }; + Gen> gen = rs -> { + int size = (int) rs.next(sizeConstraint); + Map map = Maps.newHashMapWithExpectedSize(size); + for (int i = 0; i < size; i++) + { + TimeUUID key = keyGen.generate(rs); + while (map.containsKey(key)) + key = keyGen.generate(rs); + long[] value = valueGen.generate(rs); + map.put(key, value); + } + return map; + }; + gen = gen.describedAs(map -> { + StringBuilder sb = new StringBuilder(); + for (Map.Entry entry : map.entrySet()) + sb.append('\n').append(entry.getKey()).append('\t').append(Arrays.toString(entry.getValue())); + return sb.toString(); + }); + File directory = new File(Files.createTempDirectory(null)); + directory.deleteOnExit(); + qt().withFixedSeed(185124544959375L).forAll(gen).checkAssert(map -> test(directory, map)); + } + + private static void test(File directory, Map map) + { + InMemoryIndex inMemory = InMemoryIndex.create(TimeUUIDKeySupport.INSTANCE); + for (Map.Entry e : map.entrySet()) + { + TimeUUID key = e.getKey(); + assertThat(inMemory.lookUp(key)).isEmpty(); + + long[] value = e.getValue(); + if (value.length == 0) + continue; + for (long i : value) + inMemory.update(key, Index.readOffset(i), Index.readSize(i)); + for (int i = 1 ; i < value.length ; ++i) + Invariants.require(value[i - 1] > value[i]); + } + assertIndex(map, inMemory); + + Descriptor descriptor = Descriptor.create(directory, System.nanoTime(), 1); + inMemory.persist(descriptor); + + try (OnDiskIndex onDisk = OnDiskIndex.open(descriptor, TimeUUIDKeySupport.INSTANCE)) + { + assertIndex(map, onDisk); + + List> sortedEntries = new ArrayList<>(); + for (Map.Entry entry : new TreeMap<>(map).entrySet()) + { + for (long l : entry.getValue()) + sortedEntries.add(Pair.create(entry.getKey(), l)); + } + + OnDiskIndex.IndexReader iter = onDisk.reader(); + Iterator> expectedIter = sortedEntries.iterator(); + while (iter.hasNext()) + { + Pair expected = expectedIter.next(); + Assert.assertEquals(iter.next(), expected.left); + Assert.assertEquals(iter.recordSize(), Index.readSize(expected.right)); + Assert.assertEquals(iter.offset(), Index.readOffset(expected.right)); + } + } + } + + private static void assertIndex(Map expected, Index actual) + { + List keys = expected.entrySet() + .stream() + .filter(e -> e.getValue().length != 0) + .map(Map.Entry::getKey) + .sorted() + .collect(Collectors.toList()); + + if (keys.isEmpty()) + { + assertThat(actual.firstId()).describedAs("Index %s had wrong firstId", actual).isNull(); + assertThat(actual.lastId()).describedAs("Index %s had wrong lastId", actual).isNull(); + } + else + { + assertThat(actual.firstId()).describedAs("Index %s had wrong firstId", actual).isEqualTo(keys.get(0)); + assertThat(actual.lastId()).describedAs("Index %s had wrong lastId", actual).isEqualTo(keys.get(keys.size() - 1)); + } + + for (Map.Entry e : expected.entrySet()) + { + TimeUUID key = e.getKey(); + long[] value = e.getValue(); + long[] read = actual.lookUp(key); + + if (!Arrays.equals(value, read)) + actual.lookUp(key); + + if (value.length == 0) + { + assertThat(read).describedAs("Index %s returned wrong values for %s", actual, key).isEmpty(); + } + else + { + assertThat(read).describedAs("Index %s returned wrong values for %s", actual, key).isEqualTo(value); + assertThat(actual.mayContainId(key)).describedAs("Index %s expected %s to exist", key, actual).isTrue(); + } + } + } + + @Test + public void testHelperMethods() + { + Random r = new Random(); + for (int i = 0; i < 1000000; i++) + { + long record = 0; + int size = Math.abs(r.nextInt()); + record = Index.writeSize(record, size); + int offset = Math.abs(r.nextInt()); + record = Index.writeOffset(record, offset); + assertEquals(size, Index.readSize(record)); + assertEquals(offset, Index.readOffset(record)); + assertEquals(record, composeOffsetAndSize(offset, size)); + } + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/journal/JournalTest.java b/test/unit/org/apache/cassandra/journal/JournalTest.java new file mode 100644 index 000000000000..de6848b289a9 --- /dev/null +++ b/test/unit/org/apache/cassandra/journal/JournalTest.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.io.IOException; +import java.nio.file.Files; + +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.utils.TimeUUID; + +import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; +import static org.junit.Assert.assertEquals; + +public class JournalTest +{ + @BeforeClass + public static void setUp() + { + DatabaseDescriptor.daemonInitialization(); + ServerTestUtils.prepareServer(); + } + + @Test + public void testSimpleReadWrite() throws IOException + { + File directory = new File(Files.createTempDirectory("JournalTest")); + directory.deleteRecursiveOnExit(); + + Journal journal = + new Journal<>("TestJournal", directory, TestParams.INSTANCE, TimeUUIDKeySupport.INSTANCE, LongSerializer.INSTANCE, SegmentCompactor.noop()); + + journal.start(); + + TimeUUID id1 = nextTimeUUID(); + TimeUUID id2 = nextTimeUUID(); + TimeUUID id3 = nextTimeUUID(); + TimeUUID id4 = nextTimeUUID(); + + journal.blockingWrite(id1, 1L); + journal.blockingWrite(id2, 2L); + journal.blockingWrite(id3, 3L); + journal.blockingWrite(id4, 4L); + + assertEquals(1L, (long) journal.readLast(id1)); + assertEquals(2L, (long) journal.readLast(id2)); + assertEquals(3L, (long) journal.readLast(id3)); + assertEquals(4L, (long) journal.readLast(id4)); + + journal.shutdown(); + + journal = new Journal<>("TestJournal", directory, TestParams.INSTANCE, TimeUUIDKeySupport.INSTANCE, LongSerializer.INSTANCE, SegmentCompactor.noop()); + journal.start(); + + assertEquals(1L, (long) journal.readLast(id1)); + assertEquals(2L, (long) journal.readLast(id2)); + assertEquals(3L, (long) journal.readLast(id3)); + assertEquals(4L, (long) journal.readLast(id4)); + + journal.shutdown(); + } + + static class LongSerializer implements ValueSerializer + { + static final LongSerializer INSTANCE = new LongSerializer(); + + public int serializedSize(TimeUUID key, Long value, int userVersion) + { + return Long.BYTES; + } + + public void serialize(TimeUUID key, Long value, DataOutputPlus out, int userVersion) throws IOException + { + out.writeLong(value); + } + + public Long deserialize(TimeUUID key, DataInputPlus in, int userVersion) throws IOException + { + return in.readLong(); + } + } +} diff --git a/test/unit/org/apache/cassandra/journal/MetadataTest.java b/test/unit/org/apache/cassandra/journal/MetadataTest.java new file mode 100644 index 000000000000..2ad9c14edad4 --- /dev/null +++ b/test/unit/org/apache/cassandra/journal/MetadataTest.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import org.junit.Test; + +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataOutputBuffer; + +import static org.junit.Assert.assertEquals; + +public class MetadataTest +{ + @Test + public void testUpdate() + { + Metadata metadata = Metadata.create(); + + metadata.update(); + metadata.update(); + metadata.update(); + metadata.update(); + + assertEquals(4, metadata.totalCount()); + } + + @Test + public void testWriteRead() throws IOException + { + Metadata metadata = Metadata.create(); + + metadata.update(); + metadata.update(); + metadata.update(); + metadata.update(); + + try (DataOutputBuffer out = DataOutputBuffer.scratchBuffer.get()) + { + metadata.write(out); + ByteBuffer serialized = out.buffer(); + + try (DataInputBuffer in = new DataInputBuffer(serialized, false)) + { + Metadata deserialized = Metadata.read(in); + assertEquals(4, deserialized.totalCount()); + } + } + } +} diff --git a/test/unit/org/apache/cassandra/journal/SegmentTest.java b/test/unit/org/apache/cassandra/journal/SegmentTest.java new file mode 100644 index 000000000000..3c1e7fee8a06 --- /dev/null +++ b/test/unit/org/apache/cassandra/journal/SegmentTest.java @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Files; + +import org.junit.Test; + +import org.apache.cassandra.concurrent.ImmediateExecutor; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.TimeUUID; +import org.apache.cassandra.utils.concurrent.OpOrder; + +import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class SegmentTest +{ + @Test + public void testWriteReadActiveSegment() throws IOException + { + TimeUUID id1 = nextTimeUUID(); + TimeUUID id2 = nextTimeUUID(); + TimeUUID id3 = nextTimeUUID(); + TimeUUID id4 = nextTimeUUID(); + + ByteBuffer record1 = ByteBufferUtil.bytes("sample record 1"); + ByteBuffer record2 = ByteBufferUtil.bytes("sample record 2"); + ByteBuffer record3 = ByteBufferUtil.bytes("sample record 3"); + ByteBuffer record4 = ByteBufferUtil.bytes("sample record 4"); + + File directory = new File(Files.createTempDirectory(null)); + directory.deleteRecursiveOnExit(); + + Descriptor descriptor = Descriptor.create(directory, System.currentTimeMillis(), 1); + + ActiveSegment segment = ActiveSegment.create(descriptor, params(), TimeUUIDKeySupport.INSTANCE); + + segment.allocate(record1.remaining()).write(id1, record1); + segment.allocate(record2.remaining()).write(id2, record2); + segment.allocate(record3.remaining()).write(id3, record3); + segment.allocate(record4.remaining()).write(id4, record4); + + // read all 4 entries by id and compare with originals + EntrySerializer.EntryHolder holder = new EntrySerializer.EntryHolder<>(); + + segment.readLast(id1, holder); + assertEquals(id1, holder.key); + assertEquals(record1, holder.value); + + segment.readLast(id2, holder); + assertEquals(id2, holder.key); + assertEquals(record2, holder.value); + + segment.readLast(id3, holder); + assertEquals(id3, holder.key); + assertEquals(record3, holder.value); + + segment.readLast(id4, holder); + assertEquals(id4, holder.key); + assertEquals(record4, holder.value); + } + + @Test + public void testReadClosedSegmentByID() throws IOException + { + DatabaseDescriptor.daemonInitialization(); + + TimeUUID id1 = nextTimeUUID(); + TimeUUID id2 = nextTimeUUID(); + TimeUUID id3 = nextTimeUUID(); + TimeUUID id4 = nextTimeUUID(); + + ByteBuffer record1 = ByteBufferUtil.bytes("sample record 1"); + ByteBuffer record2 = ByteBufferUtil.bytes("sample record 2"); + ByteBuffer record3 = ByteBufferUtil.bytes("sample record 3"); + ByteBuffer record4 = ByteBufferUtil.bytes("sample record 4"); + + File directory = new File(Files.createTempDirectory(null)); + directory.deleteRecursiveOnExit(); + + Descriptor descriptor = Descriptor.create(directory, System.currentTimeMillis(), 1); + + ActiveSegment activeSegment = ActiveSegment.create(descriptor, params(), TimeUUIDKeySupport.INSTANCE); + + activeSegment.allocate(record1.remaining()).write(id1, record1); + activeSegment.allocate(record2.remaining()).write(id2, record2); + activeSegment.allocate(record3.remaining()).write(id3, record3); + activeSegment.allocate(record4.remaining()).write(id4, record4); + + activeSegment.close(null); + + StaticSegment staticSegment = StaticSegment.open(descriptor, TimeUUIDKeySupport.INSTANCE); + + // read all 4 entries by id and compare with originals + EntrySerializer.EntryHolder holder = new EntrySerializer.EntryHolder<>(); + + staticSegment.readLast(id1, holder); + assertEquals(id1, holder.key); + assertEquals(record1, holder.value); + + staticSegment.readLast(id2, holder); + assertEquals(id2, holder.key); + assertEquals(record2, holder.value); + + staticSegment.readLast(id3, holder); + assertEquals(id3, holder.key); + assertEquals(record3, holder.value); + + staticSegment.readLast(id4, holder); + assertEquals(id4, holder.key); + assertEquals(record4, holder.value); + } + + @Test + public void testReadClosedSegmentSequentially() throws IOException + { + TimeUUID id1 = nextTimeUUID(); + TimeUUID id2 = nextTimeUUID(); + TimeUUID id3 = nextTimeUUID(); + TimeUUID id4 = nextTimeUUID(); + + ByteBuffer record1 = ByteBufferUtil.bytes("sample record 1"); + ByteBuffer record2 = ByteBufferUtil.bytes("sample record 2"); + ByteBuffer record3 = ByteBufferUtil.bytes("sample record 3"); + ByteBuffer record4 = ByteBufferUtil.bytes("sample record 4"); + + File directory = new File(Files.createTempDirectory(null)); + directory.deleteRecursiveOnExit(); + + Descriptor descriptor = Descriptor.create(directory, System.currentTimeMillis(), 1); + + ActiveSegment activeSegment = ActiveSegment.create(descriptor, params(), TimeUUIDKeySupport.INSTANCE); + + activeSegment.allocate(record1.remaining()).write(id1, record1); + activeSegment.allocate(record2.remaining()).write(id2, record2); + activeSegment.allocate(record3.remaining()).write(id3, record3); + activeSegment.allocate(record4.remaining()).write(id4, record4); + + Segment.Tidier tidier = (Segment.Tidier)activeSegment.selfRef().tidier(); + tidier.executor = ImmediateExecutor.INSTANCE; + OpOrder opOrder = new OpOrder(); + tidier.await = opOrder.newBarrier(); + tidier.await.issue(); + activeSegment.close(null); + + StaticSegment.SequentialReader reader = StaticSegment.sequentialReader(descriptor, TimeUUIDKeySupport.INSTANCE, 0); + + // read all 4 entries sequentially and compare with originals + assertTrue(reader.advance()); + assertEquals(id1, reader.key()); + assertEquals(record1, reader.record()); + + assertTrue(reader.advance()); + assertEquals(id2, reader.key()); + assertEquals(record2, reader.record()); + + assertTrue(reader.advance()); + assertEquals(id3, reader.key()); + assertEquals(record3, reader.record()); + + assertTrue(reader.advance()); + assertEquals(id4, reader.key()); + assertEquals(record4, reader.record()); + + assertFalse(reader.advance()); + } + + private static Params params() + { + return TestParams.INSTANCE; + } +} diff --git a/test/unit/org/apache/cassandra/journal/TestParams.java b/test/unit/org/apache/cassandra/journal/TestParams.java new file mode 100644 index 000000000000..72c64b0e518e --- /dev/null +++ b/test/unit/org/apache/cassandra/journal/TestParams.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.util.concurrent.TimeUnit; + +import org.apache.cassandra.service.accord.serializers.Version; + +public class TestParams implements Params +{ + public static final TestParams INSTANCE = new TestParams(); + + @Override + public int segmentSize() + { + return 32 << 20; + } + + @Override + public FailurePolicy failurePolicy() + { + return FailurePolicy.STOP; + } + + @Override + public FlushMode flushMode() + { + return FlushMode.GROUP; + } + + @Override + public boolean enableCompaction() + { + return false; + } + + @Override + public long compactionPeriod(TimeUnit units) + { + return units.convert(60, TimeUnit.SECONDS); + } + + @Override + public long flushPeriod(TimeUnit units) + { + return units.convert(1, TimeUnit.SECONDS); + } + + @Override + public long periodicBlockPeriod(TimeUnit units) + { + return units.convert(2, TimeUnit.SECONDS); + } + + @Override + public int userVersion() + { + return Version.LATEST.version; + } +} diff --git a/test/unit/org/apache/cassandra/journal/TimeUUIDKeySupport.java b/test/unit/org/apache/cassandra/journal/TimeUUIDKeySupport.java new file mode 100644 index 000000000000..5694a29e7ab4 --- /dev/null +++ b/test/unit/org/apache/cassandra/journal/TimeUUIDKeySupport.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.journal; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.zip.Checksum; + +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.utils.TimeUUID; + +import static org.apache.cassandra.utils.FBUtilities.updateChecksumLong; + +class TimeUUIDKeySupport implements KeySupport +{ + static final TimeUUIDKeySupport INSTANCE = new TimeUUIDKeySupport(); + + @Override + public int serializedSize(int userVersion) + { + return 16; + } + + @Override + public void serialize(TimeUUID key, DataOutputPlus out, int userVersion) throws IOException + { + out.writeLong(key.uuidTimestamp()); + out.writeLong(key.lsb()); + } + + @Override + public void serialize(TimeUUID key, ByteBuffer out, int userVersion) throws IOException + { + out.putLong(key.uuidTimestamp()); + out.putLong(key.lsb()); + } + + @Override + public TimeUUID deserialize(DataInputPlus in, int userVersion) throws IOException + { + long uuidTimestamp = in.readLong(); + long lsb = in.readLong(); + return new TimeUUID(uuidTimestamp, lsb); + } + + @Override + public TimeUUID deserialize(ByteBuffer buffer, int position, int userVersion) + { + long uuidTimestamp = buffer.getLong(position); + long lsb = buffer.getLong(position + 8); + return new TimeUUID(uuidTimestamp, lsb); + } + + @Override + public TimeUUID deserialize(ByteBuffer buffer, int userVersion) + { + long uuidTimestamp = buffer.getLong(); + long lsb = buffer.getLong(); + return new TimeUUID(uuidTimestamp, lsb); + } + + @Override + public void updateChecksum(Checksum crc, TimeUUID key, int userVersion) + { + updateChecksumLong(crc, key.uuidTimestamp()); + updateChecksumLong(crc, key.lsb()); + } + + @Override + public int compareWithKeyAt(TimeUUID key, ByteBuffer buffer, int position, int userVersion) + { + long uuidTimestamp = buffer.getLong(position); + long lsb = buffer.getLong(position + 8); + return key.uuidTimestamp() != uuidTimestamp + ? Long.compare(key.uuidTimestamp(), uuidTimestamp) + : Long.compare(key.lsb(), lsb); + } + + @Override + public int compare(TimeUUID o1, TimeUUID o2) + { + return o1.compareTo(o2); + } +} diff --git a/test/unit/org/apache/cassandra/locator/AssureSufficientLiveNodesTest.java b/test/unit/org/apache/cassandra/locator/AssureSufficientLiveNodesTest.java index 9e9ff605df99..008c3854e268 100644 --- a/test/unit/org/apache/cassandra/locator/AssureSufficientLiveNodesTest.java +++ b/test/unit/org/apache/cassandra/locator/AssureSufficientLiveNodesTest.java @@ -32,23 +32,22 @@ import com.google.common.collect.ImmutableList; import com.google.common.util.concurrent.Uninterruptibles; +import org.apache.cassandra.schema.*; import org.junit.BeforeClass; import org.junit.Test; import org.junit.runner.RunWith; +import org.apache.cassandra.CassandraTestBase; +import org.apache.cassandra.CassandraTestBase.PrepareServerNoRegister; +import org.apache.cassandra.CassandraTestBase.UseMurmur3Partitioner; import org.apache.cassandra.SchemaLoader; -import org.apache.cassandra.ServerTestUtils; -import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.dht.Token; import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; import org.apache.cassandra.exceptions.UnavailableException; -import org.apache.cassandra.schema.KeyspaceMetadata; -import org.apache.cassandra.schema.KeyspaceParams; -import org.apache.cassandra.schema.SchemaTestUtil; -import org.apache.cassandra.schema.Tables; import org.apache.cassandra.service.reads.NeverSpeculativeRetryPolicy; +import org.apache.cassandra.service.reads.ReadCoordinator; import org.apache.cassandra.utils.FBUtilities; import org.jboss.byteman.contrib.bmunit.BMRule; import org.jboss.byteman.contrib.bmunit.BMUnitRunner; @@ -69,7 +68,9 @@ targetClass = "FailureDetector", targetMethod = "isAlive", action = "return true;") -public class AssureSufficientLiveNodesTest +@PrepareServerNoRegister +@UseMurmur3Partitioner +public class AssureSufficientLiveNodesTest extends CassandraTestBase { private static final AtomicInteger testIdGen = new AtomicInteger(0); private static final Supplier keyspaceNameGen = () -> "race_" + testIdGen.getAndIncrement(); @@ -78,13 +79,11 @@ public class AssureSufficientLiveNodesTest private static final String DC3 = "datacenter3"; private static final int RACE_TEST_LOOPS = 100; private static final Token tk = new Murmur3Partitioner.LongToken(0); + private static final TableId TABLE_ID = TableId.generate(); @BeforeClass public static void setUpClass() throws Throwable { - ServerTestUtils.daemonInitialization(); - DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); - ServerTestUtils.prepareServerNoRegister(); // Register peers with expected DC for NetworkTopologyStrategy. List instances = ImmutableList.of( // datacenter 1 @@ -139,7 +138,7 @@ public void addDatacenterShouldNotCausesUnavailableWithEachQuorumTest() throws T // alter to KeyspaceParams.nts(DC1, 3, DC2, 3), // test - keyspace -> ReplicaPlans.forRead(keyspace, tk, null, EACH_QUORUM, NeverSpeculativeRetryPolicy.INSTANCE) + keyspace -> ReplicaPlans.forRead(keyspace, TABLE_ID, tk, null, EACH_QUORUM, NeverSpeculativeRetryPolicy.INSTANCE, ReadCoordinator.DEFAULT) ); } @@ -172,7 +171,7 @@ public void addDatacenterShouldNotCausesUnavailableWithQuorumTest() throws Throw // alter to KeyspaceParams.nts(DC1, 3, DC2, 3), // test - keyspace -> ReplicaPlans.forRead(keyspace, tk, null, QUORUM, NeverSpeculativeRetryPolicy.INSTANCE) + keyspace -> ReplicaPlans.forRead(keyspace, TABLE_ID, tk, null, QUORUM, NeverSpeculativeRetryPolicy.INSTANCE, ReadCoordinator.DEFAULT) ); raceOfReplicationStrategyTest( // init. The # of live endpoints is 3 = 2 + 1 @@ -180,7 +179,7 @@ public void addDatacenterShouldNotCausesUnavailableWithQuorumTest() throws Throw // alter to. (3 + 3) / 2 + 1 > 3 KeyspaceParams.nts(DC1, 2, DC2, 1, DC3, 3), // test - keyspace -> ReplicaPlans.forRead(keyspace, tk, null, QUORUM, NeverSpeculativeRetryPolicy.INSTANCE) + keyspace -> ReplicaPlans.forRead(keyspace, TABLE_ID, tk, null, QUORUM, NeverSpeculativeRetryPolicy.INSTANCE, ReadCoordinator.DEFAULT) ); } @@ -204,7 +203,7 @@ public void raceOnRemoveDatacenterNotCausesUnavailable() throws Throwable // alter to KeyspaceParams.nts(DC1, 3), // test - keyspace -> ReplicaPlans.forRead(keyspace, tk, null, EACH_QUORUM, NeverSpeculativeRetryPolicy.INSTANCE) + keyspace -> ReplicaPlans.forRead(keyspace, TABLE_ID, tk, null, EACH_QUORUM, NeverSpeculativeRetryPolicy.INSTANCE, ReadCoordinator.DEFAULT) ); } @@ -228,7 +227,7 @@ public void increaseReplicationFactorShouldNotCausesUnavailableTest() throws Thr // alter to KeyspaceParams.nts(DC1, 3), // test - keyspace -> ReplicaPlans.forRead(keyspace, tk, null, LOCAL_QUORUM, NeverSpeculativeRetryPolicy.INSTANCE) + keyspace -> ReplicaPlans.forRead(keyspace, TABLE_ID, tk, null, LOCAL_QUORUM, NeverSpeculativeRetryPolicy.INSTANCE, ReadCoordinator.DEFAULT) ); } diff --git a/test/unit/org/apache/cassandra/locator/MetaStrategyTest.java b/test/unit/org/apache/cassandra/locator/MetaStrategyTest.java index c6a96fa71eb2..35ab27d426b9 100644 --- a/test/unit/org/apache/cassandra/locator/MetaStrategyTest.java +++ b/test/unit/org/apache/cassandra/locator/MetaStrategyTest.java @@ -32,6 +32,9 @@ import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.schema.DistributedSchema; +import org.apache.cassandra.service.accord.AccordFastPath; +import org.apache.cassandra.service.accord.AccordStaleReplicas; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationState; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.membership.Directory; @@ -87,9 +90,12 @@ public static ClusterMetadata metadata(NodeConfiguration... configurations) directory, tokenMap, DataPlacements.EMPTY, + AccordFastPath.EMPTY, LockedRanges.EMPTY, InProgressSequences.EMPTY, - ImmutableMap.of()); + ConsensusMigrationState.EMPTY, + ImmutableMap.of(), + AccordStaleReplicas.EMPTY); } @Test @@ -155,4 +161,4 @@ public static Location location(String dc, String rack) { return new Location(dc, rack); } -} \ No newline at end of file +} diff --git a/test/unit/org/apache/cassandra/locator/NetworkTopologyStrategyTest.java b/test/unit/org/apache/cassandra/locator/NetworkTopologyStrategyTest.java index fc235526fcef..e59c847f7746 100644 --- a/test/unit/org/apache/cassandra/locator/NetworkTopologyStrategyTest.java +++ b/test/unit/org/apache/cassandra/locator/NetworkTopologyStrategyTest.java @@ -20,29 +20,42 @@ import java.io.IOException; import java.net.UnknownHostException; -import java.util.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; import java.util.stream.Collectors; import com.google.common.collect.HashMultimap; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.google.common.collect.Multimap; - -import org.junit.*; +import org.junit.After; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; import org.junit.rules.ExpectedException; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.CassandraTestBase; +import org.apache.cassandra.CassandraTestBase.DisableMBeanRegistration; +import org.apache.cassandra.CassandraTestBase.PrepareServerNoRegister; import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.Util; -import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.dht.*; +import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.dht.Murmur3Partitioner.LongToken; import org.apache.cassandra.dht.OrderPreservingPartitioner.StringToken; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; import org.apache.cassandra.exceptions.ConfigurationException; -import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.service.ClientWarn; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.RegistrationStatus; @@ -59,20 +72,13 @@ import static org.apache.cassandra.locator.SimpleLocationProvider.LOCATION; import static org.junit.Assert.assertTrue; -public class NetworkTopologyStrategyTest +@PrepareServerNoRegister +@DisableMBeanRegistration +public class NetworkTopologyStrategyTest extends CassandraTestBase { private static final String KEYSPACE = "ks1"; private static final Logger logger = LoggerFactory.getLogger(NetworkTopologyStrategyTest.class); - @BeforeClass - public static void setupDD() - { - DatabaseDescriptor.daemonInitialization(); - DatabaseDescriptor.setPartitionerUnsafe(OrderPreservingPartitioner.instance); - DatabaseDescriptor.setTransientReplicationEnabledUnsafe(true); - ClusterMetadataService.setInstance(ClusterMetadataTestHelper.instanceForTest()); - } - @After public void teardown() { @@ -80,6 +86,7 @@ public void teardown() } @Test + @UseOrderPreservingPartitioner public void testProperties() throws IOException, ConfigurationException { createDummyTokens(true); @@ -104,6 +111,7 @@ public void testProperties() throws IOException, ConfigurationException } @Test + @UseOrderPreservingPartitioner public void testPropertiesWithEmptyDC() throws IOException, ConfigurationException { createDummyTokens(false); @@ -126,6 +134,7 @@ public void testPropertiesWithEmptyDC() throws IOException, ConfigurationExcepti } @Test + @UseOrderPreservingPartitioner public void testLargeCluster() throws UnknownHostException, ConfigurationException { int[] dcRacks = new int[]{2, 4, 8}; @@ -201,47 +210,45 @@ public void tokenFactory(String token, byte[] bytes, Location location) throws U } @Test + @UseMurmur3Partitioner public void testCalculateEndpoints() throws UnknownHostException { final int NODES = 100; final int VNODES = 64; final int RUNS = 10; - try (WithPartitioner m3p = new WithPartitioner(Murmur3Partitioner.instance)) + Map datacenters = ImmutableMap.of("rf1", 1, "rf3", 3, "rf5_1", 5, "rf5_2", 5, "rf5_3", 5); + List nodes = new ArrayList<>(NODES); + for (byte i = 0; i < NODES; ++i) + nodes.add(InetAddressAndPort.getByAddress(new byte[]{ 127, 0, 0, i })); + for (int run = 0; run < RUNS; ++run) { - Map datacenters = ImmutableMap.of("rf1", 1, "rf3", 3, "rf5_1", 5, "rf5_2", 5, "rf5_3", 5); - List nodes = new ArrayList<>(NODES); - for (byte i = 0; i < NODES; ++i) - nodes.add(InetAddressAndPort.getByAddress(new byte[]{ 127, 0, 0, i })); - for (int run = 0; run < RUNS; ++run) - { - ServerTestUtils.resetCMS(); - Random rand = new Random(run); - Locator locator = generateLocator(datacenters, nodes, rand); + ServerTestUtils.resetCMS(); + Random rand = new Random(run); + Locator locator = generateLocator(datacenters, nodes, rand); - for (int i = 0; i < NODES; ++i) // Nodes + for (int i = 0; i < NODES; ++i) // Nodes + { + Set tokens = new HashSet<>(); + while (tokens.size() < VNODES) // tokens/vnodes per node { - Set tokens = new HashSet<>(); - while (tokens.size() < VNODES) // tokens/vnodes per node - { - tokens.add(Murmur3Partitioner.instance.getRandomToken(rand)); - } - // Here we fake the registration status because we want all the nodes to be registered in cluster - // metadata using the locations we setup in generateLocator. This registration occurs as a part of - // the addEndpoint call here and behaves as expected for all nodes _except_ the one with the address - // which matches the local broadcast address (i.e. 127.0.0.1, which is #2 in the list of nodes). - // The location we want this to be registered with is {DC: rf5_1, rack: 3}, but while - // RegistrationStatus.instance indicates that the node is yet to be registered, the Locator will - // correctly return the initialization location obtained from - // DatabaseDescriptor::getInitialLocationProvider, which ultimately resolves to - // SimpleLocationProvider (because test/conf/cassandra.yaml specifies use of SimpleSnitch) and so - // we register that one node with the location {DC: datacenter1, rack: rack1}. - // This is purely an artefact of the contrived testing setup and in more realistic scenarios, - // including the majority of tests, isn't an issue. - RegistrationStatus.instance.onRegistration(); - ClusterMetadataTestHelper.addEndpoint(nodes.get(i), tokens, locator.location(nodes.get(i))); + tokens.add(Murmur3Partitioner.instance.getRandomToken(rand)); } - testEquivalence(ClusterMetadata.current(), locator, datacenters, rand); + // Here we fake the registration status because we want all the nodes to be registered in cluster + // metadata using the locations we setup in generateLocator. This registration occurs as a part of + // the addEndpoint call here and behaves as expected for all nodes _except_ the one with the address + // which matches the local broadcast address (i.e. 127.0.0.1, which is #2 in the list of nodes). + // The location we want this to be registered with is {DC: rf5_1, rack: 3}, but while + // RegistrationStatus.instance indicates that the node is yet to be registered, the Locator will + // correctly return the initialization location obtained from + // DatabaseDescriptor::getInitialLocationProvider, which ultimately resolves to + // SimpleLocationProvider (because test/conf/cassandra.yaml specifies use of SimpleSnitch) and so + // we register that one node with the location {DC: datacenter1, rack: rack1}. + // This is purely an artefact of the contrived testing setup and in more realistic scenarios, + // including the majority of tests, isn't an issue. + RegistrationStatus.instance.onRegistration(); + ClusterMetadataTestHelper.addEndpoint(nodes.get(i), tokens, locator.location(nodes.get(i))); } + testEquivalence(ClusterMetadata.current(), locator, datacenters, rand); } } @@ -438,35 +445,32 @@ private static Range range(long l, long r) } @Test + @UseMurmur3Partitioner public void testTransientReplica() throws Exception { - try (WithPartitioner m3p = new WithPartitioner(Murmur3Partitioner.instance)) - { - List endpoints = Lists.newArrayList(InetAddressAndPort.getByName("127.0.0.1"), - InetAddressAndPort.getByName("127.0.0.2"), - InetAddressAndPort.getByName("127.0.0.3"), - InetAddressAndPort.getByName("127.0.0.4")); - - ClusterMetadataTestHelper.addEndpoint(endpoints.get(0), tk(100), LOCATION); - ClusterMetadataTestHelper.addEndpoint(endpoints.get(1), tk(200), LOCATION); - ClusterMetadataTestHelper.addEndpoint(endpoints.get(2), tk(300), LOCATION); - ClusterMetadataTestHelper.addEndpoint(endpoints.get(3), tk(400), LOCATION); - - Map configOptions = new HashMap<>(); - configOptions.put(LOCATION.datacenter, "3/1"); - NetworkTopologyStrategy strategy = new NetworkTopologyStrategy(KEYSPACE, configOptions); - - Util.assertRCEquals(EndpointsForRange.of(fullReplica(endpoints.get(0), range(400, 100)), - fullReplica(endpoints.get(1), range(400, 100)), - transientReplica(endpoints.get(2), range(400, 100))), - strategy.calculateNaturalReplicas(tk(99), ClusterMetadata.current())); - - - Util.assertRCEquals(EndpointsForRange.of(fullReplica(endpoints.get(1), range(100, 200)), - fullReplica(endpoints.get(2), range(100, 200)), - transientReplica(endpoints.get(3), range(100, 200))), - strategy.calculateNaturalReplicas(tk(101), ClusterMetadata.current())); - } + List endpoints = Lists.newArrayList(InetAddressAndPort.getByName("127.0.0.1"), + InetAddressAndPort.getByName("127.0.0.2"), + InetAddressAndPort.getByName("127.0.0.3"), + InetAddressAndPort.getByName("127.0.0.4")); + + ClusterMetadataTestHelper.addEndpoint(endpoints.get(0), tk(100), LOCATION); + ClusterMetadataTestHelper.addEndpoint(endpoints.get(1), tk(200), LOCATION); + ClusterMetadataTestHelper.addEndpoint(endpoints.get(2), tk(300), LOCATION); + ClusterMetadataTestHelper.addEndpoint(endpoints.get(3), tk(400), LOCATION); + + Map configOptions = new HashMap<>(); + configOptions.put(LOCATION.datacenter, "3/1"); + NetworkTopologyStrategy strategy = new NetworkTopologyStrategy(KEYSPACE, configOptions); + Util.assertRCEquals(EndpointsForRange.of(fullReplica(endpoints.get(0), range(400, 100)), + fullReplica(endpoints.get(1), range(400, 100)), + transientReplica(endpoints.get(2), range(400, 100))), + strategy.calculateNaturalReplicas(tk(99), ClusterMetadata.current())); + + + Util.assertRCEquals(EndpointsForRange.of(fullReplica(endpoints.get(1), range(100, 200)), + fullReplica(endpoints.get(2), range(100, 200)), + transientReplica(endpoints.get(3), range(100, 200))), + strategy.calculateNaturalReplicas(tk(101), ClusterMetadata.current())); } @Rule @@ -486,6 +490,7 @@ public void shouldRejectReplicationFactorOption() throws ConfigurationException } @Test + @UseOrderPreservingPartitioner public void shouldWarnOnHigherReplicationFactorThanNodesInDC() { HashMap configOptions = new HashMap<>(); diff --git a/test/unit/org/apache/cassandra/locator/NodeProximityEndpointCompareTest.java b/test/unit/org/apache/cassandra/locator/NodeProximityEndpointCompareTest.java new file mode 100644 index 000000000000..472104697883 --- /dev/null +++ b/test/unit/org/apache/cassandra/locator/NodeProximityEndpointCompareTest.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.locator; + +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Modifier; + +import org.assertj.core.api.Assertions; +import org.junit.Test; +import org.reflections.Reflections; +import org.reflections.scanners.Scanners; +import org.reflections.util.ConfigurationBuilder; + +import org.apache.cassandra.config.DatabaseDescriptor; + +public class NodeProximityEndpointCompareTest +{ + static + { + DatabaseDescriptor.clientInitialization(); + } + + @Test + public void allSupportEndpoint() throws InvocationTargetException, InstantiationException, IllegalAccessException + { + Reflections reflections = new Reflections(new ConfigurationBuilder() + .forPackage("org.apache.cassandra") + .setScanners(Scanners.SubTypes) + .setExpandSuperTypes(true)); + + for (Class klass : reflections.getSubTypesOf(NodeProximity.class)) + { + if (Modifier.isAbstract(klass.getModifiers()) + || Modifier.isPrivate(klass.getModifiers()) // private can not be created normally, so these are scoped to tests and can be ignored + || klass.isAnonymousClass()) + continue; + Constructor declaredConstructor; + try + { + declaredConstructor = klass.getDeclaredConstructor(); + } + catch (NoSuchMethodException e) + { + // DynamicEndpointSnitch or test snitch... we can not create this normally + continue; + } + if (Modifier.isPrivate(declaredConstructor.getModifiers())) + continue; + NodeProximity proximity = declaredConstructor.newInstance(); + Assertions.assertThat(proximity.supportCompareByEndpoint()) + .describedAs("Snitch %s does not support compare by endpoint!", proximity.getClass()) + .isTrue(); + } + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/locator/PropertyFileSnitchTest.java b/test/unit/org/apache/cassandra/locator/PropertyFileSnitchTest.java index 35bdb98d8d0d..d2bbd3a0805d 100644 --- a/test/unit/org/apache/cassandra/locator/PropertyFileSnitchTest.java +++ b/test/unit/org/apache/cassandra/locator/PropertyFileSnitchTest.java @@ -31,10 +31,11 @@ import org.junit.After; import org.junit.Before; -import org.junit.BeforeClass; import org.junit.Test; -import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.CassandraTestBase; +import org.apache.cassandra.CassandraTestBase.DDDaemonInitialization; +import org.apache.cassandra.CassandraTestBase.UseRandomPartitioner; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tcm.StubClusterMetadataService; @@ -48,18 +49,14 @@ /** * Unit tests for {@link PropertyFileSnitch}. */ -public class PropertyFileSnitchTest +@DDDaemonInitialization +@UseRandomPartitioner +public class PropertyFileSnitchTest extends CassandraTestBase { private Path effectiveFile; private Path backupFile; private InetAddressAndPort localAddress; - @BeforeClass - public static void setupDD() - { - DatabaseDescriptor.daemonInitialization(); - } - @Before public void setup() throws ConfigurationException, IOException { diff --git a/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java b/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java index 874e50d17a56..625e2e65ef85 100644 --- a/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java +++ b/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java @@ -28,38 +28,47 @@ import com.google.common.collect.HashMultimap; import com.google.common.collect.Lists; import com.google.common.collect.Multimap; -import org.junit.*; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; import org.junit.rules.ExpectedException; +import org.apache.cassandra.CassandraTestBase; +import org.apache.cassandra.CassandraTestBase.DisableMBeanRegistration; +import org.apache.cassandra.CassandraTestBase.PrepareServerNoRegister; import org.apache.cassandra.SchemaLoader; -import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.Util; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.Keyspace; -import org.apache.cassandra.dht.*; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.OrderPreservingPartitioner; import org.apache.cassandra.dht.OrderPreservingPartitioner.StringToken; import org.apache.cassandra.dht.RandomPartitioner.BigIntegerToken; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; import org.apache.cassandra.exceptions.ConfigurationException; -import org.apache.cassandra.schema.ReplicationParams; -import org.apache.cassandra.schema.SchemaConstants; -import org.apache.cassandra.tcm.transformations.Register; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.service.ClientWarn; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.membership.Location; import org.apache.cassandra.tcm.membership.NodeVersion; +import org.apache.cassandra.tcm.transformations.Register; import org.apache.cassandra.utils.ByteBufferUtil; -import static org.apache.cassandra.config.CassandraRelevantProperties.ORG_APACHE_CASSANDRA_DISABLE_MBEAN_REGISTRATION; import static org.apache.cassandra.ServerTestUtils.recreateCMS; +import static org.apache.cassandra.config.CassandraRelevantProperties.ORG_APACHE_CASSANDRA_DISABLE_MBEAN_REGISTRATION; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; -public class SimpleStrategyTest +@PrepareServerNoRegister +@DisableMBeanRegistration +public class SimpleStrategyTest extends CassandraTestBase { public static final String KEYSPACE1 = "SimpleStrategyTest"; public static final String MULTIDC = "MultiDCSimpleStrategyTest"; @@ -69,16 +78,9 @@ public class SimpleStrategyTest ORG_APACHE_CASSANDRA_DISABLE_MBEAN_REGISTRATION.setBoolean(true); } - @BeforeClass - public static void defineSchema() - { - DatabaseDescriptor.daemonInitialization(); - } - - public static void withPartitioner(IPartitioner partitioner) + @Before + public void defineSchema() { - DatabaseDescriptor.setPartitionerUnsafe(partitioner); - ServerTestUtils.prepareServerNoRegister(); recreateCMS(); SchemaLoader.createKeyspace(KEYSPACE1, KeyspaceParams.simple(1)); SchemaLoader.createKeyspace(MULTIDC, KeyspaceParams.simple(3)); @@ -92,9 +94,10 @@ public void tryValidKeyspace() } @Test + @UseRandomPartitioner public void testBigIntegerEndpoints() throws UnknownHostException { - withPartitioner(RandomPartitioner.instance); + defineSchema(); List endpointTokens = new ArrayList<>(); List keyTokens = new ArrayList<>(); for (int i = 0; i < 5; i++) { @@ -105,24 +108,23 @@ public void testBigIntegerEndpoints() throws UnknownHostException } @Test + @UseOrderPreservingPartitioner public void testStringEndpoints() throws UnknownHostException { - IPartitioner partitioner = OrderPreservingPartitioner.instance; - withPartitioner(partitioner); + defineSchema(); List endpointTokens = new ArrayList(); List keyTokens = new ArrayList(); for (int i = 0; i < 5; i++) { endpointTokens.add(new StringToken(String.valueOf((char)('a' + i * 2)))); - keyTokens.add(partitioner.getToken(ByteBufferUtil.bytes(String.valueOf((char) ('a' + i * 2 + 1))))); + keyTokens.add(OrderPreservingPartitioner.instance.getToken(ByteBufferUtil.bytes(String.valueOf((char) ('a' + i * 2 + 1))))); } verifyGetNaturalEndpoints(endpointTokens.toArray(new Token[0]), keyTokens.toArray(new Token[0])); } @Test + @UseMurmur3Partitioner public void testMultiDCSimpleStrategyEndpoints() throws UnknownHostException { - withPartitioner(Murmur3Partitioner.instance); - // Topology taken directly from the topology_test.test_size_estimates_multidc dtest that regressed Multimap dc1 = HashMultimap.create(); dc1.put(InetAddressAndPort.getByName("127.0.0.1"), new Murmur3Partitioner.LongToken(-6639341390736545756L)); @@ -186,9 +188,10 @@ private void verifyGetNaturalEndpoints(Token[] endpointTokens, Token[] keyTokens } @Test + @UseRandomPartitioner public void testGetEndpointsDuringBootstrap() throws UnknownHostException, ExecutionException, InterruptedException { - withPartitioner(RandomPartitioner.instance); + defineSchema(); // the token difference will be RING_SIZE * 2. final int RING_SIZE = 10; @@ -264,10 +267,9 @@ private static Range range(long l, long r) } @Test + @UseMurmur3Partitioner public void transientReplica() throws Exception { - withPartitioner(Murmur3Partitioner.instance); - List endpoints = Lists.newArrayList(InetAddressAndPort.getByName("127.0.0.1"), InetAddressAndPort.getByName("127.0.0.2"), InetAddressAndPort.getByName("127.0.0.3"), @@ -305,9 +307,10 @@ public void transientReplica() throws Exception public ExpectedException expectedEx = ExpectedException.none(); @Test + @UseMurmur3Partitioner public void testSimpleStrategyThrowsConfigurationException() throws ConfigurationException, UnknownHostException { - withPartitioner(Murmur3Partitioner.instance); + defineSchema(); expectedEx.expect(ConfigurationException.class); expectedEx.expectMessage("SimpleStrategy requires a replication_factor strategy option."); @@ -327,9 +330,10 @@ public void testSimpleStrategyThrowsConfigurationException() throws Configuratio } @Test + @UseMurmur3Partitioner public void shouldReturnNoEndpointsForEmptyRing() { - withPartitioner(Murmur3Partitioner.instance); + defineSchema(); HashMap configOptions = new HashMap<>(); configOptions.put("replication_factor", "1"); @@ -341,9 +345,10 @@ public void shouldReturnNoEndpointsForEmptyRing() } @Test + @UseMurmur3Partitioner public void shouldWarnOnHigherReplicationFactorThanNodes() { - withPartitioner(Murmur3Partitioner.instance); + defineSchema(); HashMap configOptions = new HashMap<>(); configOptions.put("replication_factor", "2"); diff --git a/test/unit/org/apache/cassandra/net/ConnectionTest.java b/test/unit/org/apache/cassandra/net/ConnectionTest.java index 42c137cc5f63..233d067d1215 100644 --- a/test/unit/org/apache/cassandra/net/ConnectionTest.java +++ b/test/unit/org/apache/cassandra/net/ConnectionTest.java @@ -57,9 +57,10 @@ import io.netty.channel.ChannelPromise; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.config.EncryptionOptions; +import org.apache.cassandra.config.EncryptionOptions.ServerEncryptionOptions.Builder; import org.apache.cassandra.db.commitlog.CommitLog; import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.UnknownColumnException; import org.apache.cassandra.io.IVersionedAsymmetricSerializer; import org.apache.cassandra.io.IVersionedSerializer; @@ -72,13 +73,13 @@ import static java.util.concurrent.TimeUnit.MILLISECONDS; import static java.util.concurrent.TimeUnit.MINUTES; import static java.util.concurrent.TimeUnit.SECONDS; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.NOT_REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.NOT_REQUIRED; import static org.apache.cassandra.net.MessagingService.VERSION_40; import static org.apache.cassandra.net.NoPayload.noPayload; import static org.apache.cassandra.net.MessagingService.current_version; import static org.apache.cassandra.net.ConnectionType.LARGE_MESSAGES; import static org.apache.cassandra.net.ConnectionType.SMALL_MESSAGES; -import static org.apache.cassandra.net.ConnectionUtils.*; +import static org.apache.cassandra.net.ConnectionUtils.check; import static org.apache.cassandra.net.OutboundConnectionSettings.Framing.LZ4; import static org.apache.cassandra.net.OutboundConnections.LARGE_MESSAGE_THRESHOLD; import static org.apache.cassandra.utils.Clock.Global.nanoTime; @@ -176,17 +177,18 @@ Settings override(Settings settings) } } - static final EncryptionOptions.ServerEncryptionOptions encryptionOptions = - new EncryptionOptions.ServerEncryptionOptions() + static final EncryptionOptions.Builder encryptionOptionsBuilder = + new Builder() .withLegacySslStoragePort(true) - .withOptional(true) .withInternodeEncryption(EncryptionOptions.ServerEncryptionOptions.InternodeEncryption.all) + .withOptional(true) .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH) .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) .withTrustStore(TlsTestUtils.SERVER_TRUSTSTORE_PATH) .withTrustStorePassword(TlsTestUtils.SERVER_TRUSTSTORE_PASSWORD) .withRequireClientAuth(NOT_REQUIRED) .withCipherSuites("TLS_RSA_WITH_AES_128_CBC_SHA"); + static final EncryptionOptions.ServerEncryptionOptions encryptionOptions = encryptionOptionsBuilder.build(); static final List> MODIFIERS = ImmutableList.of( settings -> settings.outbound(outbound -> outbound.withEncryption(encryptionOptions)) @@ -388,7 +390,7 @@ public long serializedSize(Object o, int version) MessagingService.instance().callbacks.addWithExpiration(new RequestCallback() { @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) + public void onFailure(InetAddressAndPort from, RequestFailure failure) { done.countDown(); } diff --git a/test/unit/org/apache/cassandra/net/HandshakeTest.java b/test/unit/org/apache/cassandra/net/HandshakeTest.java index c84643497d5d..2806bac86d18 100644 --- a/test/unit/org/apache/cassandra/net/HandshakeTest.java +++ b/test/unit/org/apache/cassandra/net/HandshakeTest.java @@ -39,6 +39,7 @@ import io.netty.util.concurrent.Future; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.config.EncryptionOptions.ServerEncryptionOptions; +import org.apache.cassandra.config.EncryptionOptions.ServerEncryptionOptions.Builder; import org.apache.cassandra.config.ParameterizedClass; import org.apache.cassandra.db.commitlog.CommitLog; import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; @@ -49,9 +50,9 @@ import org.apache.cassandra.transport.TlsTestUtils; import org.apache.cassandra.utils.concurrent.AsyncPromise; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.NOT_REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; import static org.apache.cassandra.net.ConnectionType.SMALL_MESSAGES; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.NOT_REQUIRED; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.REQUIRED; import static org.apache.cassandra.net.MessagingService.current_version; import static org.apache.cassandra.net.MessagingService.minimum_version; import static org.apache.cassandra.net.OutboundConnectionInitiator.Result; @@ -279,26 +280,28 @@ public void testOutboundConnectionDoesntFallbackWhenErrorIsNotSSLRelated() throw private ServerEncryptionOptions getServerEncryptionOptions(SslFallbackConnectionType sslConnectionType, boolean optional) { - ServerEncryptionOptions serverEncryptionOptions = new ServerEncryptionOptions().withOptional(optional) - .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH) - .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) - .withOutboundKeystore(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PATH) - .withOutboundKeystorePassword(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PASSWORD) - .withTrustStore(TlsTestUtils.SERVER_TRUSTSTORE_PATH) - .withTrustStorePassword(TlsTestUtils.SERVER_TRUSTSTORE_PASSWORD) - .withSslContextFactory((new ParameterizedClass(DefaultSslContextFactory.class.getName(), - new HashMap<>()))); + Builder serverEncryptionOptionsBuilder = new Builder(); + + serverEncryptionOptionsBuilder.withOutboundKeystore(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PATH) + .withOutboundKeystorePassword(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PASSWORD) + .withOptional(optional) + .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH) + .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) + .withTrustStore(TlsTestUtils.SERVER_TRUSTSTORE_PATH).withTrustStorePassword(TlsTestUtils.SERVER_TRUSTSTORE_PASSWORD) + .withSslContextFactory((new ParameterizedClass(DefaultSslContextFactory.class.getName(), + new HashMap<>()))); + if (sslConnectionType == SslFallbackConnectionType.MTLS) { - serverEncryptionOptions = serverEncryptionOptions.withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all) - .withRequireClientAuth(REQUIRED); + serverEncryptionOptionsBuilder.withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all) + .withRequireClientAuth(REQUIRED); } else if (sslConnectionType == SslFallbackConnectionType.SSL) { - serverEncryptionOptions = serverEncryptionOptions.withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all) - .withRequireClientAuth(NOT_REQUIRED); + serverEncryptionOptionsBuilder.withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all) + .withRequireClientAuth(NOT_REQUIRED); } - return serverEncryptionOptions; + return serverEncryptionOptionsBuilder.build(); } private InboundSockets getInboundSocket(ServerEncryptionOptions serverEncryptionOptions) diff --git a/test/unit/org/apache/cassandra/net/MessageDeliveryTest.java b/test/unit/org/apache/cassandra/net/MessageDeliveryTest.java index 59d7106506d8..73b773e92dd1 100644 --- a/test/unit/org/apache/cassandra/net/MessageDeliveryTest.java +++ b/test/unit/org/apache/cassandra/net/MessageDeliveryTest.java @@ -24,6 +24,8 @@ import java.util.Map; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; import com.google.common.collect.Iterators; @@ -35,26 +37,26 @@ import org.apache.cassandra.concurrent.SimulatedExecutorFactory; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.dht.Murmur3Partitioner; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.MessageDelivery.FailedResponseException; -import org.apache.cassandra.net.MessageDelivery.MaxRetriesException; +import org.apache.cassandra.net.MessageDelivery.GivingUpException; import org.apache.cassandra.net.SimulatedMessageDelivery.Action; import org.apache.cassandra.net.SimulatedMessageDelivery.SimulatedMessageReceiver; +import org.apache.cassandra.service.RetryStrategy; +import org.apache.cassandra.service.TimeoutStrategy.LatencySourceFactory; import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tcm.StubClusterMetadataService; -import org.apache.cassandra.utils.Backoff; import org.mockito.Mockito; import static accord.utils.Property.qt; +import static org.apache.cassandra.net.MessageDelivery.RetryErrorMessage; +import static org.apache.cassandra.net.MessageDelivery.RetryPredicate; import static org.assertj.core.api.Assertions.assertThat; public class MessageDeliveryTest { private static final InetAddressAndPort ID1 = InetAddressAndPort.getByNameUnchecked("127.0.0.1"); - private static final MessageDelivery.RetryErrorMessage RETRY_ERROR_MESSAGE = (i1, i2, i3, i4) -> null; - private static final MessageDelivery.RetryPredicate ALWAYS_RETRY = (i1, i2, i3) -> true; - private static final MessageDelivery.RetryPredicate ALWAYS_REJECT = (i1, i2, i3) -> false; static { @@ -73,19 +75,19 @@ public void sendWithRetryFailsAfterMaxAttempts() MessageDelivery messaging = simulatedMessages(rs, scheduler, failures, (i1, i2, i3) -> Action.DROP); int expectedRetries = 3; - Backoff backoff = new Backoff.ExponentialBackoff(expectedRetries, 200, 1000, rs.fork()::nextDouble); + RetryStrategy backoff = RetryStrategy.parse("200ms*2^attempts <= 1000ms,retries=" + expectedRetries, LatencySourceFactory.none()); Future> result = messaging.sendWithRetries(backoff, scheduler::schedule, Verb.ECHO_REQ, NoPayload.noPayload, Iterators.cycle(ID1), - ALWAYS_RETRY, - RETRY_ERROR_MESSAGE); + RetryPredicate.ALWAYS_RETRY, + RetryErrorMessage.EMPTY); assertThat(result).isNotDone(); factory.processAll(); assertThat(result).isDone(); - assertThat(getMaxRetriesException(result).attempts).isEqualTo(expectedRetries); + assertThat(getMaxRetriesException(result).attempts).isEqualTo(expectedRetries + 1); }); } @@ -98,21 +100,19 @@ public void sendWithRetryFirstAttempt() ScheduledExecutorPlus scheduler = factory.scheduled("ignored"); MessageDelivery messaging = simulatedMessages(rs, scheduler, failures, (i1, i2, i3) -> Action.DELIVER); - Backoff backoff = Mockito.mock(Backoff.class); + RetryStrategy backoff = Mockito.mock(RetryStrategy.class); Future> result = messaging.sendWithRetries(backoff, scheduler::schedule, Verb.ECHO_REQ, NoPayload.noPayload, Iterators.cycle(ID1), - ALWAYS_RETRY, - RETRY_ERROR_MESSAGE); + RetryPredicate.ALWAYS_RETRY, + RetryErrorMessage.EMPTY); assertThat(result).isNotDone(); factory.processAll(); assertThat(result).isDone(); assertThat(result.get().header.verb).isEqualTo(Verb.ECHO_RSP); - Mockito.verify(backoff, Mockito.never()).mayRetry(Mockito.anyInt()); - Mockito.verify(backoff, Mockito.never()).computeWaitTime(Mockito.anyInt()); - Mockito.verify(backoff, Mockito.never()).unit(); + Mockito.verify(backoff, Mockito.never()).computeWait(Mockito.anyInt(), Mockito.any()); }); } @@ -129,21 +129,20 @@ public void sendWithRetry() AtomicInteger attempts = new AtomicInteger(0); MessageDelivery messaging = simulatedMessages(rs, scheduler, failures, (i1, i2, i3) -> attempts.incrementAndGet() >= (expectedAttempts + 1) ? Action.DELIVER : Action.DROP); - Backoff backoff = Mockito.spy(new Backoff.ExponentialBackoff(maxAttempts, 200, 1000, rs.fork()::nextDouble)); + RetryStrategy backoff = RetryStrategy.parse("200ms*2^attempts <= 1000ms,retries=" + (maxAttempts-1), LatencySourceFactory.none()); + backoff = Mockito.spy(backoff); Future> result = messaging.sendWithRetries(backoff, scheduler::schedule, Verb.ECHO_REQ, NoPayload.noPayload, Iterators.cycle(ID1), - ALWAYS_RETRY, - RETRY_ERROR_MESSAGE); + RetryPredicate.ALWAYS_RETRY, + RetryErrorMessage.EMPTY); assertThat(result).isNotDone(); factory.processAll(); assertThat(result).isDone(); assertThat(result.get().header.verb).isEqualTo(Verb.ECHO_RSP); - Mockito.verify(backoff, Mockito.times(expectedAttempts)).mayRetry(Mockito.anyInt()); - Mockito.verify(backoff, Mockito.times(expectedAttempts)).computeWaitTime(Mockito.anyInt()); - Mockito.verify(backoff, Mockito.times(expectedAttempts)).unit(); + Mockito.verify(backoff, Mockito.times(expectedAttempts)).computeWait(Mockito.anyInt(), Mockito.any()); }); } @@ -157,46 +156,31 @@ public void sendWithRetryDontAllowRetry() MessageDelivery messaging = simulatedMessages(rs, scheduler, failures, (i1, i2, i3) -> Action.DROP); - Backoff backoff = Mockito.spy(new Backoff.ExponentialBackoff(3, 200, 1000, rs.fork()::nextDouble)); + RetryStrategy backoff = RetryStrategy.parse("0 <= 200ms*2^attempts <= 1000ms,retries=3", LatencySourceFactory.none()); + backoff = Mockito.spy(backoff); Future> result = messaging.sendWithRetries(backoff, scheduler::schedule, Verb.ECHO_REQ, NoPayload.noPayload, Iterators.cycle(ID1), - ALWAYS_REJECT, - RETRY_ERROR_MESSAGE); + RetryPredicate.NEVER_RETRY, + RetryErrorMessage.EMPTY); assertThat(result).isNotDone(); factory.processAll(); assertThat(result).isDone(); FailedResponseException e = getFailedResponseException(result); assertThat(e.from).isEqualTo(ID1); - assertThat(e.failure).isEqualTo(RequestFailureReason.TIMEOUT); - Mockito.verify(backoff, Mockito.times(1)).mayRetry(Mockito.anyInt()); - Mockito.verify(backoff, Mockito.never()).computeWaitTime(Mockito.anyInt()); - Mockito.verify(backoff, Mockito.never()).unit(); + assertThat(e.failure).isEqualTo(RequestFailure.TIMEOUT); + Mockito.verify(backoff, Mockito.atMostOnce()).computeWait(Mockito.anyInt(), Mockito.any()); }); } - private static MessageDelivery simulatedMessages(RandomSource rs, ScheduledExecutorPlus scheduler, List failures, SimulatedMessageDelivery.ActionSupplier actionSupplier) - { - Map receivers = new HashMap<>(); - SimulatedMessageDelivery messaging = new SimulatedMessageDelivery(ID1, - actionSupplier, - SimulatedMessageDelivery.randomDelay(rs), - (to, message) -> scheduler.execute(() -> receivers.get(to).recieve(message)), - (i1, i2, i3) -> {}, - scheduler::schedule, - failures::add); - receivers.put(ID1, messaging.receiver(m -> messaging.respond(NoPayload.noPayload, m))); - return messaging; - } - - private static FailedResponseException getFailedResponseException(Future> result) throws InterruptedException + public static FailedResponseException getFailedResponseException(Future> result) throws InterruptedException { FailedResponseException ex; try { - result.get(); + result.get(1, TimeUnit.MINUTES); Assert.fail("Should have failed"); throw new AssertionError("Not Reachable"); } @@ -204,12 +188,30 @@ private static FailedResponseException getFailedResponseException(Future> result) throws InterruptedException + private static MessageDelivery simulatedMessages(RandomSource rs, ScheduledExecutorPlus scheduler, List failures, SimulatedMessageDelivery.ActionSupplier actionSupplier) + { + Map receivers = new HashMap<>(); + SimulatedMessageDelivery messaging = new SimulatedMessageDelivery(ID1, + actionSupplier, + SimulatedMessageDelivery.randomDelay(rs), + (to, message) -> scheduler.execute(() -> receivers.get(to).recieve(message)), + (i1, i2, i3) -> {}, + scheduler::schedule, + failures::add); + receivers.put(ID1, messaging.receiver(m -> messaging.respond(NoPayload.noPayload, m))); + return messaging; + } + + private static GivingUpException getMaxRetriesException(Future> result) throws InterruptedException { - MaxRetriesException ex; + GivingUpException ex; try { result.get(); @@ -218,7 +220,7 @@ private static MaxRetriesException getMaxRetriesException(Future> } catch (ExecutionException e) { - ex = (MaxRetriesException) e.getCause(); + ex = (GivingUpException) e.getCause(); } return ex; } diff --git a/test/unit/org/apache/cassandra/net/MessageTest.java b/test/unit/org/apache/cassandra/net/MessageTest.java index 8e89973aa75e..15d062e1a70c 100644 --- a/test/unit/org/apache/cassandra/net/MessageTest.java +++ b/test/unit/org/apache/cassandra/net/MessageTest.java @@ -19,8 +19,8 @@ import java.io.IOException; import java.nio.ByteBuffer; -import java.nio.charset.CharacterCodingException; import java.nio.charset.StandardCharsets; +import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.stream.Stream; @@ -30,6 +30,7 @@ import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataInputBuffer; @@ -37,6 +38,7 @@ import org.apache.cassandra.io.util.DataOutputBuffer; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.MessagingService.Version; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.tracing.Tracing.TraceType; @@ -44,16 +46,22 @@ import org.apache.cassandra.utils.FreeRunningClock; import org.apache.cassandra.utils.TimeUUID; +import static com.google.common.base.Throwables.getStackTraceAsString; +import static org.apache.cassandra.exceptions.RemoteExceptionTest.normalizeThrowable; import static org.apache.cassandra.net.Message.serializer; import static org.apache.cassandra.net.MessagingService.VERSION_40; +import static org.apache.cassandra.net.MessagingService.VERSION_51; import static org.apache.cassandra.net.NoPayload.noPayload; import static org.apache.cassandra.net.ParamType.RESPOND_TO; import static org.apache.cassandra.net.ParamType.TRACE_SESSION; import static org.apache.cassandra.net.ParamType.TRACE_TYPE; import static org.apache.cassandra.utils.MonotonicClock.Global.approxTime; import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; - -import static org.junit.Assert.*; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertSame; +import static org.junit.Assert.assertTrue; public class MessageTest { @@ -165,7 +173,7 @@ public void testBuilder() } @Test - public void testCycleNoPayload() throws IOException + public void testCycleNoPayload() throws Exception { Message msg = Message.builder(Verb._TEST_1, noPayload) @@ -190,15 +198,20 @@ public void testCycleWithPayload() throws Exception } @Test - public void testFailureResponse() throws IOException + public void testFailureResponse() throws Exception { long expiresAt = approxTime.now(); - Message msg = Message.failureResponse(1, expiresAt, RequestFailureReason.INCOMPATIBLE_SCHEMA); + ExecutionException cause = new ExecutionException("test", new NullPointerException()); + Throwable root = new Throwable(cause); + Throwable suppressed = new Throwable(); + root.addSuppressed(suppressed); + Message msg = Message.failureResponse(1, expiresAt, new RequestFailure(RequestFailureReason.INCOMPATIBLE_SCHEMA, root)); assertEquals(1, msg.id()); assertEquals(Verb.FAILURE_RSP, msg.verb()); assertEquals(expiresAt, msg.expiresAtNanos()); - assertEquals(RequestFailureReason.INCOMPATIBLE_SCHEMA, msg.payload); + assertEquals(RequestFailureReason.INCOMPATIBLE_SCHEMA, msg.payload.reason); + assertEquals(getStackTraceAsString(root), getStackTraceAsString(msg.payload.failure)); assertTrue(msg.isFailureResponse()); testCycle(msg); @@ -218,19 +231,26 @@ public void testBuilderNotAddTraceHeaderWithNoTraceSession() } @Test - public void testCustomParams() throws CharacterCodingException, IOException + public void testCustomParams() throws IOException + { + for (Version version : MessagingService.Version.values()) + if (version.value >= VERSION_40) + testCustomParams(version.value); + } + + private void testCustomParams(int version) throws IOException { long id = 1; InetAddressAndPort from = FBUtilities.getLocalAddressAndPort(); Message msg = - Message.builder(Verb._TEST_1, noPayload) - .withEpoch(Epoch.EMPTY) - .withId(1) - .from(from) - .withCustomParam("custom1", "custom1value".getBytes(StandardCharsets.UTF_8)) - .withCustomParam("custom2", "custom2value".getBytes(StandardCharsets.UTF_8)) - .build(); + Message.builder(Verb._TEST_1, noPayload) + .withEpoch(Epoch.EMPTY) + .withId(1) + .from(from) + .withCustomParam("custom1", "custom1value".getBytes(StandardCharsets.UTF_8)) + .withCustomParam("custom2", "custom2value".getBytes(StandardCharsets.UTF_8)) + .build(); assertEquals(id, msg.id()); assertEquals(from, msg.from()); @@ -239,9 +259,10 @@ public void testCustomParams() throws CharacterCodingException, IOException assertEquals("custom2value", new String(msg.header.customParams().get("custom2"), StandardCharsets.UTF_8)); DataOutputBuffer out = DataOutputBuffer.scratchBuffer.get(); - Message.serializer.serialize(msg, out, VERSION_40); + out.clear(); + Message.serializer.serialize(msg, out, version); DataInputBuffer in = new DataInputBuffer(out.buffer(), true); - msg = Message.serializer.deserialize(in, from, VERSION_40); + msg = Message.serializer.deserialize(in, from, version); assertEquals(id, msg.id()); assertEquals(from, msg.from()); @@ -265,13 +286,13 @@ private void testAddTraceHeaderWithType(TraceType traceType) } } - private void testCycle(Message msg) throws IOException + private void testCycle(Message msg) throws Exception { testCycle(msg, VERSION_40); } // serialize (using both variants, all in one or header then rest), verify serialized size, deserialize, compare to the original - private void testCycle(Message msg, int version) throws IOException + private void testCycle(Message msg, int version) throws Exception { try (DataOutputBuffer out = new DataOutputBuffer()) { @@ -283,7 +304,7 @@ private void testCycle(Message msg, int version) throws IOException { Message msgOut = serializer.deserialize(in, msg.from(), version); assertEquals(0, in.available()); - assertMessagesEqual(msg, msgOut); + assertMessagesEqual(msg, msgOut, version); } // extract header first, then deserialize the rest of the message and compare outcomes @@ -293,12 +314,12 @@ private void testCycle(Message msg, int version) throws IOException Message.Header headerOut = serializer.extractHeader(buffer, msg.from(), approxTime.now(), version); Message msgOut = serializer.deserialize(in, headerOut, version); assertEquals(0, in.available()); - assertMessagesEqual(msg, msgOut); + assertMessagesEqual(msg, msgOut, version); } } } - private static void assertMessagesEqual(Message msg1, Message msg2) + private static void assertMessagesEqual(Message msg1, Message msg2, int version) throws Exception { assertEquals(msg1.id(), msg2.id()); assertEquals(msg1.verb(), msg2.verb()); @@ -316,6 +337,19 @@ private static void assertMessagesEqual(Message msg1, Message msg2) assertTrue(payload2 == noPayload || payload2 == null); else if (null == payload2) assertSame(payload1, noPayload); + else if (msg1.verb() == Verb.FAILURE_RSP) + { + RequestFailure reason1 = (RequestFailure)msg1.payload; + RequestFailure reason2 = (RequestFailure)msg2.payload; + assertEquals(reason1.reason, reason2.reason); + if (version >= VERSION_51) + { + if (reason1.failure == null) + assertNull(reason2.failure); + else + assertEquals(getStackTraceAsString(normalizeThrowable(reason1.failure)), getStackTraceAsString(reason2.failure)); + } + } else assertEquals(payload1, payload2); } diff --git a/test/unit/org/apache/cassandra/net/MessagingServiceTest.java b/test/unit/org/apache/cassandra/net/MessagingServiceTest.java index 95c72f4bc0b3..45746d5e6e06 100644 --- a/test/unit/org/apache/cassandra/net/MessagingServiceTest.java +++ b/test/unit/org/apache/cassandra/net/MessagingServiceTest.java @@ -50,6 +50,7 @@ import org.apache.cassandra.auth.IInternodeAuthenticator; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.config.EncryptionOptions.ServerEncryptionOptions; +import org.apache.cassandra.config.EncryptionOptions.ServerEncryptionOptions.Builder; import org.apache.cassandra.db.commitlog.CommitLog; import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; import org.apache.cassandra.exceptions.ConfigurationException; @@ -257,8 +258,7 @@ private static void addDCLatency(long sentAt, long nowTime) public void testFailedOutboundInternodeAuth() throws Exception { // Listen on serverside for connections - ServerEncryptionOptions serverEncryptionOptions = new ServerEncryptionOptions() - .withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.none); + ServerEncryptionOptions serverEncryptionOptions = new Builder().withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.none).build(); DatabaseDescriptor.setInternodeAuthenticator(REJECT_OUTBOUND_AUTHENTICATOR); InetAddress listenAddress = FBUtilities.getJustLocalAddress(); @@ -293,8 +293,7 @@ public void testFailedOutboundInternodeAuth() throws Exception @Test public void testFailedInboundInternodeAuth() throws IOException, InterruptedException { - ServerEncryptionOptions serverEncryptionOptions = new ServerEncryptionOptions() - .withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.none); + ServerEncryptionOptions serverEncryptionOptions = new Builder().withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.none).build(); DatabaseDescriptor.setInternodeAuthenticator(ALLOW_NOTHING_AUTHENTICATOR); InetAddress listenAddress = FBUtilities.getJustLocalAddress(); @@ -348,56 +347,54 @@ public void testFailedInboundInternodeAuth() throws IOException, InterruptedExce @Test public void listenPlainConnection() throws InterruptedException { - ServerEncryptionOptions serverEncryptionOptions = new ServerEncryptionOptions() - .withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.none); + ServerEncryptionOptions serverEncryptionOptions = new Builder().withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.none).build(); listen(serverEncryptionOptions, false); } @Test public void listenPlainConnectionWithBroadcastAddr() throws InterruptedException { - ServerEncryptionOptions serverEncryptionOptions = new ServerEncryptionOptions() - .withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.none); + ServerEncryptionOptions serverEncryptionOptions = new Builder().withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.none).build(); listen(serverEncryptionOptions, true); } @Test public void listenRequiredSecureConnection() throws InterruptedException { - ServerEncryptionOptions serverEncryptionOptions = new ServerEncryptionOptions() - .withOptional(false) - .withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all) - .withLegacySslStoragePort(false); + ServerEncryptionOptions serverEncryptionOptions = new Builder().withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all) + .withLegacySslStoragePort(false) + .withOptional(false) + .build(); listen(serverEncryptionOptions, false); } @Test public void listenRequiredSecureConnectionWithBroadcastAddr() throws InterruptedException { - ServerEncryptionOptions serverEncryptionOptions = new ServerEncryptionOptions() - .withOptional(false) - .withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all) - .withLegacySslStoragePort(false); + ServerEncryptionOptions serverEncryptionOptions = new Builder().withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all) + .withLegacySslStoragePort(false) + .withOptional(false) + .build(); listen(serverEncryptionOptions, true); } @Test public void listenRequiredSecureConnectionWithLegacyPort() throws InterruptedException { - ServerEncryptionOptions serverEncryptionOptions = new ServerEncryptionOptions() - .withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all) - .withOptional(false) - .withLegacySslStoragePort(true); + ServerEncryptionOptions serverEncryptionOptions = new Builder().withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all) + .withLegacySslStoragePort(true) + .withOptional(false) + .build(); listen(serverEncryptionOptions, false); } @Test public void listenRequiredSecureConnectionWithBroadcastAddrAndLegacyPort() throws InterruptedException { - ServerEncryptionOptions serverEncryptionOptions = new ServerEncryptionOptions() - .withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all) - .withOptional(false) - .withLegacySslStoragePort(true); + ServerEncryptionOptions serverEncryptionOptions = new Builder().withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all) + .withLegacySslStoragePort(true) + .withOptional(false) + .build(); listen(serverEncryptionOptions, true); } @@ -406,8 +403,7 @@ public void listenOptionalSecureConnection() throws InterruptedException { for (int i = 0; i < 500; i++) // test used to be flaky, so run in a loop to make sure stable (see CASSANDRA-17033) { - ServerEncryptionOptions serverEncryptionOptions = new ServerEncryptionOptions() - .withOptional(true); + ServerEncryptionOptions serverEncryptionOptions = new Builder().withOptional(true).build(); listen(serverEncryptionOptions, false); } } @@ -415,8 +411,7 @@ public void listenOptionalSecureConnection() throws InterruptedException @Test public void listenOptionalSecureConnectionWithBroadcastAddr() throws InterruptedException { - ServerEncryptionOptions serverEncryptionOptions = new ServerEncryptionOptions() - .withOptional(true); + ServerEncryptionOptions serverEncryptionOptions = new Builder().withOptional(true).build(); listen(serverEncryptionOptions, true); } diff --git a/test/unit/org/apache/cassandra/net/SimulatedMessageDelivery.java b/test/unit/org/apache/cassandra/net/SimulatedMessageDelivery.java index f0f34888a29c..2f5014abd8a8 100644 --- a/test/unit/org/apache/cassandra/net/SimulatedMessageDelivery.java +++ b/test/unit/org/apache/cassandra/net/SimulatedMessageDelivery.java @@ -31,7 +31,8 @@ import accord.utils.Gens; import accord.utils.RandomSource; -import org.apache.cassandra.exceptions.RequestFailureReason; + +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.utils.concurrent.AsyncPromise; import org.apache.cassandra.utils.concurrent.Future; @@ -184,7 +185,7 @@ public void onResponse(Message msg) } @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failure) + public void onFailure(InetAddressAndPort from, RequestFailure failure) { promise.tryFailure(new MessagingService.FailureResponseException(from, failure)); } @@ -237,7 +238,7 @@ private void maybeEnqueue(Message message, InetAddressAndPort to if (action == Action.FAILURE) onDropped.onDrop(action, to, message); if (callback != null) - scheduler.schedule(() -> callback.onFailure(to, RequestFailureReason.UNKNOWN), + scheduler.schedule(() -> callback.onFailure(to, RequestFailure.UNKNOWN), message.verb().expiresAfterNanos(), TimeUnit.NANOSECONDS); return; default: @@ -252,7 +253,7 @@ private void maybeEnqueue(Message message, InetAddressAndPort to assert ctx == cb; try { - ctx.onFailure(to, RequestFailureReason.TIMEOUT); + ctx.onFailure(to, RequestFailure.TIMEOUT); } catch (Throwable t) { @@ -302,7 +303,7 @@ public void recieve(Message msg) try { if (msg.isFailureResponse()) - callback.onFailure(msg.from(), (RequestFailureReason) msg.payload); + callback.onFailure(msg.from(), (RequestFailure) msg.payload); else callback.onResponse(msg); } catch (Throwable t) @@ -364,7 +365,7 @@ public void onResponse(Message msg) callback.onResponse(msg); } - public void onFailure(InetAddressAndPort from, RequestFailureReason failure) + public void onFailure(InetAddressAndPort from, RequestFailure failure) { if (callback.invokeOnFailure()) callback.onFailure(from, failure); } diff --git a/test/unit/org/apache/cassandra/repair/FailingRepairFuzzTest.java b/test/unit/org/apache/cassandra/repair/FailingRepairFuzzTest.java index 2d1438e0302a..ccce173d4486 100644 --- a/test/unit/org/apache/cassandra/repair/FailingRepairFuzzTest.java +++ b/test/unit/org/apache/cassandra/repair/FailingRepairFuzzTest.java @@ -34,6 +34,7 @@ import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.config.RetrySpec; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.repair.messages.RepairOption; import org.apache.cassandra.repair.state.Completable; import org.apache.cassandra.streaming.StreamEventHandler; import org.apache.cassandra.streaming.StreamState; @@ -64,7 +65,9 @@ public void failingRepair() { Cluster.Node coordinator = coordinatorGen.next(rs); - RepairCoordinator repair = coordinator.repair(KEYSPACE, repairOption(rs, coordinator, KEYSPACE, TABLES), false); + // exclude accord repair as this test breaks validation/sync; which accord doesn't have + RepairOption options = repairOption(rs, coordinator, KEYSPACE, TABLES); + RepairCoordinator repair = coordinator.repair(KEYSPACE, options, false); repair.run(); InetAddressAndPort failingAddress = pickParticipant(rs, coordinator, repair); Cluster.Node failingNode = cluster.nodes.get(failingAddress); diff --git a/test/unit/org/apache/cassandra/repair/FuzzTestBase.java b/test/unit/org/apache/cassandra/repair/FuzzTestBase.java index 044711a617b6..72148701de69 100644 --- a/test/unit/org/apache/cassandra/repair/FuzzTestBase.java +++ b/test/unit/org/apache/cassandra/repair/FuzzTestBase.java @@ -22,7 +22,7 @@ import java.net.InetSocketAddress; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; -import java.sql.Timestamp; +import java.time.Duration; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -44,15 +44,14 @@ import java.util.function.BiFunction; import java.util.function.Consumer; import java.util.function.Function; -import java.util.function.LongSupplier; import java.util.function.Supplier; import javax.annotation.Nullable; +import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.collect.Maps; import com.google.common.collect.Sets; -import org.apache.cassandra.config.UnitConfigOverride; import org.junit.BeforeClass; import accord.utils.DefaultRandom; @@ -73,6 +72,7 @@ import org.apache.cassandra.concurrent.Stage; import org.apache.cassandra.config.Config; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.config.UnitConfigOverride; import org.apache.cassandra.cql3.CQLTester; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Digest; @@ -85,7 +85,6 @@ import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; -import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.gms.ApplicationState; import org.apache.cassandra.gms.EndpointState; import org.apache.cassandra.gms.HeartBeatState; @@ -99,12 +98,14 @@ import org.apache.cassandra.locator.LocalStrategy; import org.apache.cassandra.locator.Locator; import org.apache.cassandra.locator.RangesAtEndpoint; -import org.apache.cassandra.net.ConnectionType; import org.apache.cassandra.net.IVerbHandler; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessageDelivery; import org.apache.cassandra.net.MessagingService; -import org.apache.cassandra.net.RequestCallback; +import org.apache.cassandra.net.SimulatedMessageDelivery; +import org.apache.cassandra.net.SimulatedMessageDelivery.SimulatedMessageReceiver; +import org.apache.cassandra.repair.RepairGenerators.PreviewType; +import org.apache.cassandra.repair.RepairGenerators.RepairType; import org.apache.cassandra.repair.messages.RepairMessage; import org.apache.cassandra.repair.messages.RepairOption; import org.apache.cassandra.repair.messages.ValidationResponse; @@ -123,6 +124,7 @@ import org.apache.cassandra.schema.Tables; import org.apache.cassandra.service.ActiveRepairService; import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.AccordService; import org.apache.cassandra.service.paxos.cleanup.PaxosCleanupComplete; import org.apache.cassandra.service.paxos.cleanup.PaxosCleanupHistory; import org.apache.cassandra.service.paxos.cleanup.PaxosCleanupRequest; @@ -150,24 +152,23 @@ import org.apache.cassandra.utils.MerkleTrees; import org.apache.cassandra.utils.NoSpamLogger; import org.apache.cassandra.utils.TimeUUID; -import org.apache.cassandra.utils.concurrent.AsyncPromise; -import org.apache.cassandra.utils.concurrent.Future; import org.apache.cassandra.utils.concurrent.ImmediateFuture; import org.apache.cassandra.utils.progress.ProgressEventType; import org.assertj.core.api.Assertions; import org.mockito.Mockito; import org.quicktheories.impl.JavaRandom; +import static org.apache.cassandra.config.CassandraRelevantProperties.ACCORD_REPAIR_RANGE_STEP_UPDATE_INTERVAL; import static org.apache.cassandra.config.CassandraRelevantProperties.CLOCK_GLOBAL; import static org.apache.cassandra.config.CassandraRelevantProperties.ORG_APACHE_CASSANDRA_DISABLE_MBEAN_REGISTRATION; public abstract class FuzzTestBase extends CQLTester.InMemory { private static final int MISMATCH_NUM_PARTITIONS = 1; - private static final Gen IDENTIFIER_GEN = fromQT(Generators.IDENTIFIER_GEN); - private static final Gen KEYSPACE_NAME_GEN = fromQT(CassandraGenerators.KEYSPACE_NAME_GEN); - private static final Gen TABLE_ID_GEN = fromQT(CassandraGenerators.TABLE_ID_GEN); - private static final Gen ADDRESS_W_PORT = fromQT(CassandraGenerators.INET_ADDRESS_AND_PORT_GEN); + private static final Gen IDENTIFIER_GEN = Generators.toGen(Generators.IDENTIFIER_GEN); + private static final Gen KEYSPACE_NAME_GEN = Generators.toGen(CassandraGenerators.KEYSPACE_NAME_GEN); + private static final Gen TABLE_ID_GEN = Generators.toGen(CassandraGenerators.TABLE_ID_GEN); + private static final Gen ADDRESS_W_PORT = Generators.toGen(CassandraGenerators.INET_ADDRESS_AND_PORT_GEN); private static boolean SETUP_SCHEMA = false; static String KEYSPACE; @@ -177,6 +178,7 @@ public abstract class FuzzTestBase extends CQLTester.InMemory public static void setUpClass() { ORG_APACHE_CASSANDRA_DISABLE_MBEAN_REGISTRATION.setBoolean(true); + ACCORD_REPAIR_RANGE_STEP_UPDATE_INTERVAL.setInt(1); CLOCK_GLOBAL.setString(ClockAccess.class.getName()); // when running in CI an external actor will replace the test configs based off the test type (such as trie, cdc, etc.), this could then have failing tests // that do not repo with the same seed! To fix that, go to UnitConfigOverride and update the config type to match the one that failed in CI, this should then @@ -244,16 +246,16 @@ private boolean shouldMock() } @Override - public Thread startThread(String name, Runnable runnable, InfiniteLoopExecutor.Daemon daemon) + public Thread startThread(String name, Runnable runnable, SystemThreadTag systemTag, SimulatorThreadTag simulatorTag) { if (shouldMock()) return new Thread(); - return delegate.startThread(name, runnable, daemon); + return delegate.startThread(name, runnable, systemTag, simulatorTag); } @Override - public Interruptible infiniteLoop(String name, Interruptible.Task task, InfiniteLoopExecutor.SimulatorSafe simulatorSafe, InfiniteLoopExecutor.Daemon daemon, InfiniteLoopExecutor.Interrupts interrupts) + public Interruptible infiniteLoop(String name, Interruptible.Task task, InfiniteLoopExecutor.SimulatorSafe simulatorSafe, SystemThreadTag systemTag, InfiniteLoopExecutor.Interrupts interrupts) { - return delegate.infiniteLoop(name, task, simulatorSafe, daemon, interrupts); + return delegate.infiniteLoop(name, task, simulatorSafe, systemTag, interrupts); } @Override @@ -288,8 +290,12 @@ public ExecutorBuilder configurePooled(String name, int // so don't want to deal with unlucky histories... DatabaseDescriptor.setRepairRpcTimeout(TimeUnit.DAYS.toMillis(1)); + // make sure accord is enabled as accord has custom repair steps + DatabaseDescriptor.setAccordTransactionsEnabled(true); InMemory.setUpClass(); + + MessagingService.instance().listen(); } public static void setupSchema() @@ -464,13 +470,12 @@ InetAddressAndPort pickParticipant(RandomSource rs, Cluster.Node coordinator, Re { if (repair.state.isComplete()) throw new IllegalStateException("Repair is completed! " + repair.state.getResult()); - List participaents = new ArrayList<>(repair.state.getNeighborsAndRanges().participants.size() + 1); - if (rs.nextBoolean()) participaents.add(coordinator.broadcastAddressAndPort()); - participaents.addAll(repair.state.getNeighborsAndRanges().participants); - participaents.sort(Comparator.naturalOrder()); + List participants = new ArrayList<>(repair.state.getNeighborsAndRanges().participants.size() + 1); + if (rs.nextBoolean()) participants.add(coordinator.broadcastAddressAndPort()); + participants.addAll(repair.state.getNeighborsAndRanges().participants); + participants.sort(Comparator.naturalOrder()); - InetAddressAndPort selected = rs.pick(participaents); - return selected; + return participants.get(rs.nextInt(participants.size())); } static void addMismatch(RandomSource rs, ColumnFamilyStore cfs, Validator validator) @@ -488,7 +493,7 @@ static void addMismatch(RandomSource rs, ColumnFamilyStore cfs, Validator valida Set allTokens = new HashSet<>(); for (Range range : validator.desc.ranges) { - Gen gen = fromQT(CassandraGenerators.tokensInRange(range)); + Gen gen = Generators.toGen(CassandraGenerators.tokensInRange(range)); Set tokens = new LinkedHashSet<>(); for (int i = 0, size = rs.nextInt(1, 10); i < size; i++) { @@ -532,12 +537,6 @@ private static void findCorrectRange(MerkleTrees trees, Token token, Consumer tableNames) { return repairOption(rs, coordinator, ks, Gens.lists(Gens.pick(tableNames)).ofSizeBetween(1, tableNames.size()), Gens.enums().all(RepairType.class), Gens.enums().all(PreviewType.class), Gens.enums().all(RepairParallelism.class)); @@ -555,52 +554,13 @@ static RepairOption previewOption(RandomSource rs, Cluster.Node coordinator, Str private static RepairOption repairOption(RandomSource rs, Cluster.Node coordinator, String ks, Gen> tablesGen, Gen repairTypeGen, Gen previewTypeGen, Gen repairParallelismGen) { - List args = new ArrayList<>(); - args.add(ks); - args.addAll(tablesGen.next(rs)); - args.add("-pr"); - RepairType type = repairTypeGen.next(rs); - switch (type) - { - case IR: - // default - break; - case FULL: - args.add("--full"); - break; - default: - throw new AssertionError("Unsupported repair type: " + type); - } - PreviewType previewType = previewTypeGen.next(rs); - switch (previewType) - { - case NONE: - break; - case REPAIRED: - args.add("--validate"); - break; - case UNREPAIRED: - args.add("--preview"); - break; - default: - throw new AssertionError("Unsupported preview type: " + previewType); - } - RepairParallelism parallelism = repairParallelismGen.next(rs); - switch (parallelism) - { - case SEQUENTIAL: - args.add("--sequential"); - break; - case PARALLEL: - // default - break; - case DATACENTER_AWARE: - args.add("--dc-parallel"); - break; - default: - throw new AssertionError("Unknown parallelism: " + parallelism); - } - if (rs.nextBoolean()) args.add("--optimise-streams"); + List args = new RepairGenerators.Builder(tablesGen.map(l -> ImmutableList.builderWithExpectedSize(l.size() + 1).add(ks).addAll(l).build())) + .withType(repairTypeGen) + .withPreviewType(previewTypeGen) + .withParallelism(repairParallelismGen) + .withRanges(i -> RepairGenerators.PRIMARY_RANGE) + .build() + .next(rs); RepairOption options = RepairOption.parse(Repair.parseOptionMap(() -> "test", args), DatabaseDescriptor.getPartitioner()); if (options.getRanges().isEmpty()) { @@ -684,15 +644,13 @@ static class Cluster private final List listeners = new ArrayList<>(); private final RandomSource rs; private BiFunction, Set> allowedMessageFaults = (a, b) -> Collections.emptySet(); - - private final Map networkLatencies = new HashMap<>(); private final Map> networkDrops = new HashMap<>(); Cluster(RandomSource rs) { ClockAccess.includeThreadAsOwner(); this.rs = rs; - globalExecutor = new SimulatedExecutorFactory(rs, fromQT(Generators.TIMESTAMP_GEN.map(Timestamp::getTime)).mapToLong(TimeUnit.MILLISECONDS::toNanos).next(rs)); + globalExecutor = new SimulatedExecutorFactory(rs); orderedExecutor = globalExecutor.configureSequential("ignore").build(); unorderedScheduled = globalExecutor.scheduled("ignored"); @@ -713,8 +671,8 @@ static class Cluster int numNodes = rs.nextInt(3, 10); List dcs = Gens.lists(IDENTIFIER_GEN).unique().ofSizeBetween(1, Math.min(10, numNodes)).next(rs); Map nodes = Maps.newHashMapWithExpectedSize(numNodes); - Gen tokenGen = fromQT(CassandraGenerators.token(DatabaseDescriptor.getPartitioner())); - Gen hostIdGen = fromQT(Generators.UUID_RANDOM_GEN); + Gen tokenGen = Generators.toGen(CassandraGenerators.token(DatabaseDescriptor.getPartitioner())); + Gen hostIdGen = Generators.toGen(Generators.UUID_RANDOM_GEN); Set tokens = new HashSet<>(); Set hostIds = new HashSet<>(); for (int i = 0; i < numNodes; i++) @@ -758,6 +716,10 @@ static class Cluster ClusterMetadataTestHelper.register(inst.broadcastAddressAndPort()); ClusterMetadataTestHelper.join(inst.broadcastAddressAndPort(), inst.tokens()); } + List addresses = new ArrayList<>(nodes.keySet()); + addresses.sort(Comparator.naturalOrder()); + AccordService.unsafeSetNoop(); + setupSchema(); } @@ -802,214 +764,43 @@ public void processAll() } } - private class CallbackContext + private SimulatedMessageDelivery.Action action(InetAddressAndPort self, Message msg, InetAddressAndPort to) { - final RequestCallback callback; - - private CallbackContext(RequestCallback callback) - { - this.callback = Objects.requireNonNull(callback); - } - - public void onResponse(Message msg) - { - callback.onResponse(msg); - } - - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) - { - if (callback.invokeOnFailure()) callback.onFailure(from, failureReason); - } + boolean toSelf = self.equals(to); + Node node = nodes.get(to); + Set allowedFaults = allowedMessageFaults.apply(node, msg); + if (allowedFaults.contains(Faults.DROP) && !toSelf && networkDrops(self, to)) return SimulatedMessageDelivery.Action.DROP_PARTITIONED; + return SimulatedMessageDelivery.Action.DELIVER; } - private static class CallbackKey + private boolean networkDrops(InetAddressAndPort self, InetAddressAndPort to) { - private final long id; - private final InetAddressAndPort peer; - - private CallbackKey(long id, InetAddressAndPort peer) - { - this.id = id; - this.peer = peer; - } - - @Override - public boolean equals(Object o) - { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - CallbackKey that = (CallbackKey) o; - return id == that.id && peer.equals(that.peer); - } - - @Override - public int hashCode() - { - return Objects.hash(id, peer); - } - - @Override - public String toString() - { - return "CallbackKey{" + - "id=" + id + - ", peer=" + peer + - '}'; - } + return networkDrops.computeIfAbsent(new Connection(self, to), ignore -> Gens.bools().biasedRepeatingRuns(rs.nextInt(1, 11) / 100.0D, rs.nextInt(3, 15)).asSupplier(rs)).get(); } - private class Messaging implements MessageDelivery + private class Messaging extends SimulatedMessageDelivery { - final InetAddressAndPort broadcastAddressAndPort; - final Map callbacks = new HashMap<>(); - private Messaging(InetAddressAndPort broadcastAddressAndPort) { - this.broadcastAddressAndPort = broadcastAddressAndPort; - } - - @Override - public void send(Message message, InetAddressAndPort to) - { - message = message.withFrom(broadcastAddressAndPort); - maybeEnqueue(message, to, null); - } - - @Override - public void sendWithCallback(Message message, InetAddressAndPort to, RequestCallback cb) - { - message = message.withFrom(broadcastAddressAndPort); - maybeEnqueue(message, to, cb); - } - - @Override - public void sendWithCallback(Message message, InetAddressAndPort to, RequestCallback cb, ConnectionType specifyConnection) - { - message = message.withFrom(broadcastAddressAndPort); - maybeEnqueue(message, to, cb); - } - - private void maybeEnqueue(Message message, InetAddressAndPort to, @Nullable RequestCallback callback) - { - CallbackContext cb; - if (callback != null) - { - CallbackKey key = new CallbackKey(message.id(), to); - if (callbacks.containsKey(key)) - throw new AssertionError("Message id " + message.id() + " to " + to + " already has a callback"); - cb = new CallbackContext(callback); - callbacks.put(key, cb); - } - else - { - cb = null; - } - boolean toSelf = this.broadcastAddressAndPort.equals(to); - Node node = nodes.get(to); - Set allowedFaults = allowedMessageFaults.apply(node, message); - if (allowedFaults.isEmpty()) - { - // enqueue so stack overflow doesn't happen with the inlining - unorderedScheduled.submit(() -> node.handle(message)); - } - else - { - Runnable enqueue = () -> { - if (!allowedFaults.contains(Faults.DELAY)) - { - unorderedScheduled.submit(() -> node.handle(message)); - } - else - { - if (toSelf) unorderedScheduled.submit(() -> node.handle(message)); - else - unorderedScheduled.schedule(() -> node.handle(message), networkJitterNanos(to), TimeUnit.NANOSECONDS); - } - }; - - if (!allowedFaults.contains(Faults.DROP)) enqueue.run(); - else - { - if (!toSelf && networkDrops(to)) - { -// logger.warn("Dropped message {}", message); - // drop - } - else - { - enqueue.run(); - } - } - - if (cb != null) - { - unorderedScheduled.schedule(() -> { - CallbackContext ctx = callbacks.remove(new CallbackKey(message.id(), to)); - if (ctx != null) - { - assert ctx == cb; - try - { - ctx.onFailure(to, RequestFailureReason.TIMEOUT); - } - catch (Throwable t) - { - failures.add(t); - } - } - }, message.verb().expiresAfterNanos(), TimeUnit.NANOSECONDS); - } - } - } - - private long networkJitterNanos(InetAddressAndPort to) - { - return networkLatencies.computeIfAbsent(new Connection(broadcastAddressAndPort, to), ignore -> { - long min = TimeUnit.MICROSECONDS.toNanos(500); - long maxSmall = TimeUnit.MILLISECONDS.toNanos(5); - long max = TimeUnit.SECONDS.toNanos(5); - LongSupplier small = () -> rs.nextLong(min, maxSmall); - LongSupplier large = () -> rs.nextLong(maxSmall, max); - return Gens.bools().biasedRepeatingRuns(rs.nextInt(1, 11) / 100.0D, rs.nextInt(3, 15)).mapToLong(b -> b ? large.getAsLong() : small.getAsLong()).asLongSupplier(rs); - }).getAsLong(); - } - - private boolean networkDrops(InetAddressAndPort to) - { - return networkDrops.computeIfAbsent(new Connection(broadcastAddressAndPort, to), ignore -> Gens.bools().biasedRepeatingRuns(rs.nextInt(1, 11) / 100.0D, rs.nextInt(3, 15)).asSupplier(rs)).get(); - } - - @Override - public Future> sendWithResult(Message message, InetAddressAndPort to) - { - AsyncPromise> promise = new AsyncPromise<>(); - sendWithCallback(message, to, new RequestCallback() - { - @Override - public void onResponse(Message msg) - { - promise.trySuccess(msg); - } - - @Override - public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) - { - promise.tryFailure(new MessagingService.FailureResponseException(from, failureReason)); - } - - @Override - public boolean invokeOnFailure() - { - return true; - } - }); - return promise; - } - - @Override - public void respond(V response, Message message) - { - send(message.responseWith(response), message.respondTo()); + super(broadcastAddressAndPort, + Cluster.this::action, + new NetworkDelaySupplier() + { + private final NetworkDelaySupplier delegate = SimulatedMessageDelivery.randomDelay(rs); + @Nullable + @Override + public Duration jitter(Message msg, InetAddressAndPort to) + { + Set allowedFaults = allowedMessageFaults.apply(nodes.get(to), msg); + if (!allowedFaults.contains(Faults.DELAY) || broadcastAddressAndPort.equals(to)) + return null; + return delegate.jitter(msg, to); + } + }, + (to, msg) -> unorderedScheduled.submit(() -> nodes.get(to).handle(msg)), + (action, to, msg) -> logger.warn("{} message {}", action, msg), + unorderedScheduled::schedule, + failures::add); } } @@ -1059,7 +850,7 @@ class Node implements SharedContext final InetAddressAndPort addressAndPort; final Collection tokens; final ActiveRepairService activeRepairService; - final IVerbHandler verbHandler; + final SimulatedMessageReceiver receiver; final Messaging messaging; final IValidationManager validationManager; private FailingBiConsumer doValidation = DEFAULT_VALIDATION; @@ -1093,7 +884,7 @@ private Node(UUID hostId, InetAddressAndPort addressAndPort, Collection t validator.fail(e); } }); - this.verbHandler = new IVerbHandler<>() + this.receiver = messaging.receiver(new IVerbHandler<>() { private final RepairMessageVerbHandler repairVerbHandler = new RepairMessageVerbHandler(Node.this); private final IVerbHandler paxosStartPrepareCleanup = PaxosStartPrepareCleanup.createVerbHandler(Node.this); @@ -1125,7 +916,7 @@ public void doVerb(Message message) throws IOException repairVerbHandler.doVerb(message); } } - }; + }); activeRepairService.start(); } @@ -1165,38 +956,7 @@ void handle(Message msg) } for (MessageListener l : listeners) l.preHandle(this, msg); - if (msg.verb().isResponse()) - { - // handle callbacks - CallbackKey key = new CallbackKey(msg.id(), msg.from()); - if (messaging.callbacks.containsKey(key)) - { - CallbackContext callback = messaging.callbacks.remove(key); - if (callback == null) - return; - try - { - if (msg.isFailureResponse()) - callback.onFailure(msg.from(), (RequestFailureReason) msg.payload); - else callback.onResponse(msg); - } - catch (Throwable t) - { - failures.add(t); - } - } - } - else - { - try - { - verbHandler.doVerb(msg); - } - catch (Throwable e) - { - failures.add(e); - } - } + receiver.recieve(msg); } public UUID hostId() @@ -1351,6 +1111,12 @@ public PaxosRepairState paxosRepairState() return paxosRepairState; } + @Override + public Supplier timeUUID() + { + return Generators.toGen(Generators.timeUUID()).asSupplier(rs); + } + public String toString() { return "Node{" + @@ -1377,14 +1143,6 @@ private Message serde(Message msg) } } - private static Gen fromQT(org.quicktheories.core.Gen qt) - { - return rs -> { - JavaRandom r = new JavaRandom(rs.asJdkRandom()); - return qt.generate(r); - }; - } - public static class HackStrat extends LocalStrategy { public HackStrat(String keyspaceName, Map configOptions) @@ -1450,7 +1208,10 @@ private void checkAccess() if (("org.apache.cassandra.service.paxos.Paxos".equals(next.getClassName()) && "newBallot".equals(next.getMethodName())) || ("org.apache.cassandra.service.paxos.uncommitted.PaxosBallotTracker".equals(next.getClassName()) && "updateLowBound".equals(next.getMethodName()))) return Access.MAIN_THREAD_ONLY; - if (next.getClassName().startsWith("org.apache.cassandra.db.") + if (next.getClassName().startsWith("org.apache.cassandra.accord.") + || next.getClassName().startsWith("org.apache.cassandra.journal.") + || next.getClassName().startsWith("org.apache.cassandra.service.accord.") + || next.getClassName().startsWith("org.apache.cassandra.db.") || next.getClassName().startsWith("org.apache.cassandra.gms.") || next.getClassName().startsWith("org.apache.cassandra.cql3.") || next.getClassName().startsWith("org.apache.cassandra.metrics.") diff --git a/test/unit/org/apache/cassandra/repair/HappyPathFuzzTest.java b/test/unit/org/apache/cassandra/repair/HappyPathFuzzTest.java index b6e34fbcf09c..f8e570b5c103 100644 --- a/test/unit/org/apache/cassandra/repair/HappyPathFuzzTest.java +++ b/test/unit/org/apache/cassandra/repair/HappyPathFuzzTest.java @@ -19,12 +19,18 @@ package org.apache.cassandra.repair; import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.stream.LongStream; import org.junit.Test; import accord.utils.Gen; import accord.utils.Gens; +import org.agrona.collections.LongArrayList; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.config.RetrySpec; import org.apache.cassandra.utils.Closeable; @@ -38,6 +44,8 @@ public void happyPath() { // disable all retries, no delays/drops are possible DatabaseDescriptor.getRepairRetrySpec().maxAttempts = RetrySpec.MaxAttempt.DISABLED; + Map repairTypeRuntimes = new HashMap<>(); + long realStartNanos = System.nanoTime(); qt().withPure(false).withExamples(10).check(rs -> { Cluster cluster = new Cluster(rs); Gen coordinatorGen = Gens.pick(cluster.nodes.keySet()).map(cluster.nodes::get); @@ -47,6 +55,7 @@ public void happyPath() { Cluster.Node coordinator = coordinatorGen.next(rs); + long nowNanos = System.nanoTime(); RepairCoordinator repair = coordinator.repair(KEYSPACE, repairOption(rs, coordinator, KEYSPACE, TABLES)); repair.run(); boolean shouldSync = rs.nextBoolean(); @@ -54,9 +63,29 @@ public void happyPath() closeables.add(cluster.nodes.get(pickParticipant(rs, coordinator, repair)).doValidation((cfs, validator) -> addMismatch(rs, cfs, validator))); runAndAssertSuccess(cluster, example, shouldSync, repair); + repairTypeRuntimes.computeIfAbsent(repair.state.getType(), ignore -> new LongArrayList()).addLong(System.nanoTime() - nowNanos); closeables.forEach(Closeable::close); closeables.clear(); } }); + long realDurationNanos = System.nanoTime() - realStartNanos; + long repairDurationsNanos = 0; + StringBuilder sb = new StringBuilder(); + for (Map.Entry e : repairTypeRuntimes.entrySet()) + { + sb.append(e.getKey()); + long[] times = e.getValue().toLongArray(); + repairDurationsNanos += LongStream.of(times).sum(); + Arrays.sort(times); + long min = times[0]; + long median = times[times.length / 2]; + long max = times[times.length - 1]; + sb.append(": min=").append(TimeUnit.NANOSECONDS.toMillis(min)) + .append(", median=").append(TimeUnit.NANOSECONDS.toMillis(median)) + .append(", max=").append(TimeUnit.NANOSECONDS.toMillis(max)) + .append(", count=").append(times.length) + .append('\n'); + } + logger.info("Repair runtimes (in millis):\nTest Duration {}\nRepair Duration {}\n{}", TimeUnit.NANOSECONDS.toMillis(realDurationNanos), TimeUnit.NANOSECONDS.toMillis(repairDurationsNanos), sb); } } diff --git a/test/unit/org/apache/cassandra/repair/LocalSyncTaskTest.java b/test/unit/org/apache/cassandra/repair/LocalSyncTaskTest.java index 95f630dc0571..c03fe2aac70e 100644 --- a/test/unit/org/apache/cassandra/repair/LocalSyncTaskTest.java +++ b/test/unit/org/apache/cassandra/repair/LocalSyncTaskTest.java @@ -171,7 +171,7 @@ private static void assertNumInOut(StreamPlan plan, int expectedIncoming, int ex StreamCoordinator coordinator = plan.getCoordinator(); StreamSession session = Iterables.getOnlyElement(coordinator.getAllStreamSessions()); assertEquals(expectedIncoming, session.getNumRequests()); - assertEquals(expectedOutgoing, session.getNumTransfers()); + assertEquals(expectedOutgoing, session.getNumKeyspaceTransfers()); } @Test diff --git a/test/unit/org/apache/cassandra/repair/RepairGenerators.java b/test/unit/org/apache/cassandra/repair/RepairGenerators.java new file mode 100644 index 000000000000..2175a37789d9 --- /dev/null +++ b/test/unit/org/apache/cassandra/repair/RepairGenerators.java @@ -0,0 +1,201 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair; + +import java.util.ArrayList; +import java.util.List; + +import accord.utils.Gen; +import accord.utils.Gens; + +public class RepairGenerators +{ + public static final List LOCAL_RANGE = List.of(); + public static final List PRIMARY_RANGE = List.of("-pr"); // repair calls this partition range, but StorageService calls this primary + + public enum RepairType + { + FULL("--full"), + IR(""); + + public final String arg; + + RepairType(String s) + { + this.arg = s; + } + } + + public enum PreviewType + { + NONE(""), + REPAIRED("--validate"), + UNREPAIRED("--preview"); + + public final String arg; + + PreviewType(String s) + { + this.arg = s; + } + } + + public static boolean isPreview(List args) + { + return args.stream().anyMatch(s -> PreviewType.REPAIRED.arg.equals(s) + || PreviewType.UNREPAIRED.arg.equals(s)); + } + + public static PreviewType previewType(List args) + { + for (String s : args) + { + if (PreviewType.REPAIRED.arg.equals(s)) + return PreviewType.REPAIRED; + if (PreviewType.UNREPAIRED.arg.equals(s)) + return PreviewType.UNREPAIRED; + } + return PreviewType.NONE; + } + + public static boolean isFull(List args) + { + return args.stream().anyMatch(s -> RepairType.FULL.arg.equals(s)); + } + + public static boolean isIncremental(List args) + { + return !isFull(args); + } + + + public static class Builder + { + final Gen> tablesGen; + Gen typeGen = Gens.enums().all(RepairType.class); + Gen previewTypeGen = Gens.enums().all(PreviewType.class); + Gen> ranges = Gens.pick(List.of(), PRIMARY_RANGE); + Gen optimizeStreamsGen = Gens.bools().all(); + Gen parallelismGen = Gens.enums().all(RepairParallelism.class); + Gen skipPaxosGen = i -> false; + Gen skipAccordGen = i -> false; + + public Builder(Gen> tablesGen) + { + this.tablesGen = tablesGen; + } + + public Builder withType(Gen typeGen) + { + this.typeGen = typeGen; + return this; + } + + public Builder withPreviewType(Gen previewTypeGen) + { + this.previewTypeGen = previewTypeGen; + return this; + } + + public Builder withRanges(Gen> ranges) + { + this.ranges = ranges; + return this; + } + + public Builder withOptimizeStreams(Gen optimizeStreamsGen) + { + this.optimizeStreamsGen = optimizeStreamsGen; + return this; + } + + public Builder withParallelism(Gen parallelismGen) + { + this.parallelismGen = parallelismGen; + return this; + } + + public Builder withSkipPaxosGen(Gen skipPaxosGen) + { + this.skipPaxosGen = skipPaxosGen; + return this; + } + + public Builder withSkipAccordGen(Gen skipAccordGen) + { + this.skipAccordGen = skipAccordGen; + return this; + } + + public Gen> build() + { + return rs -> { + RepairType type = typeGen.next(rs); + PreviewType previewType = previewTypeGen.next(rs); + List args = new ArrayList<>(); + args.addAll(tablesGen.next(rs)); + args.addAll(ranges.next(rs)); + if (skipPaxosGen.next(rs)) + args.add("--skip-paxos"); + if (skipAccordGen.next(rs)) + args.add("--skip-accord"); + switch (type) + { + case IR: + // default + break; + case FULL: + args.add(type.arg); + break; + default: + throw new AssertionError("Unsupported repair type: " + type); + } + switch (previewType) + { + case NONE: + break; + case REPAIRED: + case UNREPAIRED: + args.add(previewType.arg); + break; + default: + throw new AssertionError("Unsupported preview type: " + previewType); + } + RepairParallelism parallelism = parallelismGen.next(rs); + switch (parallelism) + { + case SEQUENTIAL: + args.add("--sequential"); + break; + case PARALLEL: + // default + break; + case DATACENTER_AWARE: + args.add("--dc-parallel"); + break; + default: + throw new AssertionError("Unknown parallelism: " + parallelism); + } + if (optimizeStreamsGen.next(rs)) + args.add("--optimise-streams"); + return args; + }; + } + } +} diff --git a/test/unit/org/apache/cassandra/repair/RepairJobTest.java b/test/unit/org/apache/cassandra/repair/RepairJobTest.java index ea32bd750b88..bea5cef88a86 100644 --- a/test/unit/org/apache/cassandra/repair/RepairJobTest.java +++ b/test/unit/org/apache/cassandra/repair/RepairJobTest.java @@ -37,12 +37,6 @@ import com.google.common.collect.ImmutableMap; import com.google.common.util.concurrent.ListenableFuture; - -import org.apache.cassandra.repair.messages.SyncResponse; -import org.apache.cassandra.repair.messages.ValidationResponse; -import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.service.paxos.cleanup.PaxosRepairState; -import org.assertj.core.api.Assertions; import org.junit.After; import org.junit.Before; import org.junit.BeforeClass; @@ -50,6 +44,7 @@ import org.apache.cassandra.SchemaLoader; import org.apache.cassandra.concurrent.ExecutorPlus; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.dht.ByteOrderedPartitioner; @@ -64,11 +59,14 @@ import org.apache.cassandra.net.Verb; import org.apache.cassandra.repair.messages.RepairMessage; import org.apache.cassandra.repair.messages.SyncRequest; +import org.apache.cassandra.repair.messages.SyncResponse; +import org.apache.cassandra.repair.messages.ValidationResponse; import org.apache.cassandra.schema.KeyspaceParams; import org.apache.cassandra.service.ActiveRepairService; import org.apache.cassandra.service.paxos.Paxos; import org.apache.cassandra.service.paxos.cleanup.PaxosCleanupRequest; import org.apache.cassandra.service.paxos.cleanup.PaxosCleanupResponse; +import org.apache.cassandra.service.paxos.cleanup.PaxosRepairState; import org.apache.cassandra.streaming.PreviewKind; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FBUtilities; @@ -78,8 +76,11 @@ import org.apache.cassandra.utils.Throwables; import org.apache.cassandra.utils.TimeUUID; import org.apache.cassandra.utils.asserts.SyncTaskListAssert; +import org.assertj.core.api.Assertions; import static java.util.Collections.emptySet; +import static org.apache.cassandra.net.Verb.PAXOS2_CLEANUP_REQ; +import static org.apache.cassandra.net.Verb.PAXOS2_CLEANUP_START_PREPARE_REQ; import static org.apache.cassandra.repair.RepairParallelism.SEQUENTIAL; import static org.apache.cassandra.streaming.PreviewKind.NONE; import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; @@ -87,8 +88,6 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; -import static org.apache.cassandra.net.Verb.PAXOS2_CLEANUP_START_PREPARE_REQ; -import static org.apache.cassandra.net.Verb.PAXOS2_CLEANUP_REQ; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -107,7 +106,7 @@ public class RepairJobTest private static final Range RANGE_3 = range(4, 5); private static final RepairJobDesc JOB_DESC = new RepairJobDesc(nextTimeUUID(), nextTimeUUID(), KEYSPACE, CF, Collections.emptyList()); private static final List> FULL_RANGE = Collections.singletonList(new Range<>(MURMUR3_PARTITIONER.getMinimumToken(), - MURMUR3_PARTITIONER.getMaximumToken())); + MURMUR3_PARTITIONER.getMaximumTokenForSplitting())); private static InetAddressAndPort addr1; private static InetAddressAndPort addr2; private static InetAddressAndPort addr3; @@ -123,14 +122,14 @@ private static class MeasureableRepairSession extends RepairSession { private final List> syncCompleteCallbacks = new ArrayList<>(); - public MeasureableRepairSession(TimeUUID parentRepairSession, CommonRange commonRange, String keyspace, + public MeasureableRepairSession(TimeUUID parentRepairSession, CommonRange commonRange, boolean excludedDeadNodes, String keyspace, RepairParallelism parallelismDegree, boolean isIncremental, boolean pullRepair, - PreviewKind previewKind, boolean optimiseStreams, boolean repairPaxos, boolean paxosOnly, - boolean dontPurgeTombstones, String... cfnames) + PreviewKind previewKind, boolean optimiseStreams, boolean repairData, boolean repairPaxos, + boolean dontPurgeTombstones, boolean repairAccord, String... cfnames) { super(SharedContext.Global.instance, new Scheduler.NoopScheduler(), - parentRepairSession, commonRange, keyspace, parallelismDegree, isIncremental, pullRepair, - previewKind, optimiseStreams, repairPaxos, paxosOnly, dontPurgeTombstones, cfnames); + parentRepairSession, commonRange, excludedDeadNodes, keyspace, parallelismDegree, isIncremental, pullRepair, + previewKind, optimiseStreams, repairData, repairPaxos, dontPurgeTombstones, repairAccord, cfnames); } @Override @@ -194,9 +193,9 @@ public void setup() ActiveRepairService.UNREPAIRED_SSTABLE, false, PreviewKind.NONE); this.session = new MeasureableRepairSession(parentRepairSession, - new CommonRange(neighbors, emptySet(), FULL_RANGE), + new CommonRange(neighbors, emptySet(), FULL_RANGE), false, KEYSPACE, SEQUENTIAL, false, false, - NONE, false, true, false, false, CF); + NONE, false, true, true, false, true, CF); this.job = new RepairJob(session, CF); this.sessionJobDesc = new RepairJobDesc(session.state.parentRepairSession, session.getId(), @@ -330,7 +329,7 @@ public void testValidationFailure() throws InterruptedException, TimeoutExceptio interceptRepairMessages(mockTrees, new ArrayList<>()); - try + try { job.run(); job.get(TEST_TIMEOUT_S, TimeUnit.SECONDS); diff --git a/test/unit/org/apache/cassandra/repair/RepairSessionTest.java b/test/unit/org/apache/cassandra/repair/RepairSessionTest.java index 470a2efc538e..c59be70c7ba1 100644 --- a/test/unit/org/apache/cassandra/repair/RepairSessionTest.java +++ b/test/unit/org/apache/cassandra/repair/RepairSessionTest.java @@ -65,10 +65,9 @@ public void testConviction() throws Exception Set endpoints = Sets.newHashSet(remote); RepairSession session = new RepairSession(SharedContext.Global.instance, new Scheduler.NoopScheduler(), parentSessionId, new CommonRange(endpoints, Collections.emptySet(), Arrays.asList(repairRange)), - "Keyspace1", RepairParallelism.SEQUENTIAL, - false, false, - PreviewKind.NONE, false, false, false, false, - "Standard1"); + false, "Keyspace1", RepairParallelism.SEQUENTIAL, + false, false, PreviewKind.NONE, false, + false, false, false, false, "Standard1"); // perform convict session.convict(remote, Double.MAX_VALUE); diff --git a/test/unit/org/apache/cassandra/repair/ValidationTaskTest.java b/test/unit/org/apache/cassandra/repair/ValidationTaskTest.java index e7f325de52ff..0bb93cad37d6 100644 --- a/test/unit/org/apache/cassandra/repair/ValidationTaskTest.java +++ b/test/unit/org/apache/cassandra/repair/ValidationTaskTest.java @@ -67,7 +67,7 @@ public void shouldReleaseTreesOnAbort() throws Exception IPartitioner partitioner = Murmur3Partitioner.instance; MerkleTrees trees = new MerkleTrees(partitioner); - trees.addMerkleTree(128, new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumToken())); + trees.addMerkleTree(128, new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumTokenForSplitting())); task.treesReceived(trees); assertEquals(1, trees.size()); diff --git a/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairConfigRepairTypeTest.java b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairConfigRepairTypeTest.java new file mode 100644 index 000000000000..ec36bb5cfd0c --- /dev/null +++ b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairConfigRepairTypeTest.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import org.junit.Assert; +import org.junit.Test; + +/** + * Unit tests for {@link org.apache.cassandra.repair.autorepair.AutoRepairConfig.RepairType} + */ +public class AutoRepairConfigRepairTypeTest +{ + @Test + public void testRepairTypeParsing() + { + Assert.assertEquals(AutoRepairConfig.RepairType.FULL, AutoRepairConfig.RepairType.parse("FULL")); + Assert.assertEquals(AutoRepairConfig.RepairType.FULL, AutoRepairConfig.RepairType.parse("FuLl")); + Assert.assertEquals(AutoRepairConfig.RepairType.FULL, AutoRepairConfig.RepairType.parse("full")); + Assert.assertEquals(AutoRepairConfig.RepairType.INCREMENTAL, AutoRepairConfig.RepairType.parse("INCREMENTAL")); + Assert.assertEquals(AutoRepairConfig.RepairType.INCREMENTAL, AutoRepairConfig.RepairType.parse("incremental")); + Assert.assertEquals(AutoRepairConfig.RepairType.INCREMENTAL, AutoRepairConfig.RepairType.parse("inCRemenTal")); + Assert.assertEquals(AutoRepairConfig.RepairType.PREVIEW_REPAIRED, AutoRepairConfig.RepairType.parse("PREVIEW_REPAIRED")); + Assert.assertEquals(AutoRepairConfig.RepairType.PREVIEW_REPAIRED, AutoRepairConfig.RepairType.parse("preview_repaired")); + Assert.assertEquals(AutoRepairConfig.RepairType.PREVIEW_REPAIRED, AutoRepairConfig.RepairType.parse("Preview_Repaired")); + } + + @Test(expected = NullPointerException.class) + public void testNullRepairTypeParsing() + { + AutoRepairConfig.RepairType.parse(null); + } + + @Test(expected = IllegalArgumentException.class) + public void testEmptyRepairTypeParsing() + { + AutoRepairConfig.RepairType.parse(""); + } + + @Test(expected = IllegalArgumentException.class) + public void testInvalidRepairTypeParsing() + { + AutoRepairConfig.RepairType.parse("very_FULL"); + } +} diff --git a/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairConfigTest.java b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairConfigTest.java new file mode 100644 index 000000000000..93ef906a287d --- /dev/null +++ b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairConfigTest.java @@ -0,0 +1,509 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.util.HashSet; +import java.util.Map; +import java.util.Collections; +import java.util.Set; + +import com.google.common.collect.ImmutableSet; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.config.DurationSpec; +import org.apache.cassandra.config.ParameterizedClass; +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig.Options; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +/** + * Unit tests for {@link org.apache.cassandra.repair.autorepair.AutoRepairConfig} + */ +@RunWith(Parameterized.class) +public class AutoRepairConfigTest extends CQLTester +{ + private AutoRepairConfig config; + + private final Set testSet = ImmutableSet.of("dc1"); + + @Parameterized.Parameter + public AutoRepairConfig.RepairType repairType; + + @Parameterized.Parameters + public static Object[] repairTypes() + { + return AutoRepairConfig.RepairType.values(); + } + + @Before + public void setUp() + { + config = new AutoRepairConfig(true); + AutoRepair.SLEEP_IF_REPAIR_FINISHES_QUICKLY = new DurationSpec.IntSecondsBound("0s"); + } + + @Test + public void autoRepairConfigDefaultsAreNotNull() + { + AutoRepairConfig config = new AutoRepairConfig(); + assertNotNull(config.global_settings); + } + + @Test + public void autoRepairConfigRepairTypesAreNotNull() + { + AutoRepairConfig config = new AutoRepairConfig(); + for (AutoRepairConfig.RepairType repairType : AutoRepairConfig.RepairType.values()) + { + assertNotNull(config.getOptions(repairType)); + } + } + + @Test + public void testIsAutoRepairEnabledReturnsTrueWhenRepairIsEnabled() + { + config.global_settings.enabled = true; + + assertTrue(config.isAutoRepairEnabled(repairType)); + } + + @Test + public void testRepairMinDuration() + { + config = new AutoRepairConfig(false); + + config.setRepairTaskMinDuration("3s"); + assertEquals(3L, config.getRepairTaskMinDuration().toSeconds()); + } + + @Test + public void testIsAutoRepairEnabledReturnsTrueWhenRepairIsDisabledGlobally() + { + config = new AutoRepairConfig(false); + config.global_settings.enabled = true; + assertFalse(config.isAutoRepairEnabled(repairType)); + } + + @Test + public void testIsAutoRepairEnabledReturnsTrueWhenRepairIsDisabledForRepairType() + { + config.global_settings.enabled = true; + config.setAutoRepairEnabled(repairType, false); + assertFalse(config.getOptions(repairType).enabled); + } + + @Test + public void testSetAutoRepairEnabledNoMVOrCDC() + { + DatabaseDescriptor.setCDCEnabled(false); + DatabaseDescriptor.setMaterializedViewsEnabled(false); + config.setAutoRepairEnabled(repairType, true); + + assertTrue(config.getOptions(repairType).enabled); + } + + @Test + public void testSetRepairByKeyspace() + { + // Should default to true. + assertTrue(config.getRepairByKeyspace(repairType)); + + config.setRepairByKeyspace(repairType, false); + + assertFalse(config.getOptions(repairType).repair_by_keyspace); + } + + @Test + public void testGetRepairByKeyspace() + { + config.global_settings.repair_by_keyspace = true; + + boolean result = config.getRepairByKeyspace(repairType); + + assertTrue(result); + } + + @Test + public void testSetRepairThreads() + { + config.setRepairThreads(repairType, 5); + + assertEquals(5, config.getOptions(repairType).number_of_repair_threads.intValue()); + } + + @Test + public void testGetRepairThreads() + { + config.global_settings.number_of_repair_threads = 5; + + int result = config.getRepairThreads(repairType); + + assertEquals(5, result); + } + + @Test + public void testGetRepairMinFrequencyInHours() + { + config.global_settings.min_repair_interval = new DurationSpec.IntSecondsBound("5s"); + + DurationSpec.IntSecondsBound result = config.getRepairMinInterval(repairType); + + assertEquals(5, result.toSeconds()); + } + + @Test + public void testSetRepairMinFrequencyInHours() + { + config.setRepairMinInterval(repairType, "5s"); + + assertEquals(5, config.getOptions(repairType).min_repair_interval.toSeconds()); + } + + @Test + public void testGetAutoRepairHistoryClearDeleteHostsBufferInSec() + { + config.history_clear_delete_hosts_buffer_interval = new DurationSpec.IntSecondsBound("5s"); + + int result = config.getAutoRepairHistoryClearDeleteHostsBufferInterval().toSeconds(); + + assertEquals(5, result); + } + + @Test + public void testSetAutoRepairHistoryClearDeleteHostsBufferInSec() + { + config.setAutoRepairHistoryClearDeleteHostsBufferInterval("5s"); + + assertEquals(new DurationSpec.IntSecondsBound("5s"), config.history_clear_delete_hosts_buffer_interval); + } + + @Test + public void testGetRepairSSTableCountHigherThreshold() + { + config.global_settings.sstable_upper_threshold = 5; + + int result = config.getRepairSSTableCountHigherThreshold(repairType); + + assertEquals(5, result); + } + + @Test + public void testSetRepairSSTableCountHigherThreshold() + { + config.setRepairSSTableCountHigherThreshold(repairType, 5); + + assertEquals(5, config.getOptions(repairType).sstable_upper_threshold.intValue()); + } + + @Test + public void testGetAutoRepairTableMaxRepairTimeInSec() + { + config.global_settings.table_max_repair_time = new DurationSpec.IntSecondsBound("5s"); + + DurationSpec.IntSecondsBound result = config.getAutoRepairTableMaxRepairTime(repairType); + + assertEquals(5, result.toSeconds()); + } + + @Test + public void testSetAutoRepairTableMaxRepairTimeInSec() + { + config.setAutoRepairTableMaxRepairTime(repairType, "5s"); + + assertEquals(5, config.getOptions(repairType).table_max_repair_time.toSeconds()); +} + + @Test + public void testGetIgnoreDCs() + { + config.global_settings.ignore_dcs = testSet; + + Set result = config.getIgnoreDCs(repairType); + + assertEquals(testSet, result); + } + + @Test + public void testSetIgnoreDCs() + { + config.setIgnoreDCs(repairType, testSet); + + assertEquals(config.getOptions(repairType).ignore_dcs, testSet); + } + + @Test + public void testGetRepairPrimaryTokenRangeOnly() + { + config.global_settings.repair_primary_token_range_only = true; + + boolean result = config.getRepairPrimaryTokenRangeOnly(repairType); + + assertTrue(result); + } + + @Test + public void testSetRepairPrimaryTokenRangeOnly() + { + config.setRepairPrimaryTokenRangeOnly(repairType, true); + + assertTrue(config.getOptions(repairType).repair_primary_token_range_only); + } + + @Test + public void testGetParallelRepairPercentageInGroup() + { + config.global_settings.parallel_repair_percentage = 5; + + int result = config.getParallelRepairPercentage(repairType); + + assertEquals(5, result); + } + + @Test + public void testSetParallelRepairPercentageInGroup() + { + config.setParallelRepairPercentage(repairType, 5); + + assertEquals(5, config.getOptions(repairType).parallel_repair_percentage.intValue()); + } + + @Test + public void testGetParallelRepairCountInGroup() + { + config.global_settings.parallel_repair_count = 5; + + int result = config.getParallelRepairCount(repairType); + + assertEquals(5, result); + } + + @Test + public void testSetParallelRepairCountInGroup() + { + config.setParallelRepairCount(repairType, 5); + + assertEquals(5, config.getOptions(repairType).parallel_repair_count.intValue()); + } + + @Test + public void testGetAllowParallelReplicaRepair() + { + // should default to false + assertFalse(config.global_settings.allow_parallel_replica_repair); + assertFalse(config.getAllowParallelReplicaRepair(repairType)); + + // setting global to true should also cause repair type config to inherit. + config.global_settings.allow_parallel_replica_repair = true; + assertTrue(config.getAllowParallelReplicaRepair(repairType)); + + } + + @Test + public void testSetAllowParallelReplicaRepair() + { + // should default to false + assertFalse(config.getAllowParallelReplicaRepair(repairType)); + + // setting explicitly for repair type should update it + config.setAllowParallelReplicaRepair(repairType, true); + assertTrue(config.getAllowParallelReplicaRepair(repairType)); + } + + @Test + public void testGetAllowParallelReplicaRepairAcrossSchedules() + { + // should default to true + assertTrue(config.global_settings.allow_parallel_replica_repair_across_schedules); + assertTrue(config.getAllowParallelReplicaRepairAcrossSchedules(repairType)); + + // setting global to true should also cause repair type config to inherit. + config.global_settings.allow_parallel_replica_repair_across_schedules = false; + assertFalse(config.getAllowParallelReplicaRepairAcrossSchedules(repairType)); + + } + + @Test + public void testSetAllowParallelReplicaRepairAcrossSchedules() + { + // should default to true + assertTrue(config.getAllowParallelReplicaRepairAcrossSchedules(repairType)); + + // setting explicitly for repair type should update it + config.setAllowParallelReplicaRepairAcrossSchedules(repairType, false); + assertFalse(config.getAllowParallelReplicaRepairAcrossSchedules(repairType)); + } + + @Test + public void testGetMaterializedViewRepairEnabled() + { + config.global_settings.materialized_view_repair_enabled = true; + + boolean result = config.getMaterializedViewRepairEnabled(repairType); + + assertTrue(result); + } + + @Test + public void testSetMVRepairEnabled() + { + config.setMaterializedViewRepairEnabled(repairType, true); + + assertTrue(config.getOptions(repairType).materialized_view_repair_enabled); + } + + @Test + public void testSetForceRepairNewNode() + { + config.setForceRepairNewNode(repairType, true); + + assertTrue(config.getOptions(repairType).force_repair_new_node); + } + + @Test + public void testGetForceRepairNewNode() + { + config.global_settings.force_repair_new_node = true; + + boolean result = config.getForceRepairNewNode(repairType); + + assertTrue(result); + } + + @Test + public void testIsAutoRepairSchedulingEnabledDefault() + { + config = new AutoRepairConfig(); + + boolean result = config.isAutoRepairSchedulingEnabled(); + + assertFalse(result); + } + + @Test + public void testIsAutoRepairSchedulingEnabledTrue() + { + boolean result = config.isAutoRepairSchedulingEnabled(); + + assertTrue(result); + } + + @Test + public void testGetDefaultOptionsMVRepairIsEnabledByDefault() + { + Options defaultOptions = Options.getDefaultOptions(); + + assertFalse(defaultOptions.materialized_view_repair_enabled); + } + + @Test + public void testGetDefaultOptionsTokenRangeSplitter() + { + Options defaultOptions = Options.getDefaultOptions(); + + ParameterizedClass expectedDefault = new ParameterizedClass(RepairTokenRangeSplitter.class.getName(), Collections.emptyMap()); + + assertEquals(expectedDefault, defaultOptions.token_range_splitter); + assertEquals(RepairTokenRangeSplitter.class.getName(), AutoRepairConfig.newAutoRepairTokenRangeSplitter(repairType, defaultOptions.token_range_splitter).getClass().getName()); + } + + @Test(expected = ConfigurationException.class) + public void testInvalidTokenRangeSplitter() + { + AutoRepairConfig.newAutoRepairTokenRangeSplitter(repairType, new ParameterizedClass("invalid-class", Collections.emptyMap())); + } + + @Test + public void testSetInitialSchedulerDelay() + { + config.setInitialSchedulerDelay(repairType, "5s"); + + assertEquals(5, config.getOptions(repairType).initial_scheduler_delay.toSeconds()); + } + + @Test + public void testGetInitialSchedulerDelay() + { + config.global_settings.initial_scheduler_delay = new DurationSpec.IntSecondsBound("5s"); + + int result = config.getInitialSchedulerDelay(repairType).toSeconds(); + + assertEquals(5, result); + } + + @Test + public void testSetRepairSessionTimeout() + { + config.setRepairSessionTimeout(repairType, "1h"); + + assertEquals(3600, config.getOptions(repairType).repair_session_timeout.toSeconds()); + } + + @Test + public void testDefaultOptions() + { + Map defaultOptions = Options.getDefaultOptionsMap(); + Options options = defaultOptions.get(repairType); + assertFalse(options.enabled); + assertTrue(options.repair_by_keyspace); + assertEquals(Integer.valueOf(1), options.number_of_repair_threads); + assertEquals(Integer.valueOf(3), options.parallel_repair_count); + assertEquals(Integer.valueOf(3), options.parallel_repair_percentage); + assertEquals(Integer.valueOf(50000), options.sstable_upper_threshold); + assertEquals(new HashSet<>(), options.ignore_dcs); + assertTrue(options.repair_primary_token_range_only); + assertFalse(options.force_repair_new_node); + assertEquals(new DurationSpec.IntSecondsBound("6h"), options.table_max_repair_time); + assertFalse(options.materialized_view_repair_enabled); + assertEquals(new ParameterizedClass(RepairTokenRangeSplitter.class.getName(), Collections.emptyMap()), options.token_range_splitter); + assertEquals(new DurationSpec.IntSecondsBound("5m"), options.initial_scheduler_delay); + assertEquals(new DurationSpec.IntSecondsBound("3h"), options.repair_session_timeout); + assertEquals(new DurationSpec.IntSecondsBound("24h"), options.min_repair_interval); + } + + @Test + public void testGlobalOptions() + { + AutoRepairConfig config = new AutoRepairConfig(); + assertFalse(config.global_settings.enabled); + assertTrue(config.global_settings.repair_by_keyspace); + assertEquals(Integer.valueOf(1), config.global_settings.number_of_repair_threads); + assertEquals(Integer.valueOf(3), config.global_settings.parallel_repair_count); + assertEquals(Integer.valueOf(3), config.global_settings.parallel_repair_percentage); + assertEquals(Integer.valueOf(50000), config.global_settings.sstable_upper_threshold); + assertEquals(new HashSet<>(), config.global_settings.ignore_dcs); + assertTrue(config.global_settings.repair_primary_token_range_only); + assertFalse(config.global_settings.force_repair_new_node); + assertEquals(new DurationSpec.IntSecondsBound("6h"), config.global_settings.table_max_repair_time); + assertFalse(config.global_settings.materialized_view_repair_enabled); + assertEquals(new ParameterizedClass(RepairTokenRangeSplitter.class.getName(), Collections.emptyMap()), config.global_settings.token_range_splitter); + assertEquals(new DurationSpec.IntSecondsBound("5m"), config.global_settings.initial_scheduler_delay); + assertEquals(new DurationSpec.IntSecondsBound("3h"), config.global_settings.repair_session_timeout); + assertEquals(new DurationSpec.IntSecondsBound("24h"), config.global_settings.min_repair_interval); + } +} diff --git a/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairKeyspaceTest.java b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairKeyspaceTest.java new file mode 100644 index 000000000000..ac9dac8236e8 --- /dev/null +++ b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairKeyspaceTest.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.SystemDistributedKeyspace; +import org.apache.cassandra.schema.TableMetadata; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.config.DatabaseDescriptor; + +/** + * Unit tests for {@link org.apache.cassandra.schema.SystemDistributedKeyspace} + */ +public class AutoRepairKeyspaceTest +{ + @BeforeClass + public static void setupDatabaseDescriptor() + { + DatabaseDescriptor.daemonInitialization(); + } + + @Test + public void testEnsureAutoRepairTablesArePresent() + { + KeyspaceMetadata keyspaceMetadata = SystemDistributedKeyspace.metadata(); + Iterator iter = keyspaceMetadata.tables.iterator(); + Set actualDistributedTablesIter = new HashSet<>(); + while (iter.hasNext()) + { + actualDistributedTablesIter.add(iter.next().name); + } + + Assert.assertTrue(actualDistributedTablesIter.contains(SystemDistributedKeyspace.AUTO_REPAIR_HISTORY)); + Assert.assertTrue(actualDistributedTablesIter.contains(SystemDistributedKeyspace.AUTO_REPAIR_PRIORITY)); + } +} diff --git a/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairMetricsTest.java b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairMetricsTest.java new file mode 100644 index 000000000000..d0b053a58778 --- /dev/null +++ b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairMetricsTest.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.metrics.AutoRepairMetrics; +import org.apache.cassandra.metrics.AutoRepairMetricsManager; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig.RepairType; +import org.apache.cassandra.repair.autorepair.AutoRepairUtils.RepairTurn; +import org.apache.cassandra.service.AutoRepairService; +import org.apache.cassandra.service.StorageService; + +import static org.apache.cassandra.Util.setAutoRepairEnabled; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class AutoRepairMetricsTest extends CQLTester +{ + + private AutoRepairMetrics metrics; + + @BeforeClass + public static void setupClass() throws Exception + { + setAutoRepairEnabled(true); + requireNetwork(); + AutoRepairUtils.setup(); + StorageService.instance.doAutoRepairSetup(); + + // Set min repair interval to an hour. + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + config.setRepairMinInterval(RepairType.FULL, "1h"); + } + + @Before + public void setup() + { + metrics = AutoRepairMetricsManager.getMetrics(RepairType.FULL); + } + + @Test + public void testShouldRecordRepairStartLagAndResetOnMyTurn() + { + // record a last finish repair time of one day. + long oneDayAgo = AutoRepair.instance.currentTimeMs() - 86_400_000; + metrics.recordRepairStartLag(oneDayAgo); + + // expect a recorded lag time of approximately 1 day (last repair finish time) - 1 hour (min repair interval) + long expectedLag = 86400 - 3600; + long recordedLag = metrics.repairStartLagSec.getValue(); + assertTrue(String.format("Expected at last 23h of lag (%d) but got (%d)", expectedLag, recordedLag), + recordedLag >= expectedLag); + // Given timing, allow at most 5 seconds of skew. + assertTrue(String.format("Expected 23h of lag (%d) but got a larger value (%d)", expectedLag, recordedLag), + recordedLag <= expectedLag + 5); + + // expect lag time to be restarted when recording a turn. + metrics.recordTurn(RepairTurn.MY_TURN); + assertEquals(0, metrics.repairStartLagSec.getValue().intValue()); + } + + @Test + public void testShouldRecordRepairStartLagOfZeroWhenFinishTimeIsWithinMinRepairInterval() + { + // record a last finish repair time of one 30 minutes + long thirtyMinutesAgo = AutoRepair.instance.currentTimeMs() - 1_800_000; + metrics.recordRepairStartLag(thirtyMinutesAgo); + + // expect 0 lag because last repair finish time was less than min repair interval + assertEquals(0, metrics.repairStartLagSec.getValue().intValue()); + } +} diff --git a/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairParameterizedTest.java b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairParameterizedTest.java new file mode 100644 index 000000000000..8e6a559d3343 --- /dev/null +++ b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairParameterizedTest.java @@ -0,0 +1,903 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; + +import com.google.common.collect.Sets; + +import org.apache.cassandra.config.DurationSpec; +import org.apache.cassandra.cql3.statements.schema.TableAttributes; +import org.apache.cassandra.repair.RepairCoordinator; +import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.locator.LocalStrategy; +import org.apache.cassandra.repair.RepairParallelism; +import org.apache.cassandra.repair.messages.RepairOption; +import org.apache.cassandra.schema.SystemDistributedKeyspace; +import org.apache.cassandra.service.StorageService; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.concurrent.ScheduledExecutorPlus; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.metrics.AutoRepairMetricsManager; +import org.apache.cassandra.metrics.AutoRepairMetrics; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.AutoRepairService; +import org.apache.cassandra.streaming.PreviewKind; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.progress.ProgressEvent; +import org.apache.cassandra.utils.progress.ProgressEventType; +import org.apache.cassandra.utils.progress.ProgressListener; +import org.mockito.Mock; +import org.mockito.Mockito; +import org.mockito.MockitoAnnotations; +import org.mockito.invocation.InvocationOnMock; + +import static org.apache.cassandra.Util.setAutoRepairEnabled; +import static org.apache.cassandra.config.CassandraRelevantProperties.SYSTEM_DISTRIBUTED_DEFAULT_RF; +import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics; +import static org.apache.cassandra.repair.autorepair.AutoRepairUtils.RepairTurn.NOT_MY_TURN; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.mockito.ArgumentMatchers.anyBoolean; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +/** + * Unit tests for {@link org.apache.cassandra.repair.autorepair.AutoRepair} + */ +@RunWith(Parameterized.class) +public class AutoRepairParameterizedTest extends CQLTester +{ + private static final String KEYSPACE = "ks"; + private static final String TABLE = "tbl"; + private static final String TABLE_DISABLED_AUTO_REPAIR = "tbl_disabled_auto_repair"; + private static final String MV = "mv"; + private static TableMetadata cfm; + private static TableMetadata cfmDisabledAutoRepair; + private static Keyspace keyspace; + private static int timeFuncCalls; + @Mock + ScheduledExecutorPlus mockExecutor; + @Mock + AutoRepairState autoRepairState; + @Mock + RepairCoordinator repairRunnable; + + // Expected number of repairs to be executed. + private static int expectedRepairAssignments; + + @Parameterized.Parameter() + public AutoRepairConfig.RepairType repairType; + + @Parameterized.Parameters(name = "repairType={0}") + public static Collection repairTypes() + { + return Arrays.asList(AutoRepairConfig.RepairType.values()); + } + + @BeforeClass + public static void setupClass() throws Exception + { + SYSTEM_DISTRIBUTED_DEFAULT_RF.setInt(1); + AutoRepair.SLEEP_IF_REPAIR_FINISHES_QUICKLY = new DurationSpec.IntSecondsBound("0s"); + setAutoRepairEnabled(true); + requireNetwork(); + AutoRepairUtils.setup(); + StorageService.instance.doAutoRepairSetup(); + DatabaseDescriptor.setCDCEnabled(false); + + // Calculate the expected number of keyspaces to be repaired, this should be all system keyspaces that are + // distributed, plus 1 for the table we created (ks.tbl). + int expectedKeyspacesGoingThroughRepair = 0; + for (Keyspace keyspace : Keyspace.all()) + { + // skip LocalStrategy keyspaces as these aren't repaired. + if (keyspace.getReplicationStrategy() instanceof LocalStrategy) + { + continue; + } + // skip system_traces keyspaces + if (keyspace.getName().equalsIgnoreCase(SchemaConstants.TRACE_KEYSPACE_NAME)) + { + continue; + } + + expectedKeyspacesGoingThroughRepair += 1; + } + // Since the splitter will unwrap a full token range, we expect twice as many repairs. + expectedRepairAssignments = expectedKeyspacesGoingThroughRepair * 2; + } + + @Before + public void setup() + { + SYSTEM_DISTRIBUTED_DEFAULT_RF.setInt(1); + QueryProcessor.executeInternal(String.format("CREATE KEYSPACE %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}", KEYSPACE)); + QueryProcessor.executeInternal(String.format("CREATE TABLE %s.%s (k text, s text static, i int, v text, primary key(k,i))", KEYSPACE, TABLE)); + QueryProcessor.executeInternal(String.format("CREATE TABLE %s.%s (k text, s text static, i int, v text, primary key(k,i)) WITH auto_repair = {'full_enabled': 'false', 'incremental_enabled': 'false', 'preview_repaired_enabled': 'false', 'priority': '0'}", KEYSPACE, TABLE_DISABLED_AUTO_REPAIR)); + + QueryProcessor.executeInternal(String.format("CREATE MATERIALIZED VIEW %s.%s AS SELECT i, k from %s.%s " + + "WHERE k IS NOT null AND i IS NOT null PRIMARY KEY (i, k)", KEYSPACE, MV, KEYSPACE, TABLE)); + + DatabaseDescriptor.setCDCOnRepairEnabled(false); + DatabaseDescriptor.setMaterializedViewsOnRepairEnabled(false); + //noinspection resource + MockitoAnnotations.openMocks(this); + + Keyspace.open(KEYSPACE).getColumnFamilyStore(TABLE).truncateBlocking(); + Keyspace.open(KEYSPACE).getColumnFamilyStore(TABLE).disableAutoCompaction(); + + Keyspace.open(KEYSPACE).getColumnFamilyStore(MV).truncateBlocking(); + Keyspace.open(KEYSPACE).getColumnFamilyStore(MV).disableAutoCompaction(); + + Keyspace.open(SchemaConstants.DISTRIBUTED_KEYSPACE_NAME).getColumnFamilyStore(SystemDistributedKeyspace.AUTO_REPAIR_PRIORITY).truncateBlocking(); + Keyspace.open(SchemaConstants.DISTRIBUTED_KEYSPACE_NAME).getColumnFamilyStore(SystemDistributedKeyspace.AUTO_REPAIR_HISTORY).truncateBlocking(); + + AutoRepair.instance.isSetupDone = false; + AutoRepair.instance.setup(); + executeCQL(); + + timeFuncCalls = 0; + AutoRepair.timeFunc = System::currentTimeMillis; + AutoRepair.sleepFunc = (Long startTime, TimeUnit unit) -> {}; + resetCounters(); + resetConfig(); + + AutoRepair.shuffleFunc = java.util.Collections::shuffle; + + keyspace = Keyspace.open(KEYSPACE); + cfm = Keyspace.open(KEYSPACE).getColumnFamilyStore(TABLE).metadata(); + cfmDisabledAutoRepair = Keyspace.open(KEYSPACE).getColumnFamilyStore(TABLE_DISABLED_AUTO_REPAIR).metadata(); + DatabaseDescriptor.setCDCOnRepairEnabled(false); + } + + @After + public void tearDown() + { + System.clearProperty("cassandra.streaming.requires_view_build_during_repair"); + } + + private void resetCounters() + { + AutoRepairMetrics metrics = AutoRepairMetricsManager.getMetrics(repairType); + Metrics.removeMatching((name, metric) -> name.startsWith("repairTurn")); + metrics.repairTurnMyTurn = Metrics.counter(String.format("repairTurnMyTurn-%s", repairType)); + metrics.repairTurnMyTurnForceRepair = Metrics.counter(String.format("repairTurnMyTurnForceRepair-%s", repairType)); + metrics.repairTurnMyTurnDueToPriority = Metrics.counter(String.format("repairTurnMyTurnDueToPriority-%s", repairType)); + } + + private void resetConfig() + { + // prepare a fresh default config + AutoRepairConfig defaultConfig = new AutoRepairConfig(true); + for (AutoRepairConfig.RepairType repairType : AutoRepairConfig.RepairType.values()) + { + defaultConfig.setAutoRepairEnabled(repairType, true); + defaultConfig.setMaterializedViewRepairEnabled(repairType, false); + } + + // reset the AutoRepairService config to default + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + config.repair_type_overrides = defaultConfig.repair_type_overrides; + config.global_settings = defaultConfig.global_settings; + config.history_clear_delete_hosts_buffer_interval = defaultConfig.history_clear_delete_hosts_buffer_interval; + config.repair_task_min_duration = new DurationSpec.LongSecondsBound("0s"); + } + + private void executeCQL() + { + QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, s) VALUES ('k', 's')"); + QueryProcessor.executeInternal("SELECT s FROM ks.tbl WHERE k='k'"); + Keyspace.open(SchemaConstants.DISTRIBUTED_KEYSPACE_NAME) + .getColumnFamilyStore(SystemDistributedKeyspace.AUTO_REPAIR_PRIORITY) + .forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); + } + + @Test + public void testRepairTurn() + { + UUID myId = StorageService.instance.getHostIdForEndpoint(FBUtilities.getBroadcastAddressAndPort()); + Assert.assertNotEquals("Expected my turn for the repair", NOT_MY_TURN, AutoRepairUtils.myTurnToRunRepair(repairType, myId)); + } + + @Test + public void testRepair() + { + AutoRepairService.instance.getAutoRepairConfig().setRepairMinInterval(repairType, "0s"); + AutoRepair.instance.repair(repairType); + assertEquals(0, AutoRepair.instance.repairStates.get(repairType).getTotalMVTablesConsideredForRepair()); + assertEquals(0, AutoRepairMetricsManager.getMetrics(repairType).totalMVTablesConsideredForRepair.getValue().intValue()); + long lastRepairTime = AutoRepair.instance.repairStates.get(repairType).getLastRepairTime(); + //if repair was done then lastRepairTime should be non-zero + Assert.assertTrue(String.format("Expected lastRepairTime > 0, actual value lastRepairTime %d", + lastRepairTime), lastRepairTime > 0); + // repair start lag sec should be reset on a successful repair + assertEquals(0, AutoRepairMetricsManager.getMetrics(repairType).repairStartLagSec.getValue().intValue()); + } + + @Test + public void testTooFrequentRepairs() + { + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + //in the first round let repair run + config.setRepairMinInterval(repairType, "0s"); + AutoRepair.instance.repair(repairType); + long lastRepairTime1 = AutoRepair.instance.repairStates.get(repairType).getLastRepairTime(); + int consideredTables = AutoRepair.instance.repairStates.get(repairType).getTotalTablesConsideredForRepair(); + Assert.assertNotEquals(String.format("Expected total repaired tables > 0, actual value %s ", consideredTables), + 0, consideredTables); + + //if repair was done in last 24 hours then it should not trigger another repair + config.setRepairMinInterval(repairType, "24h"); + AutoRepair.instance.repair(repairType); + long lastRepairTime2 = AutoRepair.instance.repairStates.get(repairType).getLastRepairTime(); + Assert.assertEquals(String.format("Expected repair time to be same, actual value lastRepairTime1 %d, " + + "lastRepairTime2 %d", lastRepairTime1, lastRepairTime2), lastRepairTime1, lastRepairTime2); + assertEquals(0, AutoRepair.instance.repairStates.get(repairType).getTotalMVTablesConsideredForRepair()); + assertEquals(0, AutoRepairMetricsManager.getMetrics(repairType).totalMVTablesConsideredForRepair.getValue().intValue()); + } + + @Test + public void testNonFrequentRepairs() + { + Integer prevMetricsCount = AutoRepairMetricsManager.getMetrics(repairType).totalMVTablesConsideredForRepair.getValue(); + AutoRepairState state = AutoRepair.instance.repairStates.get(repairType); + long prevCount = state.getTotalMVTablesConsideredForRepair(); + AutoRepairService.instance.getAutoRepairConfig().setRepairMinInterval(repairType, "0s"); + AutoRepair.instance.repair(repairType); + long lastRepairTime1 = AutoRepair.instance.repairStates.get(repairType).getLastRepairTime(); + Assert.assertTrue(String.format("Expected lastRepairTime1 > 0, actual value lastRepairTime1 %d", + lastRepairTime1), lastRepairTime1 > 0); + UUID myId = StorageService.instance.getHostIdForEndpoint(FBUtilities.getBroadcastAddressAndPort()); + Assert.assertNotEquals("Expected my turn for the repair", + NOT_MY_TURN, AutoRepairUtils.myTurnToRunRepair(repairType, myId)); + AutoRepair.instance.repair(repairType); + long lastRepairTime2 = AutoRepair.instance.repairStates.get(repairType).getLastRepairTime(); + Assert.assertNotSame(String.format("Expected repair time to be same, actual value lastRepairTime1 %d, " + + "lastRepairTime2 %d", lastRepairTime1, lastRepairTime2), lastRepairTime1, lastRepairTime2); + assertEquals(prevCount, state.getTotalMVTablesConsideredForRepair()); + assertEquals(prevMetricsCount, AutoRepairMetricsManager.getMetrics(repairType).totalMVTablesConsideredForRepair.getValue()); + } + + @Test + public void testGetPriorityHosts() + { + Integer prevMetricsCount = AutoRepairMetricsManager.getMetrics(repairType).totalMVTablesConsideredForRepair.getValue(); + AutoRepairState state = AutoRepair.instance.repairStates.get(repairType); + long prevCount = state.getTotalMVTablesConsideredForRepair(); + AutoRepairService.instance.getAutoRepairConfig().setRepairMinInterval(repairType, "0s"); + Assert.assertEquals(String.format("Priority host count is not same, actual value %d, expected value %d", + AutoRepairUtils.getPriorityHosts(repairType).size(), 0), 0, AutoRepairUtils.getPriorityHosts(repairType).size()); + UUID myId = StorageService.instance.getHostIdForEndpoint(FBUtilities.getBroadcastAddressAndPort()); + Assert.assertNotEquals("Expected my turn for the repair", NOT_MY_TURN, AutoRepairUtils.myTurnToRunRepair(repairType, myId)); + AutoRepair.instance.repair(repairType); + AutoRepairUtils.addPriorityHosts(repairType, Sets.newHashSet(FBUtilities.getBroadcastAddressAndPort())); + AutoRepair.instance.repair(repairType); + Assert.assertEquals(String.format("Priority host count is not same actual value %d, expected value %d", + AutoRepairUtils.getPriorityHosts(repairType).size(), 0), 0, AutoRepairUtils.getPriorityHosts(repairType).size()); + assertEquals(prevCount, state.getTotalMVTablesConsideredForRepair()); + assertEquals(prevMetricsCount, AutoRepairMetricsManager.getMetrics(repairType).totalMVTablesConsideredForRepair.getValue()); + } + + @Test + public void testCheckAutoRepairStartStop() throws Throwable + { + Integer prevMetricsCount = AutoRepairMetricsManager.getMetrics(repairType).totalMVTablesConsideredForRepair.getValue(); + AutoRepairState state = AutoRepair.instance.repairStates.get(repairType); + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + long prevCount = state.getTotalMVTablesConsideredForRepair(); + config.setRepairMinInterval(repairType, "0s"); + config.setAutoRepairEnabled(repairType, false); + long lastRepairTime1 = AutoRepair.instance.repairStates.get(repairType).getLastRepairTime(); + AutoRepair.instance.repair(repairType); + long lastRepairTime2 = AutoRepair.instance.repairStates.get(repairType).getLastRepairTime(); + //Since repair has not happened, both the last repair times should be same + Assert.assertEquals(String.format("Expected lastRepairTime1 %d, and lastRepairTime2 %d to be same", + lastRepairTime1, lastRepairTime2), lastRepairTime1, lastRepairTime2); + + config.setAutoRepairEnabled(repairType, true); + AutoRepair.instance.repair(repairType); + //since repair is done now, so lastRepairTime1/lastRepairTime2 and lastRepairTime3 should not be same + long lastRepairTime3 = AutoRepair.instance.repairStates.get(repairType).getLastRepairTime(); + Assert.assertNotSame(String.format("Expected lastRepairTime1 %d, and lastRepairTime3 %d to be not same", + lastRepairTime1, lastRepairTime2), lastRepairTime1, lastRepairTime3); + assertEquals(prevCount, state.getTotalMVTablesConsideredForRepair()); + assertEquals(prevMetricsCount, AutoRepairMetricsManager.getMetrics(repairType).totalMVTablesConsideredForRepair.getValue()); + } + + @Test + public void testRepairPrimaryRangesByDefault() + { + Assert.assertTrue("Expected primary range repair only", + AutoRepairService.instance.getAutoRepairConfig().getRepairPrimaryTokenRangeOnly(repairType)); + } + + @Test + public void testGetAllMVs() + { + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + config.setMaterializedViewRepairEnabled(repairType, false); + assertFalse(config.getMaterializedViewRepairEnabled(repairType)); + assertEquals(0, AutoRepairUtils.getAllMVs(repairType, keyspace, cfm).size()); + + config.setMaterializedViewRepairEnabled(repairType, true); + + assertTrue(config.getMaterializedViewRepairEnabled(repairType)); + assertEquals(Collections.singletonList(MV), AutoRepairUtils.getAllMVs(repairType, keyspace, cfm)); + config.setMaterializedViewRepairEnabled(repairType, false); + } + + + @Test + public void testMVRepair() + { + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + config.setMaterializedViewRepairEnabled(repairType, true); + config.setRepairMinInterval(repairType, "0s"); + AutoRepair.instance.repairStates.get(repairType).setLastRepairTime(System.currentTimeMillis()); + AutoRepair.instance.repair(repairType); + assertEquals(1, AutoRepair.instance.repairStates.get(repairType).getTotalMVTablesConsideredForRepair()); + assertEquals(1, AutoRepairMetricsManager.getMetrics(repairType).totalMVTablesConsideredForRepair.getValue().intValue()); + + config.setMaterializedViewRepairEnabled(repairType, false); + AutoRepair.instance.repairStates.get(repairType).setLastRepairTime(System.currentTimeMillis()); + AutoRepair.instance.repair(repairType); + assertEquals(0, AutoRepair.instance.repairStates.get(repairType).getTotalMVTablesConsideredForRepair()); + assertEquals(0, AutoRepairMetricsManager.getMetrics(repairType).totalMVTablesConsideredForRepair.getValue().intValue()); + + config.setMaterializedViewRepairEnabled(repairType, true); + AutoRepair.instance.repairStates.get(repairType).setLastRepairTime(System.currentTimeMillis()); + AutoRepair.instance.repair(repairType); + assertEquals(1, AutoRepair.instance.repairStates.get(repairType).getTotalMVTablesConsideredForRepair()); + assertEquals(1, AutoRepairMetricsManager.getMetrics(repairType).totalMVTablesConsideredForRepair.getValue().intValue()); + } + + @Test + public void testSkipRepairSSTableCountHigherThreshold() + { + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + AutoRepairState state = AutoRepair.instance.repairStates.get(repairType); + ColumnFamilyStore cfsBaseTable = Keyspace.open(KEYSPACE).getColumnFamilyStore(TABLE); + ColumnFamilyStore cfsMVTable = Keyspace.open(KEYSPACE).getColumnFamilyStore(MV); + Set preBaseTable = cfsBaseTable.getLiveSSTables(); + Set preMVTable = cfsBaseTable.getLiveSSTables(); + config.setRepairMinInterval(repairType, "0s"); + + for (int i = 0; i < 10; i++) + { + QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (k, i, v) VALUES('k1', %d, 'v1')", KEYSPACE, TABLE, i)); + cfsBaseTable.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); + cfsMVTable.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); + } + + Set postBaseTable = cfsBaseTable.getLiveSSTables(); + Set diffBaseTable = new HashSet<>(postBaseTable); + diffBaseTable.removeAll(preBaseTable); + assert diffBaseTable.size() == 10; + + Set postMVTable = cfsBaseTable.getLiveSSTables(); + Set diffMVTable = new HashSet<>(postMVTable); + diffMVTable.removeAll(preMVTable); + assert diffMVTable.size() == 10; + + int beforeCount = config.getRepairSSTableCountHigherThreshold(repairType); + config.setMaterializedViewRepairEnabled(repairType, true); + config.setRepairSSTableCountHigherThreshold(repairType, 9); + assertEquals(0, state.getSkippedTokenRangesCount()); + assertEquals(0, AutoRepairMetricsManager.getMetrics(repairType).skippedTokenRangesCount.getValue().intValue()); + state.setLastRepairTime(0); + AutoRepair.instance.repair(repairType); + assertEquals(0, state.getTotalMVTablesConsideredForRepair()); + assertEquals(0, AutoRepairMetricsManager.getMetrics(repairType).totalMVTablesConsideredForRepair.getValue().intValue()); + // skipping both the tables - one table is due to its repair has been disabled, and another one due to high sstable count + assertEquals(0, state.getSkippedTokenRangesCount()); + assertEquals(0, AutoRepairMetricsManager.getMetrics(repairType).skippedTokenRangesCount.getValue().intValue()); + assertEquals(2, state.getSkippedTablesCount()); + assertEquals(2, AutoRepairMetricsManager.getMetrics(repairType).skippedTablesCount.getValue().intValue()); + + // set it to higher value, and this time, the tables should not be skipped + config.setRepairSSTableCountHigherThreshold(repairType, beforeCount); + state.setLastRepairTime(0); + state.setSkippedTablesCount(0); + state.setTotalMVTablesConsideredForRepair(0); + AutoRepair.instance.repair(repairType); + assertEquals(1, state.getTotalMVTablesConsideredForRepair()); + assertEquals(1, AutoRepairMetricsManager.getMetrics(repairType).totalMVTablesConsideredForRepair.getValue().intValue()); + assertEquals(0, state.getSkippedTokenRangesCount()); + assertEquals(0, AutoRepairMetricsManager.getMetrics(repairType).skippedTokenRangesCount.getValue().intValue()); + assertEquals(1, state.getSkippedTablesCount()); + assertEquals(1, AutoRepairMetricsManager.getMetrics(repairType).skippedTablesCount.getValue().intValue()); + } + + @Test + public void testGetRepairState() + { + assertEquals(0, AutoRepair.instance.repairStates.get(repairType).getRepairKeyspaceCount()); + + AutoRepairState state = AutoRepair.instance.getRepairState(repairType); + state.setRepairKeyspaceCount(100); + + assertEquals(100L, AutoRepair.instance.getRepairState(repairType).getRepairKeyspaceCount()); + } + + @Test + public void testMetrics() + { + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + config.setMaterializedViewRepairEnabled(repairType, true); + config.setRepairMinInterval(repairType, "0s"); + config.setRepairRetryBackoff(repairType, "0s"); + config.setAutoRepairTableMaxRepairTime(repairType, "0s"); + AutoRepair.timeFunc = () -> { + timeFuncCalls++; + return timeFuncCalls * 1000L; + }; + AutoRepair.instance.repairStates.get(repairType).setLastRepairTime(1000L); + + AutoRepair.instance.repair(repairType); + + assertEquals(1, AutoRepairMetricsManager.getMetrics(repairType).totalMVTablesConsideredForRepair.getValue().intValue()); + assertTrue(AutoRepairMetricsManager.getMetrics(repairType).nodeRepairTimeInSec.getValue() > 0); + assertTrue(AutoRepairMetricsManager.getMetrics(repairType).clusterRepairTimeInSec.getValue() > 0); + assertEquals(1, AutoRepairMetricsManager.getMetrics(repairType).repairTurnMyTurn.getCount()); + assertTrue(AutoRepairMetricsManager.getMetrics(repairType).skippedTokenRangesCount.getValue() > 0); + assertEquals(0, AutoRepairMetricsManager.getMetrics(repairType).longestUnrepairedSec.getValue().intValue()); + + config.setAutoRepairTableMaxRepairTime(repairType, String.valueOf(Integer.MAX_VALUE-1) + 's'); + AutoRepair.instance.repairStates.put(repairType, autoRepairState); + when(autoRepairState.getRepairRunnable(Mockito.any(), Mockito.any(), Mockito.any(), anyBoolean())) + .thenReturn(repairRunnable); + doAnswer(invocation -> { + invocation.getArgument(0, ProgressListener.class).progress("test", new ProgressEvent(ProgressEventType.COMPLETE, 0, 0)); + return null; + }).when(repairRunnable).addProgressListener(Mockito.any()); + when(autoRepairState.getFailedTokenRangesCount()).thenReturn(10); + when(autoRepairState.getSucceededTokenRangesCount()).thenReturn(11); + when(autoRepairState.getLongestUnrepairedSec()).thenReturn(10); + + AutoRepair.instance.repair(repairType); + assertEquals(0, AutoRepairMetricsManager.getMetrics(repairType).skippedTokenRangesCount.getValue().intValue()); + assertTrue(AutoRepairMetricsManager.getMetrics(repairType).failedTokenRangesCount.getValue() > 0); + assertTrue(AutoRepairMetricsManager.getMetrics(repairType).succeededTokenRangesCount.getValue() > 0); + assertTrue(AutoRepairMetricsManager.getMetrics(repairType).longestUnrepairedSec.getValue() > 0); + } + + @Test + public void testRepairWaitsForRepairToFinishBeforeSchedullingNewSession() + { + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + AutoRepair.instance.repairStates.put(repairType, autoRepairState); + when(autoRepairState.getLastRepairTime()).thenReturn((long) 0); + AtomicInteger getRepairRunnableCalls = new AtomicInteger(); + AtomicReference prevListener = new AtomicReference<>(); + doAnswer(invocation -> { + if (getRepairRunnableCalls.getAndIncrement() > 0) + { + // progress listener from previous repair should be signalled before starting new repair + assertTrue(prevListener.get().condition.isSignalled()); + } + getRepairRunnableCalls.incrementAndGet(); + return repairRunnable; + }).when(autoRepairState).getRepairRunnable(Mockito.any(), Mockito.any(), Mockito.any(), anyBoolean()); + doAnswer(invocation -> { + // sending out a COMPLETE event with a 10ms delay + Executors.newScheduledThreadPool(1).schedule(() -> { + invocation.getArgument(0, AutoRepair.RepairProgressListener.class).progress("test", new ProgressEvent(ProgressEventType.COMPLETE, 0, 0)); + }, 10, TimeUnit.MILLISECONDS); + return null; + }).when(repairRunnable).addProgressListener(Mockito.any()); + + AutoRepair.instance.repair(repairType); + AutoRepair.instance.repair(repairType); + AutoRepair.instance.repair(repairType); + } + + @Test + public void testDisabledAutoRepairForATableThroughTableLevelConfiguration() + { + Assert.assertTrue(cfm.params.autoRepair.repairEnabled(AutoRepairConfig.RepairType.FULL)); + Assert.assertTrue(cfm.params.autoRepair.repairEnabled(AutoRepairConfig.RepairType.INCREMENTAL)); + Assert.assertFalse(cfmDisabledAutoRepair.params.autoRepair.repairEnabled(AutoRepairConfig.RepairType.FULL)); + Assert.assertFalse(cfmDisabledAutoRepair.params.autoRepair.repairEnabled(AutoRepairConfig.RepairType.INCREMENTAL)); + + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + config.setRepairMinInterval(repairType, "0s"); + int disabledTablesRepairCountBefore = AutoRepair.instance.repairStates.get(repairType).getTotalDisabledTablesRepairCount(); + AutoRepair.instance.repair(repairType); + int consideredTables = AutoRepair.instance.repairStates.get(repairType).getTotalTablesConsideredForRepair(); + Assert.assertNotSame(String.format("Expected total repaired tables > 0, actual value %s ", consideredTables), + 0, consideredTables); + int disabledTablesRepairCountAfter = AutoRepair.instance.repairStates.get(repairType).getTotalDisabledTablesRepairCount(); + Assert.assertTrue(String.format("A table %s should be skipped from auto repair, expected value: %d, actual value %d ", TABLE_DISABLED_AUTO_REPAIR, disabledTablesRepairCountBefore + 1, disabledTablesRepairCountAfter), + disabledTablesRepairCountBefore < disabledTablesRepairCountAfter); + } + + @Test + public void testTableAttribute() + { + assertTrue(TableAttributes.validKeywords().contains("auto_repair")); + } + + @Test + public void testDefaultAutomatedRepair() + { + for (AutoRepairConfig.RepairType repairType : AutoRepairConfig.RepairType.values()) + { + Assert.assertTrue(String.format("expected repair type %s to be enabled on table %s", repairType, cfm.name), + cfm.params.autoRepair.repairEnabled(repairType)); + Assert.assertFalse(String.format("expected repair type %s to be disabled on table %s", repairType, cfmDisabledAutoRepair.name), + cfmDisabledAutoRepair.params.autoRepair.repairEnabled(repairType)); + } + } + + @Test + public void testRepairShufflesKeyspacesAndTables() + { + AtomicInteger shuffleKeyspacesCall = new AtomicInteger(); + AtomicInteger shuffleTablesCall = new AtomicInteger(); + AtomicInteger keyspaceCount = new AtomicInteger(); + AutoRepair.shuffleFunc = (List list) -> { + // check whether was invoked for keyspaces or tables + if (list.contains(KEYSPACE)) + { + shuffleKeyspacesCall.getAndIncrement(); + keyspaceCount.set(list.size()); + } + else + // presume list not containing a keyspace is for tables. + shuffleTablesCall.getAndIncrement(); + }; + + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + config.setRepairMinInterval(repairType, "0s"); + AutoRepair.instance.repair(repairType); + + // Expect a single invocation for keyspaces + assertEquals(1, shuffleKeyspacesCall.get()); + // Expect an invocation for tables for each keyspace + assertNotEquals(0, keyspaceCount.get()); + assertEquals(keyspaceCount.get(), shuffleTablesCall.get()); + } + + @Test + public void testRepairTakesLastRepairTimeFromDB() + { + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + config.setMaterializedViewRepairEnabled(repairType, true); + long lastRepairTime = System.currentTimeMillis() - 1000; + AutoRepairUtils.insertNewRepairHistory(repairType, 0, lastRepairTime); + AutoRepair.instance.repairStates.get(repairType).setLastRepairTime(0); + config.setRepairMinInterval(repairType, "1h"); + + AutoRepair.instance.repair(repairType); + + // repair scheduler should not attempt to run repair as last repair time in DB is current time - 1s + assertEquals(0, AutoRepair.instance.repairStates.get(repairType).getTotalTablesConsideredForRepair()); + // repair scheduler should load the repair time from the DB + assertEquals(lastRepairTime, AutoRepair.instance.repairStates.get(repairType).getLastRepairTime()); + } + + @Test + public void testRepairMaxRetries() + { + when(autoRepairState.getRepairRunnable(Mockito.any(), Mockito.any(), Mockito.any(), anyBoolean())).thenReturn(repairRunnable); + doAnswer(invocation -> { + invocation.getArgument(0, ProgressListener.class).progress("test", new ProgressEvent(ProgressEventType.ERROR, 0, 0)); + return null; + }).when(repairRunnable).addProgressListener(Mockito.any()); + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + AtomicInteger sleepCalls = new AtomicInteger(); + AutoRepair.sleepFunc = (Long duration, TimeUnit unit) -> { + sleepCalls.getAndIncrement(); + assertEquals(TimeUnit.SECONDS, unit); + assertEquals(config.getRepairRetryBackoff(repairType).toSeconds(), (long) duration); + }; + config.setRepairMinInterval(repairType, "0s"); + AutoRepair.instance.repairStates.put(repairType, autoRepairState); + + AutoRepair.instance.repair(repairType); + + // Expect configured retries for each keyspace expected to be repaired + assertEquals(config.getRepairMaxRetries(repairType)*expectedRepairAssignments, sleepCalls.get()); + verify(autoRepairState, times(1)).setSucceededTokenRangesCount(0); + verify(autoRepairState, times(1)).setSkippedTokenRangesCount(0); + verify(autoRepairState, times(1)).setFailedTokenRangesCount(expectedRepairAssignments); + } + + @Test + public void testRepairSuccessAfterRetry() + { + when(autoRepairState.getRepairRunnable(Mockito.any(), Mockito.any(), Mockito.any(), anyBoolean())).thenReturn(repairRunnable); + + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + AtomicInteger sleepCalls = new AtomicInteger(); + AutoRepair.sleepFunc = (Long duration, TimeUnit unit) -> { + sleepCalls.getAndIncrement(); + assertEquals(TimeUnit.SECONDS, unit); + assertEquals(config.getRepairRetryBackoff(repairType).toSeconds(), (long) duration); + }; + doAnswer(invocation -> { + if (sleepCalls.get() == 0) + { + invocation.getArgument(0, ProgressListener.class).progress("test", new ProgressEvent(ProgressEventType.ERROR, 0, 0)); + } + else + { + invocation.getArgument(0, ProgressListener.class).progress("test", new ProgressEvent(ProgressEventType.COMPLETE, 0, 0)); + } + + return null; + }).when(repairRunnable).addProgressListener(Mockito.any()); + config.setRepairMinInterval(repairType, "0s"); + config.setRepairMaxRetries(repairType, 1); + AutoRepair.instance.repairStates.put(repairType, autoRepairState); + AutoRepair.instance.repair(repairType); + + assertEquals(1, sleepCalls.get()); + verify(autoRepairState, times(1)).setSucceededTokenRangesCount(expectedRepairAssignments); + verify(autoRepairState, times(1)).setSkippedTokenRangesCount(0); + verify(autoRepairState, times(1)).setFailedTokenRangesCount(0); + } + + @Test + public void testRepairDoesNotThrowsForIRWithMVReplayButMVRepairDisabled() + { + AutoRepair.instance.setup(); + DatabaseDescriptor.setMaterializedViewsOnRepairEnabled(true); + AutoRepairService.instance.getAutoRepairConfig().setMaterializedViewRepairEnabled(repairType, false); + + if (repairType == AutoRepairConfig.RepairType.INCREMENTAL) + { + try + { + AutoRepair.instance.repair(repairType); + } + catch (ConfigurationException ignored) + { + fail("ConfigurationException not expected"); + } + } + else + { + AutoRepair.instance.repair(repairType); + } + } + + @Test + public void testRepairThrowsForIRWithMVReplay() + { + AutoRepair.instance.setup(); + DatabaseDescriptor.setMaterializedViewsOnRepairEnabled(true); + AutoRepairService.instance.getAutoRepairConfig().setMaterializedViewRepairEnabled(repairType, true); + + if (repairType == AutoRepairConfig.RepairType.INCREMENTAL) + { + try + { + AutoRepair.instance.repair(repairType); + fail("Expected ConfigurationException"); + } + catch (ConfigurationException ignored) + { + } + } + else + { + AutoRepair.instance.repair(repairType); + } + } + + @Test + public void testRepairThrowsForIRWithCDCReplay() + { + AutoRepair.instance.setup(); + DatabaseDescriptor.setCDCEnabled(true); + DatabaseDescriptor.setCDCOnRepairEnabled(true); + + if (repairType == AutoRepairConfig.RepairType.INCREMENTAL) + { + try + { + AutoRepair.instance.repair(repairType); + fail("Expected ConfigurationException"); + } + catch (ConfigurationException ignored) + { + } + } + else + { + AutoRepair.instance.repair(repairType); + } + } + + @Test + public void testSoakAfterImmediateRepair() + { + when(autoRepairState.getRepairRunnable(Mockito.any(), Mockito.any(), Mockito.any(), anyBoolean())).thenReturn(repairRunnable); + doAnswer(invocation -> { + invocation.getArgument(0, ProgressListener.class).progress("test", new ProgressEvent(ProgressEventType.COMPLETE, 0, 0)); + return null; + }).when(repairRunnable).addProgressListener(Mockito.any()); + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + config.repair_task_min_duration = new DurationSpec.LongSecondsBound("10s"); + AtomicInteger sleepCalls = new AtomicInteger(); + AutoRepair.sleepFunc = (Long duration, TimeUnit unit) -> { + sleepCalls.getAndIncrement(); + assertEquals(TimeUnit.MILLISECONDS, unit); + assertTrue(config.getRepairTaskMinDuration().toMilliseconds() >= duration); + config.repair_task_min_duration = new DurationSpec.LongSecondsBound("0s"); + }; + config.setRepairMinInterval(repairType, "0s"); + AutoRepair.instance.repairStates.put(repairType, autoRepairState); + + AutoRepair.instance.repair(repairType); + + assertEquals(1, sleepCalls.get()); + verify(autoRepairState, times(1)).setSucceededTokenRangesCount(expectedRepairAssignments); + verify(autoRepairState, times(1)).setSkippedTokenRangesCount(0); + verify(autoRepairState, times(1)).setFailedTokenRangesCount(0); + } + + @Test + public void testNoSoakAfterRepair() + { + when(autoRepairState.getRepairRunnable(Mockito.any(), Mockito.any(), Mockito.any(), anyBoolean())).thenReturn(repairRunnable); + doAnswer(invocation -> { + invocation.getArgument(0, ProgressListener.class).progress("test", new ProgressEvent(ProgressEventType.COMPLETE, 0, 0)); + return null; + }).when(repairRunnable).addProgressListener(Mockito.any()); + AutoRepairConfig config = AutoRepairService.instance.getAutoRepairConfig(); + config.repair_task_min_duration = new DurationSpec.LongSecondsBound("0s"); + AutoRepair.sleepFunc = (Long duration, TimeUnit unit) -> { + fail("Should not sleep after repair"); + }; + config.setRepairMinInterval(repairType, "0s"); + AutoRepair.instance.repairStates.put(repairType, autoRepairState); + + AutoRepair.instance.repair(repairType); + + verify(autoRepairState, times(1)).setSucceededTokenRangesCount(expectedRepairAssignments); + verify(autoRepairState, times(1)).setSkippedTokenRangesCount(0); + verify(autoRepairState, times(1)).setFailedTokenRangesCount(0); + } + + @Test + public void testSchedulerIgnoresErrorsFromUnrelatedRepairRunables() + { + RepairOption options = new RepairOption(RepairParallelism.PARALLEL, true, repairType == AutoRepairConfig.RepairType.INCREMENTAL, false, + AutoRepairService.instance.getAutoRepairConfig().getRepairThreads(repairType), Collections.emptySet(), + false, false, PreviewKind.NONE, false, true, true, false, false, false); + AutoRepairState repairState = AutoRepair.instance.repairStates.get(repairType); + AutoRepairState spyState = spy(repairState); + AtomicReference failingListener = new AtomicReference<>(); + AtomicInteger repairRunableCalls = new AtomicInteger(); + doAnswer((InvocationOnMock inv ) -> { + RepairCoordinator runnable = spy(repairState.getRepairRunnable(inv.getArgument(0), inv.getArgument(1), inv.getArgument(2), + inv.getArgument(3))); + if (repairRunableCalls.getAndIncrement() == 0) + { + // this will be used for first repair job + doAnswer(invocation -> { + // repair runnable for the first repair job will immediately fail + failingListener.set(invocation.getArgument(0, AutoRepair.RepairProgressListener.class)); + invocation.getArgument(0, ProgressListener.class).progress("test", new ProgressEvent(ProgressEventType.ERROR, 0, 0)); + return null; + }).when(runnable).addProgressListener(Mockito.any()); + } + else + { + // this will be used for subsequent repair jobs + doAnswer(invocation -> { + if (repairRunableCalls.get() > 0) + { + // repair runnable for the subsequent repair jobs will immediately complete + invocation.getArgument(0, ProgressListener.class).progress("test", new ProgressEvent(ProgressEventType.COMPLETE, 0, 0)); + + } + // repair runnable for the first repair job will continue firing ERROR events + failingListener.get().progress("test", new ProgressEvent(ProgressEventType.ERROR, 0, 0)); + return null; + }).when(runnable).addProgressListener(Mockito.any()); + } + return runnable; + }).when(spyState).getRepairRunnable(Mockito.any(), Mockito.any(), Mockito.any(), anyBoolean()); + when(spyState.getLastRepairTime()).thenReturn((long) 0); + AutoRepairService.instance.getAutoRepairConfig().setRepairMaxRetries(repairType, 0); + AutoRepair.instance.repairStates.put(repairType, spyState); + + AutoRepair.instance.repair(repairType); + + assertEquals(1, (int) AutoRepairMetricsManager.getMetrics(repairType).failedTokenRangesCount.getValue()); + // only the first repair job should have failed despite it continuously firing ERROR events + verify(spyState, times(1)).setFailedTokenRangesCount(1); + } + + @Test + public void testProgressError() + { + AutoRepair.RepairProgressListener listener = new AutoRepair.RepairProgressListener(repairType); + + listener.progress("test", new ProgressEvent(ProgressEventType.ERROR, 0, 0, "test")); + + assertFalse(listener.success); + assertTrue(listener.condition.isSignalled()); + } + + @Test + public void testProgressProgress() + { + AutoRepair.RepairProgressListener listener = new AutoRepair.RepairProgressListener(repairType); + + listener.progress("test", new ProgressEvent(ProgressEventType.PROGRESS, 0, 0, "test")); + + assertFalse(listener.success); + assertFalse(listener.condition.isSignalled()); + } + + @Test + public void testProgresComplete() + { + AutoRepair.RepairProgressListener listener = new AutoRepair.RepairProgressListener(repairType); + + listener.progress("test", new ProgressEvent(ProgressEventType.COMPLETE, 0, 0, "test")); + + assertTrue(listener.success); + assertTrue(listener.condition.isSignalled()); + } + + @Test + public void testAwait() throws Exception + { + AutoRepair.RepairProgressListener listener = new AutoRepair.RepairProgressListener(repairType); + listener.progress("test", new ProgressEvent(ProgressEventType.COMPLETE, 0, 0, "test")); + + listener.await(new DurationSpec.IntSecondsBound("12h")); + } +} diff --git a/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairShutdownTest.java b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairShutdownTest.java new file mode 100644 index 000000000000..c7de17e0e455 --- /dev/null +++ b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairShutdownTest.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig.RepairType; + +import static org.apache.cassandra.Util.setAutoRepairEnabled; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +/** + * Unit tests to validate the executor shutdown inside {@link AutoRepair} + */ +public class AutoRepairShutdownTest extends CQLTester +{ + @BeforeClass + public static void setupClass() throws Exception + { + setAutoRepairEnabled(true); + requireNetwork(); + } + + @Test + public void testAutoRepairShutdown() throws Exception + { + AutoRepair.instance.setup(); + + for (RepairType type : RepairType.values()) + { + assertFalse("RepairRunnableExecutor should not have been shut down", AutoRepair.instance.getRepairRunnableExecutors().get(type).isShutdown()); + assertFalse("RepairExecutor should not have been shut down", AutoRepair.instance.getRepairExecutors().get(type).isShutdown()); + } + assertFalse("AutoRepair should not be marked as shut down", AutoRepair.instance.isShutDown); + + AutoRepair.instance.shutdownBlocking(); + + for (RepairType type : RepairType.values()) + { + assertTrue("RepairRunnableExecutor should be shut down", AutoRepair.instance.getRepairRunnableExecutors().get(type).isShutdown()); + assertTrue("RepairExecutor should be shut down", AutoRepair.instance.getRepairExecutors().get(type).isShutdown()); + } + assertTrue("AutoRepair should be marked as shut down", AutoRepair.instance.isShutDown); + + try + { + AutoRepair.instance.shutdownBlocking(); + fail("A second call to shutdown should have thrown an exception"); + } + catch (IllegalStateException e) + { + // expected + } + } +} diff --git a/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairStateFactoryTest.java b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairStateFactoryTest.java new file mode 100644 index 000000000000..97e80364eed7 --- /dev/null +++ b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairStateFactoryTest.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import org.junit.Test; + +import org.apache.cassandra.repair.autorepair.AutoRepairConfig.RepairType; + +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +/** + * Unit tests for {@link org.apache.cassandra.repair.autorepair.AutoRepairConfig.RepairType} + */ +public class AutoRepairStateFactoryTest +{ + @Test + public void testGetRepairState() + { + AutoRepairState state = RepairType.getAutoRepairState(RepairType.FULL); + + assertTrue(state instanceof FullRepairState); + + state = RepairType.getAutoRepairState(RepairType.INCREMENTAL); + + assertTrue(state instanceof IncrementalRepairState); + + state = RepairType.getAutoRepairState(RepairType.PREVIEW_REPAIRED); + + assertTrue(state instanceof PreviewRepairedState); + } + + @Test + public void testGetRepairStateSupportsAllRepairTypes() + { + for (RepairType repairType : RepairType.values()) + { + try + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + assertNotNull(state); + } catch (IllegalArgumentException e) + { + assertNull(e); + } + } + } +} diff --git a/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairStateTest.java b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairStateTest.java new file mode 100644 index 000000000000..422ebdff5192 --- /dev/null +++ b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairStateTest.java @@ -0,0 +1,319 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.util.Arrays; +import java.util.Collection; +import java.util.UUID; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.config.DurationSpec; +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig.RepairType; +import org.apache.cassandra.repair.autorepair.AutoRepairUtils.AutoRepairHistory; +import org.apache.cassandra.service.AutoRepairService; +import org.apache.cassandra.utils.progress.ProgressEvent; +import org.mockito.Mock; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.mockito.MockitoAnnotations.initMocks; + +/** + * Unit tests for {@link org.apache.cassandra.repair.autorepair.AutoRepairState} + */ +@RunWith(Parameterized.class) +public class AutoRepairStateTest extends CQLTester +{ + private static final String testTable = "test"; + + @Parameterized.Parameter + public RepairType repairType; + + @Mock + ProgressEvent progressEvent; + + @Parameterized.Parameters + public static Collection repairTypes() + { + return Arrays.asList(RepairType.values()); + } + + @Before + public void setUp() + { + AutoRepair.SLEEP_IF_REPAIR_FINISHES_QUICKLY = new DurationSpec.IntSecondsBound("0s"); + initMocks(this); + createTable(String.format("CREATE TABLE IF NOT EXISTS %s.%s (pk int PRIMARY KEY, v int)", KEYSPACE, testTable)); + } + + @Test + public void testGetRepairRunnable() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + AutoRepairService.setup(); + + Runnable runnable = state.getRepairRunnable(KEYSPACE, ImmutableList.of(testTable), ImmutableSet.of(), false); + + assertNotNull(runnable); + } + + @Test + public void testGetLastRepairTime() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + state.lastRepairTimeInMs = 1; + + assertEquals(1, state.getLastRepairTime()); + } + + @Test + public void testSetTotalTablesConsideredForRepair() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + + state.setTotalTablesConsideredForRepair(1); + + assertEquals(1, state.totalTablesConsideredForRepair); + } + + @Test + public void testGetTotalTablesConsideredForRepair() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + state.totalTablesConsideredForRepair = 1; + + assertEquals(1, state.getTotalTablesConsideredForRepair()); + } + + @Test + public void testSetLastRepairTimeInMs() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + + state.setLastRepairTime(1); + + assertEquals(1, state.lastRepairTimeInMs); + } + + @Test + public void testGetClusterRepairTimeInSec() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + state.clusterRepairTimeInSec = 1; + + assertEquals(1, state.getClusterRepairTimeInSec()); + } + + @Test + public void testGetNodeRepairTimeInSec() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + state.nodeRepairTimeInSec = 1; + + assertEquals(1, state.getNodeRepairTimeInSec()); + } + + @Test + public void testSetRepairInProgress() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + + state.setRepairInProgress(true); + + assertTrue(state.repairInProgress); + } + + @Test + public void testIsRepairInProgress() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + state.repairInProgress = true; + + assertTrue(state.isRepairInProgress()); + } + + @Test + public void testSetSkippedTokenRangesCount() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + + state.setSkippedTokenRangesCount(1); + + assertEquals(1, state.skippedTokenRangesCount); + } + + @Test + public void testGetSkippedTokenRangesCount() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + state.skippedTokenRangesCount = 1; + + assertEquals(1, state.getSkippedTokenRangesCount()); + } + + @Test + public void testGetLongestUnrepairedSecNull() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + state.longestUnrepairedNode = null; + + try + { + assertEquals(0, state.getLongestUnrepairedSec()); + } + catch (Exception e) + { + assertNull(e); + } + } + + @Test + public void testGetLongestUnrepairedSec() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + state.longestUnrepairedNode = new AutoRepairHistory(UUID.randomUUID(), "", 0, 1000, + null, 0, false); + AutoRepairState.timeFunc = () -> 2000L; + + try + { + assertEquals(1, state.getLongestUnrepairedSec()); + } + catch (Exception e) + { + assertNull(e); + } + } + + @Test + public void testSetTotalMVTablesConsideredForRepair() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + + state.setTotalMVTablesConsideredForRepair(1); + + assertEquals(1, state.totalMVTablesConsideredForRepair); + } + + @Test + public void testGetTotalMVTablesConsideredForRepair() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + state.totalMVTablesConsideredForRepair = 1; + + assertEquals(1, state.getTotalMVTablesConsideredForRepair()); + } + + @Test + public void testSetNodeRepairTimeInSec() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + + state.setNodeRepairTimeInSec(1); + + assertEquals(1, state.nodeRepairTimeInSec); + } + + @Test + public void testSetClusterRepairTimeInSec() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + + state.setClusterRepairTimeInSec(1); + + assertEquals(1, state.clusterRepairTimeInSec); + } + + @Test + public void testSetRepairKeyspaceCount() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + + state.setRepairKeyspaceCount(1); + + assertEquals(1, state.repairKeyspaceCount); + } + + @Test + public void testGetRepairKeyspaceCount() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + state.repairKeyspaceCount = 1; + + assertEquals(1, state.getRepairKeyspaceCount()); + } + + @Test + public void testSetLongestUnrepairedNode() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + AutoRepairHistory history = new AutoRepairHistory(UUID.randomUUID(), "", 0, 0, null, 0, false); + + state.setLongestUnrepairedNode(history); + + assertEquals(history, state.longestUnrepairedNode); + } + + @Test + public void testSetSucceededTokenRangesCount() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + + state.setSucceededTokenRangesCount(1); + + assertEquals(1, state.succeededTokenRangesCount); + } + + @Test + public void testGetSucceededTokenRangesCount() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + state.succeededTokenRangesCount = 1; + + assertEquals(1, state.getSucceededTokenRangesCount()); + } + + @Test + public void testSetFailedTokenRangesCount() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + + state.setFailedTokenRangesCount(1); + + assertEquals(1, state.failedTokenRangesCount); + } + + @Test + public void testGetFailedTokenRangesCount() + { + AutoRepairState state = RepairType.getAutoRepairState(repairType); + state.failedTokenRangesCount = 1; + + assertEquals(1, state.getFailedTokenRangesCount()); + } +} diff --git a/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairTablePropertyTest.java b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairTablePropertyTest.java new file mode 100644 index 000000000000..1bf8e52f9849 --- /dev/null +++ b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairTablePropertyTest.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.util.Map; + +import org.junit.Test; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.SimpleBuilders; +import org.apache.cassandra.db.rows.ColumnData; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.SchemaKeyspace; +import org.apache.cassandra.schema.SchemaKeyspaceTables; +import org.apache.cassandra.utils.ByteBufferUtil; + +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; + +/** + * Unit tests that verifies "auto_repair" is not included in Schema mutation + * {@link org.apache.cassandra.schema.SchemaKeyspace} if AutoRepair is disabled + */ +public class AutoRepairTablePropertyTest extends CQLTester +{ + @Test + public void testSchedulerDisabledNoColumnReturned() + { + helperTestTableProperty(false); + } + + @Test + public void testSchedulerEnabledShouldReturnColumnReturned() + { + helperTestTableProperty(true); + } + + public void helperTestTableProperty(boolean autoRepairOn) + { + DatabaseDescriptor.getAutoRepairConfig().setAutoRepairSchedulingEnabled(autoRepairOn); + DatabaseDescriptor.setAccordTransactionsEnabled(false); + + Map systemSchemaTables = Map.of(SchemaKeyspaceTables.TABLES, "table_name", SchemaKeyspaceTables.VIEWS, "view_name"); + for (Map.Entry systemSchema : systemSchemaTables.entrySet()) + { + ColumnFamilyStore tables = Keyspace.open(SchemaConstants.SCHEMA_KEYSPACE_NAME).getColumnFamilyStore(systemSchema.getKey()); + SimpleBuilders.RowBuilder builder = new SimpleBuilders.RowBuilder(tables.metadata(), systemSchema.getValue()); + SchemaKeyspace.addTableParamsToRowBuilder(tables.metadata().params, builder, false); + Row row = builder.build(); + ColumnMetadata autoRepair = tables.metadata().getColumn(ByteBufferUtil.bytes("auto_repair")); + ColumnData data = row.getCell(autoRepair); + if (autoRepairOn) + { + assertNotNull(data); + } + else + { + // if AutoRepair is not enabled, the column should not be returned + // as part of the system_schema.tables mutation + assertNull(data); + } + } + } +} diff --git a/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairTest.java b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairTest.java new file mode 100644 index 000000000000..1eceb386ee25 --- /dev/null +++ b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairTest.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.util.HashMap; +import java.util.Map; + +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.Assert; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.config.DurationSpec; +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.ReplicationParams; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig.RepairType; +import org.apache.cassandra.schema.SchemaTestUtil; +import org.apache.cassandra.service.AutoRepairService; + +import static org.apache.cassandra.Util.setAutoRepairEnabled; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** + * Unit tests for {@link org.apache.cassandra.repair.autorepair.AutoRepair} + */ +public class AutoRepairTest extends CQLTester +{ + @BeforeClass + public static void setupClass() throws Exception + { + setAutoRepairEnabled(true); + requireNetwork(); + } + + @Before + public void setup() + { + AutoRepair.SLEEP_IF_REPAIR_FINISHES_QUICKLY = new DurationSpec.IntSecondsBound("0s"); + DatabaseDescriptor.setCDCOnRepairEnabled(false); + DatabaseDescriptor.setMaterializedViewsOnRepairEnabled(false); + DatabaseDescriptor.getAutoRepairConfig().setAutoRepairEnabled(RepairType.FULL, true); + DatabaseDescriptor.getAutoRepairConfig().setAutoRepairEnabled(RepairType.INCREMENTAL, true); + AutoRepairService.setup(); + } + + @Test + public void testSetup() + { + AutoRepair.instance.setup(); + assertEquals(RepairType.values().length, AutoRepair.instance.repairExecutors.size()); + for (RepairType repairType : AutoRepair.instance.repairExecutors.keySet()) + { + int expectedTasks = AutoRepair.instance.repairExecutors.get(repairType).getPendingTaskCount() + + AutoRepair.instance.repairExecutors.get(repairType).getActiveTaskCount(); + assertTrue(String.format("Expected > 0 task in queue for %s but was %s", repairType, expectedTasks), + expectedTasks > 0); + } + } + + @Test + public void testSafeGuardSetupCall() + { + // only one should be setup, and rest should be ignored + AutoRepair.instance.setup(); + AutoRepair.instance.setup(); + AutoRepair.instance.setup(); + + assertEquals(RepairType.values().length, AutoRepair.instance.repairExecutors.size()); + for (RepairType repairType : AutoRepair.instance.repairExecutors.keySet()) + { + int expectedTasks = AutoRepair.instance.repairExecutors.get(repairType).getPendingTaskCount() + + AutoRepair.instance.repairExecutors.get(repairType).getActiveTaskCount(); + assertTrue(String.format("Expected > 0 task in queue for %s but was %s", repairType, expectedTasks), + expectedTasks > 0); + } + } + + @Test(expected = ConfigurationException.class) + public void testSetupFailsWhenIREnabledWithCDCReplay() + { + DatabaseDescriptor.getAutoRepairConfig().setAutoRepairEnabled(RepairType.INCREMENTAL, true); + DatabaseDescriptor.setMaterializedViewsOnRepairEnabled(true); + DatabaseDescriptor.setCDCEnabled(true); + DatabaseDescriptor.setCDCOnRepairEnabled(true); + AutoRepair.instance.isSetupDone = false; + AutoRepair.instance.setup(); + } + + @Test + public void testNoFailureIfMVRepairOnButConfigIsOff() + { + DatabaseDescriptor.getAutoRepairConfig().setAutoRepairEnabled(RepairType.INCREMENTAL, true); + DatabaseDescriptor.getAutoRepairConfig().setMaterializedViewRepairEnabled(RepairType.INCREMENTAL, false); + DatabaseDescriptor.setCDCOnRepairEnabled(false); + DatabaseDescriptor.setMaterializedViewsOnRepairEnabled(true); + AutoRepair.instance.setup(); + } + + @Test(expected = ConfigurationException.class) + public void testSetupFailsWhenIREnabledWithMVReplay() + { + DatabaseDescriptor.getAutoRepairConfig().setAutoRepairEnabled(RepairType.INCREMENTAL, true); + DatabaseDescriptor.getAutoRepairConfig().setMaterializedViewRepairEnabled(RepairType.INCREMENTAL, true); + DatabaseDescriptor.setCDCOnRepairEnabled(false); + DatabaseDescriptor.setMaterializedViewsOnRepairEnabled(true); + AutoRepair.instance.isSetupDone = false; + AutoRepair.instance.setup(); + } + + @Test + public void testCheckNTSreplicationNodeInsideOutsideDC() + { + String ksname1 = "ks_nts1"; + String ksname2 = "ks_nts2"; + Map configOptions1 = new HashMap<>(); + configOptions1.put("datacenter1", "3"); + configOptions1.put(ReplicationParams.CLASS, "NetworkTopologyStrategy"); + SchemaTestUtil.addOrUpdateKeyspace(KeyspaceMetadata.create(ksname1, KeyspaceParams.create(false, configOptions1)), false); + Map configOptions2 = new HashMap<>(); + configOptions2.put("datacenter2", "3"); + configOptions2.put(ReplicationParams.CLASS, "NetworkTopologyStrategy"); + SchemaTestUtil.addOrUpdateKeyspace(KeyspaceMetadata.create(ksname2, KeyspaceParams.create(false, configOptions2)), false); + + for (Keyspace ks : Keyspace.all()) + { + if (ks.getName().equals(ksname1)) + { + // case 1 : + // node reside in "datacenter1" + // keyspace has replica in "datacenter1" + Assert.assertTrue(AutoRepairUtils.shouldConsiderKeyspace(ks)); + } + else if (ks.getName().equals(ksname2)) + { + // case 2 : + // node reside in "datacenter1" + // keyspace has replica in "datacenter2" + Assert.assertFalse(AutoRepairUtils.shouldConsiderKeyspace(ks)); + } + } + } +} diff --git a/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairUtilsTest.java b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairUtilsTest.java new file mode 100644 index 000000000000..d9723ea193dd --- /dev/null +++ b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairUtilsTest.java @@ -0,0 +1,491 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.util.List; +import java.util.Set; +import java.util.TreeSet; +import java.util.UUID; + +import com.google.common.collect.ImmutableSet; +import org.apache.cassandra.schema.SystemDistributedKeyspace; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.config.DurationSpec; +import org.apache.cassandra.cql3.UntypedResultSet; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.marshal.UUIDType; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig.RepairType; +import org.apache.cassandra.repair.autorepair.AutoRepairUtils.AutoRepairHistory; +import org.apache.cassandra.repair.autorepair.AutoRepairUtils.CurrentRepairStatus; + +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.tcm.membership.NodeAddresses; +import org.apache.cassandra.utils.FBUtilities; + +import static org.apache.cassandra.Util.setAutoRepairEnabled; +import static org.apache.cassandra.config.CassandraRelevantProperties.SYSTEM_DISTRIBUTED_DEFAULT_RF; +import static org.apache.cassandra.repair.autorepair.AutoRepairUtils.COL_DELETE_HOSTS; +import static org.apache.cassandra.repair.autorepair.AutoRepairUtils.COL_FORCE_REPAIR; +import static org.apache.cassandra.repair.autorepair.AutoRepairUtils.COL_REPAIR_FINISH_TS; +import static org.apache.cassandra.repair.autorepair.AutoRepairUtils.COL_REPAIR_PRIORITY; +import static org.apache.cassandra.repair.autorepair.AutoRepairUtils.COL_REPAIR_START_TS; +import static org.apache.cassandra.repair.autorepair.AutoRepairUtils.COL_REPAIR_TURN; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +/** + * Unit tests for {@link org.apache.cassandra.repair.autorepair.AutoRepairUtils} + */ +public class AutoRepairUtilsTest extends CQLTester +{ + static final RepairType repairType = RepairType.INCREMENTAL; + static UUID hostId; + + static InetAddressAndPort localEndpoint; + + @BeforeClass + public static void setupClass() throws Exception + { + SYSTEM_DISTRIBUTED_DEFAULT_RF.setInt(1); + setAutoRepairEnabled(true); + requireNetwork(); + localEndpoint = FBUtilities.getBroadcastAddressAndPort(); + hostId = StorageService.instance.getHostIdForEndpoint(localEndpoint); + StorageService.instance.doAutoRepairSetup(); + } + + @Before + public void setup() + { + SYSTEM_DISTRIBUTED_DEFAULT_RF.setInt(1); + QueryProcessor.executeInternal(String.format("CREATE KEYSPACE %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}", "ks")); + QueryProcessor.executeInternal(String.format("CREATE TABLE %s.%s (k text, s text static, i int, v text, primary key(k,i))", "ks", "tbl")); + + AutoRepair.SLEEP_IF_REPAIR_FINISHES_QUICKLY = new DurationSpec.IntSecondsBound("0s"); + QueryProcessor.executeInternal(String.format( + "TRUNCATE %s.%s", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY)); + QueryProcessor.executeInternal(String.format( + "TRUNCATE %s.%s", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_PRIORITY)); + } + + @Test + public void testSetForceRepair() + { + QueryProcessor.executeInternal(String.format( + "INSERT INTO %s.%s (repair_type, host_id, force_repair) VALUES ('%s', %s, false)", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), hostId)); + + AutoRepairUtils.setForceRepair(repairType, ImmutableSet.of(localEndpoint)); + + UntypedResultSet result = QueryProcessor.executeInternal(String.format( + "SELECT force_repair FROM %s.%s WHERE repair_type = '%s' AND host_id = %s", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), hostId)); + assertNotNull(result); + assertEquals(1, result.size()); + assertTrue(result.one().getBoolean(COL_FORCE_REPAIR)); + } + + @Test + public void testSetForceRepairNewNode() + { + AutoRepairUtils.setForceRepairNewNode(repairType); + + UntypedResultSet result = QueryProcessor.executeInternal(String.format( + "SELECT force_repair FROM %s.%s WHERE repair_type = '%s' AND host_id = %s", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), hostId)); + assertNotNull(result); + assertEquals(1, result.size()); + assertTrue(result.one().getBoolean(COL_FORCE_REPAIR)); + } + + @Test + public void testClearDeleteHosts() + { + QueryProcessor.executeInternal(String.format( + "INSERT INTO %s.%s (repair_type, host_id, delete_hosts, delete_hosts_update_time) VALUES ('%s', %s, { %s }, toTimestamp(now()))", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), hostId, hostId)); + + AutoRepairUtils.clearDeleteHosts(repairType, hostId); + + UntypedResultSet result = QueryProcessor.executeInternal(String.format( + "SELECT delete_hosts FROM %s.%s WHERE repair_type = '%s' AND host_id = %s", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), hostId)); + assertNotNull(result); + assertEquals(1, result.size()); + Set deleteHosts = result.one().getSet(COL_DELETE_HOSTS, UUIDType.instance); + assertNull(deleteHosts); + } + + @Test + public void testGetAutoRepairHistoryForLocalGroup() + { + QueryProcessor.executeInternal(String.format( + "INSERT INTO %s.%s (repair_type, host_id, force_repair) VALUES ('%s', %s, false)", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), hostId)); + + List history = AutoRepairUtils.getAutoRepairHistory(repairType); + assertNotNull(history); + assertEquals(1, history.size()); + assertEquals(hostId, history.get(0).hostId); + } + + @Test + public void testGetAutoRepairHistoryForLocalGroup_empty_history() + { + List history = AutoRepairUtils.getAutoRepairHistory(repairType); + + assertNull(history); + } + + @Test + public void testGetCurrentRepairStatus() + { + UUID forceRepair = UUID.randomUUID(); + UUID regularRepair = UUID.randomUUID(); + QueryProcessor.executeInternal(String.format( + "INSERT INTO %s.%s (repair_type, host_id) VALUES ('%s', %s)", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), hostId)); + QueryProcessor.executeInternal(String.format( + "INSERT INTO %s.%s (repair_type, host_id, force_repair, repair_start_ts) VALUES ('%s', %s, true, toTimestamp(now()))", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), forceRepair)); + QueryProcessor.executeInternal(String.format( + "INSERT INTO %s.%s (repair_type, host_id, repair_start_ts) VALUES ('%s', %s, toTimestamp(now()))", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), regularRepair)); + QueryProcessor.executeInternal(String.format( + "INSERT INTO %s.%s (repair_type, repair_priority) VALUES ('%s', { %s })", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_PRIORITY, + repairType.toString(), regularRepair)); + + CurrentRepairStatus status = AutoRepairUtils.getCurrentRepairStatus(repairType, AutoRepairUtils.getAutoRepairHistory(repairType), hostId); + + assertNotNull(status); + assertEquals(1, status.historiesWithoutOnGoingRepair.size()); + assertEquals(hostId, status.historiesWithoutOnGoingRepair.get(0).hostId); + assertEquals(1, status.hostIdsWithOnGoingRepair.size()); + assertTrue(status.hostIdsWithOnGoingRepair.contains(regularRepair)); + assertEquals(1, status.hostIdsWithOnGoingForceRepair.size()); + assertTrue(status.hostIdsWithOnGoingForceRepair.contains(forceRepair)); + assertEquals(1, status.priority.size()); + assertTrue(status.priority.contains(regularRepair)); + assertEquals(hostId, status.myRepairHistory.hostId); + } + + @Test + public void testGetHostIdsInCurrentRing() + { + TreeSet hosts = AutoRepairUtils.getHostIdsInCurrentRing(repairType); + + assertNotNull(hosts); + assertEquals(1, hosts.size()); + assertTrue(hosts.contains(hostId)); + } + + @Test + public void testGetHostIdsInCurrentRing_multiple_nodes() + { + InetAddressAndPort ignoredEndpoint = localEndpoint.withPort(localEndpoint.getPort() + 1); + InetAddressAndPort deadEndpoint = localEndpoint.withPort(localEndpoint.getPort() + 2); + DatabaseDescriptor.getAutoRepairConfig().setIgnoreDCs(repairType, ImmutableSet.of("dc2")); + + TreeSet hosts = AutoRepairUtils.getHostIdsInCurrentRing(repairType, ImmutableSet.of(new NodeAddresses(localEndpoint), new NodeAddresses(ignoredEndpoint), new NodeAddresses(deadEndpoint))); + + assertNotNull(hosts); + assertEquals(1, hosts.size()); + assertTrue(hosts.contains(hostId)); + } + + @Test + public void testGetHostWithLongestUnrepairTime() + { + UUID otherHostId = UUID.randomUUID(); + QueryProcessor.executeInternal(String.format( + "INSERT INTO %s.%s (repair_type, host_id) VALUES ('%s', %s)", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), hostId)); + QueryProcessor.executeInternal(String.format( + "INSERT INTO %s.%s (repair_type, host_id, repair_finish_ts) VALUES ('%s', %s, toTimestamp(now()))", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), otherHostId)); + + AutoRepairHistory history = AutoRepairUtils.getHostWithLongestUnrepairTime(repairType); + + assertEquals(hostId, history.hostId); + } + + @Test + public void testGetMaxNumberOfNodeRunAutoRepairInGroup_0_group_size() + { + DatabaseDescriptor.getAutoRepairConfig().setParallelRepairCount(repairType, 2); + + int count = AutoRepairUtils.getMaxNumberOfNodeRunAutoRepair(repairType, 0); + + assertEquals(2, count); + } + + @Test + public void testGetMaxNumberOfNodeRunAutoRepairInGroup_percentage() + { + DatabaseDescriptor.getAutoRepairConfig().setParallelRepairCount(repairType, 2); + DatabaseDescriptor.getAutoRepairConfig().setParallelRepairPercentage(repairType, 50); + + + int count = AutoRepairUtils.getMaxNumberOfNodeRunAutoRepair(repairType, 10); + + assertEquals(5, count); + } + + @Test + public void testDeleteAutoRepairHistory() + { + QueryProcessor.executeInternal(String.format( + "INSERT INTO %s.%s (repair_type, host_id) VALUES ('%s', %s)", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), hostId)); + + AutoRepairUtils.deleteAutoRepairHistory(repairType, hostId); + + UntypedResultSet result = QueryProcessor.executeInternal(String.format( + "SELECT * FROM %s.%s WHERE repair_type = '%s' AND host_id = %s", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), hostId)); + assertNotNull(result); + assertEquals(0, result.size()); + } + + @Test + public void testUpdateStartAutoRepairHistory() + { + QueryProcessor.executeInternal(String.format( + "INSERT INTO %s.%s (repair_type, host_id) VALUES ('%s', %s)", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), hostId)); + + AutoRepairUtils.updateStartAutoRepairHistory(repairType, hostId, 123, AutoRepairUtils.RepairTurn.MY_TURN); + + UntypedResultSet result = QueryProcessor.executeInternal(String.format( + "SELECT repair_start_ts, repair_turn FROM %s.%s WHERE repair_type = '%s' AND host_id = %s", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), hostId)); + assertNotNull(result); + assertEquals(1, result.size()); + UntypedResultSet.Row row = result.one(); + assertEquals(123, row.getLong(COL_REPAIR_START_TS, 0)); + assertEquals(AutoRepairUtils.RepairTurn.MY_TURN.toString(), row.getString(COL_REPAIR_TURN)); + } + + @Test + public void testUpdateFinishAutoRepairHistory() + { + QueryProcessor.executeInternal(String.format( + "INSERT INTO %s.%s (repair_type, host_id) VALUES ('%s', %s)", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), hostId)); + + AutoRepairUtils.updateFinishAutoRepairHistory(repairType, hostId, 123); + + UntypedResultSet result = QueryProcessor.executeInternal(String.format( + "SELECT repair_finish_ts FROM %s.%s WHERE repair_type = '%s' AND host_id = %s", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), hostId)); + assertNotNull(result); + assertEquals(1, result.size()); + assertEquals(123, result.one().getLong(COL_REPAIR_FINISH_TS, 0)); + } + + @Test + public void testAddHostIdToDeleteHosts() + { + UUID otherHostId = UUID.randomUUID(); + QueryProcessor.executeInternal(String.format( + "INSERT INTO %s.%s (repair_type, host_id) VALUES ('%s', %s)", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), otherHostId)); + + AutoRepairUtils.addHostIdToDeleteHosts(repairType, hostId, otherHostId); + + UntypedResultSet result = QueryProcessor.executeInternal(String.format( + "SELECT * FROM %s.%s WHERE repair_type = '%s' AND host_id = %s", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, + repairType.toString(), otherHostId)); + assertNotNull(result); + assertEquals(1, result.size()); + Set deleteHosts = result.one().getSet(COL_DELETE_HOSTS, UUIDType.instance); + assertNotNull(deleteHosts); + assertEquals(1, deleteHosts.size()); + assertTrue(deleteHosts.contains(hostId)); + } + + @Test + public void testAddPriorityHost() + { + AutoRepairUtils.addPriorityHosts(repairType, ImmutableSet.of(localEndpoint)); + + UntypedResultSet result = QueryProcessor.executeInternal(String.format( + "SELECT * FROM %s.%s WHERE repair_type = '%s'", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_PRIORITY, + repairType.toString())); + assertNotNull(result); + assertEquals(1, result.size()); + Set repairPriority = result.one().getSet(COL_REPAIR_PRIORITY, UUIDType.instance); + assertNotNull(repairPriority); + assertEquals(1, repairPriority.size()); + assertTrue(repairPriority.contains(hostId)); + } + + @Test + public void testRemovePriorityStatus() + { + QueryProcessor.executeInternal(String.format( + "INSERT INTO %s.%s (repair_type, repair_priority) VALUES ('%s', { %s })", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_PRIORITY, + repairType.toString(), hostId)); + + AutoRepairUtils.removePriorityStatus(repairType, hostId); + + UntypedResultSet result = QueryProcessor.executeInternal(String.format( + "SELECT * FROM %s.%s WHERE repair_type = '%s'", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_PRIORITY, + repairType.toString())); + assertNotNull(result); + assertEquals(1, result.size()); + Set repairPriority = result.one().getSet(COL_REPAIR_PRIORITY, UUIDType.instance); + assertNull(repairPriority); + } + + @Test + public void testGetPriorityHosts() + { + QueryProcessor.executeInternal(String.format( + "INSERT INTO %s.%s (repair_type, repair_priority) VALUES ('%s', { %s })", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_PRIORITY, + repairType.toString(), hostId)); + + Set hosts = AutoRepairUtils.getPriorityHosts(repairType); + + assertNotNull(hosts); + assertEquals(1, hosts.size()); + assertTrue(hosts.contains(localEndpoint)); + } + + @Test + public void testCheckNodeContainsKeyspaceReplica() + { + Keyspace ks = Keyspace.open("ks"); + + assertTrue(AutoRepairUtils.shouldConsiderKeyspace(ks)); + } + + @Test + public void testTableMaxRepairTimeExceeded() + { + DatabaseDescriptor.getAutoRepairConfig().setAutoRepairTableMaxRepairTime(repairType, "0s"); + + assertTrue(AutoRepairUtils.tableMaxRepairTimeExceeded(repairType, 0)); + } + + @Test + public void testKeyspaceMaxRepairTimeExceeded() + { + DatabaseDescriptor.getAutoRepairConfig().setAutoRepairTableMaxRepairTime(repairType, "0s"); + + assertTrue(AutoRepairUtils.keyspaceMaxRepairTimeExceeded(repairType, 0, 1)); + } + + @Test + public void testGetLastRepairFinishTime() + { + AutoRepairHistory history = new AutoRepairHistory(UUID.randomUUID(), "", 0, 0, null, 0, false); + + assertEquals(0, history.getLastRepairFinishTime()); + + history.lastRepairFinishTime = 100; + + assertEquals(100, history.getLastRepairFinishTime()); + } + + @Test + public void testMyTurnToRunRepairShouldReturnMyTurnWhenRepairOngoing() + { + UUID myID = UUID.randomUUID(); + UUID otherID = UUID.randomUUID(); + DatabaseDescriptor.getAutoRepairConfig().setParallelRepairCount(repairType, 5); + long currentMillis = System.currentTimeMillis(); + // finish time less than start time means that repair is ongoing + AutoRepairUtils.insertNewRepairHistory(repairType, myID, currentMillis, currentMillis - 100); + // finish time is larger than start time means that repair for other node is finished + AutoRepairUtils.insertNewRepairHistory(repairType, otherID, currentMillis, currentMillis + 100); + + assertEquals(AutoRepairUtils.RepairTurn.MY_TURN, AutoRepairUtils.myTurnToRunRepair(repairType, myID)); + } + + @Test + public void testLocalStrategyAndNetworkKeyspace() + { + assertFalse(AutoRepairUtils.shouldConsiderKeyspace(Keyspace.open("system"))); + assertTrue(AutoRepairUtils.shouldConsiderKeyspace(Keyspace.open(KEYSPACE))); + } + + @Test + public void testGetLastRepairTimeForNode() + { + UUID myID = UUID.randomUUID(); + UUID otherID = UUID.randomUUID(); + long currentMillis = System.currentTimeMillis(); + AutoRepairUtils.insertNewRepairHistory(repairType, myID, currentMillis, currentMillis - 100); + AutoRepairUtils.insertNewRepairHistory(repairType, otherID, currentMillis, currentMillis + 100); + + assertEquals(currentMillis - 100, AutoRepairUtils.getLastRepairTimeForNode(repairType, myID)); + } + + @Test + public void testGetLastRepairTimeForNodeWhenHistoryIsEmpty() + { + UUID myID = UUID.randomUUID(); + + assertEquals(0, AutoRepairUtils.getLastRepairTimeForNode(repairType, myID)); + } + + @Test + public void testSkipSystemTraces() + { + assertFalse(AutoRepairUtils.shouldConsiderKeyspace(Keyspace.open(SchemaConstants.TRACE_KEYSPACE_NAME))); + } +} diff --git a/test/unit/org/apache/cassandra/repair/autorepair/FixedSplitTokenRangeSplitterHelper.java b/test/unit/org/apache/cassandra/repair/autorepair/FixedSplitTokenRangeSplitterHelper.java new file mode 100644 index 000000000000..dac4a167d556 --- /dev/null +++ b/test/unit/org/apache/cassandra/repair/autorepair/FixedSplitTokenRangeSplitterHelper.java @@ -0,0 +1,201 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.dht.BootStrapper; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.index.sai.disk.format.Version; +import org.apache.cassandra.service.AutoRepairService; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.compatibility.TokenRingUtils; +import org.apache.cassandra.utils.FBUtilities; + +import static org.apache.cassandra.config.CassandraRelevantProperties.SYSTEM_DISTRIBUTED_DEFAULT_RF; +import static org.apache.cassandra.cql3.CQLTester.Fuzzed.setupSeed; +import static org.apache.cassandra.cql3.CQLTester.Fuzzed.updateConfigs; +import static org.apache.cassandra.repair.autorepair.FixedSplitTokenRangeSplitter.DEFAULT_NUMBER_OF_SUBRANGES; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +/** + * Helper class for {@link FixedSplitTokenRangeSplitterNoVNodesTest} and {@link FixedSplitTokenRangeSplitterVNodesTest} + */ +public class FixedSplitTokenRangeSplitterHelper +{ + private static final String TABLE1 = "tbl1"; + private static final String TABLE2 = "tbl2"; + private static final String TABLE3 = "tbl3"; + public static final String KEYSPACE = "ks"; + + public static void setupClass(int numTokens) throws Exception + { + setupSeed(); + updateConfigs(); + DatabaseDescriptor.setPartitioner("org.apache.cassandra.dht.Murmur3Partitioner"); + ServerTestUtils.prepareServerNoRegister(); + + Set tokens = BootStrapper.getRandomTokens(ClusterMetadata.current(), numTokens); + ServerTestUtils.registerLocal(tokens); + // Ensure that the on-disk format statics are loaded before the test run + Version.LATEST.onDiskFormat(); + StorageService.instance.doAutoRepairSetup(); + + SYSTEM_DISTRIBUTED_DEFAULT_RF.setInt(1); + QueryProcessor.executeInternal(String.format("CREATE KEYSPACE %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}", FixedSplitTokenRangeSplitterHelper.KEYSPACE)); + } + + public static void testTokenRangesSplitByTable(int numTokens, int numberOfSubRanges, AutoRepairConfig.RepairType repairType) + { + int numberOfSplits = calcSplits(numTokens, numberOfSubRanges); + AutoRepairService.instance.getAutoRepairConfig().setRepairByKeyspace(repairType, false); + Collection> tokens = TokenRingUtils.getPrimaryRangesForEndpoint(KEYSPACE, FBUtilities.getBroadcastAddressAndPort()); + assertEquals(numTokens, tokens.size()); + List tables = Arrays.asList(TABLE1, TABLE2, TABLE3); + List> expectedToken = new ArrayList<>(); + for (int i = 0; i < tables.size(); i++) + { + for (Range range : tokens) + { + expectedToken.addAll(AutoRepairUtils.split(range, numberOfSplits)); + } + } + + List plan = PrioritizedRepairPlan.buildSingleKeyspacePlan(repairType, KEYSPACE, TABLE1, TABLE2, TABLE3); + + Iterator keyspaceAssignments = new FixedSplitTokenRangeSplitter(repairType, Collections.singletonMap(FixedSplitTokenRangeSplitter.NUMBER_OF_SUBRANGES, Integer.toString(numberOfSubRanges))) + .getRepairAssignments(true, plan); + + // should be only 1 entry for the keyspace. + assertTrue(keyspaceAssignments.hasNext()); + KeyspaceRepairAssignments keyspace = keyspaceAssignments.next(); + assertFalse(keyspaceAssignments.hasNext()); + + List assignments = keyspace.getRepairAssignments(); + assertEquals(numTokens * numberOfSplits * tables.size(), assignments.size()); + assertEquals(expectedToken.size(), assignments.size()); + + int assignmentsPerTable = numTokens * numberOfSplits; + for (int i = 0; i < tables.size(); i++) + { + List assignmentForATable = new ArrayList<>(); + List> expectedTokensForATable = new ArrayList<>(); + for (int j = 0; j < assignmentsPerTable; j++) + { + assertEquals(Collections.singletonList(tables.get(i)), assignments.get(i * assignmentsPerTable + j).getTableNames()); + assignmentForATable.add(assignments.get(i * assignmentsPerTable + j)); + expectedTokensForATable.add(expectedToken.get(i * assignmentsPerTable + j)); + } + compare(numTokens, numberOfSplits, expectedTokensForATable, assignmentForATable); + } + } + + public static void testTokenRangesSplitByKeyspace(int numTokens, int numberOfSubRanges, AutoRepairConfig.RepairType repairType) + { + int numberOfSplits = calcSplits(numTokens, numberOfSubRanges); + AutoRepairService.instance.getAutoRepairConfig().setRepairByKeyspace(repairType, true); + Collection> tokens = TokenRingUtils.getPrimaryRangesForEndpoint(KEYSPACE, FBUtilities.getBroadcastAddressAndPort()); + assertEquals(numTokens, tokens.size()); + List> expectedToken = new ArrayList<>(); + for (Range range : tokens) + { + expectedToken.addAll(AutoRepairUtils.split(range, numberOfSplits)); + } + + List plan = PrioritizedRepairPlan.buildSingleKeyspacePlan(repairType, KEYSPACE, TABLE1, TABLE2, TABLE3); + + Iterator keyspaceAssignments = new FixedSplitTokenRangeSplitter(repairType, Collections.singletonMap(FixedSplitTokenRangeSplitter.NUMBER_OF_SUBRANGES, Integer.toString(numberOfSubRanges))) + .getRepairAssignments(true, plan); + + // should be only 1 entry for the keyspace. + assertTrue(keyspaceAssignments.hasNext()); + KeyspaceRepairAssignments keyspace = keyspaceAssignments.next(); + assertFalse(keyspaceAssignments.hasNext()); + + List assignments = keyspace.getRepairAssignments(); + assertNotNull(assignments); + + assertEquals(numTokens * numberOfSplits, assignments.size()); + assertEquals(expectedToken.size(), assignments.size()); + + compare(numTokens, numberOfSplits, expectedToken, assignments); + } + + public static void testTokenRangesWithDefaultSplit(int numTokens, AutoRepairConfig.RepairType repairType) + { + int numberOfSplits = calcSplits(numTokens, DEFAULT_NUMBER_OF_SUBRANGES); + Collection> tokens = TokenRingUtils.getPrimaryRangesForEndpoint(KEYSPACE, FBUtilities.getBroadcastAddressAndPort()); + assertEquals(numTokens, tokens.size()); + List> expectedToken = new ArrayList<>(); + for (Range range : tokens) + { + expectedToken.addAll(AutoRepairUtils.split(range, numberOfSplits)); + } + + List plan = PrioritizedRepairPlan.buildSingleKeyspacePlan(repairType, KEYSPACE, TABLE1); + + Iterator keyspaceAssignments = new FixedSplitTokenRangeSplitter(repairType, Collections.emptyMap()).getRepairAssignments(true, plan); + + // should be only 1 entry for the keyspace. + assertTrue(keyspaceAssignments.hasNext()); + KeyspaceRepairAssignments keyspace = keyspaceAssignments.next(); + assertFalse(keyspaceAssignments.hasNext()); + + List assignments = keyspace.getRepairAssignments(); + assertNotNull(assignments); + + // should be 3 entries for the table which covers each token range. + assertEquals(numTokens * numberOfSplits, assignments.size()); + + compare(numTokens, numberOfSplits, expectedToken, assignments); + } + + private static void compare(int numTokens, int numberOfSplits, List> expectedToken, List assignments) + { + assertEquals(expectedToken.size(), assignments.size()); + Set> a = new TreeSet<>(); + Set> b = new TreeSet<>(); + for (int i = 0; i < numTokens * numberOfSplits; i++) + { + a.add(expectedToken.get(i)); + b.add(assignments.get(i).getTokenRange()); + } + assertEquals(a, b); + } + + private static int calcSplits(int numTokens, int subRange) + { + return Math.max(1, subRange / numTokens); + } +} diff --git a/test/unit/org/apache/cassandra/repair/autorepair/FixedSplitTokenRangeSplitterNoVNodesTest.java b/test/unit/org/apache/cassandra/repair/autorepair/FixedSplitTokenRangeSplitterNoVNodesTest.java new file mode 100644 index 000000000000..a30f3aa76246 --- /dev/null +++ b/test/unit/org/apache/cassandra/repair/autorepair/FixedSplitTokenRangeSplitterNoVNodesTest.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +/** + * Unit tests for a setup that does not have v-nodes {@link FixedSplitTokenRangeSplitter} + */ +@RunWith(Parameterized.class) +public class FixedSplitTokenRangeSplitterNoVNodesTest +{ + private static final int numTokens = 1; + + @Parameterized.Parameter(0) + public AutoRepairConfig.RepairType repairType; + + @Parameterized.Parameter(1) + public int numberOfSubRanges; + + @Parameterized.Parameters(name = "repairType={0}, numberOfSubRanges={1}") + public static Collection parameters() + { + List params = new ArrayList<>(); + for (AutoRepairConfig.RepairType type : AutoRepairConfig.RepairType.values()) + { + for (int subRange : Arrays.asList(1, 2, 4, 8, 16, 32, 64, 128, 256)) + { + params.add(new Object[]{ type, subRange }); + } + } + return params; + } + + @BeforeClass + public static void setupClass() throws Exception + { + FixedSplitTokenRangeSplitterHelper.setupClass(numTokens); + } + + @Test + public void testTokenRangesSplitByTable() + { + FixedSplitTokenRangeSplitterHelper.testTokenRangesSplitByTable(numTokens, numberOfSubRanges, repairType); + } + + @Test + public void testTokenRangesSplitByKeyspace() + { + FixedSplitTokenRangeSplitterHelper.testTokenRangesSplitByKeyspace(numTokens, numberOfSubRanges, repairType); + } + + @Test + public void testTokenRangesWithDefaultSplit() + { + FixedSplitTokenRangeSplitterHelper.testTokenRangesWithDefaultSplit(numTokens, repairType); + } +} diff --git a/test/unit/org/apache/cassandra/repair/autorepair/FixedSplitTokenRangeSplitterVNodesTest.java b/test/unit/org/apache/cassandra/repair/autorepair/FixedSplitTokenRangeSplitterVNodesTest.java new file mode 100644 index 000000000000..6839748d1f01 --- /dev/null +++ b/test/unit/org/apache/cassandra/repair/autorepair/FixedSplitTokenRangeSplitterVNodesTest.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +/** + * Unit tests for a setup that has v-nodes {@link FixedSplitTokenRangeSplitter} + */ +@RunWith(Parameterized.class) +public class FixedSplitTokenRangeSplitterVNodesTest +{ + private static final int numTokens = 16; + + @Parameterized.Parameter(0) + public AutoRepairConfig.RepairType repairType; + + @Parameterized.Parameter(1) + public int numberOfSubRanges; + + @Parameterized.Parameters(name = "repairType={0}, numberOfSubRanges={1}") + public static Collection parameters() + { + List params = new ArrayList<>(); + for (AutoRepairConfig.RepairType type : AutoRepairConfig.RepairType.values()) + { + for (int subRange : Arrays.asList(1, 2, 4, 8, 16, 32, 64, 128, 256)) + { + params.add(new Object[]{ type, subRange }); + } + } + return params; + } + + @BeforeClass + public static void setupClass() throws Exception + { + FixedSplitTokenRangeSplitterHelper.setupClass(numTokens); + } + + @Test + public void testTokenRangesSplitByTable() + { + FixedSplitTokenRangeSplitterHelper.testTokenRangesSplitByTable(numTokens, numberOfSubRanges, repairType); + } + + @Test + public void testTokenRangesSplitByKeyspace() + { + FixedSplitTokenRangeSplitterHelper.testTokenRangesSplitByKeyspace(numTokens, numberOfSubRanges, repairType); + } + + @Test + public void testTokenRangesWithDefaultSplit() + { + FixedSplitTokenRangeSplitterHelper.testTokenRangesWithDefaultSplit(numTokens, repairType); + } +} diff --git a/test/unit/org/apache/cassandra/repair/autorepair/PrioritizedRepairPlanTest.java b/test/unit/org/apache/cassandra/repair/autorepair/PrioritizedRepairPlanTest.java new file mode 100644 index 000000000000..38f9c8538846 --- /dev/null +++ b/test/unit/org/apache/cassandra/repair/autorepair/PrioritizedRepairPlanTest.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import com.google.common.collect.Lists; +import org.junit.Test; + +import org.apache.cassandra.cql3.CQLTester; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** + * Unit tests for {@link org.apache.cassandra.repair.autorepair.PrioritizedRepairPlan} + */ +public class PrioritizedRepairPlanTest extends CQLTester +{ + @Test + public void testBuildWithDifferentPriorities() + { + // Test reordering assignments with different priorities + String table1 = createTable("CREATE TABLE %s (k INT PRIMARY KEY, v INT) WITH auto_repair = {'full_enabled': 'true', 'priority': '2'}"); + String table2 = createTable("CREATE TABLE %s (k INT PRIMARY KEY, v INT) WITH auto_repair = {'full_enabled': 'true', 'priority': '3'}"); + String table3 = createTable("CREATE TABLE %s (k INT PRIMARY KEY, v INT) WITH auto_repair = {'full_enabled': 'true', 'priority': '1'}"); + + List prioritizedRepairPlans = PrioritizedRepairPlan.buildSingleKeyspacePlan(AutoRepairConfig.RepairType.FULL, KEYSPACE, table1, table2, table3); + assertEquals(3, prioritizedRepairPlans.size()); + + // Verify the order is by descending priority and matches the expected tables + assertEquals(3, prioritizedRepairPlans.get(0).getPriority()); + assertEquals(table2, prioritizedRepairPlans.get(0).getKeyspaceRepairPlans().get(0).getTableNames().get(0)); + + assertEquals(2, prioritizedRepairPlans.get(1).getPriority()); + assertEquals(table1, prioritizedRepairPlans.get(1).getKeyspaceRepairPlans().get(0).getTableNames().get(0)); + + assertEquals(1, prioritizedRepairPlans.get(2).getPriority()); + assertEquals(table3, prioritizedRepairPlans.get(2).getKeyspaceRepairPlans().get(0).getTableNames().get(0)); + } + + @Test + public void testBuildWithSamePriority() + { + // Test reordering assignments with the same priority + String table1 = createTable("CREATE TABLE %s (k INT PRIMARY KEY, v INT) WITH auto_repair = {'full_enabled': 'true', 'priority': '2'}"); + String table2 = createTable("CREATE TABLE %s (k INT PRIMARY KEY, v INT) WITH auto_repair = {'full_enabled': 'true', 'priority': '2'}"); + String table3 = createTable("CREATE TABLE %s (k INT PRIMARY KEY, v INT) WITH auto_repair = {'full_enabled': 'true', 'priority': '2'}"); + + // Expect only 1 plan since all tables share the same priority + List prioritizedRepairPlans = PrioritizedRepairPlan.buildSingleKeyspacePlan(AutoRepairConfig.RepairType.FULL, KEYSPACE, table1, table2, table3); + assertEquals(1, prioritizedRepairPlans.size()); + + // Verify all tables present in the plan + assertEquals(1, prioritizedRepairPlans.get(0).getKeyspaceRepairPlans().size()); + KeyspaceRepairPlan keyspaceRepairPlan = prioritizedRepairPlans.get(0).getKeyspaceRepairPlans().get(0); + + List tableNames = keyspaceRepairPlan.getTableNames(); + assertEquals(3, tableNames.size()); + assertEquals(table1, tableNames.get(0)); + assertEquals(table2, tableNames.get(1)); + assertEquals(table3, tableNames.get(2)); + } + + @Test + public void testBuildWithMixedPriorities() + { + String ks1 = createKeyspace("CREATE KEYSPACE %s WITH replication={ 'class' : 'SimpleStrategy', 'replication_factor' : 1 }"); + String table1 = createTable(ks1, "CREATE TABLE %s (k INT PRIMARY KEY, v INT) WITH auto_repair = {'full_enabled': 'true', 'priority': '2'}"); + String table2 = createTable(ks1, "CREATE TABLE %s (k INT PRIMARY KEY, v INT) WITH auto_repair = {'full_enabled': 'true', 'priority': '3'}"); + String table3 = createTable(ks1, "CREATE TABLE %s (k INT PRIMARY KEY, v INT) WITH auto_repair = {'full_enabled': 'true', 'priority': '2'}"); + String table4 = createTable(ks1, "CREATE TABLE %s (k INT PRIMARY KEY, v INT) WITH auto_repair = {'full_enabled': 'true', 'priority': '1'}"); + // No priority table should be bucketed at priority 0 + String table5 = createTable(ks1,"CREATE TABLE %s (k INT PRIMARY KEY, v INT)"); + + // Create a new keyspace to ensure its tables get grouped with appropriate priority bucket + String ks2 = createKeyspace("CREATE KEYSPACE %s WITH replication={ 'class' : 'SimpleStrategy', 'replication_factor' : 1 }"); + String table6 = createTable(ks2,"CREATE TABLE %s (k INT PRIMARY KEY, v INT)"); + String table7 = createTable(ks2,"CREATE TABLE %s (k INT PRIMARY KEY, v INT) WITH auto_repair = {'full_enabled': 'true', 'priority': '1'}"); + + Map> keyspaceToTableMap = new HashMap<>(); + keyspaceToTableMap.put(ks1, Lists.newArrayList(table1, table2, table3, table4, table5)); + keyspaceToTableMap.put(ks2, Lists.newArrayList(table6, table7)); + + // Expect 4 plans + List prioritizedRepairPlans = PrioritizedRepairPlan.build(keyspaceToTableMap, AutoRepairConfig.RepairType.FULL, java.util.Collections::sort); + assertEquals(4, prioritizedRepairPlans.size()); + + // Verify the order is by descending priority and matches the expected tables + assertEquals(3, prioritizedRepairPlans.get(0).getPriority()); + assertEquals(1, prioritizedRepairPlans.get(0).getKeyspaceRepairPlans().size()); + assertEquals(ks1, prioritizedRepairPlans.get(0).getKeyspaceRepairPlans().get(0).getKeyspaceName()); + assertEquals(table2, prioritizedRepairPlans.get(0).getKeyspaceRepairPlans().get(0).getTableNames().get(0)); + + assertEquals(2, prioritizedRepairPlans.get(1).getPriority()); + assertEquals(1, prioritizedRepairPlans.get(1).getKeyspaceRepairPlans().size()); + + assertEquals(ks1, prioritizedRepairPlans.get(1).getKeyspaceRepairPlans().get(0).getKeyspaceName()); + assertEquals(table1, prioritizedRepairPlans.get(1).getKeyspaceRepairPlans().get(0).getTableNames().get(0)); + assertEquals(table3, prioritizedRepairPlans.get(1).getKeyspaceRepairPlans().get(0).getTableNames().get(1)); + + assertEquals(1, prioritizedRepairPlans.get(2).getPriority()); + // 2 keyspaces should be present at priority 1 + assertEquals(2, prioritizedRepairPlans.get(2).getKeyspaceRepairPlans().size()); + // ks1.table4 expected in first plan + assertEquals(ks1, prioritizedRepairPlans.get(2).getKeyspaceRepairPlans().get(0).getKeyspaceName()); + assertEquals(table4, prioritizedRepairPlans.get(2).getKeyspaceRepairPlans().get(0).getTableNames().get(0)); + // ks2.table7 expected in second plan + assertEquals(ks2, prioritizedRepairPlans.get(2).getKeyspaceRepairPlans().get(1).getKeyspaceName()); + assertEquals(table7, prioritizedRepairPlans.get(2).getKeyspaceRepairPlans().get(1).getTableNames().get(0)); + + // Tables without priority should get bucketed at priority 0 + assertEquals(0, prioritizedRepairPlans.get(3).getPriority()); + // 2 keyspaces expected + assertEquals(2, prioritizedRepairPlans.get(3).getKeyspaceRepairPlans().size()); + // ks1.table5 expected in first plan + assertEquals(ks1, prioritizedRepairPlans.get(3).getKeyspaceRepairPlans().get(0).getKeyspaceName()); + assertEquals(table5, prioritizedRepairPlans.get(3).getKeyspaceRepairPlans().get(0).getTableNames().get(0)); + // ks2.table6 expected in second plan + assertEquals(ks2, prioritizedRepairPlans.get(3).getKeyspaceRepairPlans().get(1).getKeyspaceName()); + assertEquals(table6, prioritizedRepairPlans.get(3).getKeyspaceRepairPlans().get(1).getTableNames().get(0)); + } + + @Test + public void testBuildWithEmptyTableList() + { + // Test with an empty table list (should remain empty) + List prioritizedRepairPlans = PrioritizedRepairPlan.buildSingleKeyspacePlan(AutoRepairConfig.RepairType.FULL, KEYSPACE); + assertTrue(prioritizedRepairPlans.isEmpty()); + } + + @Test + public void testBuildWithOneTable() + { + // Test with a single element (should remain unchanged) + String table1 = createTable("CREATE TABLE %s (k INT PRIMARY KEY, v INT) WITH auto_repair = {'full_enabled': 'true', 'priority': '5'}"); + + // Expect only 1 plans + List prioritizedRepairPlans = PrioritizedRepairPlan.buildSingleKeyspacePlan(AutoRepairConfig.RepairType.FULL, KEYSPACE, table1); + assertEquals(1, prioritizedRepairPlans.size()); + + // Verify the order is by descending priority and matches the expected tables + assertEquals(5, prioritizedRepairPlans.get(0).getPriority()); + assertEquals(table1, prioritizedRepairPlans.get(0).getKeyspaceRepairPlans().get(0).getTableNames().get(0)); + } +} diff --git a/test/unit/org/apache/cassandra/repair/autorepair/RepairTokenRangeSplitterTest.java b/test/unit/org/apache/cassandra/repair/autorepair/RepairTokenRangeSplitterTest.java new file mode 100644 index 000000000000..79fef533f18f --- /dev/null +++ b/test/unit/org/apache/cassandra/repair/autorepair/RepairTokenRangeSplitterTest.java @@ -0,0 +1,465 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.auth.AuthKeyspace; +import org.apache.cassandra.config.DataStorageSpec.LongMebibytesBound; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.io.sstable.format.big.BigFormat; +import org.apache.cassandra.io.sstable.format.bti.BtiFormat; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig.RepairType; +import org.apache.cassandra.repair.autorepair.RepairTokenRangeSplitter.FilteredRepairAssignments; +import org.apache.cassandra.repair.autorepair.RepairTokenRangeSplitter.SizeEstimate; +import org.apache.cassandra.repair.autorepair.RepairTokenRangeSplitter.SizedRepairAssignment; +import org.apache.cassandra.service.AutoRepairService; +import org.apache.cassandra.utils.concurrent.Refs; + +import static org.apache.cassandra.repair.autorepair.RepairTokenRangeSplitter.MAX_BYTES_PER_SCHEDULE; +import static org.apache.cassandra.repair.autorepair.RepairTokenRangeSplitter.BYTES_PER_ASSIGNMENT; +import static org.apache.cassandra.repair.autorepair.RepairTokenRangeSplitter.MAX_TABLES_PER_ASSIGNMENT; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +/** + * Unit tests for {@link org.apache.cassandra.repair.autorepair.RepairTokenRangeSplitter} + */ +@RunWith(Parameterized.class) +public class RepairTokenRangeSplitterTest extends CQLTester +{ + private RepairTokenRangeSplitter repairRangeSplitter; + private String tableName; + private static Range FULL_RANGE; + + @Parameterized.Parameter() + public String sstableFormat; + + @Parameterized.Parameters(name = "sstableFormat={0}") + public static Collection sstableFormats() + { + return List.of(BtiFormat.NAME, BigFormat.NAME); + } + + @BeforeClass + public static void setUpClass() + { + CQLTester.setUpClass(); + AutoRepairService.setup(); + FULL_RANGE = new Range<>(DatabaseDescriptor.getPartitioner().getMinimumToken(), DatabaseDescriptor.getPartitioner().getMaximumTokenForSplitting()); + } + + @Before + public void setUp() + { + AutoRepairService.instance.getAutoRepairConfig().setRepairByKeyspace(RepairType.FULL, true); + DatabaseDescriptor.setSelectedSSTableFormat(DatabaseDescriptor.getSSTableFormats().get(sstableFormat)); + repairRangeSplitter = new RepairTokenRangeSplitter(RepairType.FULL, Collections.emptyMap()); + tableName = createTable("CREATE TABLE %s (k INT PRIMARY KEY, v INT)"); + // ensure correct format is selected. + if (sstableFormat.equalsIgnoreCase(BigFormat.NAME)) + { + assertTrue(BigFormat.isSelected()); + } + else + { + assertTrue(BtiFormat.isSelected()); + } + } + + @Test + public void testSizePartitionCount() + { + insertAndFlushTable(tableName, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + try (Refs sstables = RepairTokenRangeSplitter.getSSTableReaderRefs(RepairType.FULL, KEYSPACE, tableName, FULL_RANGE)) + { + assertEquals(10, sstables.iterator().next().getEstimatedPartitionSize().count()); + SizeEstimate sizes = RepairTokenRangeSplitter.getSizesForRangeOfSSTables(RepairType.FULL, KEYSPACE, tableName, FULL_RANGE, sstables); + assertEquals(10, sizes.partitions); + } + } + + @Test + public void testSizePartitionCountSplit() + { + int partitionCount = 100_000; + int[] values = new int[partitionCount]; + for (int i = 0; i < values.length; i++) + values[i] = i + 1; + insertAndFlushTable(tableName, values); + Iterator> range = AutoRepairUtils.split(FULL_RANGE, 2).iterator(); + Range tokenRange1 = range.next(); + Range tokenRange2 = range.next(); + Assert.assertFalse(range.hasNext()); + + try (Refs sstables1 = RepairTokenRangeSplitter.getSSTableReaderRefs(RepairType.FULL, KEYSPACE, tableName, tokenRange1); + Refs sstables2 = RepairTokenRangeSplitter.getSSTableReaderRefs(RepairType.FULL, KEYSPACE, tableName, tokenRange2)) + { + SizeEstimate sizes1 = RepairTokenRangeSplitter.getSizesForRangeOfSSTables(RepairType.FULL, KEYSPACE, tableName, tokenRange1, sstables1); + SizeEstimate sizes2 = RepairTokenRangeSplitter.getSizesForRangeOfSSTables(RepairType.FULL, KEYSPACE, tableName, tokenRange2, sstables2); + + // +-5% because including entire compression blocks covering token range, HLL merge and the applying of range size approx ratio causes estimation errors + long allowableDelta = (long) (partitionCount * .05); + long estimatedPartitionDelta = Math.abs(partitionCount - (sizes1.partitions + sizes2.partitions)); + assertTrue("Partition count delta was +/-" + estimatedPartitionDelta + " but expected +/- " + allowableDelta, estimatedPartitionDelta <= allowableDelta); + } + } + + @Test + public void testGetRepairAssignmentsForTable_NoSSTables() + { + // Should return 1 assignment if there are no SSTables + List assignments = repairRangeSplitter.getRepairAssignmentsForTable(CQLTester.KEYSPACE, tableName, FULL_RANGE); + assertEquals(1, assignments.size()); + } + + @Test + public void testGetRepairAssignmentsForTable_Single() + { + insertAndFlushSingleTable(); + List assignments = repairRangeSplitter.getRepairAssignmentsForTable(CQLTester.KEYSPACE, tableName, FULL_RANGE); + assertEquals(1, assignments.size()); + } + + @Test + public void testGetRepairAssignmentsForTable_BatchingTables() + { + repairRangeSplitter = new RepairTokenRangeSplitter(RepairType.FULL, Collections.singletonMap(MAX_TABLES_PER_ASSIGNMENT, "2")); + + List tableNames = createAndInsertTables(3); + List assignments = repairRangeSplitter.getRepairAssignmentsForKeyspace(RepairType.FULL, KEYSPACE, tableNames, FULL_RANGE); + + // We expect two assignments, one with table1 and table2 batched, and one with table3 + assertEquals(2, assignments.size()); + assertEquals(2, assignments.get(0).getTableNames().size()); + assertEquals(1, assignments.get(1).getTableNames().size()); + } + + @Test + public void testGetRepairAssignmentsForTable_BatchSize() + { + repairRangeSplitter = new RepairTokenRangeSplitter(RepairType.FULL, Collections.singletonMap(MAX_TABLES_PER_ASSIGNMENT, "2")); + + List tableNames = createAndInsertTables(2); + List assignments = repairRangeSplitter.getRepairAssignmentsForKeyspace(RepairType.FULL, KEYSPACE, tableNames, FULL_RANGE); + + // We expect one assignment, with two tables batched + assertEquals(1, assignments.size()); + assertEquals(2, assignments.get(0).getTableNames().size()); + } + + @Test + public void testGetRepairAssignmentsForTable_NoBatching() + { + repairRangeSplitter = new RepairTokenRangeSplitter(RepairType.FULL, Collections.singletonMap(MAX_TABLES_PER_ASSIGNMENT, "1")); + + List tableNames = createAndInsertTables(3); + List assignments = repairRangeSplitter.getRepairAssignmentsForKeyspace(RepairType.FULL, KEYSPACE, tableNames, FULL_RANGE); + + assertEquals(3, assignments.size()); + } + + @Test + public void testGetRepairAssignmentsForTable_AllBatched() + { + repairRangeSplitter = new RepairTokenRangeSplitter(RepairType.FULL, Collections.singletonMap(MAX_TABLES_PER_ASSIGNMENT, "100")); + + List tableNames = createAndInsertTables(5); + List assignments = repairRangeSplitter.getRepairAssignmentsForKeyspace(RepairType.FULL, KEYSPACE, tableNames, FULL_RANGE); + + assertEquals(1, assignments.size()); + } + + @Test(expected = IllegalStateException.class) + public void testMergeEmptyAssignments() + { + // Test when the list of assignments is empty + List emptyAssignments = Collections.emptyList(); + RepairTokenRangeSplitter.merge(emptyAssignments); + } + + @Test + public void testMergeSingleAssignment() + { + // Test when there is only one assignment in the list + String keyspaceName = "testKeyspace"; + List tableNames = Arrays.asList("table1", "table2"); + + SizedRepairAssignment assignment = new SizedRepairAssignment(FULL_RANGE, keyspaceName, tableNames); + List assignments = Collections.singletonList(assignment); + + SizedRepairAssignment result = RepairTokenRangeSplitter.merge(assignments); + + assertEquals(FULL_RANGE, result.getTokenRange()); + assertEquals(keyspaceName, result.getKeyspaceName()); + assertEquals(new HashSet<>(tableNames), new HashSet<>(result.getTableNames())); + } + + @Test + public void testMergeMultipleAssignmentsWithSameTokenRangeAndKeyspace() + { + // Test merging multiple assignments with the same token range and keyspace + String keyspaceName = "testKeyspace"; + List tableNames1 = Arrays.asList("table1", "table2"); + List tableNames2 = Arrays.asList("table2", "table3"); + + SizedRepairAssignment assignment1 = new SizedRepairAssignment(FULL_RANGE, keyspaceName, tableNames1); + SizedRepairAssignment assignment2 = new SizedRepairAssignment(FULL_RANGE, keyspaceName, tableNames2); + List assignments = Arrays.asList(assignment1, assignment2); + + SizedRepairAssignment result = RepairTokenRangeSplitter.merge(assignments); + + assertEquals(FULL_RANGE, result.getTokenRange()); + assertEquals(keyspaceName, result.getKeyspaceName()); + assertEquals(new HashSet<>(Arrays.asList("table1", "table2", "table3")), new HashSet<>(result.getTableNames())); + } + + @Test(expected = IllegalStateException.class) + public void testMergeDifferentTokenRange() + { + // Test merging assignments with different token ranges + Iterator> range = AutoRepairUtils.split(FULL_RANGE, 2).iterator(); // Split the full range into two ranges ie (0-100, 100-200 + Range tokenRange1 = range.next(); + Range tokenRange2 = range.next(); + Assert.assertFalse(range.hasNext()); + + String keyspaceName = "testKeyspace"; + List tableNames = Arrays.asList("table1", "table2"); + + SizedRepairAssignment assignment1 = new SizedRepairAssignment(tokenRange1, keyspaceName, tableNames); + SizedRepairAssignment assignment2 = new SizedRepairAssignment(tokenRange2, keyspaceName, tableNames); + List assignments = Arrays.asList(assignment1, assignment2); + + RepairTokenRangeSplitter.merge(assignments); // Should throw IllegalStateException + } + + @Test(expected = IllegalStateException.class) + public void testMergeDifferentKeyspaceName() + { + // Test merging assignments with different keyspace names + List tableNames = Arrays.asList("table1", "table2"); + + SizedRepairAssignment assignment1 = new SizedRepairAssignment(FULL_RANGE, "keyspace1", tableNames); + SizedRepairAssignment assignment2 = new SizedRepairAssignment(FULL_RANGE, "keyspace2", tableNames); + List assignments = Arrays.asList(assignment1, assignment2); + + RepairTokenRangeSplitter.merge(assignments); // Should throw IllegalStateException + } + + @Test + public void testMergeWithDuplicateTables() + { + // Test merging assignments with duplicate table names + String keyspaceName = "testKeyspace"; + List tableNames1 = Arrays.asList("table1", "table2"); + List tableNames2 = Arrays.asList("table2", "table3"); + + SizedRepairAssignment assignment1 = new SizedRepairAssignment(FULL_RANGE, keyspaceName, tableNames1); + SizedRepairAssignment assignment2 = new SizedRepairAssignment(FULL_RANGE, keyspaceName, tableNames2); + List assignments = Arrays.asList(assignment1, assignment2); + + RepairAssignment result = RepairTokenRangeSplitter.merge(assignments); + + // The merged result should contain all unique table names + assertEquals(new HashSet<>(Arrays.asList("table1", "table2", "table3")), new HashSet<>(result.getTableNames())); + } + + @Test + public void testGetRepairAssignmentsSplitsBySubrangeSizeAndFilterLimitsByMaxBytesPerSchedule() + { + // Ensures that getRepairAssignments splits by BYTES_PER_ASSIGNMENT and filterRepairAssignments limits by MAX_BYTES_PER_SCHEDULE. + repairRangeSplitter = new RepairTokenRangeSplitter(RepairType.INCREMENTAL, Collections.emptyMap()); + repairRangeSplitter.setParameter(BYTES_PER_ASSIGNMENT, "50GiB"); + repairRangeSplitter.setParameter(MAX_BYTES_PER_SCHEDULE, "100GiB"); + + // Given a size estimate of 1024GiB, we should expect 21 splits (50GiB*21 = 1050GiB < 1024GiB) + SizeEstimate sizeEstimate = sizeEstimateByBytes(new LongMebibytesBound("1024GiB")); + + List assignments = repairRangeSplitter.getRepairAssignments(Collections.singletonList(sizeEstimate)); + + // Should be 21 assignments, each being ~48.76 GiB + assertEquals(21, assignments.size()); + long expectedBytes = 52357696560L; + for (int i = 0; i < assignments.size(); i++) + { + SizedRepairAssignment assignment = assignments.get(i); + assertEquals("Did not get expected value for assignment " + i, 52357696560L, assignment.getEstimatedBytes()); + } + + // When filtering we should only get 2 assignments back (48.76 * 2 < 100GiB) + FilteredRepairAssignments filteredRepairAssignments = repairRangeSplitter.filterRepairAssignments(0, KEYSPACE, assignments, 0); + List finalRepairAssignments = filteredRepairAssignments.repairAssignments; + assertEquals(2, finalRepairAssignments.size()); + assertEquals(expectedBytes * 2, filteredRepairAssignments.newBytesSoFar); + } + + @Test + public void testTokenRangesRepairByKeyspace() + { + AutoRepairService.instance.getAutoRepairConfig().setRepairByKeyspace(RepairType.FULL, true); + + final KeyspaceRepairPlan repairPlan = new KeyspaceRepairPlan("system_auth", new ArrayList<>(AuthKeyspace.TABLE_NAMES)); + final PrioritizedRepairPlan prioritizedRepairPlan = new PrioritizedRepairPlan(0, List.of(repairPlan)); + + Iterator keyspaceAssignments = repairRangeSplitter.getRepairAssignments(true, List.of(prioritizedRepairPlan)); + + // should be only 1 entry for the keyspace. + assertTrue(keyspaceAssignments.hasNext()); + KeyspaceRepairAssignments keyspace = keyspaceAssignments.next(); + assertFalse(keyspaceAssignments.hasNext()); + + List assignments = keyspace.getRepairAssignments(); + assertNotNull(assignments); + + // Should only be two assignments (since single node encompasses the whole range, should get 2 primary ranges) + // to account for the range wrapping the ring. + assertEquals(2, assignments.size()); + + for (RepairAssignment assignment : assignments) + { + assertEquals(AuthKeyspace.TABLE_NAMES.size(), assignment.getTableNames().size()); + } + } + + @Test + public void testTokenRangesRepairByKeyspaceFalse() + { + AutoRepairService.instance.getAutoRepairConfig().setRepairByKeyspace(RepairType.FULL, false); + + final KeyspaceRepairPlan repairPlan = new KeyspaceRepairPlan("system_auth", new ArrayList<>(AuthKeyspace.TABLE_NAMES)); + final PrioritizedRepairPlan prioritizedRepairPlan = new PrioritizedRepairPlan(0, List.of(repairPlan)); + + Iterator keyspaceAssignments = repairRangeSplitter.getRepairAssignments(true, List.of(prioritizedRepairPlan)); + + // should be only 1 entry for the keyspace. + assertTrue(keyspaceAssignments.hasNext()); + KeyspaceRepairAssignments keyspace = keyspaceAssignments.next(); + assertFalse(keyspaceAssignments.hasNext()); + + List assignments = keyspace.getRepairAssignments(); + assertNotNull(assignments); + + // Should be two ranges * X system_auth table names assignments + assertEquals(2 * AuthKeyspace.TABLE_NAMES.size(), assignments.size()); + + // each assignment should only include one table. + for (RepairAssignment assignment : assignments) + { + assertEquals(1, assignment.getTableNames().size()); + } + } + + @Test(expected = IllegalArgumentException.class) + public void testSetParameterShouldNotAllowUnknownParameter() + { + repairRangeSplitter.setParameter("unknown", "x"); + } + + @Test(expected = IllegalArgumentException.class) + public void testSetParameterShouldNotAllowSettingBytesPerAssignmentGreaterThanMaxBytesPerSchedule() + { + repairRangeSplitter.setParameter(MAX_BYTES_PER_SCHEDULE, "500GiB"); + repairRangeSplitter.setParameter(BYTES_PER_ASSIGNMENT, "600GiB"); + } + + @Test(expected = IllegalArgumentException.class) + public void testSetParameterShouldNotAllowSettingMaxBytesPerScheduleLessThanBytesPerAssignment() + { + repairRangeSplitter.setParameter(BYTES_PER_ASSIGNMENT, "100MiB"); + repairRangeSplitter.setParameter(MAX_BYTES_PER_SCHEDULE, "50MiB"); + } + + @Test + public void testGetParameters() + { + repairRangeSplitter.setParameter(BYTES_PER_ASSIGNMENT, "100MiB"); + repairRangeSplitter.setParameter(MAX_TABLES_PER_ASSIGNMENT, "5"); + + Map parameters = repairRangeSplitter.getParameters(); + // Each parameter should be present. + assertEquals(RepairTokenRangeSplitter.PARAMETERS.size(), parameters.size()); + // The parameters we explicitly set should be set exactly as we set them. + assertEquals("100MiB", parameters.get(BYTES_PER_ASSIGNMENT)); + assertEquals("5", parameters.get(MAX_TABLES_PER_ASSIGNMENT)); + } + + private SizeEstimate sizeEstimateByBytes(LongMebibytesBound totalSize) + { + return sizeEstimateByBytes(totalSize, totalSize); + } + + private SizeEstimate sizeEstimateByBytes(LongMebibytesBound sizeInRange, LongMebibytesBound totalSize) + { + return new SizeEstimate(RepairType.INCREMENTAL, KEYSPACE, "table1", FULL_RANGE, 1, sizeInRange.toBytes(), totalSize.toBytes()); + } + + private void insertAndFlushSingleTable() + { + execute("INSERT INTO %s (k, v) values (?, ?)", 1, 1); + flush(); + } + + private List createAndInsertTables(int count) + { + List tableNames = new ArrayList<>(); + for (int i = 0; i < count; i++) + { + String tableName = createTable("CREATE TABLE %s (k INT PRIMARY KEY, v INT)"); + tableNames.add(tableName); + insertAndFlushTable(tableName); + } + return tableNames; + } + + private void insertAndFlushTable(String tableName) + { + insertAndFlushTable(tableName, 1); + } + + private void insertAndFlushTable(String tableName, int... vals) + { + for (int i : vals) + { + executeFormattedQuery("INSERT INTO " + KEYSPACE + '.' + tableName + " (k, v) values (?, ?)", i, i); + } + ColumnFamilyStore cfs = ColumnFamilyStore.getIfExists(KEYSPACE, tableName); + cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); + } +} diff --git a/test/unit/org/apache/cassandra/repair/autorepair/SSTableRepairedAtTest.java b/test/unit/org/apache/cassandra/repair/autorepair/SSTableRepairedAtTest.java new file mode 100644 index 000000000000..14677490ca2d --- /dev/null +++ b/test/unit/org/apache/cassandra/repair/autorepair/SSTableRepairedAtTest.java @@ -0,0 +1,175 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.repair.autorepair; + +import java.net.UnknownHostException; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.service.StorageService; + +import static org.apache.cassandra.config.CassandraRelevantProperties.SYSTEM_DISTRIBUTED_DEFAULT_RF; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + + +/** + * Unit tests to cover AutoRepair functionality inside {@link org.apache.cassandra.service.StorageService} + */ +public class SSTableRepairedAtTest extends CQLTester +{ + public static final String TEST_KEYSPACE = "test_keyspace"; + public static ColumnFamilyStore table1; + public static ColumnFamilyStore table2; + + @BeforeClass + public static void setUp() throws ConfigurationException, UnknownHostException + { + requireNetwork(); + AutoRepairUtils.setup(); + StorageService.instance.doAutoRepairSetup(); + DatabaseDescriptor.setCDCEnabled(false); + } + + @Before + public void clearData() + { + SYSTEM_DISTRIBUTED_DEFAULT_RF.setInt(1); + QueryProcessor.executeInternal(String.format("CREATE KEYSPACE %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}", TEST_KEYSPACE)); + QueryProcessor.executeInternal(String.format("CREATE TABLE %s.%s (key text, val text, primary key(key))", TEST_KEYSPACE, "table1")); + QueryProcessor.executeInternal(String.format("CREATE TABLE %s.%s (key text, val text, primary key(key))", TEST_KEYSPACE, "table2")); + + Keyspace.open(TEST_KEYSPACE).getColumnFamilyStore("table1").truncateBlocking(); + Keyspace.open(TEST_KEYSPACE).getColumnFamilyStore("table2").truncateBlocking(); + + table1 = Keyspace.open(TEST_KEYSPACE).getColumnFamilyStore("table1"); + assert table1 != null; + table2 = Keyspace.open(TEST_KEYSPACE).getColumnFamilyStore("table2"); + assert table2 != null; + } + + @Test + public void testGetTablesForKeyspace() + { + List result = StorageService.instance.getTablesForKeyspace(TEST_KEYSPACE); + + assertEquals(Arrays.asList(table1.name, table2.name), result.stream().sorted().collect(Collectors.toList())); + } + + @Test + public void testGetTablesForKeyspaceNotFound() + { + String missingKeyspace = "MISSING_KEYSPACE"; + try + { + StorageService.instance.getTablesForKeyspace(missingKeyspace); + fail("Expected an AssertionError to be thrown"); + } + catch (AssertionError e) + { + assertEquals("Unknown keyspace " + missingKeyspace, e.getMessage()); + } + } + + @Test + public void testMutateSSTableRepairedStateTableNotFound() + { + try + { + StorageService.instance.mutateSSTableRepairedState(true, false, TEST_KEYSPACE, List.of("MISSING_TABLE")); + fail("Expected an InvalidRequestException to be thrown"); + } + catch (RuntimeException e) + { + assertEquals("Table MISSING_TABLE does not exist in keyspace " + TEST_KEYSPACE, e.getMessage()); + // Test passed + } + } + + @Test + public void testMutateSSTableRepairedStateTablePreview() + { + SchemaLoader.insertData(TEST_KEYSPACE, table1.name, 0, 1); + table1.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); + assertEquals(1, table1.getLiveSSTables().size()); + + List result = StorageService.instance.mutateSSTableRepairedState(true, true, TEST_KEYSPACE, Arrays.asList(table1.name)); + + assertEquals(1, result.size()); + table1.getLiveSSTables().forEach(sstable -> { + assertFalse(sstable.isRepaired()); + assertTrue(result.contains(sstable.descriptor.baseFile().name())); + }); + } + + @Test + public void testMutateSSTableRepairedStateTableRepaired() + { + SchemaLoader.insertData(TEST_KEYSPACE, table1.name, 0, 1); + table1.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); + SchemaLoader.insertData(TEST_KEYSPACE, table1.name, 0, 1); + table1.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); + assertEquals(2, table1.getLiveSSTables().size()); + table1.getLiveSSTables().forEach(sstable -> { + assertFalse(sstable.isRepaired()); + }); + + List result = StorageService.instance.mutateSSTableRepairedState(true, false, TEST_KEYSPACE, Arrays.asList(table1.name)); + + assertEquals(2, result.size()); + table1.getLiveSSTables().forEach(sstable -> { + assertTrue(sstable.isRepaired()); + assertTrue(result.contains(sstable.descriptor.baseFile().name())); + }); + } + + @Test + public void testMutateSSTableRepairedStateTableUnrepaired() throws Exception + { + SchemaLoader.insertData(TEST_KEYSPACE, table1.name, 0, 1); + table1.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); + SchemaLoader.insertData(TEST_KEYSPACE, table1.name, 0, 1); + table1.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); + table1.getCompactionStrategyManager().mutateRepaired(table1.getLiveSSTables(), 1, null, false); + assertEquals(2, table1.getLiveSSTables().stream().filter(SSTableReader::isRepaired).count()); + + List result = StorageService.instance.mutateSSTableRepairedState(false, false, TEST_KEYSPACE, Arrays.asList(table1.name)); + + assertEquals(2, result.size()); + table1.getLiveSSTables().forEach(sstable -> { + assertFalse(sstable.isRepaired()); + assertTrue(result.contains(sstable.descriptor.baseFile().name())); + }); + } +} diff --git a/test/unit/org/apache/cassandra/repair/messages/RepairMessageSerializationsTest.java b/test/unit/org/apache/cassandra/repair/messages/RepairMessageSerializationsTest.java index 1657ceff4870..750c6144553d 100644 --- a/test/unit/org/apache/cassandra/repair/messages/RepairMessageSerializationsTest.java +++ b/test/unit/org/apache/cassandra/repair/messages/RepairMessageSerializationsTest.java @@ -25,17 +25,16 @@ import java.util.UUID; import com.google.common.collect.Lists; -import org.junit.AfterClass; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.*; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; -import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.dht.IPartitioner; -import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.CassandraTestBase; +import org.apache.cassandra.CassandraTestBase.DDDaemonInitialization; +import org.apache.cassandra.CassandraTestBase.UseMurmur3Partitioner; import org.apache.cassandra.dht.Murmur3Partitioner.LongToken; -import org.apache.cassandra.dht.Range; -import org.apache.cassandra.dht.Token; import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataInputBuffer; @@ -52,37 +51,30 @@ import org.apache.cassandra.streaming.PreviewKind; import org.apache.cassandra.repair.RepairJobDesc; import org.apache.cassandra.schema.TableId; -import org.apache.cassandra.service.StorageService; import org.apache.cassandra.streaming.SessionSummary; import org.apache.cassandra.streaming.StreamSummary; import org.apache.cassandra.utils.MerkleTrees; +import static java.util.Collections.emptyList; import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; -public class RepairMessageSerializationsTest +@UseMurmur3Partitioner +@DDDaemonInitialization +public class RepairMessageSerializationsTest extends CassandraTestBase { private static final int PROTOCOL_VERSION = MessagingService.current_version; private static final int GC_BEFORE = 1000000; - private static IPartitioner originalPartitioner; @BeforeClass public static void before() { - DatabaseDescriptor.daemonInitialization(); - originalPartitioner = StorageService.instance.setPartitionerUnsafe(Murmur3Partitioner.instance); ClusterMetadataTestHelper.setInstanceForTest(); SchemaTestUtil.addOrUpdateKeyspace(KeyspaceMetadata.create("serializationsTestKeyspace", KeyspaceParams.simple(3))); SchemaTestUtil.announceNewTable(TableMetadata.minimal("serializationsTestKeyspace", "repairMessages")); } - @AfterClass - public static void after() - { - DatabaseDescriptor.setPartitionerUnsafe(originalPartitioner); - } - @Test public void validationRequestMessage() throws IOException { @@ -123,7 +115,19 @@ private T serializeRoundTrip(T msg, IVersionedSerializ buf.flip(); DataInputPlus in = new DataInputBuffer(buf, false); - T deserialized = serializer.deserialize(in, PROTOCOL_VERSION); + + T deserialized = null; + + if (serializer instanceof IPartitionerDependentSerializer) + { + IPartitionerDependentSerializer pds = (IPartitionerDependentSerializer) serializer; + deserialized = pds.deserialize(in, DatabaseDescriptor.getPartitioner(), PROTOCOL_VERSION); + } + else + { + deserialized = serializer.deserialize(in, PROTOCOL_VERSION); + } + Assert.assertEquals(msg, deserialized); Assert.assertEquals(msg.hashCode(), deserialized.hashCode()); return deserialized; @@ -175,8 +179,8 @@ public void syncCompleteMessage() throws IOException InetAddressAndPort dst = InetAddressAndPort.getByName("127.0.0.3"); List summaries = new ArrayList<>(); summaries.add(new SessionSummary(src, dst, - Lists.newArrayList(new StreamSummary(TableId.fromUUID(UUID.randomUUID()), 5, 100)), - Lists.newArrayList(new StreamSummary(TableId.fromUUID(UUID.randomUUID()), 500, 10)) + Lists.newArrayList(new StreamSummary(TableId.fromUUID(UUID.randomUUID()), emptyList(), 5, 100)), + Lists.newArrayList(new StreamSummary(TableId.fromUUID(UUID.randomUUID()), emptyList(), 500, 10)) )); SyncResponse msg = new SyncResponse(buildRepairJobDesc(), new SyncNodePair(src, dst), true, summaries); serializeRoundTrip(msg, SyncResponse.serializer); diff --git a/test/unit/org/apache/cassandra/repair/messages/RepairMessageTest.java b/test/unit/org/apache/cassandra/repair/messages/RepairMessageTest.java index fb3ce470f581..b7fa71c95943 100644 --- a/test/unit/org/apache/cassandra/repair/messages/RepairMessageTest.java +++ b/test/unit/org/apache/cassandra/repair/messages/RepairMessageTest.java @@ -23,7 +23,7 @@ import org.apache.cassandra.concurrent.ScheduledExecutorPlus; import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.gms.IGossiper; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.metrics.RepairMetrics; @@ -32,9 +32,11 @@ import org.apache.cassandra.net.RequestCallback; import org.apache.cassandra.net.Verb; import org.apache.cassandra.repair.SharedContext; +import org.apache.cassandra.service.RetryStrategy; +import org.apache.cassandra.service.TimeoutStrategy.LatencySourceFactory; +import org.apache.cassandra.service.WaitStrategy; import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tcm.StubClusterMetadataService; -import org.apache.cassandra.utils.Backoff; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.TimeUUID; import org.assertj.core.api.Assertions; @@ -53,7 +55,7 @@ public class RepairMessageTest private static final Answer REJECT_ALL = ignore -> { throw new UnsupportedOperationException(); }; - private static final int[] attempts = {1, 2, 10}; + private static final int[] retries = { 1, 2, 10 }; // Tests may use verb / message pairs that do not make sense... that is due to the fact that the message sending logic does not validate this and delegates such validation to messaging, which is mocked within the class... // By using messages with simpler state it makes the test easier to read, even though the verb -> message mapping is incorrect. private static final Verb VERB = Verb.PREPARE_MSG; @@ -87,7 +89,7 @@ public void noRetries() public void noRetriesRequestFailed() { test(NO_RETRY_ATTEMPTS, ((ignore, callback) -> { - callback.onFailure(ADDRESS, RequestFailureReason.UNKNOWN); + callback.onFailure(ADDRESS, RequestFailure.UNKNOWN); assertNoRetries(); })); } @@ -104,18 +106,18 @@ public void retryWithSuccess() @Test public void retryWithTimeout() { - test((maxAttempts, callback) -> { - callback.onFailure(ADDRESS, RequestFailureReason.TIMEOUT); - assertMetrics(maxAttempts, true, false); + test((maxRetries, callback) -> { + callback.onFailure(ADDRESS, RequestFailure.TIMEOUT); + assertMetrics(maxRetries, true, false); }); } @Test public void retryWithFailure() { - test((maxAttempts, callback) -> { - callback.onFailure(ADDRESS, RequestFailureReason.UNKNOWN); - assertMetrics(maxAttempts, false, true); + test((maxRetries, callback) -> { + callback.onFailure(ADDRESS, RequestFailure.UNKNOWN); + assertMetrics(maxRetries, false, true); }); } @@ -124,9 +126,9 @@ private void assertNoRetries() assertMetrics(0, false, false); } - private void assertMetrics(long attempts, boolean timeout, boolean failure) + private void assertMetrics(long retries, boolean timeout, boolean failure) { - if (attempts == 0) + if (retries == 0) { assertThat(RepairMetrics.retries).isEmpty(); assertThat(RepairMetrics.retriesByVerb.get(VERB)).isEmpty(); @@ -137,8 +139,8 @@ private void assertMetrics(long attempts, boolean timeout, boolean failure) } else { - assertThat(RepairMetrics.retries).hasCount(1).hasMax(attempts); - assertThat(RepairMetrics.retriesByVerb.get(VERB)).hasCount(1).hasMax(attempts); + assertThat(RepairMetrics.retries).hasCount(1).hasMax(retries); + assertThat(RepairMetrics.retriesByVerb.get(VERB)).hasCount(1).hasMax(retries); assertThat(RepairMetrics.retryTimeout).hasCount(timeout ? 1 : 0); assertThat(RepairMetrics.retryTimeoutByVerb.get(VERB)).hasCount(timeout ? 1 : 0); assertThat(RepairMetrics.retryFailure).hasCount(failure ? 1 : 0); @@ -146,9 +148,9 @@ private void assertMetrics(long attempts, boolean timeout, boolean failure) } } - private static Backoff backoff(int maxAttempts) + private static WaitStrategy backoff(int maxRetries) { - return new Backoff.ExponentialBackoff(maxAttempts, 100, 1000, () -> .5); + return RetryStrategy.parse("0 <= 100ms * 2^attempts <= 1000ms,retries=" + maxRetries, LatencySourceFactory.none()); } private static SharedContext ctx() @@ -194,22 +196,22 @@ private interface TestCase private void test(TestCase fn) { - test(attempts, fn); + test(retries, fn); } - private void test(int[] attempts, TestCase fn) + private void test(int[] retries, TestCase fn) { SharedContext ctx = ctx(); MessageDelivery messaging = ctx.messaging(); - for (int maxAttempts : attempts) + for (int maxRetries : retries) { before(); - sendMessageWithRetries(ctx, backoff(maxAttempts), always(), PAYLOAD, VERB, ADDRESS, RepairMessage.NOOP_CALLBACK); - for (int i = 0; i < maxAttempts; i++) - callback(messaging).onFailure(ADDRESS, RequestFailureReason.TIMEOUT); - fn.test(maxAttempts, callback(messaging)); + sendMessageWithRetries(ctx, backoff(maxRetries), always(), PAYLOAD, VERB, ADDRESS, RepairMessage.NOOP_CALLBACK); + for (int i = 0; i < maxRetries; i++) + callback(messaging).onFailure(ADDRESS, RequestFailure.TIMEOUT); + fn.test(maxRetries, callback(messaging)); Mockito.verifyNoInteractions(messaging); } } diff --git a/test/unit/org/apache/cassandra/schema/FastPathSchemaTest.java b/test/unit/org/apache/cassandra/schema/FastPathSchemaTest.java new file mode 100644 index 000000000000..e4c603079821 --- /dev/null +++ b/test/unit/org/apache/cassandra/schema/FastPathSchemaTest.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.schema; + +import java.util.Arrays; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.service.accord.fastpath.FastPathStrategy; +import org.apache.cassandra.service.accord.fastpath.ParameterizedFastPathStrategy; + +import static java.lang.String.format; + +public class FastPathSchemaTest +{ + private static String KEYSPACE = "ks"; + private static int ksCount = 0; + + @BeforeClass + public static void setupClass() + { + DatabaseDescriptor.daemonInitialization(); + ServerTestUtils.prepareServer(); + SchemaTestUtil.addOrUpdateKeyspace(KeyspaceMetadata.create(KEYSPACE, KeyspaceParams.simple(1), Tables.of())); + } + + @Before + public void setup() + { + KEYSPACE = format("ks_%s", ksCount++); + } + + + private static void process(String fmt, Object... objects) + { + QueryProcessor.process(format(fmt, objects), ConsistencyLevel.ANY); + } + + @Test + public void keyspaceInheriting() + { + process("CREATE KEYSPACE %s with replication={'class':'SimpleStrategy', 'replication_factor':1} AND fast_path='simple'", KEYSPACE); + KeyspaceMetadata ksm = Schema.instance.getKeyspaceMetadata(KEYSPACE); + Assert.assertSame(FastPathStrategy.simple(), ksm.params.fastPath); + + process("CREATE TABLE %s.tbl (k int primary key, v int) WITH transactional_mode='full'", KEYSPACE); + TableMetadata tbm = Schema.instance.getTableMetadata(KEYSPACE, "tbl"); + Assert.assertSame(FastPathStrategy.inheritKeyspace(), tbm.params.fastPath); + } + + @Test + public void keyspaceModification() + { + process("CREATE KEYSPACE %s with replication={'class':'SimpleStrategy', 'replication_factor':1} AND fast_path='simple'", KEYSPACE); + KeyspaceMetadata ksm = Schema.instance.getKeyspaceMetadata(KEYSPACE); + Assert.assertSame(FastPathStrategy.simple(), ksm.params.fastPath); + process("ALTER KEYSPACE %s with fast_path={'size':2, 'dcs':'dc1,dc2'}", KEYSPACE); + + ksm = Schema.instance.getKeyspaceMetadata(KEYSPACE); + Assert.assertSame(FastPathStrategy.Kind.PARAMETERIZED, ksm.params.fastPath.kind()); + ParameterizedFastPathStrategy strategy = (ParameterizedFastPathStrategy) ksm.params.fastPath; + Assert.assertEquals(2, strategy.size); + Assert.assertEquals(Arrays.asList("dc1", "dc2"), strategy.dcStrings()); + } + + @Test(expected = ConfigurationException.class) + public void keyspaceInheritingFailure() + { + process("CREATE KEYSPACE %s with replication={'class':'SimpleStrategy', 'replication_factor':1} AND fast_path='keyspace'", KEYSPACE); + } + + @Test + public void tableModification() + { + process("CREATE KEYSPACE %s with replication={'class':'SimpleStrategy', 'replication_factor':1} AND fast_path='simple'", KEYSPACE); + KeyspaceMetadata ksm = Schema.instance.getKeyspaceMetadata(KEYSPACE); + Assert.assertSame(FastPathStrategy.simple(), ksm.params.fastPath); + + process("CREATE TABLE %s.tbl (k int primary key, v int) WITH transactional_mode='full'", KEYSPACE); + TableMetadata tbm = Schema.instance.getTableMetadata(KEYSPACE, "tbl"); + Assert.assertSame(FastPathStrategy.inheritKeyspace(), tbm.params.fastPath); + + process("ALTER TABLE %s.tbl WITH fast_path='simple'", KEYSPACE); + tbm = Schema.instance.getTableMetadata(KEYSPACE, "tbl"); + Assert.assertSame(FastPathStrategy.simple(), tbm.params.fastPath); + } +} diff --git a/test/unit/org/apache/cassandra/schema/SchemaChangeDuringRangeMovementTest.java b/test/unit/org/apache/cassandra/schema/SchemaChangeDuringRangeMovementTest.java index e6cdb1be8248..d54b2e5e31d5 100644 --- a/test/unit/org/apache/cassandra/schema/SchemaChangeDuringRangeMovementTest.java +++ b/test/unit/org/apache/cassandra/schema/SchemaChangeDuringRangeMovementTest.java @@ -20,30 +20,21 @@ import org.junit.Test; -import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.CQLTester; -import org.apache.cassandra.dht.Range; import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.ClusterMetadataService; -import org.apache.cassandra.tcm.Transformation; -import org.apache.cassandra.tcm.sequences.LockedRanges; import org.apache.cassandra.tcm.transformations.AlterSchema; import org.apache.cassandra.triggers.TriggersTest; +import static org.apache.cassandra.tcm.sequences.SequencesUtils.ClearLockedRanges; +import static org.apache.cassandra.tcm.sequences.SequencesUtils.LockRanges; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; public class SchemaChangeDuringRangeMovementTest extends CQLTester { - // at the moment, the detail of the specific LockedRanges doesn't matter, transformations - // which are rejected in the presence of locking are rejected whatever is actually locked - private static final LockedRanges.AffectedRanges toLock = - LockedRanges.AffectedRanges.singleton(ReplicationParams.simple(3), - new Range<>(DatabaseDescriptor.getPartitioner().getMinimumToken(), - DatabaseDescriptor.getPartitioner().getRandomToken())); - @Test public void testAlwaysPermittedChanges() throws Throwable { @@ -216,39 +207,4 @@ private void withAndWithoutLockedRanges(TestActions actions) throws Throwable metadata = ClusterMetadataService.instance().commit(new ClearLockedRanges()); assertTrue(metadata.lockedRanges.locked.isEmpty()); } - - - // Custom transforms to lock/unlock an arbitrary set of ranges to - // avoid having to actually initiate some range movement - private static class LockRanges implements Transformation - { - @Override - public Kind kind() - { - return Kind.CUSTOM; - } - - @Override - public Result execute(ClusterMetadata metadata) - { - LockedRanges newLocked = metadata.lockedRanges.lock(LockedRanges.keyFor(metadata.epoch), toLock); - return Transformation.success(metadata.transformer().with(newLocked), toLock); - } - } - - private static class ClearLockedRanges implements Transformation - { - @Override - public Kind kind() - { - return Kind.CUSTOM; - } - - @Override - public Result execute(ClusterMetadata metadata) - { - LockedRanges newLocked = LockedRanges.EMPTY; - return Transformation.success(metadata.transformer().with(newLocked), LockedRanges.AffectedRanges.EMPTY); - } - } } diff --git a/test/unit/org/apache/cassandra/schema/SchemaChangesTest.java b/test/unit/org/apache/cassandra/schema/SchemaChangesTest.java index 7bf5a5bd05dc..9d966c11e44c 100644 --- a/test/unit/org/apache/cassandra/schema/SchemaChangesTest.java +++ b/test/unit/org/apache/cassandra/schema/SchemaChangesTest.java @@ -113,11 +113,11 @@ public void testTableMetadataBuilder() throws ConfigurationException assertNull(table.getColumn(ByteBuffer.wrap(new byte[]{ 5 }))); // add one. - ColumnMetadata addIndexDef = ColumnMetadata.regularColumn(table, ByteBuffer.wrap(new byte[] { 5 }), BytesType.instance); + ColumnMetadata addIndexDef = ColumnMetadata.regularColumn(table, ByteBuffer.wrap(new byte[] { 5 }), BytesType.instance, ColumnMetadata.NO_UNIQUE_ID); builder.addColumn(addIndexDef); // remove one. - ColumnMetadata removeIndexDef = ColumnMetadata.regularColumn(table, ByteBuffer.wrap(new byte[] { 0 }), BytesType.instance); + ColumnMetadata removeIndexDef = ColumnMetadata.regularColumn(table, ByteBuffer.wrap(new byte[] { 0 }), BytesType.instance, ColumnMetadata.NO_UNIQUE_ID); builder.removeRegularOrStaticColumn(removeIndexDef.name); TableMetadata table2 = builder.build(); diff --git a/test/unit/org/apache/cassandra/schema/TableIdTest.java b/test/unit/org/apache/cassandra/schema/TableIdTest.java new file mode 100644 index 000000000000..2ad67a3518dd --- /dev/null +++ b/test/unit/org/apache/cassandra/schema/TableIdTest.java @@ -0,0 +1,80 @@ +package org.apache.cassandra.schema; + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.junit.Test; + +import accord.utils.LazyToString; +import accord.utils.ReflectionUtils; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.utils.CassandraGenerators; +import org.apache.cassandra.utils.Generators; +import org.assertj.core.api.Assertions; + +import static accord.utils.Property.qt; + +public class TableIdTest +{ + @Test + public void serialize() + { + DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(Generators.toGen(CassandraGenerators.TABLE_ID_GEN)).check(input -> { + output.clear(); + input.serialize(output); + Assertions.assertThat(output.getLength()).describedAs("The serialized size and bytes written do not match").isEqualTo(input.serializedSize()); + + DataInputBuffer in = new DataInputBuffer(output.unsafeGetBufferAndFlip(), false); + TableId read = TableId.deserialize(in); + Assertions.assertThat(read).describedAs("The deserialized output does not match the serialized input; difference %s", new LazyToString(() -> ReflectionUtils.recursiveEquals(read, input).toString())).isEqualTo(input); + }); + } + + @Test + public void serializeCompact() + { + DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(Generators.toGen(CassandraGenerators.TABLE_ID_GEN)).check(input -> { + output.clear(); + input.serializeCompact(output); + Assertions.assertThat(output.getLength()).describedAs("The serialized size and bytes written do not match").isEqualTo(input.serializedCompactSize()); + + DataInputBuffer in = new DataInputBuffer(output.unsafeGetBufferAndFlip(), false); + TableId read = TableId.deserializeCompact(in); + Assertions.assertThat(read).describedAs("The deserialized output does not match the serialized input; difference %s", new LazyToString(() -> ReflectionUtils.recursiveEquals(read, input).toString())).isEqualTo(input); + }); + } + + @Test + public void serializeCompactComparable() + { + DataOutputBuffer output = new DataOutputBuffer(); + // Seed = 3447758086368915686 + qt().withSeed(3848293537190683248L).forAll(Generators.toGen(CassandraGenerators.TABLE_ID_GEN)).check(input -> { + output.clear(); + input.serializeCompactComparable(output); + Assertions.assertThat(output.getLength()).describedAs("The serialized size and bytes written do not match").isEqualTo(input.serializedCompactComparableSize()); + + DataInputBuffer in = new DataInputBuffer(output.unsafeGetBufferAndFlip(), false); + TableId read = TableId.deserializeCompactComparable(in); + Assertions.assertThat(read).describedAs("The deserialized output does not match the serialized input; difference %s", new LazyToString(() -> ReflectionUtils.recursiveEquals(read, input).toString())).isEqualTo(input); + }); + } + } \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/schema/TableParamsTest.java b/test/unit/org/apache/cassandra/schema/TableParamsTest.java new file mode 100644 index 000000000000..9146264278f7 --- /dev/null +++ b/test/unit/org/apache/cassandra/schema/TableParamsTest.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.schema; + +import org.junit.Test; + +import accord.utils.Gen; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.tcm.membership.NodeVersion; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializers; +import org.apache.cassandra.utils.CassandraGenerators.TableParamsBuilder; +import org.apache.cassandra.utils.Generators; + +import static accord.utils.Property.qt; + + +public class TableParamsTest +{ + @Test + public void serdeLatest() + { + DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(tableParams()).check(params -> { + AsymmetricMetadataSerializers.testSerde(output, TableParams.serializer, params, NodeVersion.CURRENT_METADATA_VERSION); + }); + } + + private static Gen tableParams() + { + return Generators.toGen(new TableParamsBuilder() + .withKnownMemtables() + .withTransactionalMode() + .withFastPathStrategy() + .build()); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/schema/TransactionalConfigSchemaTest.java b/test/unit/org/apache/cassandra/schema/TransactionalConfigSchemaTest.java new file mode 100644 index 000000000000..b9876e0eb8eb --- /dev/null +++ b/test/unit/org/apache/cassandra/schema/TransactionalConfigSchemaTest.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.schema; + +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.db.ConsistencyLevel; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.service.consensus.migration.TransactionalMigrationFromMode; + +import static java.lang.String.format; + +public class TransactionalConfigSchemaTest +{ + private static final String KEYSPACE = "ks"; + @BeforeClass + public static void setup() + { + DatabaseDescriptor.daemonInitialization(); + ServerTestUtils.prepareServer(); + SchemaTestUtil.addOrUpdateKeyspace(KeyspaceMetadata.create(KEYSPACE, KeyspaceParams.simple(1), Tables.of())); + } + + private static void process(String fmt, Object... objects) + { + QueryProcessor.process(format(fmt, objects), ConsistencyLevel.ANY); + } + + private static void assertTransactionalMode(String table, TransactionalMode mode, TransactionalMigrationFromMode migration) + { + TableMetadata metadata = Schema.instance.getTableMetadata(KEYSPACE, table); + Assert.assertEquals(mode, metadata.params.transactionalMode); + Assert.assertEquals(migration, metadata.params.transactionalMigrationFrom); + } + + // if a table is created with an accord transactional mode, it skips having to migrate + @Test + public void newTableSkipsMigration() + { + String table = "new_table"; + process("CREATE TABLE ks.%s (k int primary key, v int) WITH transactional_mode='%s'", table, TransactionalMode.full); + assertTransactionalMode(table, TransactionalMode.full, TransactionalMigrationFromMode.none); + } + + // if an existing table is set to an accord transactional mode, it should be set to migrating + @Test + public void existingTableMigration() + { + String table = "existing_table"; + process("CREATE TABLE ks.%s (k int primary key, v int)", table); + assertTransactionalMode(table, TransactionalMode.off, TransactionalMigrationFromMode.none); + + process("ALTER TABLE ks.%s WITH transactional_mode='%s'", table, TransactionalMode.full); + assertTransactionalMode(table, TransactionalMode.full, TransactionalMigrationFromMode.off); + } + + // changing transactional mode with an incomplete migration should fail, unless the migration mode is explicitly updated + @Test + public void incompleteMigrationFailure() + { + String table = "incomplete_table"; + process("CREATE TABLE ks.%s (k int primary key, v int)", table); + process("ALTER TABLE ks.%s WITH transactional_mode='%s'", table, TransactionalMode.full); + assertTransactionalMode(table, TransactionalMode.full, TransactionalMigrationFromMode.off); + + process("ALTER TABLE ks.%s WITH transactional_mode='%s'", table, TransactionalMode.off); + assertTransactionalMode(table, TransactionalMode.off, TransactionalMigrationFromMode.full); + + // explicitly setting the migration mode should work + process("ALTER TABLE ks.%s WITH transactional_mode='%s' AND transactional_migration_from='%s'", + table, TransactionalMode.off, TransactionalMigrationFromMode.none); + assertTransactionalMode(table, TransactionalMode.off, TransactionalMigrationFromMode.none); + } + + @Test + public void alterCanSkipMigration() + { + String table = "alter_skips_migration_table"; + process("CREATE TABLE ks.%s (k int primary key, v int)", table); + assertTransactionalMode(table, TransactionalMode.off, TransactionalMigrationFromMode.none); + + process("ALTER TABLE ks.%s WITH transactional_mode='%s' AND transactional_migration_from='%s'", table, TransactionalMode.full, TransactionalMigrationFromMode.none); + assertTransactionalMode(table, TransactionalMode.full, TransactionalMigrationFromMode.none); + } +} diff --git a/test/unit/org/apache/cassandra/schema/ValidationTest.java b/test/unit/org/apache/cassandra/schema/ValidationTest.java index 8eb1247c5b0c..e9edbf729dbd 100644 --- a/test/unit/org/apache/cassandra/schema/ValidationTest.java +++ b/test/unit/org/apache/cassandra/schema/ValidationTest.java @@ -18,17 +18,35 @@ */ package org.apache.cassandra.schema; -import java.util.*; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; -import org.apache.cassandra.db.marshal.*; +import org.apache.cassandra.config.DatabaseDescriptor; + +import org.junit.BeforeClass; import org.junit.Test; +import org.apache.cassandra.db.marshal.AbstractType; + +import org.apache.cassandra.CassandraTestBase; +import org.apache.cassandra.CassandraTestBase.DDDaemonInitialization; + import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; -public class ValidationTest +@DDDaemonInitialization +public class ValidationTest extends CassandraTestBase { + @BeforeClass + public static void beforeClass() + { + DatabaseDescriptor.daemonInitialization(); + } + @Test public void testIsNameValidPositive() { @@ -47,35 +65,35 @@ public void testIsNameValidNegative() assertFalse(SchemaConstants.isValidName("!")); } - private static Set primitiveTypes = - new HashSet<>(Arrays.asList(new String[] { "ascii", "bigint", "blob", "boolean", "date", - "duration", "decimal", "double", "float", - "inet", "int", "smallint", "text", "time", - "timestamp", "timeuuid", "tinyint", "uuid", - "varchar", "varint" })); + private static final Set primitiveTypes = + new HashSet<>(Arrays.asList("ascii", "bigint", "blob", "boolean", "date", + "duration", "decimal", "double", "float", + "inet", "int", "smallint", "text", "time", + "timestamp", "timeuuid", "tinyint", "uuid", + "varchar", "varint")); @Test public void typeCompatibilityTest() { Map> compatibilityMap = new HashMap<>(); - compatibilityMap.put("bigint", new HashSet<>(Arrays.asList(new String[] {"timestamp"}))); - compatibilityMap.put("blob", new HashSet<>(Arrays.asList(new String[] {"ascii", "bigint", "boolean", "date", "decimal", "double", "duration", - "float", "inet", "int", "smallint", "text", "time", "timestamp", - "timeuuid", "tinyint", "uuid", "varchar", "varint"}))); - compatibilityMap.put("date", new HashSet<>(Arrays.asList(new String[] {"int"}))); - compatibilityMap.put("time", new HashSet<>(Arrays.asList(new String[] {"bigint"}))); - compatibilityMap.put("text", new HashSet<>(Arrays.asList(new String[] {"ascii", "varchar"}))); - compatibilityMap.put("timestamp", new HashSet<>(Arrays.asList(new String[] {"bigint"}))); - compatibilityMap.put("varchar", new HashSet<>(Arrays.asList(new String[] {"ascii", "text"}))); - compatibilityMap.put("varint", new HashSet<>(Arrays.asList(new String[] {"bigint", "int", "timestamp"}))); - compatibilityMap.put("uuid", new HashSet<>(Arrays.asList(new String[] {"timeuuid"}))); + compatibilityMap.put("bigint", new HashSet<>(Arrays.asList("timestamp"))); + compatibilityMap.put("blob", new HashSet<>(Arrays.asList("ascii", "bigint", "boolean", "date", "decimal", "double", "duration", + "float", "inet", "int", "smallint", "text", "time", "timestamp", + "timeuuid", "tinyint", "uuid", "varchar", "varint"))); + compatibilityMap.put("date", new HashSet<>(Arrays.asList("int"))); + compatibilityMap.put("time", new HashSet<>(Arrays.asList("bigint"))); + compatibilityMap.put("text", new HashSet<>(Arrays.asList("ascii", "varchar"))); + compatibilityMap.put("timestamp", new HashSet<>(Arrays.asList("bigint"))); + compatibilityMap.put("varchar", new HashSet<>(Arrays.asList("ascii", "text"))); + compatibilityMap.put("varint", new HashSet<>(Arrays.asList("bigint", "int", "timestamp"))); + compatibilityMap.put("uuid", new HashSet<>(Arrays.asList("timeuuid"))); for (String sourceTypeString: primitiveTypes) { - AbstractType sourceType = CQLTypeParser.parse("KEYSPACE", sourceTypeString, Types.none()); + AbstractType sourceType = CQLTypeParser.parse("KEYSPACE", sourceTypeString, Types.none()); for (String destinationTypeString: primitiveTypes) { - AbstractType destinationType = CQLTypeParser.parse("KEYSPACE", destinationTypeString, Types.none()); + AbstractType destinationType = CQLTypeParser.parse("KEYSPACE", destinationTypeString, Types.none()); if (compatibilityMap.get(destinationTypeString) != null && compatibilityMap.get(destinationTypeString).contains(sourceTypeString) || @@ -94,19 +112,19 @@ public void typeCompatibilityTest() } @Test - public void clusteringColumnTypeCompatibilityTest() throws Throwable + public void clusteringColumnTypeCompatibilityTest() { Map> compatibilityMap = new HashMap<>(); - compatibilityMap.put("blob", new HashSet<>(Arrays.asList(new String[] {"ascii", "text", "varchar"}))); - compatibilityMap.put("text", new HashSet<>(Arrays.asList(new String[] {"ascii", "varchar"}))); - compatibilityMap.put("varchar", new HashSet<>(Arrays.asList(new String[] {"ascii", "text" }))); + compatibilityMap.put("blob", new HashSet<>(Arrays.asList("ascii", "text", "varchar"))); + compatibilityMap.put("text", new HashSet<>(Arrays.asList("ascii", "varchar"))); + compatibilityMap.put("varchar", new HashSet<>(Arrays.asList("ascii", "text"))); for (String sourceTypeString: primitiveTypes) { - AbstractType sourceType = CQLTypeParser.parse("KEYSPACE", sourceTypeString, Types.none()); + AbstractType sourceType = CQLTypeParser.parse("KEYSPACE", sourceTypeString, Types.none()); for (String destinationTypeString: primitiveTypes) { - AbstractType destinationType = CQLTypeParser.parse("KEYSPACE", destinationTypeString, Types.none()); + AbstractType destinationType = CQLTypeParser.parse("KEYSPACE", destinationTypeString, Types.none()); if (compatibilityMap.get(destinationTypeString) != null && compatibilityMap.get(destinationTypeString).contains(sourceTypeString) || diff --git a/test/unit/org/apache/cassandra/security/DefaultSslContextFactoryTest.java b/test/unit/org/apache/cassandra/security/DefaultSslContextFactoryTest.java index fa3eb7c845ee..5dc6d7e39bdc 100644 --- a/test/unit/org/apache/cassandra/security/DefaultSslContextFactoryTest.java +++ b/test/unit/org/apache/cassandra/security/DefaultSslContextFactoryTest.java @@ -33,17 +33,17 @@ import io.netty.handler.ssl.SslContext; import io.netty.handler.ssl.SslProvider; import org.apache.cassandra.config.EncryptionOptions; +import org.apache.cassandra.config.EncryptionOptions.ServerEncryptionOptions.Builder; import org.apache.cassandra.distributed.shared.WithProperties; import org.apache.cassandra.transport.TlsTestUtils; import static org.apache.cassandra.config.CassandraRelevantProperties.DISABLE_TCACTIVE_OPENSSL; - -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.NOT_REQUIRED; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.NOT_REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; public class DefaultSslContextFactoryTest { - private Map commonConfig = new HashMap<>(); + private Map commonConfig = new HashMap<>(); @Before public void setup() @@ -54,7 +54,7 @@ public void setup() commonConfig.put("cipher_suites", Arrays.asList("TLS_RSA_WITH_AES_128_CBC_SHA")); } - private void addKeystoreOptions(Map config) + private void addKeystoreOptions(Map config) { config.put("keystore", TlsTestUtils.SERVER_KEYSTORE_PATH); config.put("keystore_password", TlsTestUtils.SERVER_KEYSTORE_PASSWORD); @@ -69,14 +69,18 @@ private void addOutboundKeystoreOptions(Map config) @Test public void getSslContextOpenSSL() throws IOException { - EncryptionOptions.ServerEncryptionOptions options = new EncryptionOptions.ServerEncryptionOptions().withTrustStore(TlsTestUtils.SERVER_TRUSTSTORE_PATH) - .withTrustStorePassword(TlsTestUtils.SERVER_TRUSTSTORE_PASSWORD) - .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH) - .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) - .withOutboundKeystore(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PATH) - .withOutboundKeystorePassword(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PASSWORD) - .withRequireClientAuth(NOT_REQUIRED) - .withCipherSuites("TLS_RSA_WITH_AES_128_CBC_SHA"); + EncryptionOptions.ServerEncryptionOptions.Builder builder = new Builder(); + EncryptionOptions.ServerEncryptionOptions options = builder + .withOutboundKeystore(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PATH) + .withOutboundKeystorePassword(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PASSWORD) + .withTrustStore(TlsTestUtils.SERVER_TRUSTSTORE_PATH) + .withTrustStorePassword(TlsTestUtils.SERVER_TRUSTSTORE_PASSWORD) + .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH) + .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) + .withRequireClientAuth(NOT_REQUIRED) + .withCipherSuites("TLS_RSA_WITH_AES_128_CBC_SHA") + .build(); + SslContext sslContext = SSLFactory.getOrCreateSslContext(options, REQUIRED, ISslContextFactory.SocketType.CLIENT, "test"); Assert.assertNotNull(sslContext); if (OpenSsl.isAvailable()) @@ -88,7 +92,7 @@ public void getSslContextOpenSSL() throws IOException @Test(expected = IOException.class) public void buildTrustManagerFactoryWithInvalidTruststoreFile() throws IOException { - Map config = new HashMap<>(); + Map config = new HashMap<>(); config.putAll(commonConfig); config.put("truststore", "/this/is/probably/not/a/file/on/your/test/machine"); @@ -100,7 +104,7 @@ public void buildTrustManagerFactoryWithInvalidTruststoreFile() throws IOExcepti @Test(expected = IOException.class) public void buildTrustManagerFactoryWithBadPassword() throws IOException { - Map config = new HashMap<>(); + Map config = new HashMap<>(); config.putAll(commonConfig); config.put("truststore_password", "HomeOfBadPasswords"); @@ -112,7 +116,7 @@ public void buildTrustManagerFactoryWithBadPassword() throws IOException @Test public void buildTrustManagerFactoryHappyPath() throws IOException { - Map config = new HashMap<>(); + Map config = new HashMap<>(); config.putAll(commonConfig); DefaultSslContextFactory defaultSslContextFactoryImpl = new DefaultSslContextFactory(config); @@ -124,7 +128,7 @@ public void buildTrustManagerFactoryHappyPath() throws IOException @Test(expected = IOException.class) public void buildKeyManagerFactoryWithInvalidKeystoreFile() throws IOException { - Map config = new HashMap<>(); + Map config = new HashMap<>(); config.putAll(commonConfig); config.put("keystore", "/this/is/probably/not/a/file/on/your/test/machine"); config.put("keystore_password", "ThisWontMatter"); @@ -137,7 +141,7 @@ public void buildKeyManagerFactoryWithInvalidKeystoreFile() throws IOException @Test(expected = IOException.class) public void buildKeyManagerFactoryWithBadPassword() throws IOException { - Map config = new HashMap<>(); + Map config = new HashMap<>(); config.putAll(commonConfig); addKeystoreOptions(config); config.put("keystore_password", "HomeOfBadPasswords"); @@ -149,7 +153,7 @@ public void buildKeyManagerFactoryWithBadPassword() throws IOException @Test public void buildKeyManagerFactoryHappyPath() throws IOException { - Map config = new HashMap<>(); + Map config = new HashMap<>(); config.putAll(commonConfig); DefaultSslContextFactory defaultSslContextFactoryImpl = new DefaultSslContextFactory(config); @@ -222,12 +226,13 @@ public void buildOutboundKeyManagerFactoryHappyPath() throws IOException } @Test - public void testDisableOpenSslForInJvmDtests() { + public void testDisableOpenSslForInJvmDtests() + { // The configuration name below is hard-coded intentionally to make sure we don't break the contract without // changing the documentation appropriately try (WithProperties properties = new WithProperties().set(DISABLE_TCACTIVE_OPENSSL, true)) { - Map config = new HashMap<>(); + Map config = new HashMap<>(); config.putAll(commonConfig); DefaultSslContextFactory defaultSslContextFactoryImpl = new DefaultSslContextFactory(config); diff --git a/test/unit/org/apache/cassandra/security/DummySslContextFactoryImpl.java b/test/unit/org/apache/cassandra/security/DummySslContextFactoryImpl.java index ca4f4e86f06e..12649cfb0cc6 100644 --- a/test/unit/org/apache/cassandra/security/DummySslContextFactoryImpl.java +++ b/test/unit/org/apache/cassandra/security/DummySslContextFactoryImpl.java @@ -44,7 +44,7 @@ public SSLContext createJSSESslContext(boolean verifyPeerCertificate) throws SSL } @Override - public SSLContext createJSSESslContext(EncryptionOptions.ClientAuth clientAuth) throws SSLException + public SSLContext createJSSESslContext(EncryptionOptions.ClientEncryptionOptions.ClientAuth clientAuth) throws SSLException { return null; } @@ -56,7 +56,7 @@ public SslContext createNettySslContext(boolean verifyPeerCertificate, SocketTyp } @Override - public SslContext createNettySslContext(EncryptionOptions.ClientAuth clientAuth, SocketType socketType, + public SslContext createNettySslContext(EncryptionOptions.ClientEncryptionOptions.ClientAuth clientAuth, SocketType socketType, CipherSuiteFilter cipherFilter) throws SSLException { return null; diff --git a/test/unit/org/apache/cassandra/security/FileBasedSslContextFactoryTest.java b/test/unit/org/apache/cassandra/security/FileBasedSslContextFactoryTest.java index d6d936ba0e6c..cc6c05af5d25 100644 --- a/test/unit/org/apache/cassandra/security/FileBasedSslContextFactoryTest.java +++ b/test/unit/org/apache/cassandra/security/FileBasedSslContextFactoryTest.java @@ -36,11 +36,12 @@ import static org.apache.cassandra.config.CassandraRelevantProperties.CASSANDRA_CONFIG; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.NOT_REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.NOT_REQUIRED; public class FileBasedSslContextFactoryTest { private EncryptionOptions.ServerEncryptionOptions encryptionOptions; + private EncryptionOptions.ServerEncryptionOptions.Builder encryptionOptionsBuilder; static WithProperties properties; @@ -60,7 +61,10 @@ public static void tearDownDatabaseDescriptor() @Before public void setup() { - encryptionOptions = new EncryptionOptions.ServerEncryptionOptions() + encryptionOptionsBuilder = new EncryptionOptions.ServerEncryptionOptions.Builder(); + encryptionOptions = encryptionOptionsBuilder + .withOutboundKeystore(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PATH) + .withOutboundKeystorePassword(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PASSWORD) .withSslContextFactory(new ParameterizedClass(TestFileBasedSSLContextFactory.class.getName(), new HashMap<>())) .withTrustStore(TlsTestUtils.SERVER_TRUSTSTORE_PATH) @@ -69,8 +73,7 @@ public void setup() .withCipherSuites("TLS_RSA_WITH_AES_128_CBC_SHA") .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH) .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) - .withOutboundKeystore(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PATH) - .withOutboundKeystorePassword(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PASSWORD); + .build(); } @Test @@ -95,11 +98,12 @@ public void testHappyPath() throws SSLException @Test public void testEmptyKeystorePasswords() throws SSLException { - EncryptionOptions.ServerEncryptionOptions localEncryptionOptions = encryptionOptions + EncryptionOptions.ServerEncryptionOptions localEncryptionOptions = encryptionOptionsBuilder + .withOutboundKeystorePassword("") + .withOutboundKeystore("test/conf/cassandra_ssl_test_nopassword.keystore") .withKeyStorePassword("") .withKeyStore("test/conf/cassandra_ssl_test_nopassword.keystore") - .withOutboundKeystorePassword("") - .withOutboundKeystore("test/conf/cassandra_ssl_test_nopassword.keystore"); + .build(); Assert.assertEquals("org.apache.cassandra.security.FileBasedSslContextFactoryTest$TestFileBasedSSLContextFactory", localEncryptionOptions.ssl_context_factory.class_name); @@ -118,13 +122,14 @@ public void testKeystorePasswordFile() throws SSLException { // Here we only override password configuration and specify password_file configuration since keystore paths // are already loaded in the `encryptionOptions` - EncryptionOptions.ServerEncryptionOptions localEncryptionOptions = encryptionOptions - .withKeyStorePassword(null) - .withKeyStorePasswordFile(TlsTestUtils.SERVER_KEYSTORE_PASSWORD_FILE) + EncryptionOptions.ServerEncryptionOptions localEncryptionOptions = encryptionOptionsBuilder .withOutboundKeystorePassword(null) .withOutboundKeystorePasswordFile(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PASSWORD_FILE) + .withKeyStorePassword(null) + .withKeyStorePasswordFile(TlsTestUtils.SERVER_KEYSTORE_PASSWORD_FILE) .withTrustStorePassword(null) - .withTrustStorePasswordFile(TlsTestUtils.SERVER_TRUSTSTORE_PASSWORD_FILE); + .withTrustStorePasswordFile(TlsTestUtils.SERVER_TRUSTSTORE_PASSWORD_FILE) + .build(); Assert.assertEquals("org.apache.cassandra.security.FileBasedSslContextFactoryTest$TestFileBasedSSLContextFactory", localEncryptionOptions.ssl_context_factory.class_name); @@ -144,13 +149,14 @@ public void testBadKeystorePasswordFile() throws SSLException { // Here we only override password configuration and specify password_file configuration since keystore paths // are already loaded in the `encryptionOptions` - encryptionOptions - .withKeyStorePassword(null) - .withKeyStorePasswordFile("/path/to/non-existance-password-file") + encryptionOptionsBuilder .withOutboundKeystorePassword(null) .withOutboundKeystorePasswordFile("/path/to/non-existance-password-file") + .withKeyStorePassword(null) + .withKeyStorePasswordFile("/path/to/non-existance-password-file") .withTrustStorePassword(null) - .withTrustStorePasswordFile("/path/to/non-existance-password-file"); + .withTrustStorePasswordFile("/path/to/non-existance-password-file") + .build(); } /** @@ -159,7 +165,9 @@ public void testBadKeystorePasswordFile() throws SSLException @Test(expected = IllegalArgumentException.class) public void testNullKeystorePasswordDisallowed() throws SSLException { - EncryptionOptions.ServerEncryptionOptions localEncryptionOptions = encryptionOptions.withKeyStorePassword(null); + EncryptionOptions.ServerEncryptionOptions localEncryptionOptions = encryptionOptionsBuilder + .withKeyStorePassword(null) + .build(); Assert.assertEquals("org.apache.cassandra.security.FileBasedSslContextFactoryTest$TestFileBasedSSLContextFactory", localEncryptionOptions.ssl_context_factory.class_name); @@ -187,7 +195,9 @@ public void testNullKeystorePasswordDisallowed() throws SSLException @Test public void testOnlyEmptyOutboundKeystorePassword() throws SSLException { - EncryptionOptions.ServerEncryptionOptions localEncryptionOptions = encryptionOptions.withOutboundKeystorePassword(null); + EncryptionOptions.ServerEncryptionOptions localEncryptionOptions = encryptionOptionsBuilder + .withOutboundKeystorePassword(null) + .build(); Assert.assertEquals("org.apache.cassandra.security.FileBasedSslContextFactoryTest$TestFileBasedSSLContextFactory", localEncryptionOptions.ssl_context_factory.class_name); @@ -203,7 +213,9 @@ public void testOnlyEmptyOutboundKeystorePassword() throws SSLException @Test public void testEmptyTruststorePassword() throws SSLException { - EncryptionOptions.ServerEncryptionOptions localEncryptionOptions = encryptionOptions.withTrustStorePassword(null); + EncryptionOptions.ServerEncryptionOptions localEncryptionOptions = encryptionOptionsBuilder + .withTrustStorePassword(null) + .build(); Assert.assertEquals("org.apache.cassandra.security.FileBasedSslContextFactoryTest$TestFileBasedSSLContextFactory", localEncryptionOptions.ssl_context_factory.class_name); Assert.assertNotNull("keystore_password must not be null", localEncryptionOptions.keystore_password); diff --git a/test/unit/org/apache/cassandra/security/PEMBasedSslContextFactoryTest.java b/test/unit/org/apache/cassandra/security/PEMBasedSslContextFactoryTest.java index 9781e9d08a71..18d323286d3e 100644 --- a/test/unit/org/apache/cassandra/security/PEMBasedSslContextFactoryTest.java +++ b/test/unit/org/apache/cassandra/security/PEMBasedSslContextFactoryTest.java @@ -39,8 +39,8 @@ import org.apache.cassandra.transport.TlsTestUtils; import static org.apache.cassandra.config.CassandraRelevantProperties.DISABLE_TCACTIVE_OPENSSL; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.NOT_REQUIRED; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.NOT_REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; import static org.apache.cassandra.security.PEMBasedSslContextFactory.ConfigKey.ENCODED_CERTIFICATES; import static org.apache.cassandra.security.PEMBasedSslContextFactory.ConfigKey.ENCODED_KEY; import static org.apache.cassandra.security.PEMBasedSslContextFactory.ConfigKey.KEY_PASSWORD; @@ -215,12 +215,14 @@ public void getSslContextOpenSSL() throws IOException { ParameterizedClass sslContextFactory = new ParameterizedClass(PEMBasedSslContextFactory.class.getSimpleName() , new HashMap<>()); - EncryptionOptions options = new EncryptionOptions().withTrustStore(TlsTestUtils.SERVER_TRUSTSTORE_PEM_PATH) - .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH_PEM) - .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) - .withRequireClientAuth(NOT_REQUIRED) - .withCipherSuites("TLS_RSA_WITH_AES_128_CBC_SHA") - .withSslContextFactory(sslContextFactory); + EncryptionOptions.ClientEncryptionOptions options = new EncryptionOptions.ClientEncryptionOptions.Builder() + .withTrustStore(TlsTestUtils.SERVER_TRUSTSTORE_PEM_PATH) + .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH_PEM) + .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) + .withRequireClientAuth(NOT_REQUIRED) + .withCipherSuites("TLS_RSA_WITH_AES_128_CBC_SHA") + .withSslContextFactory(sslContextFactory) + .build(); SslContext sslContext = SSLFactory.getOrCreateSslContext(options, REQUIRED, ISslContextFactory.SocketType.SERVER, "test"); Assert.assertNotNull(sslContext); if (OpenSsl.isAvailable()) @@ -234,14 +236,16 @@ public void getSslContextOpenSSLOutboundKeystore() throws IOException { ParameterizedClass sslContextFactory = new ParameterizedClass(PEMBasedSslContextFactory.class.getSimpleName() , new HashMap<>()); - EncryptionOptions.ServerEncryptionOptions options = new EncryptionOptions.ServerEncryptionOptions().withTrustStore(TlsTestUtils.SERVER_TRUSTSTORE_PEM_PATH) - .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH_PEM) - .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) - .withOutboundKeystore(TlsTestUtils.SERVER_KEYSTORE_PATH_PEM) - .withOutboundKeystorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) - .withRequireClientAuth(NOT_REQUIRED) - .withCipherSuites("TLS_RSA_WITH_AES_128_CBC_SHA") - .withSslContextFactory(sslContextFactory); + EncryptionOptions.ServerEncryptionOptions options = + new EncryptionOptions.ServerEncryptionOptions.Builder().withOutboundKeystore(TlsTestUtils.SERVER_KEYSTORE_PATH_PEM) + .withOutboundKeystorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) + .withTrustStore(TlsTestUtils.SERVER_TRUSTSTORE_PEM_PATH) + .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH_PEM) + .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) + .withRequireClientAuth(NOT_REQUIRED) + .withCipherSuites("TLS_RSA_WITH_AES_128_CBC_SHA") + .withSslContextFactory(sslContextFactory) + .build(); SslContext sslContext = SSLFactory.getOrCreateSslContext(options, REQUIRED, ISslContextFactory.SocketType.CLIENT, "test"); Assert.assertNotNull(sslContext); if (OpenSsl.isAvailable()) diff --git a/test/unit/org/apache/cassandra/security/SSLFactoryTest.java b/test/unit/org/apache/cassandra/security/SSLFactoryTest.java index ba46588686bc..7ac95173376c 100644 --- a/test/unit/org/apache/cassandra/security/SSLFactoryTest.java +++ b/test/unit/org/apache/cassandra/security/SSLFactoryTest.java @@ -1,21 +1,21 @@ /* -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an -* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -* KIND, either express or implied. See the License for the -* specific language governing permissions and limitations -* under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ package org.apache.cassandra.security; import java.io.FileInputStream; @@ -46,18 +46,20 @@ import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.config.EncryptionOptions; import org.apache.cassandra.config.EncryptionOptions.ServerEncryptionOptions; +import org.apache.cassandra.config.EncryptionOptions.ServerEncryptionOptions.Builder; import org.apache.cassandra.config.ParameterizedClass; import org.apache.cassandra.io.util.File; import org.apache.cassandra.transport.TlsTestUtils; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.NOT_REQUIRED; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.NOT_REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; public class SSLFactoryTest { static final SelfSignedCertificate ssc; + static { DatabaseDescriptor.daemonInitialization(); @@ -77,33 +79,41 @@ public class SSLFactoryTest public void setup() { SSLFactory.clearSslContextCache(); - encryptionOptions = new ServerEncryptionOptions() - .withTrustStore(TlsTestUtils.SERVER_TRUSTSTORE_PATH) - .withTrustStorePassword(TlsTestUtils.SERVER_TRUSTSTORE_PASSWORD) - .withRequireClientAuth(NOT_REQUIRED) - .withCipherSuites("TLS_RSA_WITH_AES_128_CBC_SHA") - .withSslContextFactory(new ParameterizedClass(TestFileBasedSSLContextFactory.class.getName(), - new HashMap<>())); + encryptionOptions = new Builder().withTrustStore(TlsTestUtils.SERVER_TRUSTSTORE_PATH) + .withTrustStorePassword(TlsTestUtils.SERVER_TRUSTSTORE_PASSWORD) + .withRequireClientAuth(NOT_REQUIRED) + .withCipherSuites("TLS_RSA_WITH_AES_128_CBC_SHA") + .withSslContextFactory(new ParameterizedClass(TestFileBasedSSLContextFactory.class.getName(), + new HashMap<>())) + .build(); } - private ServerEncryptionOptions addKeystoreOptions(ServerEncryptionOptions options) + private Builder addKeystoreOptions(ServerEncryptionOptions options) { - return options.withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH) - .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) - .withOutboundKeystore(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PATH) - .withOutboundKeystorePassword(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PASSWORD); + EncryptionOptions.ServerEncryptionOptions.Builder builder = new EncryptionOptions.ServerEncryptionOptions.Builder(options); + + builder.withOutboundKeystorePassword(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PASSWORD) + .withOutboundKeystore(TlsTestUtils.SERVER_OUTBOUND_KEYSTORE_PATH) + .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) + .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH); + + return builder; } - private ServerEncryptionOptions addPEMKeystoreOptions(ServerEncryptionOptions options) + private Builder addPEMKeystoreOptions(ServerEncryptionOptions options) { ParameterizedClass sslContextFactoryClass = new ParameterizedClass("org.apache.cassandra.security.PEMBasedSslContextFactory", new HashMap<>()); - return options.withSslContextFactory(sslContextFactoryClass) - .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH_PEM) - .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) - .withOutboundKeystore(TlsTestUtils.SERVER_KEYSTORE_PATH_PEM) - .withOutboundKeystorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) - .withTrustStore(TlsTestUtils.SERVER_TRUSTSTORE_PEM_PATH); + EncryptionOptions.ServerEncryptionOptions.Builder builder = new EncryptionOptions.ServerEncryptionOptions.Builder(options); + + builder.withOutboundKeystore(TlsTestUtils.SERVER_KEYSTORE_PATH_PEM) + .withOutboundKeystorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) + .withSslContextFactory(sslContextFactoryClass) + .withKeyStore(TlsTestUtils.SERVER_KEYSTORE_PATH_PEM) + .withKeyStorePassword(TlsTestUtils.SERVER_KEYSTORE_PASSWORD) + .withTrustStore(TlsTestUtils.SERVER_TRUSTSTORE_PEM_PATH); + + return builder; } @Test @@ -111,9 +121,13 @@ public void testSslContextReload_HappyPath() throws IOException, InterruptedExce { try { - ServerEncryptionOptions options = addKeystoreOptions(encryptionOptions) - .withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all); - ServerEncryptionOptions legacyOptions = options.withOptional(false).withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all); + Builder optionsBuilder = addKeystoreOptions(encryptionOptions) + .withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all); + ServerEncryptionOptions options = optionsBuilder.build(); + ServerEncryptionOptions legacyOptions = optionsBuilder + .withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all) + .withOptional(false) + .build(); options.sslContextFactoryInstance.initHotReloading(); legacyOptions.sslContextFactoryInstance.initHotReloading(); @@ -146,8 +160,9 @@ public void testSslContextReload_HappyPath() throws IOException, InterruptedExce public void testServerSocketShouldUseKeystore() throws IOException, CertificateException, KeyStoreException, NoSuchAlgorithmException, NoSuchFieldException, IllegalAccessException, ClassNotFoundException, NoSuchMethodException, InvocationTargetException { ServerEncryptionOptions options = addKeystoreOptions(encryptionOptions) - .withOutboundKeystore("dummyKeystore") - .withOutboundKeystorePassword("dummyPassword"); + .withOutboundKeystore("dummyKeystore") + .withOutboundKeystorePassword("dummyPassword") + .build(); // Server socket type should create a keystore with keystore & keystore password final OpenSslServerContext context = (OpenSslServerContext) SSLFactory.createNettySslContext(options, REQUIRED, ISslContextFactory.SocketType.SERVER); @@ -163,8 +178,9 @@ public void testServerSocketShouldUseKeystore() throws IOException, CertificateE public void testClientSocketShouldUseOutboundKeystore() throws IOException, CertificateException, KeyStoreException, NoSuchAlgorithmException, NoSuchFieldException, ClassNotFoundException, InvocationTargetException, IllegalAccessException, NoSuchMethodException { ServerEncryptionOptions options = addKeystoreOptions(encryptionOptions) - .withKeyStore("dummyKeystore") - .withKeyStorePassword("dummyPassword"); + .withKeyStore("dummyKeystore") + .withKeyStorePassword("dummyPassword") + .build(); // Client socket type should create a keystore with outbound Keystore & outbound password final OpenSslClientContext context = (OpenSslClientContext) SSLFactory.createNettySslContext(options, REQUIRED, ISslContextFactory.SocketType.CLIENT); @@ -181,10 +197,14 @@ public void testPEMSslContextReload_HappyPath() throws IOException { try { - ServerEncryptionOptions options = addPEMKeystoreOptions(encryptionOptions) - .withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.dc); + Builder optionsBuilder = addPEMKeystoreOptions(encryptionOptions); + ServerEncryptionOptions options = optionsBuilder.withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.dc) + .build(); // emulate InboundSockets and share the cert but with different options, no extra hot reloading init - ServerEncryptionOptions legacyOptions = options.withOptional(false).withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all); + ServerEncryptionOptions legacyOptions = optionsBuilder + .withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all) + .withOptional(false) + .build(); options.sslContextFactoryInstance.initHotReloading(); legacyOptions.sslContextFactoryInstance.initHotReloading(); @@ -217,8 +237,9 @@ public void testPEMSslContextReload_HappyPath() throws IOException public void testSslFactorySslInit_BadPassword_ThrowsException() throws IOException { ServerEncryptionOptions options = addKeystoreOptions(encryptionOptions) - .withKeyStorePassword("bad password") - .withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all); + .withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all) + .withKeyStorePassword("bad password") + .build(); SSLFactory.validateSslContext("testSslFactorySslInit_BadPassword_ThrowsException", options, NOT_REQUIRED, true); } @@ -228,13 +249,17 @@ public void testSslFactoryHotReload_BadPassword_DoesNotClearExistingSslContext() { try { - ServerEncryptionOptions options = addKeystoreOptions(encryptionOptions); + Builder optionsBuilder = addKeystoreOptions(encryptionOptions); + ServerEncryptionOptions options = optionsBuilder.build(); // emulate InboundSockets and share the cert but with different options, no extra hot reloading init - ServerEncryptionOptions legacyOptions = options.withOptional(false).withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all); + ServerEncryptionOptions legacyOptions = optionsBuilder + .withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.all) + .withOptional(false) + .build(); File testKeystoreFile = new File(options.keystore + ".test"); FileUtils.copyFile(new File(options.keystore).toJavaIOFile(), testKeystoreFile.toJavaIOFile()); - options = options.withKeyStore(testKeystoreFile.path()); + options = new Builder(options).withKeyStore(testKeystoreFile.path()).build(); SSLFactory.initHotReloading(options, options, true); // deliberately not initializing with legacyOptions to match InboundSockets.addBindings @@ -261,11 +286,12 @@ public void testSslFactoryHotReload_CorruptOrNonExistentFile_DoesNotClearExistin { try { - ServerEncryptionOptions options = addKeystoreOptions(encryptionOptions); + Builder optionsBuilder = addKeystoreOptions(encryptionOptions); + ServerEncryptionOptions options = optionsBuilder.build(); File testKeystoreFile = new File(options.keystore + ".test"); FileUtils.copyFile(new File(options.keystore).toJavaIOFile(), testKeystoreFile.toJavaIOFile()); - options = options.withKeyStore(testKeystoreFile.path()); + options = optionsBuilder.withKeyStore(testKeystoreFile.path()).build(); SSLFactory.initHotReloading(options, options, true); @@ -294,8 +320,10 @@ public void testSslFactoryHotReload_CorruptOrNonExistentFile_DoesNotClearExistin @Test public void getSslContext_ParamChanges() throws IOException { - ServerEncryptionOptions options = addKeystoreOptions(encryptionOptions) - .withCipherSuites("TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256"); + Builder optionsBuilder = addKeystoreOptions(encryptionOptions); + ServerEncryptionOptions options = optionsBuilder + .withCipherSuites("TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256") + .build(); SslContext ctx1 = SSLFactory.getOrCreateSslContext(options, REQUIRED, ISslContextFactory.SocketType.SERVER, "test"); @@ -303,7 +331,7 @@ public void getSslContext_ParamChanges() throws IOException Assert.assertTrue(ctx1.isServer()); Assert.assertEquals(ctx1.cipherSuites(), options.cipher_suites); - options = options.withCipherSuites("TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256"); + options = optionsBuilder.withCipherSuites("TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256").build(); SslContext ctx2 = SSLFactory.getOrCreateSslContext(options, REQUIRED, ISslContextFactory.SocketType.CLIENT, "test"); @@ -313,30 +341,33 @@ public void getSslContext_ParamChanges() throws IOException } @Test - public void testCacheKeyEqualityForCustomSslContextFactory() { + public void testCacheKeyEqualityForCustomSslContextFactory() + { - Map parameters1 = new HashMap<>(); + Map parameters1 = new HashMap<>(); parameters1.put("key1", "value1"); parameters1.put("key2", "value2"); - EncryptionOptions encryptionOptions1 = - new EncryptionOptions() + EncryptionOptions.ClientEncryptionOptions encryptionOptions1 = + new EncryptionOptions.ClientEncryptionOptions.Builder() .withSslContextFactory(new ParameterizedClass(DummySslContextFactoryImpl.class.getName(), parameters1)) .withProtocol("TLSv1.1") .withRequireClientAuth(REQUIRED) - .withRequireEndpointVerification(false); + .withRequireEndpointVerification(false) + .build(); SSLFactory.CacheKey cacheKey1 = new SSLFactory.CacheKey(encryptionOptions1, ISslContextFactory.SocketType.SERVER, "test" ); - Map parameters2 = new HashMap<>(); + Map parameters2 = new HashMap<>(); parameters2.put("key1", "value1"); parameters2.put("key2", "value2"); - EncryptionOptions encryptionOptions2 = - new EncryptionOptions() + EncryptionOptions.ClientEncryptionOptions encryptionOptions2 = + new EncryptionOptions.ClientEncryptionOptions.Builder() .withSslContextFactory(new ParameterizedClass(DummySslContextFactoryImpl.class.getName(), parameters2)) .withProtocol("TLSv1.1") .withRequireClientAuth(REQUIRED) - .withRequireEndpointVerification(false); + .withRequireEndpointVerification(false) + .build(); SSLFactory.CacheKey cacheKey2 = new SSLFactory.CacheKey(encryptionOptions2, ISslContextFactory.SocketType.SERVER, "test" ); @@ -345,26 +376,29 @@ public void testCacheKeyEqualityForCustomSslContextFactory() { } @Test - public void testCacheKeyInequalityForCustomSslContextFactory() { + public void testCacheKeyInequalityForCustomSslContextFactory() + { - Map parameters1 = new HashMap<>(); + Map parameters1 = new HashMap<>(); parameters1.put("key1", "value11"); parameters1.put("key2", "value12"); - EncryptionOptions encryptionOptions1 = - new EncryptionOptions() + EncryptionOptions.ClientEncryptionOptions encryptionOptions1 = + new EncryptionOptions.ClientEncryptionOptions.Builder() .withSslContextFactory(new ParameterizedClass(DummySslContextFactoryImpl.class.getName(), parameters1)) - .withProtocol("TLSv1.1"); + .withProtocol("TLSv1.1") + .build(); SSLFactory.CacheKey cacheKey1 = new SSLFactory.CacheKey(encryptionOptions1, ISslContextFactory.SocketType.SERVER, "test" ); - Map parameters2 = new HashMap<>(); + Map parameters2 = new HashMap<>(); parameters2.put("key1", "value21"); parameters2.put("key2", "value22"); - EncryptionOptions encryptionOptions2 = - new EncryptionOptions() + EncryptionOptions.ClientEncryptionOptions encryptionOptions2 = + new EncryptionOptions.ClientEncryptionOptions.Builder() .withSslContextFactory(new ParameterizedClass(DummySslContextFactoryImpl.class.getName(), parameters2)) - .withProtocol("TLSv1.1"); + .withProtocol("TLSv1.1") + .build(); SSLFactory.CacheKey cacheKey2 = new SSLFactory.CacheKey(encryptionOptions2, ISslContextFactory.SocketType.SERVER, "test" ); @@ -372,7 +406,8 @@ public void testCacheKeyInequalityForCustomSslContextFactory() { Assert.assertNotEquals(cacheKey1, cacheKey2); } - public static class TestFileBasedSSLContextFactory extends FileBasedSslContextFactory { + public static class TestFileBasedSSLContextFactory extends FileBasedSslContextFactory + { public TestFileBasedSSLContextFactory(Map parameters) { super(parameters); diff --git a/test/unit/org/apache/cassandra/serializers/SerializationUtils.java b/test/unit/org/apache/cassandra/serializers/SerializationUtils.java index b88b56f003de..da37cb2a5a6d 100644 --- a/test/unit/org/apache/cassandra/serializers/SerializationUtils.java +++ b/test/unit/org/apache/cassandra/serializers/SerializationUtils.java @@ -49,11 +49,6 @@ public static T cycleSerialization(T src, IVersionedSerializer serializer } } - public static T cycleSerialization(T src, IVersionedSerializer serializer) - { - return cycleSerialization(src, serializer, MessagingService.current_version); - } - public static void assertSerializationCycle(T src, IVersionedSerializer serializer, int version) { T dst = cycleSerialization(src, serializer, version); diff --git a/test/unit/org/apache/cassandra/service/AbstractFilesystemOwnershipCheckTest.java b/test/unit/org/apache/cassandra/service/AbstractFilesystemOwnershipCheckTest.java index cae7de060b62..7c0d78dbd543 100644 --- a/test/unit/org/apache/cassandra/service/AbstractFilesystemOwnershipCheckTest.java +++ b/test/unit/org/apache/cassandra/service/AbstractFilesystemOwnershipCheckTest.java @@ -33,14 +33,15 @@ import org.junit.Ignore; import org.junit.Test; -import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.config.StartupChecksOptions; import org.apache.cassandra.distributed.shared.WithProperties; import org.apache.cassandra.exceptions.StartupException; import org.apache.cassandra.io.util.File; -import static org.apache.cassandra.service.FileSystemOwnershipCheck.DEFAULT_FS_OWNERSHIP_FILENAME; +import static org.apache.cassandra.config.CassandraRelevantProperties.FILE_SYSTEM_CHECK_ENABLE; +import static org.apache.cassandra.config.CassandraRelevantProperties.FILE_SYSTEM_CHECK_OWNERSHIP_FILENAME; +import static org.apache.cassandra.config.CassandraRelevantProperties.FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN; import static org.apache.cassandra.service.FileSystemOwnershipCheck.ERROR_PREFIX; import static org.apache.cassandra.service.FileSystemOwnershipCheck.INCONSISTENT_FILES_FOUND; import static org.apache.cassandra.service.FileSystemOwnershipCheck.INVALID_FILE_COUNT; @@ -69,15 +70,20 @@ public abstract class AbstractFilesystemOwnershipCheckTest static WithProperties properties; + protected static String ownershipCheckFileName() + { + return FILE_SYSTEM_CHECK_OWNERSHIP_FILENAME.getDefaultValue(); + } + protected void setup() { cleanTempDir(); tempDir = new File(com.google.common.io.Files.createTempDir()); token = makeRandomString(10); properties = new WithProperties(); - System.clearProperty(CassandraRelevantProperties.FILE_SYSTEM_CHECK_OWNERSHIP_FILENAME.getKey()); - System.clearProperty(CassandraRelevantProperties.FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN.getKey()); - System.clearProperty(CassandraRelevantProperties.FILE_SYSTEM_CHECK_ENABLE.getKey()); + System.clearProperty(FILE_SYSTEM_CHECK_OWNERSHIP_FILENAME.getKey()); + System.clearProperty(FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN.getKey()); + System.clearProperty(FILE_SYSTEM_CHECK_ENABLE.getKey()); } static File writeFile(File dir, String filename, Properties props) throws IOException @@ -118,7 +124,7 @@ private static Properties makeProperties(int version, int volumeCount, String to private static File writeFile(File dir, int volumeCount, String token) throws IOException { - return AbstractFilesystemOwnershipCheckTest.writeFile(dir, DEFAULT_FS_OWNERSHIP_FILENAME, 1, volumeCount, token); + return AbstractFilesystemOwnershipCheckTest.writeFile(dir, ownershipCheckFileName(), 1, volumeCount, token); } private static File writeFile(File dir, final String filename, int version, int volumeCount, String token) @@ -201,7 +207,7 @@ public void skipCheckDisabledIfSystemPropertyIsEmpty() throws Exception { // no exceptions thrown from the supplier because the check is skipped options.disable(check_filesystem_ownership); - System.clearProperty(CassandraRelevantProperties.FILE_SYSTEM_CHECK_ENABLE.getKey()); + System.clearProperty(FILE_SYSTEM_CHECK_ENABLE.getKey()); AbstractFilesystemOwnershipCheckTest.checker(() -> { throw new RuntimeException("FAIL"); }).execute(options); } @@ -210,23 +216,23 @@ public void skipCheckDisabledIfSystemPropertyIsFalseButOptionsEnabled() throws E { // no exceptions thrown from the supplier because the check is skipped options.enable(check_filesystem_ownership); - CassandraRelevantProperties.FILE_SYSTEM_CHECK_ENABLE.setBoolean(false); + FILE_SYSTEM_CHECK_ENABLE.setBoolean(false); AbstractFilesystemOwnershipCheckTest.checker(() -> { throw new RuntimeException("FAIL"); }).execute(options); } @Test public void checkEnabledButClusterPropertyIsEmpty() { - CassandraRelevantProperties.FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN.setString(""); - AbstractFilesystemOwnershipCheckTest.executeAndFail(AbstractFilesystemOwnershipCheckTest.checker(tempDir), options, MISSING_PROPERTY, CassandraRelevantProperties.FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN.getKey()); + FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN.setString(""); + AbstractFilesystemOwnershipCheckTest.executeAndFail(checker(tempDir), options, MISSING_PROPERTY, FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN.getKey()); } @Test public void checkEnabledButClusterPropertyIsUnset() { Assume.assumeFalse(options.getConfig(check_filesystem_ownership).containsKey("ownership_token")); - CassandraRelevantProperties.FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN.clearValue(); // checkstyle: suppress nearby 'clearValueSystemPropertyUsage' - AbstractFilesystemOwnershipCheckTest.executeAndFail(AbstractFilesystemOwnershipCheckTest.checker(tempDir), options, MISSING_PROPERTY, CassandraRelevantProperties.FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN.getKey()); + FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN.clearValue(); // checkstyle: suppress nearby 'clearValueSystemPropertyUsage' + AbstractFilesystemOwnershipCheckTest.executeAndFail(checker(tempDir), options, MISSING_PROPERTY, FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN.getKey()); } // tests for presence/absence of files in dirs @@ -319,7 +325,7 @@ public void propsFileUnreadable() throws Exception public void propsFileIllegalContent() throws Exception { File leafDir = AbstractFilesystemOwnershipCheckTest.mkdirs(tempDir, "cassandra/data"); - File propsFile = new File(leafDir, DEFAULT_FS_OWNERSHIP_FILENAME); //checkstyle: permit this instantiation + File propsFile = new File(leafDir, ownershipCheckFileName()); //checkstyle: permit this instantiation assertTrue(propsFile.createFileIfNotExists()); try (OutputStream os = Files.newOutputStream(propsFile.toPath())) { @@ -360,9 +366,9 @@ public void overrideFilename() throws Exception { File leafDir = AbstractFilesystemOwnershipCheckTest.mkdirs(tempDir, "cassandra/data"); writeFile(leafDir.parent(), "other_file", AbstractFilesystemOwnershipCheckTest.makeProperties(1, 1, token)); - AbstractFilesystemOwnershipCheckTest.executeAndFail(AbstractFilesystemOwnershipCheckTest.checker(leafDir), options, NO_OWNERSHIP_FILE, quote(leafDir.absolutePath())); - CassandraRelevantProperties.FILE_SYSTEM_CHECK_OWNERSHIP_FILENAME.setString("other_file"); - AbstractFilesystemOwnershipCheckTest.checker(leafDir).execute(options); + AbstractFilesystemOwnershipCheckTest.executeAndFail(checker(leafDir), options, NO_OWNERSHIP_FILE, quote(leafDir.absolutePath())); + FILE_SYSTEM_CHECK_OWNERSHIP_FILENAME.setString("other_file"); + checker(leafDir).execute(options); } // check consistency between discovered files @@ -404,11 +410,11 @@ public void differentExpectedCountsFoundInTrees() throws Exception public void emptyPropertiesFile() throws Exception { File leafDir = AbstractFilesystemOwnershipCheckTest.mkdirs(tempDir, "cassandra/data"); - writeFile(leafDir.parent(), DEFAULT_FS_OWNERSHIP_FILENAME, new Properties()); - AbstractFilesystemOwnershipCheckTest.executeAndFail(AbstractFilesystemOwnershipCheckTest.checker(leafDir), + writeFile(leafDir.parent(), ownershipCheckFileName(), new Properties()); + AbstractFilesystemOwnershipCheckTest.executeAndFail(checker(leafDir), options, String.format(INVALID_PROPERTY_VALUE, VERSION), - leafDir.parent().toPath().resolve(DEFAULT_FS_OWNERSHIP_FILENAME)); + leafDir.parent().toPath().resolve(ownershipCheckFileName())); } @Test @@ -418,11 +424,11 @@ public void missingVersionProp() throws Exception p.setProperty(VOLUME_COUNT, "1"); p.setProperty(TOKEN, "foo"); File leafDir = AbstractFilesystemOwnershipCheckTest.mkdirs(tempDir, "cassandra/data"); - writeFile(leafDir.parent(), DEFAULT_FS_OWNERSHIP_FILENAME, p); - AbstractFilesystemOwnershipCheckTest.executeAndFail(AbstractFilesystemOwnershipCheckTest.checker(leafDir), + writeFile(leafDir.parent(), ownershipCheckFileName(), p); + AbstractFilesystemOwnershipCheckTest.executeAndFail(checker(leafDir), options, String.format(INVALID_PROPERTY_VALUE, VERSION), - leafDir.parent().toPath().resolve(DEFAULT_FS_OWNERSHIP_FILENAME)); + leafDir.parent().toPath().resolve(ownershipCheckFileName())); } @Test @@ -431,11 +437,11 @@ public void nonNumericVersionProp() throws Exception Properties p = new Properties(); p.setProperty(VERSION, "abc"); File leafDir = AbstractFilesystemOwnershipCheckTest.mkdirs(tempDir, "cassandra/data"); - writeFile(leafDir.parent(), DEFAULT_FS_OWNERSHIP_FILENAME, p); - AbstractFilesystemOwnershipCheckTest.executeAndFail(AbstractFilesystemOwnershipCheckTest.checker(leafDir), + writeFile(leafDir.parent(), ownershipCheckFileName(), p); + AbstractFilesystemOwnershipCheckTest.executeAndFail(checker(leafDir), options, String.format(INVALID_PROPERTY_VALUE, VERSION), - leafDir.parent().toPath().resolve(DEFAULT_FS_OWNERSHIP_FILENAME)); + leafDir.parent().toPath().resolve(ownershipCheckFileName())); } @Test @@ -444,11 +450,11 @@ public void unsupportedVersionProp() throws Exception Properties p = new Properties(); p.setProperty(VERSION, "99"); File leafDir = AbstractFilesystemOwnershipCheckTest.mkdirs(tempDir, "cassandra/data"); - writeFile(leafDir.parent(), DEFAULT_FS_OWNERSHIP_FILENAME, p); - AbstractFilesystemOwnershipCheckTest.executeAndFail(AbstractFilesystemOwnershipCheckTest.checker(leafDir), + writeFile(leafDir.parent(), ownershipCheckFileName(), p); + AbstractFilesystemOwnershipCheckTest.executeAndFail(checker(leafDir), options, String.format(UNSUPPORTED_VERSION, "99"), - leafDir.parent().toPath().resolve(DEFAULT_FS_OWNERSHIP_FILENAME)); + leafDir.parent().toPath().resolve(ownershipCheckFileName())); } @Test @@ -458,11 +464,11 @@ public void missingVolumeCountProp() throws Exception p.setProperty(VERSION, "1"); p.setProperty(TOKEN, token); File leafDir = AbstractFilesystemOwnershipCheckTest.mkdirs(tempDir, "cassandra/data"); - writeFile(leafDir.parent(), DEFAULT_FS_OWNERSHIP_FILENAME, p); - AbstractFilesystemOwnershipCheckTest.executeAndFail(AbstractFilesystemOwnershipCheckTest.checker(leafDir), + writeFile(leafDir.parent(), ownershipCheckFileName(), p); + AbstractFilesystemOwnershipCheckTest.executeAndFail(checker(leafDir), options, String.format(INVALID_PROPERTY_VALUE, VOLUME_COUNT), - leafDir.parent().toPath().resolve(DEFAULT_FS_OWNERSHIP_FILENAME)); + leafDir.parent().toPath().resolve(ownershipCheckFileName())); } @Test @@ -473,11 +479,11 @@ public void nonNumericVolumeCountProp() throws Exception p.setProperty(VOLUME_COUNT, "bar"); p.setProperty(TOKEN, token); File leafDir = AbstractFilesystemOwnershipCheckTest.mkdirs(tempDir, "cassandra/data"); - writeFile(leafDir.parent(), DEFAULT_FS_OWNERSHIP_FILENAME, p); - AbstractFilesystemOwnershipCheckTest.executeAndFail(AbstractFilesystemOwnershipCheckTest.checker(leafDir), + writeFile(leafDir.parent(), ownershipCheckFileName(), p); + AbstractFilesystemOwnershipCheckTest.executeAndFail(checker(leafDir), options, String.format(INVALID_PROPERTY_VALUE, VOLUME_COUNT), - leafDir.parent().toPath().resolve(DEFAULT_FS_OWNERSHIP_FILENAME)); + leafDir.parent().toPath().resolve(ownershipCheckFileName())); } @Test @@ -487,11 +493,11 @@ public void missingTokenProp() throws Exception p.setProperty(VERSION, "1"); p.setProperty(VOLUME_COUNT, "1"); File leafDir = AbstractFilesystemOwnershipCheckTest.mkdirs(tempDir, "cassandra/data"); - writeFile(leafDir.parent(), DEFAULT_FS_OWNERSHIP_FILENAME, p); - AbstractFilesystemOwnershipCheckTest.executeAndFail(AbstractFilesystemOwnershipCheckTest.checker(leafDir), + writeFile(leafDir.parent(), ownershipCheckFileName(), p); + AbstractFilesystemOwnershipCheckTest.executeAndFail(checker(leafDir), options, String.format(INVALID_PROPERTY_VALUE, TOKEN), - leafDir.parent().toPath().resolve(DEFAULT_FS_OWNERSHIP_FILENAME)); + leafDir.parent().toPath().resolve(ownershipCheckFileName())); } @Test @@ -502,7 +508,7 @@ public void emptyTokenProp() throws Exception AbstractFilesystemOwnershipCheckTest.executeAndFail(AbstractFilesystemOwnershipCheckTest.checker(leafDir), options, String.format(INVALID_PROPERTY_VALUE, TOKEN), - leafDir.parent().toPath().resolve(DEFAULT_FS_OWNERSHIP_FILENAME)); + leafDir.parent().toPath().resolve(ownershipCheckFileName())); } @Test @@ -514,7 +520,7 @@ public void mismatchingTokenProp() throws Exception AbstractFilesystemOwnershipCheckTest.executeAndFail(AbstractFilesystemOwnershipCheckTest.checker(leafDir), options, MISMATCHING_TOKEN, - leafDir.parent().toPath().resolve(DEFAULT_FS_OWNERSHIP_FILENAME)); + leafDir.parent().toPath().resolve(ownershipCheckFileName())); } // Validate volume_count prop values match number of files found diff --git a/test/unit/org/apache/cassandra/service/ActiveRepairServiceTest.java b/test/unit/org/apache/cassandra/service/ActiveRepairServiceTest.java index c59163ae0bc3..ee4fa293a25e 100644 --- a/test/unit/org/apache/cassandra/service/ActiveRepairServiceTest.java +++ b/test/unit/org/apache/cassandra/service/ActiveRepairServiceTest.java @@ -1,21 +1,21 @@ /* -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an -* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -* KIND, either express or implied. See the License for the -* specific language governing permissions and limitations -* under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ package org.apache.cassandra.service; import java.net.UnknownHostException; @@ -61,6 +61,8 @@ import org.apache.cassandra.locator.Replica; import org.apache.cassandra.repair.messages.RepairOption; import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.service.disk.usage.DiskUsageMonitor; +import org.apache.cassandra.service.snapshot.SnapshotManager; import org.apache.cassandra.service.snapshot.TableSnapshot; import org.apache.cassandra.streaming.PreviewKind; import org.apache.cassandra.tcm.ClusterMetadata; @@ -72,8 +74,9 @@ import org.apache.cassandra.utils.TimeUUID; import org.apache.cassandra.utils.concurrent.Condition; import org.apache.cassandra.utils.concurrent.Refs; +import org.mockito.Mock; -import static org.apache.cassandra.ServerTestUtils.*; +import static org.apache.cassandra.ServerTestUtils.resetCMS; import static org.apache.cassandra.config.CassandraRelevantProperties.ORG_APACHE_CASSANDRA_DISABLE_MBEAN_REGISTRATION; import static org.apache.cassandra.repair.messages.RepairOption.DATACENTERS_KEY; import static org.apache.cassandra.repair.messages.RepairOption.FORCE_REPAIR_KEY; @@ -81,12 +84,15 @@ import static org.apache.cassandra.repair.messages.RepairOption.INCREMENTAL_KEY; import static org.apache.cassandra.repair.messages.RepairOption.RANGES_KEY; import static org.apache.cassandra.service.ActiveRepairService.UNREPAIRED_SSTABLE; +import static org.apache.cassandra.service.ActiveRepairService.instance; import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; import static org.apache.cassandra.utils.concurrent.Condition.newOneTimeCondition; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.fail; +import static org.mockito.Mockito.when; +import static org.mockito.MockitoAnnotations.initMocks; public class ActiveRepairServiceTest { @@ -98,6 +104,8 @@ public class ActiveRepairServiceTest public String cfname; public ColumnFamilyStore store; public static InetAddressAndPort LOCAL, REMOTE; + @Mock + public DiskUsageMonitor diskUsageMonitor; @BeforeClass public static void defineSchema() throws ConfigurationException, UnknownHostException @@ -122,6 +130,7 @@ public void prepare() throws Exception NodeId remote = Register.register(new NodeAddresses(REMOTE)); UnsafeJoin.unsafeJoin(local, Collections.singleton(DatabaseDescriptor.getPartitioner().getRandomToken())); UnsafeJoin.unsafeJoin(remote, Collections.singleton(DatabaseDescriptor.getPartitioner().getMinimumToken())); + initMocks(this); } @Test @@ -220,12 +229,12 @@ public void testGetNeighborsTimesTwoInSpecifiedHosts() throws Throwable } expected.remove(FBUtilities.getBroadcastAddressAndPort()); - Collection hosts = Arrays.asList(FBUtilities.getBroadcastAddressAndPort().getHostAddressAndPort(),expected.get(0).getHostAddressAndPort()); + Collection hosts = Arrays.asList(FBUtilities.getBroadcastAddressAndPort().getHostAddressAndPort(), expected.get(0).getHostAddressAndPort()); Iterable> ranges = StorageService.instance.getLocalReplicas(KEYSPACE5).ranges(); assertEquals(expected.get(0), ActiveRepairService.instance().getNeighbors(KEYSPACE5, ranges, - ranges.iterator().next(), - null, hosts).endpoints().iterator().next()); + ranges.iterator().next(), + null, hosts).endpoints().iterator().next()); } @Test(expected = IllegalArgumentException.class) @@ -238,7 +247,6 @@ public void testGetNeighborsSpecifiedHostsWithNoLocalHost() throws Throwable ActiveRepairService.instance().getNeighbors(KEYSPACE5, ranges, ranges.iterator().next(), null, hosts); } - @Test public void testParentRepairStatus() throws Throwable { @@ -256,7 +264,6 @@ public void testParentRepairStatus() throws Throwable List failed = StorageService.instance.getParentRepairStatus(3); assertNotNull(failed); assertEquals(ActiveRepairService.ParentRepairStatus.FAILED, ActiveRepairService.ParentRepairStatus.valueOf(failed.get(0))); - } Set addTokens(int max) throws Throwable @@ -300,6 +307,26 @@ public void testSnapshotAddSSTables() throws Exception } } + @Test + public void testForcedSnapshot() throws Throwable + { + ColumnFamilyStore store = prepareColumnFamilyStore(); + TimeUUID prsId = nextTimeUUID(); + Collection> ranges = Collections.singleton(new Range<>(store.getPartitioner().getMinimumToken(), store.getPartitioner().getMinimumToken())); + ActiveRepairService.instance().registerParentRepairSession(prsId, FBUtilities.getBroadcastAddressAndPort(), Collections.singletonList(store), + ranges, true, System.currentTimeMillis(), false, PreviewKind.NONE); + + // snapshot twice, would not be possible before CASSANDRA-20490 + store.getRepairManager().snapshot(prsId.toString(), ranges, true); + store.getRepairManager().snapshot(prsId.toString(), ranges, true); + + List snapshots = SnapshotManager.instance.getSnapshots(p -> p.getKeyspaceName().equals(store.getKeyspaceName()) && p.getTableName().equals(store.getTableName())); + Assert.assertEquals(1, snapshots.size()); + TableSnapshot snapshot = snapshots.get(0); + Assert.assertTrue(snapshot.isEphemeral()); + Assert.assertEquals(prsId.toString(), snapshot.getTag()); + } + private ColumnFamilyStore prepareColumnFamilyStore() { Keyspace keyspace = Keyspace.open(KEYSPACE5); @@ -307,6 +334,7 @@ private ColumnFamilyStore prepareColumnFamilyStore() store.truncateBlocking(); store.disableAutoCompaction(); createSSTables(store, 10); + SnapshotManager.instance.clearAllSnapshots(); return store; } @@ -331,10 +359,10 @@ private static RepairOption opts(String... params) { assert params.length % 2 == 0 : "unbalanced key value pairs"; Map opt = new HashMap<>(); - for (int i=0; i<(params.length >> 1); i++) + for (int i = 0; i < (params.length >> 1); i++) { int idx = i << 1; - opt.put(params[idx], params[idx+1]); + opt.put(params[idx], params[idx + 1]); } return RepairOption.parse(opt, DatabaseDescriptor.getPartitioner()); } @@ -354,19 +382,19 @@ public void repairedAt() throws Exception Assert.assertNotEquals(UNREPAIRED_SSTABLE, ActiveRepairService.instance().getRepairedAt(opts(INCREMENTAL_KEY, b2s(true)), false)); // subrange incremental repair Assert.assertNotEquals(UNREPAIRED_SSTABLE, ActiveRepairService.instance().getRepairedAt(opts(INCREMENTAL_KEY, b2s(true), - RANGES_KEY, "1:2"), false)); + RANGES_KEY, "1:2"), false)); // hosts incremental repair Assert.assertEquals(UNREPAIRED_SSTABLE, ActiveRepairService.instance().getRepairedAt(opts(INCREMENTAL_KEY, b2s(true), - HOSTS_KEY, "127.0.0.1"), false)); + HOSTS_KEY, "127.0.0.1"), false)); // dc incremental repair Assert.assertEquals(UNREPAIRED_SSTABLE, ActiveRepairService.instance().getRepairedAt(opts(INCREMENTAL_KEY, b2s(true), - DATACENTERS_KEY, "DC2"), false)); + DATACENTERS_KEY, "DC2"), false)); // forced incremental repair Assert.assertNotEquals(UNREPAIRED_SSTABLE, ActiveRepairService.instance().getRepairedAt(opts(INCREMENTAL_KEY, b2s(true), - FORCE_REPAIR_KEY, b2s(true)), false)); + FORCE_REPAIR_KEY, b2s(true)), false)); Assert.assertEquals(UNREPAIRED_SSTABLE, ActiveRepairService.instance().getRepairedAt(opts(INCREMENTAL_KEY, b2s(true), - FORCE_REPAIR_KEY, b2s(true)), true)); + FORCE_REPAIR_KEY, b2s(true)), true)); // full repair Assert.assertEquals(UNREPAIRED_SSTABLE, ActiveRepairService.instance().getRepairedAt(opts(INCREMENTAL_KEY, b2s(false)), false)); @@ -412,7 +440,8 @@ public void testRejectWhenPoolFullStrategy() throws InterruptedException // Submission is unblocked Thread.sleep(250); - validationExecutor.submit(() -> {}); + validationExecutor.submit(() -> { + }); } finally { @@ -449,8 +478,8 @@ public void testQueueWhenPoolFullStrategy() throws InterruptedException allSubmitted.await(TASK_SECONDS + 1, TimeUnit.SECONDS); // Give the tasks we expect to execute immediately chance to be scheduled - Util.spinAssertEquals(2 , ((ExecutorPlus) validationExecutor)::getActiveTaskCount, 1); - Util.spinAssertEquals(3 , ((ExecutorPlus) validationExecutor)::getPendingTaskCount, 1); + Util.spinAssertEquals(2, ((ExecutorPlus) validationExecutor)::getActiveTaskCount, 1); + Util.spinAssertEquals(3, ((ExecutorPlus) validationExecutor)::getPendingTaskCount, 1); // verify that we've reached a steady state with 2 threads actively processing and 3 queued tasks Assert.assertEquals(2, ((ExecutorPlus) validationExecutor).getActiveTaskCount()); @@ -489,7 +518,9 @@ public void testRepairSessionSpaceInMiB() activeRepairService.setRepairSessionSpaceInMiB(0); fail("Should have received an IllegalArgumentException for depth of 0"); } - catch (IllegalArgumentException ignored) { } + catch (IllegalArgumentException ignored) + { + } Assert.assertEquals(10, activeRepairService.getRepairSessionSpaceInMiB()); } @@ -499,6 +530,40 @@ public void testRepairSessionSpaceInMiB() } } + public void testVerifyDiskHeadroomThresholdFullRepair() + { + Assert.assertTrue(ActiveRepairService.verifyDiskHeadroomThreshold(TimeUUID.maxAtUnixMillis(0), PreviewKind.NONE, false)); + } + + @Test + public void testVerifyDiskHeadroomThresholdDiskFull() + { + DiskUsageMonitor.instance = diskUsageMonitor; + when(diskUsageMonitor.getDiskUsage()).thenReturn(1.0); + DatabaseDescriptor.setIncrementalRepairDiskHeadroomRejectRatio(1.0); + + Assert.assertFalse(ActiveRepairService.verifyDiskHeadroomThreshold(TimeUUID.maxAtUnixMillis(0), PreviewKind.NONE, true)); + } + + @Test + public void testVerifyDiskHeadroomThresholdSufficientDisk() + { + DiskUsageMonitor.instance = diskUsageMonitor; + when(diskUsageMonitor.getDiskUsage()).thenReturn(0.0); + DatabaseDescriptor.setIncrementalRepairDiskHeadroomRejectRatio(0.0); + + Assert.assertTrue(ActiveRepairService.verifyDiskHeadroomThreshold(TimeUUID.maxAtUnixMillis(0), PreviewKind.NONE, true)); + } + + @Test(expected = RuntimeException.class) + public void testPrepareForRepairThrowsExceptionForInsufficientDisk() + { + DiskUsageMonitor.instance = diskUsageMonitor; + when(diskUsageMonitor.getDiskUsage()).thenReturn(1.5); + + instance().prepareForRepair(TimeUUID.maxAtUnixMillis(0), null, null, opts(INCREMENTAL_KEY, b2s(true)), false, null); + } + private static class Task implements Runnable { private final Condition blocked; diff --git a/test/unit/org/apache/cassandra/service/AlterTopologyArgParsingTest.java b/test/unit/org/apache/cassandra/service/AlterTopologyArgParsingTest.java new file mode 100644 index 000000000000..59a7bb58d904 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/AlterTopologyArgParsingTest.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service; + +import java.util.Map; + +import org.junit.Before; +import org.junit.Test; + +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.tcm.membership.Directory; +import org.apache.cassandra.tcm.membership.Location; +import org.apache.cassandra.tcm.membership.MembershipUtils; +import org.apache.cassandra.tcm.membership.NodeAddresses; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.transformations.AlterTopology; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +public class AlterTopologyArgParsingTest +{ + Location loc = new Location("test_dc", "test_rack"); + NodeId id = new NodeId(1); + Directory dir; + + @Before + public void setup() + { + dir = new Directory(); + } + + @Test + public void testSingleChangeByInt() + { + String arg = "1=test_dc:test_rack"; + Map parsed = AlterTopology.parseArgs(arg, dir); + assertEquals(1, parsed.size()); + assertEquals(parsed.get(id), loc); + } + + @Test + public void testSingleChangeByUUID() + { + String arg = String.format("%s=test_dc:test_rack", id.toUUID().toString()); + Map parsed = AlterTopology.parseArgs(arg, dir); + assertEquals(1, parsed.size()); + assertEquals(parsed.get(id), loc); + } + + @Test + public void testSingleChangeByEndpoint() + { + InetAddressAndPort ep = MembershipUtils.endpoint(1); + dir = dir.with(new NodeAddresses(ep), loc); // this will associate NodeId(1) with ep + String arg = String.format("%s=test_dc:test_rack", ep.getHostAddressAndPort()); + Map parsed = AlterTopology.parseArgs(arg, dir); + assertEquals(1, parsed.size()); + assertEquals(parsed.get(id), loc); + } + + @Test + public void testSingleChangeByEndpointAddress() + { + InetAddressAndPort ep = MembershipUtils.endpoint(1); + dir = dir.with(new NodeAddresses(ep), loc); // this will associate NodeId(1) with ep + String arg = String.format("%s=test_dc:test_rack", ep.getHostAddress(false)); + Map parsed = AlterTopology.parseArgs(arg, dir); + assertEquals(1, parsed.size()); + assertEquals(parsed.get(id), loc); + } + + @Test + public void testInvalidArg() + { + String[] args = new String[]{ "invalid", "1=", "=dc:rack", "1=dc", "1=dc:" }; + for (String invalid : args) + { + try + { + AlterTopology.parseArgs(invalid, dir); + fail("Expected exception"); + } + catch (IllegalArgumentException e) + { + } + } + } + + @Test + public void testMultipleChanges() + { + NodeId otherId = new NodeId(2); + InetAddressAndPort ep = MembershipUtils.endpoint(1); + dir = dir.with(new NodeAddresses(ep), loc); // this will associate NodeId(1) with ep + String arg = String.format("%s=dc1:rack1,%s=dc2:rack2,3=dc3:rack3,", + ep.getHostAddress(true), + otherId.toUUID().toString()); + Map parsed = AlterTopology.parseArgs(arg, dir); + assertEquals(3, parsed.size()); + assertEquals(parsed.get(id).datacenter, "dc1"); + assertEquals(parsed.get(id).rack, "rack1"); + assertEquals(parsed.get(otherId).datacenter, "dc2"); + assertEquals(parsed.get(otherId).rack, "rack2"); + assertEquals(parsed.get(new NodeId(3)).datacenter, "dc3"); + assertEquals(parsed.get(new NodeId(3)).rack, "rack3"); + } + + @Test + public void testMultipleChangesForSameNode() + { + InetAddressAndPort ep = MembershipUtils.endpoint(1); + dir = dir.with(new NodeAddresses(ep), loc); // this will associate NodeId(1) with ep + String epString = ep.getHostAddress(true); + String idString = id.toUUID().toString(); + assertIllegalArgument(String.format("%1$s=dc1:rack1,%1$s=dc2:rack2", id.id())); + assertIllegalArgument(String.format("%s=dc1:rack1,%s=dc2:rack2", id.id(), idString)); + assertIllegalArgument(String.format("%s=dc1:rack1,%s=dc2:rack2", id.id(), epString)); + assertIllegalArgument(String.format("%1$s=dc1:rack1,%1$s=dc2:rack2", epString)); + assertIllegalArgument(String.format("%1$s=dc1:rack1,%1$s=dc2:rack2", idString)); + assertIllegalArgument(String.format("%s=dc1:rack1,%s=dc2:rack2,%s=dc3:rack3", id.id(), idString, epString)); + } + + private void assertIllegalArgument(String arg) + { + try + { + AlterTopology.parseArgs(arg, dir); + fail("Expected exception"); + } + catch (IllegalArgumentException e) {} + } +} diff --git a/test/unit/org/apache/cassandra/service/AutoRepairServiceBasicTest.java b/test/unit/org/apache/cassandra/service/AutoRepairServiceBasicTest.java new file mode 100644 index 000000000000..07b8bcc69ec3 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/AutoRepairServiceBasicTest.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service; + +import org.junit.Before; +import org.junit.Test; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig; + +import static org.junit.Assert.assertEquals; + +/** + * Unit tests for {@link org.apache.cassandra.service.AutoRepairService} + */ +public class AutoRepairServiceBasicTest extends CQLTester +{ + private static AutoRepairService autoRepairService; + private static AutoRepairConfig config; + + @Before + public void setUp() + { + DatabaseDescriptor.setCDCOnRepairEnabled(false); + DatabaseDescriptor.setMaterializedViewsOnRepairEnabled(false); + DatabaseDescriptor.setMaterializedViewsEnabled(false); + DatabaseDescriptor.setCDCEnabled(false); + config = new AutoRepairConfig(); + autoRepairService = new AutoRepairService(); + autoRepairService.config = config; + } + + @Test + public void testSetup() + { + AutoRepairService.instance.config = null; + + AutoRepairService.setup(); + + assertEquals(DatabaseDescriptor.getAutoRepairConfig(), AutoRepairService.instance.config); + } + + @Test + public void testGetAutoRepairConfigReturnsConfig() + { + assertEquals(config, autoRepairService.getAutoRepairConfig()); + } + + @Test + public void testsetAutoRepairHistoryClearDeleteHostsBufferInSecV2() + { + autoRepairService.setAutoRepairHistoryClearDeleteHostsBufferDuration("100s"); + + assertEquals(100, config.getAutoRepairHistoryClearDeleteHostsBufferInterval().toSeconds()); + } + + @Test + public void testsetAutoRepairMaxRetriesCount() + { + autoRepairService.setAutoRepairMaxRetriesCount(AutoRepairConfig.RepairType.INCREMENTAL.name(), 101); + + assertEquals(101, config.getRepairMaxRetries(AutoRepairConfig.RepairType.INCREMENTAL)); + } + + @Test + public void testsetAutoRepairRetryBackoffInSec() + { + autoRepairService.setAutoRepairRetryBackoff(AutoRepairConfig.RepairType.INCREMENTAL.name(), "102s"); + + assertEquals(102, config.getRepairRetryBackoff(AutoRepairConfig.RepairType.INCREMENTAL).toSeconds()); + } + + @Test(expected = ConfigurationException.class) + public void testSetAutoRepairEnabledThrowsWithSchedulerDisabled() + { + autoRepairService.config = new AutoRepairConfig(false); + + autoRepairService.setAutoRepairEnabled(AutoRepairConfig.RepairType.INCREMENTAL.name(), true); + } + + @Test + public void testSetAutoRepairEnabledDoesNotThrowForIRWithMVReplayButMVRepairDisabled() + { + autoRepairService.config = new AutoRepairConfig(true); + autoRepairService.config.setMaterializedViewRepairEnabled(AutoRepairConfig.RepairType.INCREMENTAL, false); + DatabaseDescriptor.setMaterializedViewsOnRepairEnabled(true); + autoRepairService.setAutoRepairEnabled(AutoRepairConfig.RepairType.INCREMENTAL.name(), true); + } + + @Test(expected = ConfigurationException.class) + public void testSetAutoRepairEnabledThrowsForIRWithMVReplay() + { + autoRepairService.config = new AutoRepairConfig(true); + autoRepairService.config.setMaterializedViewRepairEnabled(AutoRepairConfig.RepairType.INCREMENTAL, true); + DatabaseDescriptor.setMaterializedViewsOnRepairEnabled(true); + autoRepairService.setAutoRepairEnabled(AutoRepairConfig.RepairType.INCREMENTAL.name(), true); + } + + @Test + public void testSetAutoRepairEnabledDoesNotThrowForIRWithMVReplayDisabled() + { + autoRepairService.config = new AutoRepairConfig(true); + DatabaseDescriptor.setMaterializedViewsEnabled(true); + DatabaseDescriptor.setMaterializedViewsOnRepairEnabled(false); + autoRepairService.setAutoRepairEnabled(AutoRepairConfig.RepairType.INCREMENTAL.name(), true); + } + + @Test + public void testSetAutoRepairEnabledDoesNotThrowForIRWithCDCReplayButCDCDisabled() + { + autoRepairService.config = new AutoRepairConfig(true); + DatabaseDescriptor.setCDCOnRepairEnabled(true); + DatabaseDescriptor.setCDCEnabled(false); + autoRepairService.setAutoRepairEnabled(AutoRepairConfig.RepairType.INCREMENTAL.name(), true); + } + + @Test(expected = ConfigurationException.class) + public void testSetAutoRepairEnabledThrowsForIRWithCDCReplay() + { + autoRepairService.config = new AutoRepairConfig(true); + DatabaseDescriptor.setCDCOnRepairEnabled(true); + DatabaseDescriptor.setCDCEnabled(true); + autoRepairService.setAutoRepairEnabled(AutoRepairConfig.RepairType.INCREMENTAL.name(), true); + } + + @Test + public void testSetAutoRepairEnabledDoesNotThrowForIRWithCDCReplayDisabled() + { + autoRepairService.config = new AutoRepairConfig(true); + DatabaseDescriptor.setCDCEnabled(true); + autoRepairService.setAutoRepairEnabled(AutoRepairConfig.RepairType.INCREMENTAL.name(), true); + } +} diff --git a/test/unit/org/apache/cassandra/service/AutoRepairServiceRepairTypeTest.java b/test/unit/org/apache/cassandra/service/AutoRepairServiceRepairTypeTest.java new file mode 100644 index 000000000000..9c2af3e1c793 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/AutoRepairServiceRepairTypeTest.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service; + +import com.google.common.collect.ImmutableSet; + +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig; +import org.apache.cassandra.repair.autorepair.AutoRepairUtils; + +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Set; +import java.util.UUID; + +import static org.apache.cassandra.Util.setAutoRepairEnabled; +import static org.apache.cassandra.config.CassandraRelevantProperties.SYSTEM_DISTRIBUTED_DEFAULT_RF; +import static org.junit.Assert.assertEquals; + +/** + * Unit tests covering different repair types for {@link org.apache.cassandra.service.AutoRepairService} + */ +@RunWith(Parameterized.class) +public class AutoRepairServiceRepairTypeTest extends CQLTester +{ + @Parameterized.Parameter() + public AutoRepairConfig.RepairType repairType; + + private final UUID host1 = UUID.fromString("00000000-0000-0000-0000-000000000001"); + private final UUID host2 = UUID.fromString("00000000-0000-0000-0000-000000000002"); + + private AutoRepairService instance; + + @Parameterized.Parameters(name = "repairType={0}") + public static Collection repairTypes() + { + return Arrays.asList(AutoRepairConfig.RepairType.values()); + } + + @BeforeClass + public static void setupClass() throws Exception + { + SYSTEM_DISTRIBUTED_DEFAULT_RF.setInt(1); + setAutoRepairEnabled(true); + requireNetwork(); + } + + @Before + public void setUpTest() + { + AutoRepairUtils.setup(); + instance = new AutoRepairService(); + } + + @Test + public void testGetOnGoingRepairHostIdsTest() + { + long now = System.currentTimeMillis(); + AutoRepairUtils.insertNewRepairHistory(repairType, host1, now, now - 1000000); + AutoRepairUtils.insertNewRepairHistory(repairType, host2, now, now - 1000000); + + Set hosts = instance.getOnGoingRepairHostIds(repairType.name()); + + assertEquals(ImmutableSet.of(host1.toString(), host2.toString()), hosts); + } +} diff --git a/test/unit/org/apache/cassandra/service/AutoRepairServiceSetterTest.java b/test/unit/org/apache/cassandra/service/AutoRepairServiceSetterTest.java new file mode 100644 index 000000000000..db87e995f558 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/AutoRepairServiceSetterTest.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service; + +import com.google.common.collect.ImmutableSet; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.cql3.UntypedResultSet; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig; +import org.apache.cassandra.repair.autorepair.AutoRepairUtils; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.SystemDistributedKeyspace; + +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Set; +import java.util.UUID; +import java.util.function.BiConsumer; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.apache.cassandra.Util.setAutoRepairEnabled; +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Unit tests for (updating parameters through JMX) {@link org.apache.cassandra.service.AutoRepairService} + */ +@RunWith(Parameterized.class) +public class AutoRepairServiceSetterTest extends CQLTester +{ + private static final AutoRepairConfig config = new AutoRepairConfig(true); + + @Parameterized.Parameter + public AutoRepairConfig.RepairType repairTypeStr; + + @Parameterized.Parameter(1) + public T arg; + + @Parameterized.Parameter(2) + public BiConsumer setter; + + @Parameterized.Parameter(3) + public Function getter; + + @Parameterized.Parameters(name = "{index}: repairType={0}, arg={1}") + public static Collection testCases() + { + DatabaseDescriptor.setConfig(DatabaseDescriptor.loadConfig()); + return Stream.of( + forEachRepairType(true, AutoRepairService.instance::setAutoRepairEnabled, config::isAutoRepairEnabled), + forEachRepairType(100, AutoRepairService.instance::setRepairThreads, config::getRepairThreads), + forEachRepairType(400, AutoRepairService.instance::setRepairSSTableCountHigherThreshold, config::getRepairSSTableCountHigherThreshold), + forEachRepairType(ImmutableSet.of("dc1", "dc2"), AutoRepairService.instance::setIgnoreDCs, config::getIgnoreDCs), + forEachRepairType(true, AutoRepairService.instance::setPrimaryTokenRangeOnly, config::getRepairPrimaryTokenRangeOnly), + forEachRepairType(600, AutoRepairService.instance::setParallelRepairPercentage, config::getParallelRepairPercentage), + forEachRepairType(700, AutoRepairService.instance::setParallelRepairCount, config::getParallelRepairCount), + forEachRepairType(true, AutoRepairService.instance::setMVRepairEnabled, config::getMaterializedViewRepairEnabled), + forEachRepairType(InetAddressAndPort.getLocalHost().getHostAddressAndPort(), (repairType, commaSeparatedHostSet) -> AutoRepairService.instance.setRepairPriorityForHosts(repairType, (String) commaSeparatedHostSet), AutoRepairUtils::getPriorityHosts), + forEachRepairType(InetAddressAndPort.getLocalHost().getHostAddressAndPort(), (repairType, commaSeparatedHostSet) -> AutoRepairService.instance.setForceRepairForHosts(repairType, (String) commaSeparatedHostSet), AutoRepairServiceSetterTest::isLocalHostForceRepair) + ).flatMap(Function.identity()).collect(Collectors.toList()); + } + + private static Set isLocalHostForceRepair(AutoRepairConfig.RepairType type) + { + UUID hostId = StorageService.instance.getHostIdForEndpoint(InetAddressAndPort.getLocalHost()); + UntypedResultSet resultSet = QueryProcessor.executeInternal(String.format( + "SELECT force_repair FROM %s.%s WHERE host_id = %s and repair_type = '%s'", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY, hostId, type)); + + if (!resultSet.isEmpty() && resultSet.one().getBoolean("force_repair")) + { + return ImmutableSet.of(InetAddressAndPort.getLocalHost()); + } + return ImmutableSet.of(); + } + + private static Stream forEachRepairType(T arg, BiConsumer setter, Function getter) + { + Object[][] testCases = new Object[AutoRepairConfig.RepairType.values().length][4]; + for (AutoRepairConfig.RepairType repairType : AutoRepairConfig.RepairType.values()) + { + testCases[repairType.ordinal()] = new Object[]{ repairType, arg, setter, getter }; + } + + return Arrays.stream(testCases); + } + + @BeforeClass + public static void setup() throws Exception + { + DatabaseDescriptor.daemonInitialization(); + setAutoRepairEnabled(true); + requireNetwork(); + DatabaseDescriptor.setMaterializedViewsEnabled(false); + DatabaseDescriptor.setCDCEnabled(false); + AutoRepairUtils.setup(); + AutoRepairService.instance.config = config; + } + + @Before + public void prepare() + { + QueryProcessor.executeInternal(String.format( + "TRUNCATE %s.%s", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_HISTORY)); + QueryProcessor.executeInternal(String.format( + "TRUNCATE %s.%s", + SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, SystemDistributedKeyspace.AUTO_REPAIR_PRIORITY)); + } + + @Test + public void testSettersTest() + { + DatabaseDescriptor.setMaterializedViewsOnRepairEnabled(false); + DatabaseDescriptor.setCDCOnRepairEnabled(false); + setter.accept(repairTypeStr.name(), arg); + T actualConfig = getter.apply(repairTypeStr); + if (actualConfig instanceof Set) + // When performing a setRepairPriorityForHosts or setForceRepairForHosts, a comma-separated list of + // ip addresses is provided as input. The configuration is expected to return a Set of Strings that + // represent the configured IP addresses. This especial handling allows verification of this special + // case where one of the entries in the Set must match the configured input. + assertThat(actualConfig).satisfiesAnyOf(entry -> assertThat(entry.toString()).contains(arg.toString())); + else + assertThat(actualConfig).isEqualTo(arg); + } +} diff --git a/test/unit/org/apache/cassandra/service/BootstrapTransientTest.java b/test/unit/org/apache/cassandra/service/BootstrapTransientTest.java index d6f4aa9a09d5..b0207565c074 100644 --- a/test/unit/org/apache/cassandra/service/BootstrapTransientTest.java +++ b/test/unit/org/apache/cassandra/service/BootstrapTransientTest.java @@ -30,6 +30,9 @@ import org.junit.BeforeClass; import org.junit.Test; +import org.apache.cassandra.CassandraTestBase; +import org.apache.cassandra.CassandraTestBase.DDDaemonInitialization; +import org.apache.cassandra.CassandraTestBase.UseOrderPreservingPartitioner; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.dht.OrderPreservingPartitioner; import org.apache.cassandra.dht.Range; @@ -63,7 +66,9 @@ * is used to calculate the endpoints to fetch from and check they are alive for both RangeRelocator (move) and * bootstrap (RangeRelocator). */ -public class BootstrapTransientTest +@DDDaemonInitialization +@UseOrderPreservingPartitioner +public class BootstrapTransientTest extends CassandraTestBase { static final String KEYSPACE = "TestKeyspace"; static InetAddressAndPort address02; diff --git a/test/unit/org/apache/cassandra/service/ClientWarningsTest.java b/test/unit/org/apache/cassandra/service/ClientWarningsTest.java index cf6d6ed40032..fb9f594801b3 100644 --- a/test/unit/org/apache/cassandra/service/ClientWarningsTest.java +++ b/test/unit/org/apache/cassandra/service/ClientWarningsTest.java @@ -70,7 +70,7 @@ public void testUnloggedBatch() throws Exception createTable("CREATE TABLE %s (pk int PRIMARY KEY, v text)"); // v4 and higher - try (SimpleClient client = new SimpleClient(nativeAddr.getHostAddress(), nativePort, version, true, new EncryptionOptions())) + try (SimpleClient client = new SimpleClient(nativeAddr.getHostAddress(), nativePort, version, true, new EncryptionOptions.ClientEncryptionOptions())) { client.connect(false); @@ -90,7 +90,7 @@ public void testLargeBatch() throws Exception createTable("CREATE TABLE %s (pk int PRIMARY KEY, v text)"); // v4 and higher - try (SimpleClient client = new SimpleClient(nativeAddr.getHostAddress(), nativePort, version, true, new EncryptionOptions())) + try (SimpleClient client = new SimpleClient(nativeAddr.getHostAddress(), nativePort, version, true, new EncryptionOptions.ClientEncryptionOptions())) { client.connect(false); @@ -112,7 +112,7 @@ public void testTombstoneWarning() throws Exception final int iterations = 10000; createTable("CREATE TABLE %s (pk int, ck int, v int, PRIMARY KEY (pk, ck))"); - try (SimpleClient client = new SimpleClient(nativeAddr.getHostAddress(), nativePort, version, true, new EncryptionOptions())) + try (SimpleClient client = new SimpleClient(nativeAddr.getHostAddress(), nativePort, version, true, new EncryptionOptions.ClientEncryptionOptions())) { client.connect(false); diff --git a/test/unit/org/apache/cassandra/service/DiskErrorsHandlerTest.java b/test/unit/org/apache/cassandra/service/DiskErrorsHandlerTest.java index 6465164fe0ee..c8638754af66 100644 --- a/test/unit/org/apache/cassandra/service/DiskErrorsHandlerTest.java +++ b/test/unit/org/apache/cassandra/service/DiskErrorsHandlerTest.java @@ -21,11 +21,13 @@ import org.junit.Test; import org.apache.cassandra.distributed.shared.WithProperties; +import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.io.FSError; import org.apache.cassandra.io.sstable.CorruptSSTableException; import static org.apache.cassandra.config.CassandraRelevantProperties.CUSTOM_DISK_ERROR_HANDLER; import static org.apache.cassandra.service.DiskErrorsHandlerService.get; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertSame; import static org.junit.Assert.assertTrue; @@ -35,17 +37,16 @@ public class DiskErrorsHandlerTest @Test public void testSetting() throws Throwable { + DiskErrorsHandler handlerA; + DiskErrorsHandler handlerB; try (WithProperties ignore = new WithProperties().set(CUSTOM_DISK_ERROR_HANDLER, HandlerA.class.getName())) { DiskErrorsHandlerService.configure(); - - assertSame(HandlerA.class, get().getClass()); - - assertTrue(HandlerA.initialized); - assertFalse(HandlerA.closed); - assertFalse(HandlerB.initialized); - assertFalse(HandlerB.closed); + handlerA = get(); + assertSame(HandlerA.class, handlerA.getClass()); + assertInitialized(HandlerA.class, handlerA); + assertNotClosed(HandlerA.class, handlerA); } try (WithProperties ignore = new WithProperties().set(CUSTOM_DISK_ERROR_HANDLER, @@ -53,96 +54,73 @@ public void testSetting() throws Throwable { DiskErrorsHandlerService.configure(); - assertTrue(HandlerA.initialized); - assertTrue(HandlerA.closed); + handlerB = get(); + assertSame(HandlerB.class, handlerB.getClass()); - assertTrue(HandlerB.initialized); - assertFalse(HandlerB.closed); + assertInitialized(HandlerA.class, handlerA); + assertClosed(HandlerA.class, handlerA); - assertSame(HandlerB.class, get().getClass()); + assertInitialized(HandlerB.class, handlerB); + assertNotClosed(HandlerB.class, handlerB); - get().close(); + handlerB.close(); - assertTrue(HandlerB.closed); + assertClosed(HandlerB.class, handlerB); } } @Test public void testFailures() { - // failed closing + DiskErrorsHandler handlerC; try (WithProperties ignore = new WithProperties().set(CUSTOM_DISK_ERROR_HANDLER, HandlerC.class.getName())) { DiskErrorsHandlerService.configure(); - assertTrue(HandlerC.initialized); - assertSame(HandlerC.class, get().getClass()); + handlerC = get(); + assertInitialized(HandlerC.class, handlerC); } - // this will call close() on C handler + DiskErrorsHandler handlerA; + // this will call _not_ close() on C handler try (WithProperties ignore = new WithProperties().set(CUSTOM_DISK_ERROR_HANDLER, - HandlerE.class.getName())) + HandlerA.class.getName())) { DiskErrorsHandlerService.configure(); - assertTrue(HandlerE.initialized); - assertSame(HandlerE.class, get().getClass()); + handlerA = get(); + assertInitialized(HandlerA.class, handlerA); + assertNotClosed(HandlerC.class, handlerC); } try (WithProperties ignore = new WithProperties().set(CUSTOM_DISK_ERROR_HANDLER, HandlerD.class.getName())) { - DiskErrorsHandlerService.configure(); - // still handler E as handler D failed to init - assertSame(HandlerE.class, get().getClass()); - } - } + assertThatThrownBy(DiskErrorsHandlerService::configure) + .isInstanceOf(ConfigurationException.class); - public static class HandlerA extends DummyErrorHandler - { - public static boolean initialized = false; - public static boolean closed = false; - - @Override - public void init() - { - initialized = true; + assertSame(HandlerA.class, get().getClass()); + // still handler A as handler D failed to init + assertInitialized(HandlerA.class, handlerA); + assertNotClosed(HandlerA.class, handlerA); } - @Override - public void close() throws Exception + // what if a user tries to set no-op handler or handler which can not be constructed (constructor is private) + try (WithProperties ignore = new WithProperties().set(CUSTOM_DISK_ERROR_HANDLER, + DiskErrorsHandler.NoOpDiskErrorHandler.class.getName())) { - closed = true; + assertThatThrownBy(DiskErrorsHandlerService::configure) + .isInstanceOf(ConfigurationException.class) + .hasMessageContaining("Default constructor for disk error handler class " + + '\'' + DiskErrorsHandler.NoOpDiskErrorHandler.class.getName() + "' is inaccessible."); } } - public static class HandlerB extends DummyErrorHandler - { - public static boolean initialized = false; - public static boolean closed = false; - - @Override - public void init() - { - initialized = true; - } + public static class HandlerA extends DummyErrorHandler {} - @Override - public void close() throws Exception - { - closed = true; - } - } + public static class HandlerB extends DummyErrorHandler {} public static class HandlerC extends DummyErrorHandler { - public static boolean initialized = false; - - @Override - public void init() - { - initialized = true; - } - @Override public void close() throws Exception { @@ -152,25 +130,35 @@ public void close() throws Exception public static class HandlerD extends DummyErrorHandler { - public static boolean closed = false; - @Override public void init() { throw new RuntimeException("failed to init"); } + } - @Override - public void close() throws Exception - { - closed = true; - } + public void assertClosed(Class handlerClass, DiskErrorsHandler diskErrorsHandler) + { + assertSame(handlerClass, diskErrorsHandler.getClass()); + assertTrue(((DummyErrorHandler) diskErrorsHandler).closed); + } + + public void assertNotClosed(Class handlerClass, DiskErrorsHandler diskErrorsHandler) + { + assertSame(handlerClass, diskErrorsHandler.getClass()); + assertFalse(((DummyErrorHandler) diskErrorsHandler).closed); } - public static class HandlerE extends DummyErrorHandler + public void assertInitialized(Class handlerClass, DiskErrorsHandler diskErrorsHandler) { - public static boolean initialized = false; - public static boolean closed = false; + assertSame(handlerClass, diskErrorsHandler.getClass()); + assertTrue(((DummyErrorHandler) diskErrorsHandler).initialized); + } + + private static abstract class DummyErrorHandler implements DiskErrorsHandler + { + public boolean initialized = false; + public boolean closed = false; @Override public void init() @@ -183,10 +171,7 @@ public void close() throws Exception { closed = true; } - } - private static abstract class DummyErrorHandler implements DiskErrorsHandler - { @Override public void handleCorruptSSTable(CorruptSSTableException e) { diff --git a/test/unit/org/apache/cassandra/service/NativeTransportServiceTest.java b/test/unit/org/apache/cassandra/service/NativeTransportServiceTest.java index 645d5b8f8bca..960289940062 100644 --- a/test/unit/org/apache/cassandra/service/NativeTransportServiceTest.java +++ b/test/unit/org/apache/cassandra/service/NativeTransportServiceTest.java @@ -36,7 +36,7 @@ public class NativeTransportServiceTest { - static EncryptionOptions defaultOptions; + static EncryptionOptions.ClientEncryptionOptions defaultOptions; @BeforeClass public static void setupDD() @@ -48,7 +48,7 @@ public static void setupDD() @After public void resetConfig() { - DatabaseDescriptor.updateNativeProtocolEncryptionOptions(update -> new EncryptionOptions(defaultOptions).applyConfig()); + DatabaseDescriptor.updateNativeProtocolEncryptionOptions(update -> new EncryptionOptions.ClientEncryptionOptions.Builder(defaultOptions).build()); } @Test @@ -127,8 +127,11 @@ public void testPlainDefaultPort() public void testSSLOnly() { // default ssl settings: client encryption enabled and default native transport port used for ssl only - DatabaseDescriptor.updateNativeProtocolEncryptionOptions(options -> options.withEnabled(true) - .withOptional(false)); + DatabaseDescriptor.updateNativeProtocolEncryptionOptions(options -> + new EncryptionOptions.ClientEncryptionOptions.Builder(options) + .withEnabled(true) + .withOptional(false) + .build()); withService((NativeTransportService service) -> { @@ -144,8 +147,11 @@ public void testSSLOnly() public void testSSLOptional() { // default ssl settings: client encryption enabled and default native transport port used for optional ssl - DatabaseDescriptor.updateNativeProtocolEncryptionOptions(options -> options.withEnabled(true) - .withOptional(true)); + DatabaseDescriptor.updateNativeProtocolEncryptionOptions(options -> + new EncryptionOptions.ClientEncryptionOptions.Builder(options) + .withEnabled(true) + .withOptional(true) + .build()); withService((NativeTransportService service) -> { diff --git a/test/unit/org/apache/cassandra/service/ProtocolBetaVersionTest.java b/test/unit/org/apache/cassandra/service/ProtocolBetaVersionTest.java index a5b32bfa1fa7..22324f178b03 100644 --- a/test/unit/org/apache/cassandra/service/ProtocolBetaVersionTest.java +++ b/test/unit/org/apache/cassandra/service/ProtocolBetaVersionTest.java @@ -68,7 +68,7 @@ public void testProtocolBetaVersion() throws Exception createTable("CREATE TABLE %s (pk int PRIMARY KEY, v int)"); assertTrue(betaVersion.isBeta()); // change to another beta version or remove test if no beta version - try (SimpleClient client = new SimpleClient(nativeAddr.getHostAddress(), nativePort, betaVersion, true, new EncryptionOptions())) + try (SimpleClient client = new SimpleClient(nativeAddr.getHostAddress(), nativePort, betaVersion, true, new EncryptionOptions.ClientEncryptionOptions())) { client.connect(false); for (int i = 0; i < 10; i++) @@ -103,7 +103,7 @@ public void unforcedProtocolVersionTest() throws Exception } assertTrue(betaVersion.isBeta()); // change to another beta version or remove test if no beta version - try (SimpleClient client = new SimpleClient(nativeAddr.getHostAddress(), nativePort, betaVersion, false, new EncryptionOptions())) + try (SimpleClient client = new SimpleClient(nativeAddr.getHostAddress(), nativePort, betaVersion, false, new EncryptionOptions.ClientEncryptionOptions())) { client.connect(false); fail("Exception should have been thrown"); diff --git a/test/unit/org/apache/cassandra/service/RemoveTest.java b/test/unit/org/apache/cassandra/service/RemoveTest.java index b2c664ebf204..9e85c6a65619 100644 --- a/test/unit/org/apache/cassandra/service/RemoveTest.java +++ b/test/unit/org/apache/cassandra/service/RemoveTest.java @@ -24,15 +24,16 @@ import java.util.List; import java.util.UUID; -import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; +import org.apache.cassandra.CassandraTestBase; +import org.apache.cassandra.CassandraTestBase.PrepareServerNoRegister; +import org.apache.cassandra.CassandraTestBase.UseRandomPartitioner; import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.Util; import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.db.commitlog.CommitLog; import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.dht.RandomPartitioner; import org.apache.cassandra.dht.Token; @@ -43,19 +44,14 @@ import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.tcm.membership.NodeId; -import static org.apache.cassandra.tcm.membership.MembershipUtils.*; +import static org.apache.cassandra.tcm.membership.MembershipUtils.endpoint; -public class RemoveTest +@UseRandomPartitioner +@PrepareServerNoRegister +public class RemoveTest extends CassandraTestBase { - static - { - DatabaseDescriptor.daemonInitialization(); - CommitLog.instance.start(); - } - static final IPartitioner partitioner = RandomPartitioner.instance; StorageService ss = StorageService.instance; - static IPartitioner oldPartitioner; ArrayList endpointTokens = new ArrayList(); ArrayList keyTokens = new ArrayList(); List hosts = new ArrayList<>(); @@ -66,17 +62,9 @@ public class RemoveTest @BeforeClass public static void setupClass() throws ConfigurationException { - oldPartitioner = StorageService.instance.setPartitionerUnsafe(partitioner); - ServerTestUtils.prepareServerNoRegister(); MessagingService.instance().listen(); } - @AfterClass - public static void tearDownClass() - { - StorageService.instance.setPartitionerUnsafe(oldPartitioner); - } - @Before public void setup() throws IOException, ConfigurationException { diff --git a/test/unit/org/apache/cassandra/service/RetryStrategyTest.java b/test/unit/org/apache/cassandra/service/RetryStrategyTest.java new file mode 100644 index 000000000000..26c3633a5b89 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/RetryStrategyTest.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service; + +import java.util.function.IntFunction; + +import org.junit.Test; + +import accord.utils.Gen; +import accord.utils.RandomTestRunner; + +public class RetryStrategyTest +{ + @Test + public void fuzzParser() + { + Gen> expressionGen = random -> i -> { + switch (i) + { + case 0: return String.format("p%d * %d", random.nextInt(100), random.nextInt(0, Integer.MAX_VALUE)); + case 1: return String.format("p%d * %d * attempts", random.nextInt(100), random.nextInt(0, Integer.MAX_VALUE)); + case 2: return String.format("p%d * attempts", random.nextInt(100)); + case 3: return String.format("p%d * %d ^ attempts", random.nextInt(100), random.nextInt(0, Integer.MAX_VALUE)); + case 4: return String.format("%dms", random.nextInt(0, Integer.MAX_VALUE)); + default: + throw new IllegalArgumentException(Integer.toString(i)); + } + }; + + RandomTestRunner.test().check(rs -> { + IntFunction expression = expressionGen.next(rs); + for (int i = 0; i < 10_000; i++) + { + RetryStrategy.parse(String.format("%dms <= %s", rs.nextInt(0, Integer.MAX_VALUE), expression.apply(rs.nextInt(5))), + new TestLatencySourceFactory()); + RetryStrategy.parse(String.format("%dms <= %s <= %dms", + rs.nextInt(0, Integer.MAX_VALUE), + expression.apply(rs.nextInt(4)), + rs.nextInt(0, Integer.MAX_VALUE)), + new TestLatencySourceFactory()); + RetryStrategy.parse(String.format("%dms <= %s ... %s <= %dms", + rs.nextInt(0, Integer.MAX_VALUE), + expression.apply(rs.nextInt(4)), + expression.apply(rs.nextInt(4)), + rs.nextInt(0, Integer.MAX_VALUE)), + new TestLatencySourceFactory()); + } + }); + } + + private static class TestLatencySourceFactory implements TimeoutStrategy.LatencySourceFactory + { + + @Override + public TimeoutStrategy.LatencySource source(String params) + { + return percentile -> 10; + } + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/SerializationsTest.java b/test/unit/org/apache/cassandra/service/SerializationsTest.java index 20431fc335c6..00864f5a3434 100644 --- a/test/unit/org/apache/cassandra/service/SerializationsTest.java +++ b/test/unit/org/apache/cassandra/service/SerializationsTest.java @@ -23,12 +23,8 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.UUID; import com.google.common.collect.Lists; - -import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; -import org.apache.cassandra.io.util.FileInputStreamPlus; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; @@ -41,13 +37,19 @@ import org.apache.cassandra.dht.RandomPartitioner; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; +import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataOutputStreamPlus; +import org.apache.cassandra.io.util.FileInputStreamPlus; import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.repair.SyncNodePair; import org.apache.cassandra.repair.RepairJobDesc; +import org.apache.cassandra.repair.SyncNodePair; import org.apache.cassandra.repair.Validator; -import org.apache.cassandra.repair.messages.*; +import org.apache.cassandra.repair.messages.RepairMessage; +import org.apache.cassandra.repair.messages.SyncRequest; +import org.apache.cassandra.repair.messages.SyncResponse; +import org.apache.cassandra.repair.messages.ValidationRequest; +import org.apache.cassandra.repair.messages.ValidationResponse; import org.apache.cassandra.repair.state.ValidationState; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.KeyspaceParams; @@ -57,14 +59,18 @@ import org.apache.cassandra.streaming.PreviewKind; import org.apache.cassandra.streaming.SessionSummary; import org.apache.cassandra.streaming.StreamSummary; +import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.utils.Clock; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.MerkleTrees; import org.apache.cassandra.utils.TimeUUID; +import static java.util.Collections.emptyList; + public class SerializationsTest extends AbstractSerializationsTester { private static PartitionerSwitcher partitionerSwitcher; + private static TableId TABLE_ID; private static TimeUUID RANDOM_UUID; private static Range FULL_RANGE; private static RepairJobDesc DESC; @@ -79,6 +85,7 @@ public static void defineSchema() throws Exception ClusterMetadataTestHelper.setInstanceForTest(); SchemaTestUtil.addOrUpdateKeyspace(KeyspaceMetadata.create("Keyspace1", KeyspaceParams.simple(3))); SchemaTestUtil.announceNewTable(TableMetadata.minimal("Keyspace1", "Standard1")); + TABLE_ID = ClusterMetadata.current().schema.getKeyspaceMetadata("Keyspace1").getTableOrViewNullable("Standard1").id(); RANDOM_UUID = TimeUUID.fromString("743325d0-4c4b-11ec-8a88-2d67081686db"); FULL_RANGE = new Range<>(Util.testPartitioner().getMinimumToken(), Util.testPartitioner().getMinimumToken()); DESC = new RepairJobDesc(RANDOM_UUID, RANDOM_UUID, "Keyspace1", "Standard1", Arrays.asList(FULL_RANGE)); @@ -218,12 +225,12 @@ private void testSyncCompleteWrite() throws IOException // sync success List summaries = new ArrayList<>(); summaries.add(new SessionSummary(src, dest, - Lists.newArrayList(new StreamSummary(TableId.fromUUID(UUID.randomUUID()), 5, 100)), - Lists.newArrayList(new StreamSummary(TableId.fromUUID(UUID.randomUUID()), 500, 10)) + Lists.newArrayList(new StreamSummary(TABLE_ID, emptyList(), 5, 100)), + Lists.newArrayList(new StreamSummary(TABLE_ID, emptyList(), 500, 10)) )); SyncResponse success = new SyncResponse(DESC, src, dest, true, summaries); // sync fail - SyncResponse fail = new SyncResponse(DESC, src, dest, false, Collections.emptyList()); + SyncResponse fail = new SyncResponse(DESC, src, dest, false, emptyList()); testRepairMessageWrite("service.SyncComplete.bin", SyncResponse.serializer, success, fail); } diff --git a/test/unit/org/apache/cassandra/service/StorageServiceTest.java b/test/unit/org/apache/cassandra/service/StorageServiceTest.java index 0b7ef7b1c471..8742cdac2395 100644 --- a/test/unit/org/apache/cassandra/service/StorageServiceTest.java +++ b/test/unit/org/apache/cassandra/service/StorageServiceTest.java @@ -68,6 +68,7 @@ public static void setUpClass() throws Exception ServerTestUtils.prepareServerNoRegister(); DatabaseDescriptor.daemonInitialization(); DatabaseDescriptor.setTransientReplicationEnabledUnsafe(true); + DatabaseDescriptor.setAccordTransactionsEnabled(false); ClusterMetadataService.instance().commit(new Register(NodeAddresses.current(), SimpleLocationProvider.LOCATION, diff --git a/test/unit/org/apache/cassandra/service/WriteResponseHandlerTest.java b/test/unit/org/apache/cassandra/service/WriteResponseHandlerTest.java index 03785f3c305b..63562b35eb80 100644 --- a/test/unit/org/apache/cassandra/service/WriteResponseHandlerTest.java +++ b/test/unit/org/apache/cassandra/service/WriteResponseHandlerTest.java @@ -35,8 +35,8 @@ import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.WriteType; import org.apache.cassandra.dht.Murmur3Partitioner; -import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.locator.EndpointsForToken; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.NodeProximity; @@ -238,8 +238,8 @@ public void failedIdealCLIncrementsStatForExplicitOnFailure() //Fail in remote DC - awr.onFailure(targets.get(3).endpoint(), RequestFailureReason.TIMEOUT); - awr.onFailure(targets.get(4).endpoint(), RequestFailureReason.TIMEOUT); + awr.onFailure(targets.get(3).endpoint(), RequestFailure.TIMEOUT); + awr.onFailure(targets.get(4).endpoint(), RequestFailure.TIMEOUT); awr.onResponse(createDummyMessage(5)); assertEquals(startingCountForWriteFailedIdealCL + 1, ks.metric.writeFailedIdealCL.getCount()); @@ -281,14 +281,14 @@ public void failedIdealCLDoesNotIncrementsStatOnExplicitQueryFailure() //Fail in local DC - awr.onFailure(targets.get(0).endpoint(), RequestFailureReason.TIMEOUT); - awr.onFailure(targets.get(1).endpoint(), RequestFailureReason.TIMEOUT); + awr.onFailure(targets.get(0).endpoint(), RequestFailure.TIMEOUT); + awr.onFailure(targets.get(1).endpoint(), RequestFailure.TIMEOUT); awr.onResponse(createDummyMessage(2)); //Fail in remote DC - awr.onFailure(targets.get(3).endpoint(), RequestFailureReason.TIMEOUT); - awr.onFailure(targets.get(4).endpoint(), RequestFailureReason.TIMEOUT); + awr.onFailure(targets.get(3).endpoint(), RequestFailure.TIMEOUT); + awr.onFailure(targets.get(4).endpoint(), RequestFailure.TIMEOUT); awr.onResponse(createDummyMessage(5)); assertEquals(startingCountForWriteFailedIdealCL, ks.metric.writeFailedIdealCL.getCount()); diff --git a/test/unit/org/apache/cassandra/service/accord/AccordAgentTest.java b/test/unit/org/apache/cassandra/service/accord/AccordAgentTest.java new file mode 100644 index 000000000000..bc5f51cbcc8e --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordAgentTest.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.Arrays; +import java.util.concurrent.TimeUnit; + +import org.junit.Test; + +import accord.local.Node; +import accord.utils.RandomTestRunner; +import accord.utils.SortedArrays.SortedArrayList; +import org.apache.cassandra.service.accord.api.AccordAgent; + +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.junit.Assert.assertTrue; + +public class AccordAgentTest +{ + @Test + public void testNonClashingStartTimes() + { + RandomTestRunner.test().check(rnd -> { + SortedArrayList nodes; { + Node.Id[] ids = new Node.Id[rnd.nextInt(4, 16)]; + for (int i = 0 ; i < ids.length ; ++i) + ids[i] = new Node.Id(i); + nodes = new SortedArrayList<>(ids); + } + + long[] startTimes = new long[nodes.size()]; + long oneSecond = SECONDS.toMicros(1); + long targetDelta = oneSecond / nodes.size(); + for (int i = 0 ; i < 10000 ; ++i) + { + long startTime = rnd.nextLong(1, TimeUnit.DAYS.toMicros(100L)); + for (int j = 0 ; j < startTimes.length ; ++j) + { + long nonClashingStartTime = AccordAgent.nonClashingStartTime(startTime, nodes, nodes.get(j), oneSecond, rnd); + assertTrue(nonClashingStartTime >= startTime); + startTimes[j] = nonClashingStartTime; + } + + Arrays.sort(startTimes); + for (int j = 1 ; j < startTimes.length ; ++j) + { + long actualDelta = startTimes[j] - startTimes[j - 1]; + assertTrue(Math.abs(targetDelta - actualDelta) <= startTimes.length); + } + } + }); + } + +} diff --git a/test/unit/org/apache/cassandra/service/accord/AccordCacheEntryTest.java b/test/unit/org/apache/cassandra/service/accord/AccordCacheEntryTest.java new file mode 100644 index 000000000000..0cd0d4e03121 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordCacheEntryTest.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service.accord; + +import org.junit.Assert; +import org.junit.Test; + +import org.apache.cassandra.service.accord.AccordCacheEntry.Status; +import org.apache.cassandra.service.accord.AccordCache.Type; + +public class AccordCacheEntryTest +{ + static class CacheEntry extends AccordCacheEntry + { + public CacheEntry(String key, Type.Instance instance) + { + super(key, instance); + } + + public CacheEntry(String key) + { + this(key, null); + } + } + + private static void assertIllegalState(Runnable runnable) + { + try + { + runnable.run(); + Assert.fail("Expected IllegalStateException"); + } + catch (IllegalStateException ise) + { + // expected + } + } + + @Test + public void loadSuccessTest() + { + CacheEntry state = new CacheEntry("K"); + + Assert.assertEquals(Status.UNINITIALIZED, state.status()); + assertIllegalState(state::getExclusive); + assertIllegalState(() -> state.setExclusive("VVVV")); + assertIllegalState(state::loading); + + state.readyToLoad(); + state.testLoad(); + Assert.assertEquals(Status.LOADING, state.status()); + + state.testLoaded("V"); + Assert.assertEquals(Status.LOADED, state.status()); + Assert.assertEquals("V", state.getExclusive()); + + assertIllegalState(state::testLoad); + assertIllegalState(() -> state.loaded(null)); + assertIllegalState(state::loading); + } + + @Test + public void loadNullTest() + { + CacheEntry state = new CacheEntry("K"); + Assert.assertEquals(Status.UNINITIALIZED, state.status()); + + assertIllegalState(state::getExclusive); + assertIllegalState(() -> state.setExclusive("VVVV")); + assertIllegalState(state::loading); + + state.readyToLoad(); + state.testLoad(); + Assert.assertEquals(Status.LOADING, state.status()); + + // TODO (expected): this is sort of a pointless test now - remove it? + state.testLoaded(null); + Assert.assertEquals(Status.LOADED, state.status()); + Assert.assertNull(state.getExclusive()); + + assertIllegalState(state::testLoad); + assertIllegalState(state::failedToLoad); + assertIllegalState(state::loading); + } + + @Test + public void loadFailureTest() + { + CacheEntry state = new CacheEntry("K"); + + Assert.assertEquals(Status.UNINITIALIZED, state.status()); + assertIllegalState(state::getExclusive); + assertIllegalState(() -> state.setExclusive("VVVV")); + assertIllegalState(state::loading); + + state.readyToLoad(); + state.testLoad(); + state.failedToLoad(); + Assert.assertEquals(Status.FAILED_TO_LOAD, state.status()); + assertIllegalState(state::getExclusive); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/AccordCacheTest.java b/test/unit/org/apache/cassandra/service/accord/AccordCacheTest.java new file mode 100644 index 000000000000..d7a89017d6aa --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordCacheTest.java @@ -0,0 +1,505 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service.accord; + +import java.util.UUID; +import java.util.function.Function; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import accord.utils.async.Cancellable; +import org.apache.cassandra.cache.CacheSize; +import org.apache.cassandra.concurrent.ExecutorPlus; +import org.apache.cassandra.concurrent.ManualExecutor; +import org.apache.cassandra.metrics.AccordCacheMetrics; +import org.apache.cassandra.metrics.CacheAccessMetrics; +import org.apache.cassandra.service.accord.AccordCacheEntry.OnSaved; +import org.apache.cassandra.service.accord.AccordCacheEntry.Status; + +import static org.apache.cassandra.service.accord.AccordTestUtils.testLoad; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatExceptionOfType; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class AccordCacheTest +{ + private static final long DEFAULT_NODE_SIZE = nodeSize(0); + private AccordCacheMetrics cacheMetrics; + + private static abstract class TestSafeState implements AccordSafeState + { + protected boolean invalidated = false; + protected final AccordCacheEntry global; + private T original = null; + + public TestSafeState(AccordCacheEntry global) + { + this.global = global; + } + + public AccordCacheEntry global() + { + return global; + } + + @Override + public T key() + { + return global.key(); + } + + @Override + public T current() + { + return global.getExclusive(); + } + + @Override + public void set(T update) + { + global.setExclusive(update); + } + + @Override + public T original() + { + return original; + } + + @Override + public void preExecute() + { + original = global.getExclusive(); + } + + @Override + public Cancellable saving() + { + return global.saving(); + } + + @Override + public Throwable failure() + { + return global.failure(); + } + + @Override + public void invalidate() + { + invalidated = true; + } + + @Override + public boolean invalidated() + { + return invalidated; + } + } + + private static class SafeString extends TestSafeState + { + public SafeString(AccordCacheEntry global) + { + super(global); + } + } + + private static class SafeInt extends TestSafeState + { + public SafeInt(AccordCacheEntry global) + { + super(global); + } + } + + private static long emptyNodeSize() + { + return AccordCacheEntry.EMPTY_SIZE; + } + + private static long nodeSize(long itemSize) + { + return itemSize + emptyNodeSize(); + } + + private static void assertCacheState(AccordCache cache, int referenced, int total, long bytes) + { + Assert.assertEquals(referenced, cache.numReferencedEntries()); + Assert.assertEquals(total, cache.size()); + Assert.assertEquals(bytes, cache.weightedSize()); + } + + private void assertCacheMetrics(CacheAccessMetrics metrics, int hits, int misses, int requests, int sizes) + { + Assert.assertEquals(hits, metrics.hits.getCount()); + Assert.assertEquals(misses, metrics.misses.getCount()); + Assert.assertEquals(requests, metrics.requests.getCount()); + if (metrics instanceof AccordCacheMetrics) + { + AccordCacheMetrics ascMetrics = (AccordCacheMetrics) metrics; + Assert.assertEquals(sizes, ascMetrics.objectSize.getCount()); + assertThat(ascMetrics.objectSize.getSnapshot().getMax()).isGreaterThanOrEqualTo(DEFAULT_NODE_SIZE); + } + } + + @Before + public void before() + { + String type = String.format("%s-%s", AccordCommandStores.ACCORD_STATE_CACHE, UUID.randomUUID()); + cacheMetrics = new AccordCacheMetrics(type); + } + + @Test + public void testAcquisitionAndRelease() + { + ManualExecutor executor = new ManualExecutor(); + AccordCache cache = new AccordCache(wrap(executor), OnSaved.immediate(), 500, cacheMetrics); + AccordCache.Type type = + cache.newType(String.class, (s, k) -> k, (s, k, c, o) -> null, Function.identity(), (s, k, v) -> true, String::length, SafeString::new); + AccordCache.Type.Instance instance = type.newInstance(null); + assertCacheState(cache, 0, 0, 0); + + SafeString safeString1 = instance.acquire("1"); + assertCacheState(cache, 1, 1, emptyNodeSize()); + testLoad(executor, instance, safeString1, "1"); + Assert.assertTrue(!cache.evictionQueue().iterator().hasNext()); + + instance.release(safeString1, null); + assertCacheState(cache, 0, 1, nodeSize(1)); + Assert.assertSame(safeString1.global, cache.head()); + Assert.assertSame(safeString1.global, cache.tail()); + + SafeString safeString2 = instance.acquire("2"); + assertCacheState(cache, 1, 2, DEFAULT_NODE_SIZE + nodeSize(1)); + testLoad(executor, instance, safeString2, "2"); + instance.release(safeString2, null); + assertCacheState(cache, 0, 2, nodeSize(1) + nodeSize(1)); + + Assert.assertSame(safeString1.global, cache.head()); + Assert.assertSame(safeString2.global, cache.tail()); + + assertCacheMetrics(cache.metrics, 0, 2, 2, 2); + assertCacheMetrics(type.typeMetrics, 0, 2, 2, 2); + } + + @Test + public void testCachingMetricsWithTwoInstances() + { + ManualExecutor executor = new ManualExecutor(); + AccordCache cache = new AccordCache(wrap(executor), OnSaved.immediate(), 500, cacheMetrics); + AccordCache.Type stringType = + cache.newType(String.class, (s, k) -> k, (s, k, c, o) -> null, Function.identity(), (s, k, v) -> true, String::length, SafeString::new); + AccordCache.Type.Instance stringInstance = stringType.newInstance(null); + AccordCache.Type intType = + cache.newType(Integer.class, (s, k) -> k, (s, k, c, o) -> null, Function.identity(), (s, k, v) -> true, ignore -> Integer.BYTES, SafeInt::new); + assertCacheState(cache, 0, 0, 0); + AccordCache.Type.Instance intInstance = intType.newInstance(null); + + SafeString safeString1 = stringInstance.acquire("1"); + testLoad(executor, stringInstance, safeString1, "1"); + stringInstance.release(safeString1, null); + SafeString safeString2 = stringInstance.acquire("2"); + testLoad(executor, stringInstance, safeString2, "2"); + stringInstance.release(safeString2, null); + + SafeInt safeInt1 = intInstance.acquire(3); + testLoad(executor, intInstance, safeInt1, 3); + intInstance.release(safeInt1, null); + SafeInt safeInt2 = intInstance.acquire(4); + testLoad(executor, intInstance, safeInt2, 4); + intInstance.release(safeInt2, null); + SafeInt safeInt3 = intInstance.acquire(5); + testLoad(executor, intInstance, safeInt3, 5); + intInstance.release(safeInt3, null); + + assertCacheState(cache, 0, 5, nodeSize(Integer.BYTES) * 3 + nodeSize(1) * 2); + assertThat(stringType.size()).isEqualTo(2); + assertThat(stringType.weightedSize()).isEqualTo(nodeSize(1) * 2); + assertThat(stringType.capacity()).isEqualTo(cache.capacity()); + assertThat(intType.size()).isEqualTo(3); + assertThat(intType.weightedSize()).isEqualTo(nodeSize(Integer.BYTES) * 3); + assertThat(intType.capacity()).isEqualTo(cache.capacity()); + + assertThatExceptionOfType(UnsupportedOperationException.class).isThrownBy(() -> stringType.setCapacity(123)); + assertThatExceptionOfType(UnsupportedOperationException.class).isThrownBy(() -> intType.setCapacity(123)); + } + + @Test + public void testRotation() + { + ManualExecutor executor = new ManualExecutor(); + AccordCache cache = new AccordCache(wrap(executor), OnSaved.immediate(), DEFAULT_NODE_SIZE * 5, cacheMetrics); + AccordCache.Type type = + cache.newType(String.class, (s, k) -> k, (s, k, c, o) -> null, Function.identity(), (s, k, v) -> true, String::length, SafeString::new); + assertCacheState(cache, 0, 0, 0); + AccordCache.Type.Instance instance = type.newInstance(null); + + SafeString[] items = new SafeString[3]; + for (int i=0; i<3; i++) + { + SafeString safeString = instance.acquire(Integer.toString(i)); + items[i] = safeString; + Assert.assertNotNull(safeString); + testLoad(executor, instance, safeString, Integer.toString(i)); + Assert.assertTrue(instance.isReferenced(safeString.key())); + instance.release(safeString, null); + } + + Assert.assertSame(items[0].global, cache.head()); + Assert.assertSame(items[2].global, cache.tail()); + assertCacheState(cache, 0, 3, nodeSize(1) * 3); + assertCacheMetrics(cache.metrics, 0, 3, 3, 3); + assertCacheMetrics(type.typeMetrics, 0, 3, 3, 3); + + SafeString safeString = instance.acquire("1"); + Assert.assertEquals(Status.LOADED, safeString.global.status()); + + assertCacheState(cache, 1, 3, nodeSize(1) * 3); + assertCacheMetrics(cache.metrics, 1, 3, 4, 3); + assertCacheMetrics(type.typeMetrics, 1, 3, 4, 3); + + // releasing item should return it to the tail + instance.release(safeString, null); + assertCacheState(cache, 0, 3, nodeSize(1) * 3); + Assert.assertSame(items[0].global, cache.head()); + Assert.assertSame(items[1].global, cache.tail()); + } + + @Test + public void testEvictionOnAcquire() + { + ManualExecutor executor = new ManualExecutor(); + AccordCache cache = new AccordCache(wrap(executor), OnSaved.immediate(), nodeSize(1) * 5, cacheMetrics); + AccordCache.Type type = + cache.newType(String.class, (s, k) -> k, (s, k, c, o) -> null, Function.identity(), (s, k, v) -> true, String::length, SafeString::new); + AccordCache.Type.Instance instance = type.newInstance(null); + assertCacheState(cache, 0, 0, 0); + + SafeString[] items = new SafeString[5]; + for (int i=0; i<5; i++) + { + SafeString safeString = instance.acquire(Integer.toString(i)); + items[i] = safeString; + testLoad(executor, instance, safeString, Integer.toString(i)); + Assert.assertTrue(instance.isReferenced(safeString.key())); + instance.release(safeString, null); + } + + assertCacheState(cache, 0, 5, nodeSize(1) * 5); + Assert.assertSame(items[0].global, cache.head()); + Assert.assertSame(items[4].global, cache.tail()); + assertCacheMetrics(cache.metrics, 0, 5, 5, 5); + assertCacheMetrics(type.typeMetrics, 0, 5, 5, 5); + + SafeString safeString = instance.acquire("5"); + Assert.assertTrue(instance.isReferenced(safeString.key())); + + // since it's not loaded, only the node size is counted here + assertCacheState(cache, 1, 5, nodeSize(1) * 4 + nodeSize(0)); + Assert.assertSame(items[1].global, cache.head()); + Assert.assertSame(items[4].global, cache.tail()); + Assert.assertFalse(instance.keyIsCached("0", SafeString.class)); + Assert.assertFalse(instance.keyIsReferenced("0", SafeString.class)); + assertCacheMetrics(cache.metrics, 0, 6, 6, 5); + assertCacheMetrics(type.typeMetrics, 0, 6, 6, 5); + + testLoad(executor, instance, safeString, "5"); + instance.release(safeString, null); + assertCacheState(cache, 0, 5, nodeSize(1) * 5); + Assert.assertSame(items[1].global, cache.head()); + Assert.assertSame(safeString.global, cache.tail()); + assertCacheMetrics(cache.metrics, 0, 6, 6, 6); + assertCacheMetrics(type.typeMetrics, 0, 6, 6, 6); + } + + @Test + public void testEvictionOnRelease() + { + ManualExecutor executor = new ManualExecutor(); + AccordCache cache = new AccordCache(wrap(executor), OnSaved.immediate(), nodeSize(1) * 4, cacheMetrics); + AccordCache.Type type = + cache.newType(String.class, (s, k) -> k, (s, k, c, o) -> null, Function.identity(), (s, k, v) -> true, String::length, SafeString::new); + AccordCache.Type.Instance instance = type.newInstance(null); + assertCacheState(cache, 0, 0, 0); + + SafeString[] items = new SafeString[5]; + for (int i=0; i<5; i++) + { + SafeString safeString = instance.acquire(Integer.toString(i)); + items[i] = safeString; + testLoad(executor, instance, safeString, Integer.toString(i)); + Assert.assertTrue(instance.isReferenced(safeString.key())); + } + + assertCacheState(cache, 5, 5, nodeSize(1) * 5); + assertCacheMetrics(cache.metrics, 0, 5, 5, 5); + assertCacheMetrics(type.typeMetrics, 0, 5, 5, 5); + Assert.assertNull(cache.head()); + Assert.assertNull(cache.tail()); + + instance.release(items[2], null); + assertCacheState(cache, 4, 4, nodeSize(1) * 4); + assertCacheMetrics(cache.metrics, 0, 5, 5, 5); + assertCacheMetrics(type.typeMetrics, 0, 5, 5, 5); + Assert.assertNull(cache.head()); + Assert.assertNull(cache.tail()); + + instance.release(items[4], null); + assertCacheState(cache, 3, 4, nodeSize(1) * 4); + assertCacheMetrics(cache.metrics, 0, 5, 5, 5); + assertCacheMetrics(type.typeMetrics, 0, 5, 5, 5); + Assert.assertSame(items[4].global, cache.head()); + Assert.assertSame(items[4].global, cache.tail()); + } + + @Test + public void testMultiAcquireRelease() + { + ManualExecutor executor = new ManualExecutor(); + AccordCache cache = new AccordCache(wrap(executor), OnSaved.immediate(), DEFAULT_NODE_SIZE * 4, cacheMetrics); + AccordCache.Type type = + cache.newType(String.class, (s, k) -> k, (s, k, c, o) -> null, Function.identity(), (s, k, v) -> true, String::length, SafeString::new); + AccordCache.Type.Instance instance = type.newInstance(null); + assertCacheState(cache, 0, 0, 0); + + SafeString safeString1 = instance.acquire("0"); + testLoad(executor, instance, safeString1, "0"); + Assert.assertEquals(Status.LOADED, safeString1.global.status()); + assertCacheMetrics(cache.metrics, 0, 1, 1, 1); + assertCacheMetrics(type.typeMetrics, 0, 1, 1, 1); + + Assert.assertEquals(1, instance.references("0", SafeString.class)); + assertCacheState(cache, 1, 1, nodeSize(1)); + + SafeString safeString2 = instance.acquire("0"); + Assert.assertEquals("0", safeString2.current()); + Assert.assertEquals(Status.LOADED, safeString1.global.status()); + Assert.assertEquals(2, instance.references("0", SafeString.class)); + assertCacheState(cache, 1, 1, nodeSize(1)); + assertCacheMetrics(cache.metrics, 1, 1, 2, 1); + assertCacheMetrics(type.typeMetrics, 1, 1, 2, 1); + + instance.release(safeString1, null); + assertCacheState(cache, 1, 1, nodeSize(1)); + instance.release(safeString2, null); + assertCacheState(cache, 0, 1, nodeSize(1)); + } + + @Test + public void evictionBlockedOnSaving() + { + ManualExecutor executor = new ManualExecutor(); + AccordCache cache = new AccordCache(wrap(executor), OnSaved.immediate(), nodeSize(1) * 3 + nodeSize(3), cacheMetrics); + AccordCache.Type type = + cache.newType(String.class, (s, k) -> k, (s, k, c, o) -> null, Function.identity(), (s, k, v) -> true, String::length, SafeString::new); + AccordCache.Type.Instance instance = type.newInstance(null); + assertCacheState(cache, 0, 0, 0); + + SafeString item = instance.acquire(Integer.toString(0)); + testLoad(executor, instance, item, Integer.toString(0)); + item.set("0*"); + Assert.assertTrue(instance.isReferenced(item.key())); + instance.release(item, null); + + for (int i=1; i<4; i++) + { + item = instance.acquire(Integer.toString(i)); + testLoad(executor, instance, item, Integer.toString(i)); + Assert.assertTrue(instance.isReferenced(item.key())); + instance.release(item, null); + } + + assertCacheState(cache, 0, 4, nodeSize(1) * 3 + nodeSize(2)); + assertCacheMetrics(cache.metrics, 0, 4, 4, 5); + assertCacheMetrics(type.typeMetrics, 0, 4, 4, 5); + + // force cache eviction + instance.acquire(Integer.toString(0)); + cache.setCapacity(0); + + // all should have been evicted except 0 + assertCacheState(cache, 1, 1, nodeSize(2)); + + Assert.assertTrue(instance.keyIsCached("0", SafeString.class)); + Assert.assertFalse(instance.keyIsCached("1", SafeString.class)); + Assert.assertFalse(instance.keyIsCached("2", SafeString.class)); + Assert.assertFalse(instance.keyIsCached("3", SafeString.class)); + } + + @Test + public void testUpdates() + { + ManualExecutor executor = new ManualExecutor(); + AccordCache cache = new AccordCache(wrap(executor), OnSaved.immediate(), 500, cacheMetrics); + AccordCache.Type type = + cache.newType(String.class, (s, k) -> k, (s, k, c, o) -> null, Function.identity(), (s, k, v) -> true, String::length, SafeString::new); + AccordCache.Type.Instance instance = type.newInstance(null); + assertCacheState(cache, 0, 0, 0); + + SafeString safeString = instance.acquire("1"); + testLoad(executor, instance, safeString, "1"); + assertCacheState(cache, 1, 1, nodeSize(1)); + Assert.assertNull(cache.head()); + Assert.assertNull(cache.tail()); + + Assert.assertTrue(instance.isReferenced(safeString.key())); + assertCacheState(cache, 1, 1, nodeSize(1)); + + safeString.set("11"); + instance.release(safeString, null); + assertCacheState(cache, 0, 1, nodeSize(2)); + Assert.assertSame(safeString.global, cache.head()); + Assert.assertSame(safeString.global, cache.tail()); + + assertCacheMetrics(cache.metrics, 0, 1, 1, 2); + assertCacheMetrics(type.typeMetrics, 0, 1, 1, 2); + } + + private CacheSize mockCacheSize(long capacity, long size, int entries) + { + CacheSize cacheSize = mock(CacheSize.class); + when(cacheSize.capacity()).thenReturn(capacity); + when(cacheSize.weightedSize()).thenReturn(size); + when(cacheSize.size()).thenReturn(entries); + return cacheSize; + } + + @Test + public void testAccorStateCacheMetrics() + { + CacheAccessMetrics stringInstance1 = cacheMetrics.forInstance(String.class); + CacheAccessMetrics stringInstance1Dup = cacheMetrics.forInstance(String.class); + CacheAccessMetrics stringInstance2 = cacheMetrics.forInstance(String.class); + CacheAccessMetrics integerInstance1 = cacheMetrics.forInstance(Integer.class); + CacheAccessMetrics integerInstance2 = cacheMetrics.forInstance(Integer.class); + + assertThat(stringInstance1).isSameAs(stringInstance1Dup); + assertThat(stringInstance1).isSameAs(stringInstance2); + assertThat(integerInstance1).isSameAs(integerInstance2); + assertThat(stringInstance1).isNotSameAs(integerInstance1); + } + + private static Function wrap(ExecutorPlus executor) + { + return r -> AccordExecutor.wrap(executor.submit(r)); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/AccordCommandStoreTest.java b/test/unit/org/apache/cassandra/service/accord/AccordCommandStoreTest.java new file mode 100644 index 000000000000..6faca72d34dc --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordCommandStoreTest.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.NavigableMap; +import java.util.TreeMap; +import java.util.concurrent.atomic.AtomicLong; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.Key; +import accord.api.Result; +import accord.local.Command; +import accord.local.StoreParticipants; +import accord.local.cfk.CommandsForKey; +import accord.primitives.Ballot; +import accord.primitives.PartialDeps; +import accord.primitives.PartialTxn; +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.primitives.Routable; +import accord.primitives.Route; +import accord.primitives.RoutingKeys; +import accord.primitives.SaveStatus; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.primitives.Writes; +import accord.utils.ImmutableBitSet; +import accord.utils.SimpleBitSet; +import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.AccordKeyspace.CommandsForKeyAccessor; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.service.accord.serializers.CommandsForKeySerializerTest.TestSafeCommandStore; +import org.apache.cassandra.service.accord.serializers.ResultSerializers; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.utils.Pair; + +import static accord.primitives.Status.Durability.Majority; +import static com.google.common.collect.Iterables.getOnlyElement; +import static org.apache.cassandra.cql3.statements.schema.CreateTableStatement.parse; +import static org.apache.cassandra.service.accord.AccordTestUtils.Commands.preaccepted; +import static org.apache.cassandra.service.accord.AccordTestUtils.ballot; +import static org.apache.cassandra.service.accord.AccordTestUtils.createAccordCommandStore; +import static org.apache.cassandra.service.accord.AccordTestUtils.createPartialTxn; +import static org.apache.cassandra.service.accord.AccordTestUtils.loaded; +import static org.apache.cassandra.service.accord.AccordTestUtils.timestamp; +import static org.apache.cassandra.service.accord.AccordTestUtils.txnId; + +public class AccordCommandStoreTest +{ + private static final Logger logger = LoggerFactory.getLogger(AccordCommandStoreTest.class); + + @BeforeClass + public static void beforeClass() throws Throwable + { + SchemaLoader.prepareServer(); + SchemaLoader.createKeyspace("ks", KeyspaceParams.simple(1), + parse("CREATE TABLE tbl (k int, c int, v int, primary key (k, c)) WITH transactional_mode='full'", "ks")); + TableMetadata tbl = Schema.instance.getTableMetadata("ks", "tbl"); + Assert.assertEquals(TransactionalMode.full, tbl.params.transactionalMode); + StorageService.instance.initServer(); + } + + @Before + public void setUp() throws Exception + { + Keyspace.open(SchemaConstants.ACCORD_KEYSPACE_NAME).getColumnFamilyStores().forEach(ColumnFamilyStore::truncateBlocking); + } + + @Test + public void commandLoadSave() throws Throwable + { + AtomicLong clock = new AtomicLong(0); + PartialTxn depTxn = createPartialTxn(0); + Key key = (Key) depTxn.keys().get(0); + Range range = key.toUnseekable().asRange(); + AccordCommandStore commandStore = createAccordCommandStore(clock::incrementAndGet, "ks", "tbl"); + + QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, c, v) VALUES (0, 0, 1)"); + TableId tableId = Schema.instance.getTableMetadata("ks", "tbl").id; + TxnId oldTxnId1 = txnId(1, clock.incrementAndGet(), 1, Txn.Kind.Write, Routable.Domain.Range); + TxnId oldTxnId2 = txnId(1, clock.incrementAndGet(), 1, Txn.Kind.Write, Routable.Domain.Range); + TxnId txnId = txnId(1, clock.incrementAndGet(), 1, Txn.Kind.Write, Routable.Domain.Range); + + PartialDeps dependencies; + try (PartialDeps.Builder builder = PartialDeps.builder(Ranges.of(range), true)) + { + builder.add(range, oldTxnId1); + builder.add(range, oldTxnId2); + dependencies = builder.build(); + } + + PartialTxn txn = createPartialTxn(0); + Route route = RoutingKeys.of(key.toUnseekable()).toRoute(key.toUnseekable()); + Ballot promised = ballot(1, clock.incrementAndGet(), 1); + Ballot accepted = ballot(1, clock.incrementAndGet(), 1); + Timestamp executeAt = timestamp(1, clock.incrementAndGet(), 1); + SimpleBitSet waitingOnApply = new SimpleBitSet(3); + waitingOnApply.set(1); + Command.WaitingOn waitingOn = new Command.WaitingOn(dependencies.keyDeps.keys(), dependencies.rangeDeps, new ImmutableBitSet(waitingOnApply), new ImmutableBitSet(2)); + Pair result = AccordTestUtils.processTxnResult(commandStore, txnId, txn, executeAt); + + Command expected = Command.Executed.executed(txnId, SaveStatus.Applied, Majority, StoreParticipants.all(route), + promised, executeAt, txn, dependencies, accepted, + waitingOn, result.left, ResultSerializers.APPLIED); + AccordSafeCommand safeCommand = new AccordSafeCommand(loaded(txnId, null)); + safeCommand.set(expected); + + AccordTestUtils.appendCommandsBlocking(commandStore, null, expected); + + logger.info("E: {}", expected); + Command actual = commandStore.loadCommand(txnId); + logger.info("A: {}", actual); + + Assert.assertEquals(expected, actual); + } + + @Test + public void commandsForKeyLoadSave() + { + AtomicLong clock = new AtomicLong(0); + AccordCommandStore commandStore = createAccordCommandStore(clock::incrementAndGet, "ks", "tbl"); + + PartialTxn txn = createPartialTxn(1); + TokenKey key = ((PartitionKey) getOnlyElement(txn.keys())).toUnseekable(); + TxnId txnId1 = txnId(1, clock.incrementAndGet(), 1); + TxnId txnId2 = txnId(1, clock.incrementAndGet(), 1); + + Command command1 = preaccepted(txnId1, txn, timestamp(1, clock.incrementAndGet(), 1)); + Command command2 = preaccepted(txnId2, txn, timestamp(1, clock.incrementAndGet(), 1)); + + AccordSafeCommandsForKey cfk = new AccordSafeCommandsForKey(loaded(key, null)); + cfk.initialize(); + + cfk.set(cfk.current().update(new TestSafeCommandStore(command1.txnId()), command1).cfk()); + cfk.set(cfk.current().update(new TestSafeCommandStore(command1.txnId()), command2).cfk()); + + CommandsForKeyAccessor.systemTableUpdater(commandStore.id(), (TokenKey)cfk.key(), cfk.current(), null, commandStore.nextSystemTimestampMicros()).run(); + logger.info("E: {}", cfk); + CommandsForKey actual = CommandsForKeyAccessor.load(commandStore.id(), key); + logger.info("A: {}", actual); + + Assert.assertEquals(cfk.current(), actual); + } + + private static > NavigableMap toNavigableMap(V safeState) + { + TreeMap map = new TreeMap<>(); + map.put(safeState.key(), safeState); + return map; + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/AccordCommandTest.java b/test/unit/org/apache/cassandra/service/accord/AccordCommandTest.java new file mode 100644 index 000000000000..af94147517e0 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordCommandTest.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.concurrent.atomic.AtomicLong; + +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.api.Key; +import accord.api.RoutingKey; +import accord.local.StoreParticipants; +import accord.local.cfk.CommandsForKey; +import accord.local.Command; +import accord.local.KeyHistory; +import accord.local.Node; +import accord.local.PreLoadContext; +import accord.local.SafeCommand; +import accord.local.SafeCommandStore; +import accord.primitives.KeyDeps; +import accord.primitives.Status; +import accord.messages.Accept; +import accord.messages.Commit; +import accord.messages.PreAccept; +import accord.primitives.Ballot; +import accord.primitives.FullRoute; +import accord.primitives.Keys; +import accord.primitives.PartialDeps; +import accord.primitives.PartialTxn; +import accord.primitives.Route; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.utils.ByteBufferUtil; + +import static accord.api.ProtocolModifiers.Toggles.filterDuplicateDependenciesFromAcceptReply; +import static accord.messages.Accept.Kind.SLOW; +import static accord.utils.async.AsyncChains.getUninterruptibly; +import static org.apache.cassandra.cql3.statements.schema.CreateTableStatement.parse; +import static org.apache.cassandra.service.accord.AccordTestUtils.createAccordCommandStore; +import static org.apache.cassandra.service.accord.AccordTestUtils.createWriteTxn; +import static org.apache.cassandra.service.accord.AccordTestUtils.fullRange; +import static org.apache.cassandra.service.accord.AccordTestUtils.timestamp; +import static org.apache.cassandra.service.accord.AccordTestUtils.txnId; + +public class AccordCommandTest +{ + + static final AtomicLong clock = new AtomicLong(0); + private static final Node.Id ID1 = new Node.Id(1); + private static final Node.Id ID2 = new Node.Id(2); + private static final Node.Id ID3 = new Node.Id(3); + + @BeforeClass + public static void beforeClass() throws Throwable + { + SchemaLoader.prepareServer(); + SchemaLoader.createKeyspace("ks", KeyspaceParams.simple(1), + parse("CREATE TABLE tbl (k int, c int, v int, primary key (k, c)) WITH transactional_mode='full'", "ks")); + StorageService.instance.initServer(); + } + + private static PartitionKey key(int k) + { + TableMetadata metadata = Schema.instance.getTableMetadata("ks", "tbl"); + return new PartitionKey(metadata.id, metadata.partitioner.decorateKey(ByteBufferUtil.bytes(k))); + } + + /** + * disable cache and make sure correct values are coming in and out of the accord table + */ + @Test + public void basicCycleTest() throws Throwable + { + AccordCommandStore commandStore = createAccordCommandStore(clock::incrementAndGet, "ks", "tbl"); + getUninterruptibly(commandStore.execute(PreLoadContext.empty(), unused -> commandStore.executor().cacheUnsafe().setCapacity(0))); + + TxnId txnId = txnId(1, clock.incrementAndGet(), 1); + Txn txn = createWriteTxn(1); + Key key = (Key)txn.keys().get(0); + RoutingKey homeKey = key.toUnseekable(); + FullRoute fullRoute = txn.keys().toRoute(homeKey); + Route route = fullRoute.slice(fullRange(txn)); + PartialTxn partialTxn = txn.intersecting(route, true); + PreAccept preAccept = PreAccept.SerializerSupport.create(txnId, route, 1, 1, 1, partialTxn, null, false, fullRoute); + + // Check preaccept + getUninterruptibly(commandStore.execute(preAccept, safeStore -> { + SafeCommand safeCommand = safeStore.get(txnId, StoreParticipants.all(route)); + Command before = safeCommand.current(); + PreAccept.PreAcceptReply reply = preAccept.apply(safeStore); + Command after = safeCommand.current(); + + Assert.assertTrue(reply.isOk()); + PreAccept.PreAcceptOk ok = (PreAccept.PreAcceptOk) reply; + Assert.assertEquals(txnId, ok.witnessedAt); + Assert.assertTrue(ok.deps.isEmpty()); + + AccordTestUtils.appendCommandsBlocking(commandStore, before, after); + })); + + getUninterruptibly(commandStore.execute(preAccept, safeStore -> { + Command before = safeStore.ifInitialised(txnId).current(); + SafeCommand safeCommand = safeStore.get(txnId, StoreParticipants.all(route)); + Assert.assertEquals(txnId, before.executeAt()); + Assert.assertEquals(Status.PreAccepted, before.status()); + Assert.assertTrue(before.partialDeps() == null || before.partialDeps().isEmpty()); + + CommandsForKey cfk = safeStore.get(key(1).toUnseekable()).current(); + Assert.assertTrue(cfk.indexOf(txnId) >= 0); + Command after = safeCommand.current(); + AccordTestUtils.appendCommandsBlocking(commandStore, before, after); + })); + + // check accept + TxnId txnId2 = txnId(1, clock.incrementAndGet(), 1); + Timestamp executeAt = timestamp(1, clock.incrementAndGet(), 1); + PartialDeps deps; + try (PartialDeps.Builder builder = PartialDeps.builder(route, true)) + { + builder.add(key.toUnseekable(), txnId2); + deps = builder.build(); + } + Accept accept = Accept.SerializerSupport.create(txnId, route, 1, 1, SLOW, Ballot.ZERO, executeAt, deps, false); + + getUninterruptibly(commandStore.execute(accept, safeStore -> { + Command before = safeStore.ifInitialised(txnId).current(); + Accept.AcceptReply reply = accept.apply(safeStore); + Assert.assertTrue(reply.isOk()); + Assert.assertEquals(filterDuplicateDependenciesFromAcceptReply() ? KeyDeps.NONE : deps.keyDeps, reply.deps.keyDeps); + Command after = safeStore.ifInitialised(txnId).current(); + AccordTestUtils.appendCommandsBlocking(commandStore, before, after); + })); + + getUninterruptibly(commandStore.execute(accept, safeStore -> { + Command before = safeStore.ifInitialised(txnId).current(); + Assert.assertEquals(executeAt, before.executeAt()); + Assert.assertEquals(Status.AcceptedSlow, before.status()); + Assert.assertEquals(deps, before.partialDeps()); + + CommandsForKey cfk = safeStore.get(key(1).toUnseekable()).current(); + Assert.assertTrue(cfk.indexOf(txnId) >= 0); + Command after = safeStore.ifInitialised(txnId).current(); + AccordTestUtils.appendCommandsBlocking(commandStore, before, after); + })); + + // check commit + Commit commit = Commit.SerializerSupport.create(txnId, route, 1, 1, Commit.Kind.StableWithTxnAndDeps, Ballot.ZERO, executeAt, partialTxn, deps, fullRoute); + getUninterruptibly(commandStore.execute(commit, commit::apply)); + + getUninterruptibly(commandStore.execute(PreLoadContext.contextFor(txnId, Keys.of(key).toParticipants(), KeyHistory.SYNC), safeStore -> { + Command before = safeStore.ifInitialised(txnId).current(); + Assert.assertEquals(commit.executeAt, before.executeAt()); + Assert.assertTrue(before.hasBeen(Status.Committed)); + Assert.assertEquals(commit.partialDeps, before.partialDeps()); + + CommandsForKey cfk = safeStore.get(key(1).toUnseekable()).current(); + Assert.assertTrue(cfk.indexOf(txnId) >= 0); + Command after = safeStore.ifInitialised(txnId).current(); + AccordTestUtils.appendCommandsBlocking(commandStore, before, after); + })); + } + + @Test + public void computeDeps() throws Throwable + { + AccordCommandStore commandStore = createAccordCommandStore(clock::incrementAndGet, "ks", "tbl"); + getUninterruptibly(commandStore.execute(PreLoadContext.empty(), unused -> commandStore.executor().cacheUnsafe().setCapacity(0))); + + TxnId txnId1 = txnId(1, clock.incrementAndGet(), 1); + Txn txn = createWriteTxn(2); + Key key = (Key)txn.keys().get(0); + RoutingKey homeKey = key.toUnseekable(); + FullRoute fullRoute = txn.keys().toRoute(homeKey); + Route route = fullRoute.slice(fullRange(txn)); + PartialTxn partialTxn = txn.intersecting(route, true); + PreAccept preAccept1 = PreAccept.SerializerSupport.create(txnId1, route, 1, 1, 1, partialTxn, null, false, fullRoute); + + getUninterruptibly(commandStore.execute(preAccept1, safeStore -> { + persistDiff(commandStore, safeStore, txnId1, route, () -> { + preAccept1.apply(safeStore); + }); + })); + + // second preaccept should identify txnId1 as a dependency + TxnId txnId2 = txnId(1, clock.incrementAndGet(), 1); + PreAccept preAccept2 = PreAccept.SerializerSupport.create(txnId2, route, 1, 1, 1, partialTxn, null, false, fullRoute); + getUninterruptibly(commandStore.execute(preAccept2, safeStore -> { + persistDiff(commandStore, safeStore, txnId2, route, () -> { + PreAccept.PreAcceptReply reply = preAccept2.apply(safeStore); + Assert.assertTrue(reply.isOk()); + PreAccept.PreAcceptOk ok = (PreAccept.PreAcceptOk) reply; + Assert.assertTrue(ok.deps.contains(txnId1)); + }); + })); + } + + private static void persistDiff(AccordCommandStore commandStore, SafeCommandStore safeStore, TxnId txnId, Route route, Runnable runnable) + { + SafeCommand safeCommand = safeStore.get(txnId, StoreParticipants.all(route)); + Command before = safeCommand.current(); + runnable.run(); + Command after = safeCommand.current(); + AccordTestUtils.appendCommandsBlocking(commandStore, before, after); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/AccordConfigurationServiceTest.java b/test/unit/org/apache/cassandra/service/accord/AccordConfigurationServiceTest.java new file mode 100644 index 000000000000..77d6e687d238 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordConfigurationServiceTest.java @@ -0,0 +1,295 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.net.UnknownHostException; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Optional; +import java.util.UUID; + +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.api.Journal; +import accord.impl.AbstractConfigurationServiceTest; +import accord.local.Node.Id; +import accord.topology.Topology; +import accord.utils.SortedArrays.SortedArrayList; +import accord.utils.async.AsyncResult; +import org.agrona.collections.Int2ObjectHashMap; +import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.concurrent.ScheduledExecutors; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.journal.TestParams; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.Replica; +import org.apache.cassandra.net.ConnectionType; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessageDelivery; +import org.apache.cassandra.net.RequestCallback; +import org.apache.cassandra.schema.DistributedSchema; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.Tables; +import org.apache.cassandra.service.accord.journal.AccordTopologyUpdate; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ValidatingClusterMetadataService; +import org.apache.cassandra.tcm.membership.Location; +import org.apache.cassandra.tcm.membership.NodeAddresses; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.membership.NodeVersion; +import org.apache.cassandra.tcm.ownership.DataPlacement; +import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.utils.MockFailureDetector; +import org.apache.cassandra.utils.concurrent.Future; + +import static accord.impl.AbstractConfigurationServiceTest.TestListener; +import static org.apache.cassandra.cql3.statements.schema.CreateTableStatement.parse; + +public class AccordConfigurationServiceTest +{ + private static final Id ID1 = new Id(1); + private static final Id ID2 = new Id(2); + private static final Id ID3 = new Id(3); + private static final SortedArrayList ID_LIST = new SortedArrayList<>(new Id[] { ID1, ID2, ID3 }); + private static final String KEYSPACE_NAME = "test_ks"; + private static final TableId TBL_ID = TableId.fromUUID(new UUID(0, 1)); + + private static EndpointMapping mappingForEpoch(long epoch) + { + try + { + EndpointMapping.Builder builder = EndpointMapping.builder(epoch); + builder.add(InetAddressAndPort.getByName("127.0.0.1"), ID1); + builder.add(InetAddressAndPort.getByName("127.0.0.2"), ID2); + builder.add(InetAddressAndPort.getByName("127.0.0.3"), ID3); + return builder.build(); + } + catch (UnknownHostException e) + { + throw new RuntimeException(e); + } + } + + private static class Messaging implements MessageDelivery + { + static class Request + { + final Message message; + final InetAddressAndPort to; + final RequestCallback callback; + + public Request(Message message, InetAddressAndPort to, RequestCallback callback) + { + this.message = message; + this.to = to; + this.callback = callback; + } + } + + final List requests = new ArrayList<>(); + + @Override + public void send(Message message, InetAddressAndPort to) + { + requests.add(new Request(message, to, null)); + } + + @Override + public void sendWithCallback(Message message, InetAddressAndPort to, RequestCallback cb) + { + requests.add(new Request(message, to, cb)); + } + + @Override + public void sendWithCallback(Message message, InetAddressAndPort to, RequestCallback cb, ConnectionType specifyConnection) + { + throw new UnsupportedOperationException(); + } + + @Override + public Future> sendWithResult(Message message, InetAddressAndPort to) + { + throw new UnsupportedOperationException(); + } + + @Override + public void respond(V response, Message message) + { + throw new UnsupportedOperationException(); + } + } + + @BeforeClass + public static void beforeClass() throws Throwable + { + ServerTestUtils.daemonInitialization(); + DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); + + SchemaLoader.prepareServer(); + SchemaLoader.createKeyspace("ks", KeyspaceParams.simple(1), + parse("CREATE TABLE tbl (k int, c int, v int, primary key (k, c)) WITH transactional_mode='full'", "ks")); + } + + @Test + public void loadTest() throws Throwable + { + ValidatingClusterMetadataService cms = ValidatingClusterMetadataService.createAndRegister(Version.MIN_ACCORD_VERSION); + + AccordJournal journal = null; + try + { + journal = initJournal(); + AccordConfigurationService service = new AccordConfigurationService(ID1, new Messaging(), new MockFailureDetector(), ScheduledExecutors.scheduledTasks); + AccordJournal journal_ = journal; + TestListener listener = new TestListener(service, true) + { + @Override + public AsyncResult onTopologyUpdate(Topology topology, boolean isLoad, boolean startSync) + { + // Fake journal save + journal_.saveTopology(new Journal.TopologyUpdate(new Int2ObjectHashMap<>(), topology, topology), () -> {}); + return super.onTopologyUpdate(topology, isLoad, startSync); + } + }; + service.registerListener(listener); + service.start(); + + Topology topology1 = createTopology(cms); + service.updateMapping(mappingForEpoch(cms.metadata().epoch.getEpoch() + 1)); + service.reportTopology(topology1); + service.receiveRemoteSyncComplete(ID1, 1); + service.receiveRemoteSyncComplete(ID2, 1); + service.receiveRemoteSyncComplete(ID3, 1); + + Topology topology2 = createTopology(cms); + service.reportTopology(topology2); + service.receiveRemoteSyncComplete(ID1, 2); + + Topology topology3 = createTopology(cms); + service.reportTopology(topology3); + + AccordConfigurationService loaded = new AccordConfigurationService(ID1, new Messaging(), new MockFailureDetector(), ScheduledExecutors.scheduledTasks); + loaded.updateMapping(mappingForEpoch(cms.metadata().epoch.getEpoch() + 1)); + listener = new AbstractConfigurationServiceTest.TestListener(loaded, true); + loaded.registerListener(listener); + Iterator iter = journal.replayTopologies(); + // Simulate journal replay + while (iter.hasNext()) + loaded.reportTopology(iter.next().global); + loaded.start(); + + listener.assertTopologiesFor(1L, 2L, 3L); + listener.assertTopologyForEpoch(1, topology1); + listener.assertTopologyForEpoch(2, topology2); + listener.assertTopologyForEpoch(3, topology3); + } + finally + { + if (journal != null) + journal.shutdown(); + } + } + + private static AccordJournal initJournal() throws Throwable + { + File directory = new File(Files.createTempDirectory("config_service_test")); + directory.deleteRecursiveOnExit(); + Keyspace ks = Schema.instance.getKeyspaceInstance("system_accord"); + ColumnFamilyStore cfs = ks.getColumnFamilyStore(AccordKeyspace.JOURNAL); + AccordJournal journal = new AccordJournal(new TestParams(), directory, cfs); + journal.start(null); + journal.unsafeSetStarted(); + return journal; + } + private static Topology createTopology(ValidatingClusterMetadataService cms) + { + ClusterMetadata previous = cms.metadata(); + ClusterMetadata.Transformer next = previous.transformer(); + maybeCreateTable(previous, next); + + ClusterMetadata metadata = next.build().metadata; + cms.setMetadata(metadata); + return AccordTopology.createAccordTopology(metadata); + } + + private static void maybeCreateTable(ClusterMetadata previous, ClusterMetadata.Transformer next) + { + Optional ks = previous.schema.getKeyspaces().get(KEYSPACE_NAME); + if (ks.isPresent()) return; + // lets create it + TableMetadata table = TableMetadata.builder(KEYSPACE_NAME, "tbl") + .id(TBL_ID) + .kind(TableMetadata.Kind.REGULAR) + .partitioner(Murmur3Partitioner.instance) + .addPartitionKeyColumn("pk", Int32Type.instance) + .build(); + KeyspaceMetadata keyspace = KeyspaceMetadata.create(KEYSPACE_NAME, KeyspaceParams.simple(ID_LIST.size())) + .withSwapped(Tables.builder().add(table).build()); + + next.with(new DistributedSchema(previous.schema.getKeyspaces().with(keyspace))); + + for (Id node : ID_LIST) + { + // not forcing the cms node id to match as they do when this logic was first added... + next.register(new NodeAddresses(getAddress(node)), + new Location("dc1", "rack1"), + NodeVersion.CURRENT); + + next.proposeToken(new NodeId(node.id), Collections.singleton(new Murmur3Partitioner.LongToken(node.id))); + } + + DataPlacement.Builder replication = DataPlacement.builder(); + Range fullRange = new Range<>(Murmur3Partitioner.MINIMUM, Murmur3Partitioner.MINIMUM); + for (int i = 0; i < ID_LIST.size(); i++) + { + InetAddressAndPort address = getAddress(ID_LIST.get(i)); + Replica replica = new Replica(address, fullRange, true); + replication.withReadReplica(next.epoch(), replica).withWriteReplica(next.epoch(), replica); + } + next.with(previous.placements.unbuild().with(keyspace.params.replication, replication.build()).build()); + } + + private static InetAddressAndPort getAddress(Id node) + { + try + { + return InetAddressAndPort.getByAddress(new byte[]{127, 0, 0, (byte) node.id}); + } + catch (UnknownHostException e) + { + throw new RuntimeException(e); + } + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/AccordFastPathCoordinatorTest.java b/test/unit/org/apache/cassandra/service/accord/AccordFastPathCoordinatorTest.java new file mode 100644 index 000000000000..f9ba3b1cfc25 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordFastPathCoordinatorTest.java @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.concurrent.TimeUnit; + +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.local.Node; +import accord.topology.Shard; +import accord.topology.Topology; +import com.google.common.collect.Iterables; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.schema.*; +import org.apache.cassandra.service.accord.AccordFastPath.Status; +import org.apache.cassandra.tcm.ClusterMetadata; + +import static org.apache.cassandra.service.accord.AccordTestUtils.*; + +public class AccordFastPathCoordinatorTest +{ + private static final IPartitioner partitioner = Murmur3Partitioner.instance; + private static ClusterMetadata EMPTY; + + + public static final TableId TABLE_1 = TableId.fromString("00000000-0000-0000-0000-000000000001"); + + @BeforeClass + public static void beforeClass() throws Exception + { + DatabaseDescriptor.daemonInitialization(); + DatabaseDescriptor.setPartitionerUnsafe(partitioner); + EMPTY = new ClusterMetadata(partitioner); + } + + private static class CapturedUpdate + { + final Node.Id node; + final Status status; + + public CapturedUpdate(Node.Id node, Status status) + { + this.node = node; + this.status = status; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + CapturedUpdate that = (CapturedUpdate) o; + return Objects.equals(node, that.node) && status == that.status; + } + + @Override + public int hashCode() + { + return Objects.hash(node, status); + } + + @Override + public String toString() + { + return "CapturedUpdate{" + + "node=" + node + + ", status=" + status + + '}'; + } + } + + private static CapturedUpdate update(Node.Id node, Status status) + { + return new CapturedUpdate(node, status); + } + + private static class InstrumentedFastPathCoordinator extends AccordFastPathCoordinator + { + private ClusterMetadata currentMetadata = EMPTY; + private List capturedUpdates = new ArrayList<>(); + + public InstrumentedFastPathCoordinator(Node.Id localId) + { + super(localId); + } + + public InstrumentedFastPathCoordinator currentMetadata(ClusterMetadata currentMetadata) + { + this.currentMetadata = currentMetadata; + return this; + } + + @Override + ClusterMetadata currentMetadata() + { + return currentMetadata; + } + + @Override + void registerAsListener() + { + + } + + @Override + void updateFastPath(Node.Id node, Status status, long updateTimeMillis, long updateDelayMillis) + { + capturedUpdates.add(new CapturedUpdate(node, status)); + + } + + @Override + long getAccordFastPathUpdateDelayMillis() + { + return TimeUnit.SECONDS.toMillis(5); + } + } + + @Test + public void simpleAlive() + { + Topology topology = new Topology(1, + Shard.create(AccordTopology.minRange(TABLE_1, token(0)), idList(0, 1, 2), idSet(0, 1, 2)), + Shard.create(AccordTopology.maxRange(TABLE_1, token(0)), idList(3, 4, 5), idSet(3, 4, 5))); + + InstrumentedFastPathCoordinator coordinator = new InstrumentedFastPathCoordinator(id(0)); + coordinator.updatePeers(topology); + + // setup existing fast path state + coordinator.currentMetadata(EMPTY.transformer() + .withFastPathStatusSince(id(1), Status.UNAVAILABLE, 1, 1) + .withFastPathStatusSince(id(3), Status.UNAVAILABLE, 1, 1).build().metadata); + + Assert.assertTrue(coordinator.capturedUpdates.isEmpty()); + + + // peer isn't marked unavailable, shouldn't update + coordinator.onAlive(id(2)); + Assert.assertTrue(coordinator.capturedUpdates.isEmpty()); + + // node isn't a peer, shouldn't update + coordinator.onAlive(id(3)); + Assert.assertTrue(coordinator.capturedUpdates.isEmpty()); + + // node is a peer, should issue update + coordinator.onAlive(id(1)); + Assert.assertEquals(update(id(1), Status.NORMAL), Iterables.getOnlyElement(coordinator.capturedUpdates)); + } + + @Test + public void simpleDead() + { + Topology topology = new Topology(1, + Shard.create(AccordTopology.minRange(TABLE_1, token(0)), idList(0, 1, 2), idSet(0, 1, 2)), + Shard.create(AccordTopology.maxRange(TABLE_1, token(0)), idList(3, 4, 5), idSet(3, 4, 5))); + InstrumentedFastPathCoordinator coordinator = new InstrumentedFastPathCoordinator(id(0)); + coordinator.updatePeers(topology); + Assert.assertTrue(coordinator.capturedUpdates.isEmpty()); + + // not a peer, shouldn't update + coordinator.onDead(id(3)); + Assert.assertTrue(coordinator.capturedUpdates.isEmpty()); + + // is a peer, should update + coordinator.onDead(id(1)); + Assert.assertEquals(update(id(1), Status.UNAVAILABLE), Iterables.getOnlyElement(coordinator.capturedUpdates)); + } + + /** + * We shouldn't be scheduling updates if there aren't any accord tables + */ + @Test + public void noTableTest() + { + InstrumentedFastPathCoordinator coordinator = new InstrumentedFastPathCoordinator(id(0)); + coordinator.start(); + Assert.assertTrue(coordinator.capturedUpdates.isEmpty()); + + coordinator.onDead(id(1)); + Assert.assertTrue(coordinator.capturedUpdates.isEmpty()); + } + + /** + * node should mark itself as shutdown on shutdown + */ + @Test + public void selfShutdownTest() + { + InstrumentedFastPathCoordinator coordinator = new InstrumentedFastPathCoordinator(id(0)); + Assert.assertTrue(coordinator.capturedUpdates.isEmpty()); + + coordinator.onShutdown(); + Assert.assertEquals(update(id(0), Status.SHUTDOWN), Iterables.getOnlyElement(coordinator.capturedUpdates)); + } + + /** + * If a node finds itself marked shutdown on startup, it should mark itself normal + */ + @Test + public void startupTest() + { + InstrumentedFastPathCoordinator coordinator = new InstrumentedFastPathCoordinator(id(0)); + coordinator.currentMetadata(EMPTY.transformer().withFastPathStatusSince(id(0), Status.SHUTDOWN, 1, 1).build().metadata); + Assert.assertTrue(coordinator.capturedUpdates.isEmpty()); + coordinator.start(); + Assert.assertEquals(update(id(0), Status.NORMAL), Iterables.getOnlyElement(coordinator.capturedUpdates)); + } + + /** + * if a peer is marked as shutdown, other nodes should ignore FD signals until it marks itself alive again + */ + @Test + public void peerShutdownTest() + { + Topology topology = new Topology(1, + Shard.create(AccordTopology.minRange(TABLE_1, token(0)), idList(0, 1, 2), idSet(0, 1, 2)), + Shard.create(AccordTopology.maxRange(TABLE_1, token(0)), idList(3, 4, 5), idSet(3, 4, 5))); + InstrumentedFastPathCoordinator coordinator = new InstrumentedFastPathCoordinator(id(0)); + coordinator.currentMetadata(EMPTY.transformer().withFastPathStatusSince(id(1), Status.SHUTDOWN, 1, 1).build().metadata); + coordinator.updatePeers(topology); + Assert.assertTrue(coordinator.capturedUpdates.isEmpty()); + coordinator.start(); + + Assert.assertTrue(coordinator.isPeer(id(1))); + coordinator.onAlive(id(1)); + Assert.assertTrue(coordinator.capturedUpdates.isEmpty()); + coordinator.onDead(id(1)); + Assert.assertTrue(coordinator.capturedUpdates.isEmpty()); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/AccordJournalOrderTest.java b/test/unit/org/apache/cassandra/service/accord/AccordJournalOrderTest.java new file mode 100644 index 000000000000..03666a777b94 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordJournalOrderTest.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.HashMap; +import java.util.Map; +import java.util.Random; + +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.api.Journal; +import accord.local.Command; +import accord.local.StoreParticipants; +import accord.primitives.Ballot; +import accord.primitives.Participants; +import accord.primitives.RoutingKeys; +import accord.primitives.SaveStatus; +import accord.primitives.Status; +import accord.primitives.TxnId; +import accord.utils.AccordGens; +import accord.utils.RandomSource; +import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.ByteOrderedPartitioner; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.journal.TestParams; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.utils.StorageCompatibilityMode; + +import static org.apache.cassandra.cql3.statements.schema.CreateTableStatement.parse; + +public class AccordJournalOrderTest +{ + @BeforeClass + public static void beforeClass() throws Throwable + { + CassandraRelevantProperties.JUNIT_STORAGE_COMPATIBILITY_MODE.setEnum(StorageCompatibilityMode.NONE); + SchemaLoader.prepareServer(); + SchemaLoader.createKeyspace("ks", KeyspaceParams.simple(1), + parse("CREATE TABLE tbl (k int, c int, v int, primary key (k, c)) WITH transactional_mode='full'", "ks")); + TableMetadata tbl = Schema.instance.getTableMetadata("ks", "tbl"); + Assert.assertEquals(TransactionalMode.full, tbl.params.transactionalMode); + StorageService.instance.initServer(); + } + + @Test + public void simpleKeyTest() + { + if (new File(DatabaseDescriptor.getAccordJournalDirectory()).exists()) + ServerTestUtils.cleanupDirectory(DatabaseDescriptor.getAccordJournalDirectory()); + AccordJournal accordJournal = new AccordJournal(TestParams.INSTANCE); + accordJournal.start(null); + RandomSource randomSource = RandomSource.wrap(new Random(0)); + TxnId id1 = AccordGens.txnIds().next(randomSource); + TxnId id2 = AccordGens.txnIds().next(randomSource); + + Map res = new HashMap<>(); + for (int i = 0; i < 10_000; i++) + { + TxnId txnId = randomSource.nextBoolean() ? id1 : id2; + JournalKey key = new JournalKey(txnId, JournalKey.Type.COMMAND_DIFF, randomSource.nextInt(5)); + res.compute(key, (k, prev) -> prev == null ? 1 : prev + 1); + Participants participants = RoutingKeys.of(new TokenKey(TableId.generate(), new ByteOrderedPartitioner.BytesToken(new byte[1]))); + Command command = Command.NotDefined.notDefined(txnId, SaveStatus.NotDefined, Status.Durability.NotDurable, StoreParticipants.create(null, participants, null, null, participants, participants), Ballot.ZERO); + accordJournal.saveCommand(key.commandStoreId, + new Journal.CommandUpdate(null, command), + () -> {}); + } + + Runnable check = () -> { + for (JournalKey key : res.keySet()) + { + AccordJournal.Builder diffs = accordJournal.load(key.commandStoreId, key.id); + Assert.assertEquals(String.format("%d != %d for key %s", diffs.count(), res.get(key).intValue(), key), + diffs.count(), res.get(key).intValue()); + } + }; + + check.run(); + accordJournal.closeCurrentSegmentForTestingIfNonEmpty(); + check.run(); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/AccordJournalTest.java b/test/unit/org/apache/cassandra/service/accord/AccordJournalTest.java new file mode 100644 index 000000000000..58f4e1b1c86a --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordJournalTest.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.service.accord; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.List; + +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.primitives.TxnId; +import accord.utils.AccordGens; +import accord.utils.Gen; +import accord.utils.Gens; +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.utils.AsymmetricOrdering; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.FBUtilities.Order; +import org.apache.cassandra.utils.StorageCompatibilityMode; +import org.checkerframework.checker.nullness.qual.Nullable; + +import static accord.utils.Property.qt; +import static org.assertj.core.api.Assertions.assertThat; + +public class AccordJournalTest +{ + @BeforeClass + public static void setCompatibilityMode() throws IOException + { + CassandraRelevantProperties.TEST_STORAGE_COMPATIBILITY_MODE.setEnum(StorageCompatibilityMode.NONE); + + ServerTestUtils.daemonInitialization(); + StorageService.instance.registerMBeans(); + StorageService.instance.setPartitionerUnsafe(Murmur3Partitioner.instance); + ServerTestUtils.prepareServerNoRegister(); + + File directory = new File(Files.createTempDirectory(null)); + directory.deleteRecursiveOnExit(); + DatabaseDescriptor.setAccordJournalDirectory(directory.path()); + StorageService.instance.initServer(); + Keyspace.setInitialized(); + } + + @Test + public void keySerde() + { + DataOutputBuffer buffer = new DataOutputBuffer(); + qt().forAll(keyGen()).check(key -> + { + buffer.clear(); + int expectedSize = JournalKey.SUPPORT.serializedSize(1); + JournalKey.SUPPORT.serialize(key, buffer, 1); + assertThat(buffer.getLength()).isEqualTo(expectedSize); + try (DataInputBuffer input = new DataInputBuffer(buffer.unsafeGetBufferAndFlip(), false)) + { + JournalKey read = JournalKey.SUPPORT.deserialize(input, 1); + assertThat(read).isEqualTo(key); + } + }); + } + + @Test + public void compareKeys() + { + qt().forAll(Gens.lists(keyGen()).ofSizeBetween(2, 100)).check(keys -> + { + keys.sort(JournalKey.SUPPORT); + + List buffers = new ArrayList<>(keys.size()); + for (JournalKey k : keys) buffers.add(toBuffer(k)); + + for (int i = 0; i < keys.size(); i++) + { + JournalKey outerKey = keys.get(i); + for (int j = 0; j < keys.size(); j++) + { + JournalKey innerKey = keys.get(j); + ByteBuffer innerBuffer = buffers.get(j); + Order expected = FBUtilities.compare(outerKey, innerKey, JournalKey.SUPPORT); + Order actual = FBUtilities.compare(outerKey, innerBuffer, new AsymmetricOrdering() + { + @Override + public int compareAsymmetric(JournalKey left, ByteBuffer right) + { + return JournalKey.SUPPORT.compareWithKeyAt(left, right, 0, 1); + } + + @Override + public int compare(@Nullable JournalKey left, @Nullable JournalKey right) + { + throw new UnsupportedOperationException(); + } + }); + assertThat(actual).isEqualTo(expected); + } + } + }); + } + + private static ByteBuffer toBuffer(JournalKey k) + { + try (DataOutputBuffer buffer = new DataOutputBuffer(JournalKey.SUPPORT.serializedSize(1))) + { + JournalKey.SUPPORT.serialize(k, buffer, 1); + return buffer.unsafeGetBufferAndFlip(); + } + catch (IOException e) + { + throw new UncheckedIOException(e); + } + } + + private Gen keyGen() + { + Gen txnIdGen = AccordGens.txnIds(); + return rs -> new JournalKey(txnIdGen.next(rs), JournalKey.Type.COMMAND_DIFF, 0); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/AccordKeyspaceTest.java b/test/unit/org/apache/cassandra/service/accord/AccordKeyspaceTest.java new file mode 100644 index 000000000000..f609b2ad7315 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordKeyspaceTest.java @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.SortedSet; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; + +import com.google.common.collect.Iterables; +import org.junit.Test; + +import accord.api.RoutingKey; +import accord.local.Command; +import accord.local.Node; +import accord.local.StoreParticipants; +import accord.primitives.Ballot; +import accord.primitives.Deps; +import accord.primitives.FullRoute; +import accord.primitives.KeyDeps; +import accord.primitives.Keys; +import accord.primitives.PartialTxn; +import accord.primitives.RangeDeps; +import accord.primitives.Ranges; +import accord.primitives.Routable; +import accord.primitives.SaveStatus; +import accord.primitives.Status; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.LocalPartitioner; +import org.apache.cassandra.dht.ReversedLongLocalPartitioner; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.schema.MemtableParams; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.SchemaProvider; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.AccordKeyspace.CommandsForKeyAccessor; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.utils.CassandraGenerators; +import org.assertj.core.api.Assertions; +import org.mockito.Mockito; +import org.mockito.stubbing.Answer; + +import static accord.local.Command.Committed.committed; +import static accord.utils.Property.qt; +import static org.apache.cassandra.config.DatabaseDescriptor.getPartitioner; +import static org.apache.cassandra.config.DatabaseDescriptor.setSelectedSSTableFormat; +import static org.apache.cassandra.db.ColumnFamilyStore.FlushReason.UNIT_TESTS; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.setMemtable; +import static org.apache.cassandra.schema.SchemaConstants.ACCORD_KEYSPACE_NAME; +import static org.apache.cassandra.service.accord.AccordKeyspace.CommandsForKeyAccessor.findAllKeysBetween; +import static org.apache.cassandra.service.accord.AccordKeyspace.CommandsForKeyAccessor.makeSystemTableKeyBytes; +import static org.apache.cassandra.service.accord.AccordTestUtils.createTxn; +import static org.apache.cassandra.utils.AbstractTypeGenerators.getTypeSupport; +import static org.apache.cassandra.utils.AccordGenerators.fromQT; + +public class AccordKeyspaceTest extends CQLTester.InMemory +{ + static + { + // since this test does frequent truncates, the info table gets updated and forced flushed... which is 90% of the cost of this test... + // this flag disables that flush + CassandraRelevantProperties.UNSAFE_SYSTEM.setBoolean(true); + } + + @Test + public void serde() + { + AtomicLong now = new AtomicLong(); + + String tableName = createTable("CREATE TABLE %s (k int, c int, v int, PRIMARY KEY (k, c)) WITH transactional_mode = 'full'"); + TableId tableId = Schema.instance.getTableMetadata(KEYSPACE, tableName).id; + Ranges scope = Ranges.of(TokenRange.create(TokenKey.min(tableId, getPartitioner()), TokenKey.max(tableId, getPartitioner()))); + + AccordCommandStore store = AccordTestUtils.createAccordCommandStore(now::incrementAndGet, KEYSPACE, tableName); + + TxnId id = new TxnId(Timestamp.fromValues(1, 42, new Node.Id(1)), Txn.Kind.Read, Routable.Domain.Key); + + Txn txn = createTxn(wrapInTxn(String.format("SELECT * FROM %s.%s WHERE k=? LIMIT 1", KEYSPACE, tableName)), Collections.singletonList(42)); + + PartialTxn partialTxn = txn.slice(scope, true); + RoutingKey routingKey = partialTxn.keys().get(0).asKey().toUnseekable(); + FullRoute route = partialTxn.keys().toRoute(routingKey); + StoreParticipants participants = StoreParticipants.all(route); + Deps deps = new Deps(KeyDeps.none(((Keys) txn.keys()).toParticipants()), RangeDeps.NONE); + + Command.WaitingOn waitingOn = null; + + Command.Committed committed = committed(id, SaveStatus.Committed, Status.Durability.NotDurable, + participants, Ballot.ZERO, id, partialTxn, deps.intersecting(scope), + Ballot.ZERO, waitingOn); + AccordSafeCommand safeCommand = new AccordSafeCommand(AccordTestUtils.loaded(id, null)); + safeCommand.set(committed); + + AccordTestUtils.appendCommandsBlocking(store, null, committed); + + Command loaded = store.loadCommand(id); + Assertions.assertThat(loaded).isEqualTo(committed); + } + + @Test + public void findOverlappingKeys() + { + var tableIdGen = fromQT(CassandraGenerators.TABLE_ID_GEN); + var partitionGen = fromQT(CassandraGenerators.partitioners().assuming(IPartitioner::accordSupported)); + + var sstableFormats = DatabaseDescriptor.getSSTableFormats(); + List sstableFormatNames = new ArrayList<>(sstableFormats.keySet()); + sstableFormatNames.sort(Comparator.naturalOrder()); + + List memtableFormats = MemtableParams.knownDefinitions().stream() + .filter(name -> !name.startsWith("test_") && !name.equals("default")) + .sorted() + .collect(Collectors.toList()); + + qt().check(rs -> { + AccordKeyspace.unsafeClear(); + // control SSTable format + setSelectedSSTableFormat(sstableFormats.get(rs.pick(sstableFormatNames))); + // control memtable format + setMemtable(ACCORD_KEYSPACE_NAME, "commands_for_key", rs.pick(memtableFormats)); + + // define the tables w/ partitioners for the test + // this uses the ability to override the SchemaProvider for the keyspace and only defines the single API call expected: getTablePartitioner + TreeMap tables = new TreeMap<>(); + int numStores = rs.nextInt(1, 3); + int numTables = numStores == 1 ? 1 : rs.nextInt(1, numStores); + for (int i = 0; i < numTables; i++) + { + var tableId = tableIdGen.next(rs); + while (tables.containsKey(tableId)) + tableId = tableIdGen.next(rs); + tables.put(tableId, partitionGen.next(rs)); + } + + TreeMap storeTableIds = new TreeMap<>(); + for (int i = 0; i < numStores; i++) + { + int tableIdx = rs.nextInt(tables.size()); + TableId tableId = Iterables.get(tables.keySet(), tableIdx); + storeTableIds.put(i, tableId); + } + SchemaProvider schema = Mockito.mock(SchemaProvider.class); + Mockito.when(schema.getTablePartitioner(Mockito.any())).thenAnswer((Answer) invocationOnMock -> tables.get(invocationOnMock.getArgument(0))); + + // The model of the DB + TreeMap> storesToKeys = new TreeMap<>(); + // write to the table and the model + for (int i = 0, numKeys = rs.nextInt(10, 20); i < numKeys; i++) + { + int store = rs.nextInt(0, numStores); + var keys = storesToKeys.computeIfAbsent(store, ignore -> new TreeSet<>()); + TokenKey pk = null; + // LocalPartitioner may have a type with a very small domain (boolean, vector, etc.), so need to bound the attempts + // else this will loop forever... + for (int attempt = 0; attempt < 10; attempt++) + { + TableId tableId = storeTableIds.get(store); + IPartitioner partitioner = tables.get(tableId); + ByteBuffer data; + if (partitioner instanceof ReversedLongLocalPartitioner) + data = fromQT(CassandraGenerators.reversedLongLocalKeys()).next(rs); + else if (partitioner instanceof LocalPartitioner) + data = fromQT(getTypeSupport(partitioner.getTokenValidator()).bytesGen()).next(rs); + else + data = Int32Type.instance.decompose(rs.nextInt()); + TokenKey key = new TokenKey(tableId, tables.get(tableId).decorateKey(data).getToken()); + if (keys.add(key)) + { + pk = key; + break; + } + } + if (pk != null) + { + try + { + // using Mutation directly (what we do in Accord) can break when user data is too large; leading to data loss + // The memtable will allow the write, but it will be dropped when writing to the SSTable... + execute("INSERT INTO system_accord.commands_for_key (key) VALUES (?)", + makeSystemTableKeyBytes(store, pk)); + } + catch (IllegalArgumentException | InvalidRequestException e) + { + // Sometimes the types are too large (LocalPartitioner) so the mutation gets rejected... just ignore those cases + // Length 69912 > max length 65535 + String msg = e.getMessage(); + if (msg != null) + { + if ((msg.startsWith("Length ") && msg.endsWith("> max length 65535")) // Clustering was rejected + || (msg.startsWith("Key length of ") && msg.endsWith(" is longer than maximum of 65535"))) // Partition was rejected + { + // failed to add + keys.remove(pk); + continue; + } + } + throw e; + } + } + } + + // read from the table and validate it matches the model + for (int read = 0; read < 2; read++) // read=0 is memtable, read=1 is sstable + { + { + // Make sure no data was lost + // An issue was found that system mutations bypass checks so make their way to the Memtable, but when we flush to SSTable + // they get filtered out, causing data loss... This check is here to make sure that the data is present (test covers Memtable + SStable) + // in the storage before checking if the filtering logic is correct + TreeMap> expectedCqlStoresToKeys = new TreeMap<>(); + for (var e : storesToKeys.entrySet()) + { + int store = e.getKey(); + SortedSet keys = e.getValue(); + if (keys.isEmpty()) + continue; + expectedCqlStoresToKeys.put(store, new TreeSet<>(keys.stream().map(key -> makeSystemTableKeyBytes(store, key)).collect(Collectors.toList()))); + } + + // make sure no data loss... when this test was written sstable had all the rows but the sstable didn't... this + // is mostly a santity check to detect that case early + var resultSet = execute("SELECT key FROM system_accord.commands_for_key ALLOW FILTERING"); + TreeMap> cqlStoresToKeys = new TreeMap<>(); + for (var row : resultSet) + { + ByteBuffer bb = row.getBytes("key"); + int storeId = CommandsForKeyAccessor.getCommandStoreId(bb); + cqlStoresToKeys.computeIfAbsent(storeId, ignore -> new TreeSet<>()).add(bb); + } + Assertions.assertThat(cqlStoresToKeys).isEqualTo(expectedCqlStoresToKeys); + } + + for (int i = 0, queries = rs.nextInt(1, 5); i < queries; i++) + { + int store = rs.pickOrderedSet(storesToKeys.navigableKeySet()); + var keysForStore = new ArrayList<>(storesToKeys.get(store)); + if (keysForStore.isEmpty()) + continue; + + int offset; + int offsetEnd; + if (keysForStore.size() == 1) + { + offset = 0; + offsetEnd = 1; + } + else + { + offset = rs.nextInt(0, keysForStore.size()); + offsetEnd = rs.nextInt(offset, keysForStore.size()) + 1; + } + List expected = keysForStore.subList(offset, offsetEnd); + TokenKey start = expected.get(0); + TokenKey end = expected.get(expected.size() - 1); + + List actual = new ArrayList<>(); + findAllKeysBetween(store, storeTableIds.get(store), start.token().getPartitioner(), start, true, end, true, actual::add); + Assertions.assertThat(actual).isEqualTo(expected); + } + + if (read == 0) + Keyspace.open(ACCORD_KEYSPACE_NAME).getColumnFamilyStore("commands_for_key").forceBlockingFlush(UNIT_TESTS); + } + }); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/AccordMessageSinkTest.java b/test/unit/org/apache/cassandra/service/accord/AccordMessageSinkTest.java new file mode 100644 index 000000000000..62196013a9bf --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordMessageSinkTest.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.api.TopologySorter; +import accord.api.TopologySorter.StaticSorter; +import accord.impl.RequestCallbacks; +import accord.messages.ReadData; +import accord.messages.ReadData.CommitOrReadNack; +import accord.topology.TopologyUtils; +import org.apache.cassandra.service.accord.AccordFetchCoordinator.AccordFetchRequest; +import org.apache.cassandra.service.accord.api.AccordAgent; +import org.apache.cassandra.service.accord.api.AccordTimeService; +import org.mockito.ArgumentCaptor; +import org.mockito.Mockito; + +import accord.Utils; +import accord.impl.AbstractFetchCoordinator; +import accord.impl.IntKey; +import accord.local.Node; +import accord.messages.ReadTxnData; +import accord.messages.Reply; +import accord.messages.Request; +import accord.primitives.Keys; +import accord.primitives.PartialDeps; +import accord.primitives.PartialTxn; +import accord.primitives.Ranges; +import accord.primitives.Routable; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.topology.Topologies; +import accord.topology.Topology; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessageDelivery; +import org.apache.cassandra.tcm.ClusterMetadataService; + +public class AccordMessageSinkTest +{ + private static final Node.Id node = new Node.Id(1); + private static final AccordEndpointMapper mapping = SimpleAccordEndpointMapper.INSTANCE; + private static final Topology topology = TopologyUtils.initialTopology(new Node.Id[] { node}, Ranges.of(IntKey.range(0, 100)), 1); + private static final Topologies topologies = new Topologies.Single((TopologySorter) (StaticSorter)(a, b, ignore) -> 0, topology); + + private static final MessageDelivery messaging = Mockito.mock(MessageDelivery.class); + private static AccordMessageSink sink; + + @BeforeClass + public static void setup() + { + DatabaseDescriptor.clientInitialization(); + DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); + ClusterMetadataService.initializeForClients(); + sink = new AccordMessageSink(Mockito.mock(AccordAgent.class), messaging, mapping, new RequestCallbacks(new AccordTimeService())); + } + + @Test + public void bootstrapRead() + { + long epoch = 42; + Txn txn = Utils.readTxn(Keys.of(IntKey.key(42))); + TxnId id = nextTxnId(epoch, txn); + Ranges ranges = Ranges.of(IntKey.range(40, 50)); + PartialTxn partialTxn = txn.slice(ranges, true); + Request request = new AccordFetchRequest(epoch, id, ranges, PartialDeps.NONE, partialTxn); + + checkRequestReplies(request, + new AbstractFetchCoordinator.FetchResponse(null, null, id), + CommitOrReadNack.Insufficient); + + } + + @Test + public void txnRead() + { + TxnId txnId = nextTxnId(42, Txn.Kind.Read, Routable.Domain.Key); + Request request = new ReadTxnData(node, topologies, txnId, topology.ranges(), txnId.epoch()); + checkRequestReplies(request, + new ReadData.ReadOk(null, null, 0), + CommitOrReadNack.Insufficient); + } + + private static void checkRequestReplies(Request request, Reply... replies) + { + Message requestMessage = send(request); + for (Reply reply : replies) + { + Mockito.clearInvocations(messaging); + try + { + sink.reply(node, requestMessage, reply); + } + catch (Throwable t) + { + throw new AssertionError(String.format("Expected reply type %s (type=%s) to be allowed", reply.getClass().getCanonicalName(), reply.type()), t); + } + } + } + + private static Message send(Request request) + { + Mockito.clearInvocations(messaging); + ArgumentCaptor> captor = ArgumentCaptor.forClass(Message.class); + Mockito.doNothing().when(messaging).send(captor.capture(), Mockito.any()); + sink.send(node, request); + return captor.getValue(); + } + + private static TxnId nextTxnId(long epoch, Txn txn) + { + return nextTxnId(epoch, txn.kind(), txn.keys().domain()); + } + + private static TxnId nextTxnId(long epoch, Txn.Kind rw, Routable.Domain domain) + { + return new TxnId(Timestamp.fromValues(epoch, System.nanoTime(), node), rw, domain); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/AccordReadRepairTest.java b/test/unit/org/apache/cassandra/service/accord/AccordReadRepairTest.java new file mode 100644 index 000000000000..e050d43a373b --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordReadRepairTest.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.io.IOException; + +import com.google.common.base.Function; +import com.google.common.collect.ImmutableList; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.shared.Metrics; +import org.apache.cassandra.distributed.test.accord.AccordTestBase; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.service.consensus.TransactionalMode; + +import static org.apache.cassandra.Util.spinAssertEquals; +import static org.apache.cassandra.distributed.shared.AssertUtils.assertRows; +import static org.apache.cassandra.distributed.util.QueryResultUtil.assertThat; +import static org.apache.cassandra.net.Verb.HINT_REQ; +import static org.apache.cassandra.net.Verb.MUTATION_REQ; +import static org.apache.cassandra.net.Verb.READ_REPAIR_REQ; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class AccordReadRepairTest extends AccordTestBase +{ + private static final Logger logger = LoggerFactory.getLogger(AccordReadRepairTest.class); + + @Override + protected Logger logger() + { + return logger; + } + + @BeforeClass + public static void setupClass() throws IOException + { + AccordTestBase.setupCluster(builder -> builder, 2); + SHARED_CLUSTER.setMessageSink(new MessageCountingSink(SHARED_CLUSTER)); + } + + /* + * SERIAL read and CAS create Accord transactions which will then invoke Cassandra coordination to perform the read + * and proxy any read repairs that are generated. + */ + @Test + public void testSerialReadRepair() throws Exception + { + testReadRepair(cluster -> cluster.coordinator(1).execute("SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1 AND c = 1;", ConsistencyLevel.SERIAL), + new Object[][] {{1, 1, 1, 1}}, + TransactionalMode.test_unsafe_writes, + 0, 2, 1, 0); + } + + @Test + public void testCASFailedConditionReadRepair() throws Exception + { + // Even if the condition fails to apply the data checked when applying the condition should be repaired + testReadRepair(cluster -> cluster.coordinator(1).execute("INSERT INTO " + qualifiedAccordTableName + " (k, c, v1) VALUES (1, 1, 99) IF NOT EXISTS;", ConsistencyLevel.SERIAL), + new Object[][] {{false, 1, 1, 1, 1}}, + TransactionalMode.test_unsafe_writes, + 2, 0, 1, 0); + } + + @Test + public void testCASReadRepair() throws Exception + { + // If the condition applies the read repair should preserve the existing timestamp + testReadRepair(cluster -> cluster.coordinator(1).execute("UPDATE " + qualifiedAccordTableName + " SET v2 = 99 WHERE k = 1 and c = 1 IF EXISTS;", ConsistencyLevel.SERIAL), + new Object[][] {{Boolean.TRUE}}, + TransactionalMode.test_unsafe_writes, + 2, 0, 1, 0); + } + + /* + * non-SERIAL consistency levels are coordinated by C* and then if a partition needs to be repaired an Accord transaction + * is created for each partition repair to proxy the repair mutations safely. + */ + @Test + public void testNonSerialReadRepair() throws Exception + { + for (ConsistencyLevel cl : ImmutableList.of(ConsistencyLevel.QUORUM)) + testReadRepair(cluster -> cluster.coordinator(1).execute("SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1 AND c = 1;", cl), + new Object[][] {{1, 1, 1, 1}}, + TransactionalMode.test_unsafe_writes, + 0, 2, 0, 1); + } + + @Test + public void testNonSerialRangeReadRepair() throws Exception + { + for (ConsistencyLevel cl : ImmutableList.of(ConsistencyLevel.QUORUM)) + testReadRepair(cluster -> cluster.coordinator(1).execute("SELECT * FROM " + qualifiedAccordTableName + " WHERE TOKEN(k) > " + Long.MIN_VALUE + " AND TOKEN(k) < " + Long.MAX_VALUE, cl), + new Object[][] {{1, 1, 1, 1}}, + TransactionalMode.test_unsafe_writes, + 0, 2, 0, 1); + } + + void testReadRepair(Function accordTxn, Object[][] expected, TransactionalMode transactionalMode, int expectedInteropApply, int expectedRegularApply, int expectedReadRepairFromAccord, int expectedReadRepairViaAccord) throws Exception + { + test("CREATE TABLE " + qualifiedAccordTableName + " (k int, c int, v1 int, v2 int, PRIMARY KEY ((k), c)) WITH transactional_mode='" + transactionalMode + "';", + cluster -> { + Metrics metrics = cluster.get(1).metrics(); + String repairedFromAccordName = metrics.getNames().stream().filter(name -> name.contains("RepairedBlockingFromAccord")).findFirst().get(); + String repairedViaAccordName = metrics.getNames().stream().filter(name -> name.contains("RepairedBlockingViaAccord")).findFirst().get(); + long startingRepairedFromAccord = metrics.getCounter(repairedFromAccordName); + long startingRepairedViaAccord = metrics.getCounter(repairedViaAccordName); + + cluster.filters().verbs(READ_REPAIR_REQ.id, MUTATION_REQ.id, HINT_REQ.id).drop().on(); + cluster.get(1).executeInternal("INSERT INTO " + qualifiedAccordTableName + " (k, c, v1, v2) VALUES (1, 1, 1, 1) USING TIMESTAMP 42;"); + assertThat(cluster.get(2).executeInternalWithResult("SELECT * FROM " + qualifiedAccordTableName + " WHERE k = 1 AND c = 1;")).isEmpty(); + // Should perform read repair + Object[][] result = accordTxn.apply(cluster); + assertRows(result, expected); + // Side effect of the read repair should be visible now + assertThat(cluster.get(2).executeInternalWithResult("SELECT k, c, v1, WRITETIME(v1) FROM " + qualifiedAccordTableName + " WHERE k = 1 AND c = 1;")) + .isEqualTo(1, 1, 1, 42L); + assertEquals(expectedInteropApply, messageCount(Verb.ACCORD_INTEROP_APPLY_REQ)); + // Regular apply is async so need to spin + spinAssertEquals(expectedRegularApply, () -> messageCount(Verb.ACCORD_APPLY_REQ)); + assertEquals(1, messageCount(Verb.ACCORD_INTEROP_READ_REPAIR_REQ)); + long readRepairRspCount = messageCount(Verb.ACCORD_INTEROP_READ_REPAIR_RSP); + assertTrue("Should be 1-2 read repair responses depending on whether insufficient occurred", readRepairRspCount >= 1 && readRepairRspCount <= 2); + long repairedFromAccord = metrics.getCounter(repairedFromAccordName) - startingRepairedFromAccord; + assertEquals(expectedReadRepairFromAccord, repairedFromAccord); + long repairedViaAccord = metrics.getCounter(repairedViaAccordName) - startingRepairedViaAccord; + assertEquals(expectedReadRepairViaAccord, repairedViaAccord); + }); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/AccordSerializersTest.java b/test/unit/org/apache/cassandra/service/accord/AccordSerializersTest.java new file mode 100644 index 000000000000..44908a0fa115 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordSerializersTest.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import org.junit.Test; + +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.utils.CassandraGenerators; +import org.apache.cassandra.utils.Generators; + +import static accord.utils.Property.qt; + +public class AccordSerializersTest +{ + @Test + public void clustering() + { + DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(Generators.toGen(CassandraGenerators.CLUSTERING_GEN)).check(clustering -> { + Serializers.testSerde(output, AccordSerializers.clusteringSerializer, clustering); + }); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/AccordStaleReplicasTest.java b/test/unit/org/apache/cassandra/service/accord/AccordStaleReplicasTest.java new file mode 100644 index 000000000000..870c989f4788 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordStaleReplicasTest.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.HashSet; +import java.util.Set; + +import org.junit.Test; + +import accord.local.Node; +import accord.utils.AccordGens; +import accord.utils.Gen; +import accord.utils.Gens; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializers; +import org.apache.cassandra.tcm.serialization.Version; + +import static accord.utils.Property.qt; + +public class AccordStaleReplicasTest +{ + static + { + DatabaseDescriptor.toolInitialization(); + } + + @Test + public void serde() + { + try (DataOutputBuffer buffer = new DataOutputBuffer()) + { + Gen> nodesGen = Gens.lists(AccordGens.nodes()).unique().ofSizeBetween(0, 9).map(nodes -> new HashSet<>(nodes)); + Gen epochGen = AccordGens.epochs().map(Epoch::create); + + qt().check(rs -> { + Epoch epoch = epochGen.next(rs); + Set nodes = nodesGen.next(rs); + AsymmetricMetadataSerializers.testSerde(buffer, AccordStaleReplicas.serializer, new AccordStaleReplicas(nodes, epoch), Version.MIN_ACCORD_VERSION); + }); + } + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/AccordSyncPropagatorTest.java b/test/unit/org/apache/cassandra/service/accord/AccordSyncPropagatorTest.java new file mode 100644 index 000000000000..948dc80afbe7 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordSyncPropagatorTest.java @@ -0,0 +1,521 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.net.InetAddress; +import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.NavigableSet; +import java.util.Objects; +import java.util.Set; +import java.util.TreeSet; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +import com.google.common.collect.ImmutableBiMap; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Sets; +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.api.Agent; +import accord.impl.AbstractConfigurationService; +import accord.impl.TestAgent; +import accord.impl.basic.Pending; +import accord.impl.basic.PendingQueue; +import accord.impl.basic.MonitoredPendingQueue; +import accord.impl.basic.RandomDelayQueue; +import accord.impl.basic.SimulatedDelayedExecutorService; +import accord.local.Node; +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.topology.Topology; +import accord.utils.AccordGens; +import accord.utils.Gen; +import accord.utils.Gens; +import accord.utils.RandomSource; +import org.apache.cassandra.concurrent.AdaptingScheduledExecutorPlus; +import org.apache.cassandra.concurrent.ScheduledExecutorPlus; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.exceptions.RequestFailure; +import org.apache.cassandra.gms.EndpointState; +import org.apache.cassandra.gms.Gossiper; +import org.apache.cassandra.gms.HeartBeatState; +import org.apache.cassandra.gms.IFailureDetectionEventListener; +import org.apache.cassandra.gms.IFailureDetector; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.ConnectionType; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessageDelivery; +import org.apache.cassandra.net.RequestCallback; +import org.apache.cassandra.tcm.ValidatingClusterMetadataService; +import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.utils.AccordGenerators; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.concurrent.Future; +import org.assertj.core.api.Assertions; + +import static accord.utils.Property.qt; +import static org.apache.cassandra.simulator.RandomSource.Choices.choose; + +public class AccordSyncPropagatorTest +{ + @BeforeClass + public static void setup() throws NoSuchFieldException, IllegalAccessException + { + DatabaseDescriptor.daemonInitialization(); + ValidatingClusterMetadataService.createAndRegister(Version.MIN_ACCORD_VERSION); + } + + @Test + public void burnTest() + { + Gen rangesGen = AccordGenerators.ranges().filter(r -> !r.isEmpty()); + Gen> nodesGen = Gens.lists(AccordGens.nodes()).unique().ofSizeBetween(1, 40); + qt().withExamples(100).check(rs -> { + // when gossip and cluster metadata don't know an endpoint, retries are avoided (node removed) + // so when instances are created here they are added to gossip to trick the membership check... + Gossiper.instance.clearUnsafe(); + + List nodes = nodesGen.next(rs); + Set nodesAsSet = ImmutableSet.copyOf(nodes); + + List failures = new ArrayList<>(); + RandomDelayQueue delayQueue = new RandomDelayQueue.Factory(rs).get(); + PendingQueue queue = new MonitoredPendingQueue(failures, delayQueue); + Agent agent = new TestAgent.RethrowAgent(); + SimulatedDelayedExecutorService globalExecutor = new SimulatedDelayedExecutorService(queue, agent, null); + ScheduledExecutorPlus scheduler = new AdaptingScheduledExecutorPlus(globalExecutor); + + Cluster cluster = new Cluster(nodes, rs, scheduler); + + long epochOffset = rs.nextLong(1, 1024); + int numEpochs = rs.nextInt(1, 10); + Map allRanges = new HashMap<>(); + Pending.Global.setNoActiveOrigin(); + for (int i = 0; i < numEpochs; i++) + { + long epoch = epochOffset + i; + Ranges ranges = rangesGen.next(rs); + allRanges.put(epoch, ranges); + scheduler.schedule(() -> { + for (Node.Id nodeId : nodes) + cluster.node(nodeId).propagator.reportSyncComplete(epoch, nodes, nodeId); + + for (int j = 0, attempts = rs.nextInt(1, 4); j < attempts; j++) + { + for (Range range : ranges) + { + Cluster.Instace inst = cluster.node(choose(rs, nodes)); + scheduler.schedule(() -> { + Ranges subrange = Ranges.of(range); + inst.propagator.reportClosed(epoch, nodes, subrange); + scheduler.schedule(() -> inst.propagator.reportRetired(epoch, nodes, subrange), 1, TimeUnit.MINUTES); + }, rs.nextInt(30, 300), TimeUnit.SECONDS); + } + } + }, rs.nextInt(30, 300), TimeUnit.SECONDS); + } + Pending.Global.clearActiveOrigin(); + + while (queue.size() > 0) + { + Runnable next = (Runnable) queue.poll(); + if (next == null) + break; + Pending.Global.setActiveOrigin((Pending)next); + next.run(); + Pending.Global.clearActiveOrigin(); + if (!failures.isEmpty()) + { + RuntimeException e = new RuntimeException("Failures detected"); + failures.forEach(e::addSuppressed); + throw e; + } + } + if (hasPending(cluster)) + throw new AssertionError("Unable to make progress: pending syncs on \n" + cluster.instances.values().stream().filter(i -> i.propagator.hasPending()).map(i -> i.propagator.toString()).collect(Collectors.joining("\n"))); + + for (Cluster.Instace inst : cluster.instances.values()) + { + Cluster.ConfigService cs = inst.configurationService; + assertSetsEqual(cs.completedEpochs, allRanges.keySet(), "completedEpochs %s", inst.id); + assertSetsEqual(cs.syncCompletes.keySet(), allRanges.keySet(), "syncCompletes %s", inst.id); + for (Map.Entry> e : cs.syncCompletes.entrySet()) + assertSetsEqual(e.getValue(), nodesAsSet, "syncCompletes values on %s", inst.id); + + assertMapEquals(cs.closed, allRanges, "Unexpected state for closed on %s", inst.id); + assertMapEquals(cs.redundant, allRanges, "Unexpected state for redundant on %s", inst.id); + } + }); + } + + private static void assertSetsEqual(Set actual, Set expected, String msg, Object... args) + { + Set notExpected = Sets.difference(actual, expected); + Assertions.assertThat(notExpected).describedAs("Unexpected values detected; " + msg, args).isEmpty(); + Set missing = Sets.difference(expected, actual); + Assertions.assertThat(missing).describedAs("Missing values detected; " + msg, args).isEmpty(); + } + + private static void assertMapEquals(Map actual, Map expected, String msg, Object... args) + { + assertSetsEqual(actual.keySet(), expected.keySet(), msg, args); + List errors = new ArrayList<>(); + for (Map.Entry e : actual.entrySet()) + { + V value = e.getValue(); + V other = expected.get(e.getKey()); + if (!Objects.equals(value, other)) + errors.add(String.format("Missmatch at key %s: expected %s but given %s", e.getKey(), other, value)); + } + if (!errors.isEmpty()) + throw new AssertionError(String.join("\n", errors)); + } + + private static boolean hasPending(Cluster cluster) + { + return cluster.instances.values().stream().anyMatch(i -> i.propagator.hasPending()); + } + + private static class Cluster implements AccordEndpointMapper + { + private final ImmutableBiMap nodeToAddress; + private final ImmutableMap instances; + private final RandomSource rs; + private final ScheduledExecutorPlus scheduler; + + private Cluster(List nodes, + RandomSource rs, + ScheduledExecutorPlus scheduler) + { + this.rs = rs; + this.scheduler = scheduler; + ImmutableBiMap.Builder nodeToAddress = ImmutableBiMap.builder(); + ImmutableMap.Builder instances = ImmutableMap.builder(); + for (Node.Id id : nodes) + { + InetAddressAndPort address = addressFromInt(id.id); + nodeToAddress.put(id, address); + ConfigService cs = new ConfigService(id); + Sink sink = new Sink(id); + IFailureDetector fd = new FailureDetector(address); + instances.put(id, new Instace(id, address, cs, sink, fd, cs, new AccordSyncPropagator(id, Cluster.this, sink, fd, scheduler, cs))); + Gossiper.instance.endpointStateMap.put(address, new EndpointState(HeartBeatState.empty())); + } + this.nodeToAddress = nodeToAddress.build(); + this.instances = instances.build(); + } + + private InetAddressAndPort addressFromInt(int value) + { + byte[] array = ByteBufferUtil.bytes(value).array(); + try + { + InetAddress address = InetAddress.getByAddress(array); + return InetAddressAndPort.getByAddressOverrideDefaults(address, 1); + } + catch (UnknownHostException e) + { + throw new AssertionError(e); + } + } + + public Cluster.Instace node(Node.Id id) + { + Instace instace = instances.get(id); + if (instace == null) + throw new NullPointerException("Unknown id: " + id); + return instace; + } + + public Cluster.Instace node(InetAddressAndPort address) + { + return node(mappedId(address)); + } + + @Override + public Node.Id mappedIdOrNull(InetAddressAndPort endpoint) + { + return nodeToAddress.inverse().get(endpoint); + } + + @Override + public InetAddressAndPort mappedEndpointOrNull(Node.Id id) + { + return nodeToAddress.get(id); + } + + private enum Action + { + DELIVER, TIMEOUT, ERROR + } + + private class Sink implements MessageDelivery + { + private final Node.Id from; + private final Map> callbacks = new HashMap<>(); + private final Map> nodeActions = new HashMap<>(); + + private Sink(Node.Id from) + { + this.from = from; + } + + @Override + public void send(Message message, InetAddressAndPort to) + { + throw new UnsupportedOperationException(); + } + + @Override + public void sendWithCallback(Message message, InetAddressAndPort to, RequestCallback cb) + { + Action action = action(to); + switch (action) + { + case ERROR: + cb.onFailure(to, RequestFailure.UNKNOWN); + return; + case TIMEOUT: + cb.onFailure(to, RequestFailure.TIMEOUT); + return; + case DELIVER: + break; + default: + throw new IllegalStateException("Unknown action: " + action); + } + callbacks.put(message.id(), cb); + scheduler.schedule(() -> AccordService.receive(this, node(to).configurationService, (Message) message.withFrom(mappedEndpoint(from))), 500, TimeUnit.MILLISECONDS); + scheduler.schedule(() -> { + RequestCallback removed = callbacks.remove(message.id()); + if (removed != null) + removed.onFailure(to, RequestFailure.TIMEOUT); + }, 1, TimeUnit.MINUTES); + } + + @Override + public void sendWithCallback(Message message, InetAddressAndPort to, RequestCallback cb, ConnectionType specifyConnection) + { + throw new UnsupportedOperationException(); + } + + @Override + public Future> sendWithResult(Message message, InetAddressAndPort to) + { + throw new UnsupportedOperationException(); + } + + @Override + public void respond(V response, Message message) + { + Action action = action(message.respondTo()); + switch (action) + { + case ERROR: + case TIMEOUT: + return; + case DELIVER: + break; + default: + throw new IllegalStateException("Unknown action: " + action); + } + + RequestCallback cb = node(message.respondTo()).messagingService.callbacks.remove(message.id()); + if (cb != null) + cb.onResponse(message.responseWith(response)); + } + + private Action action(InetAddressAndPort to) + { + return nodeActions.computeIfAbsent(to, ignore -> Gens.enums().allWithWeights(Action.class, 81, 10, 1)).next(rs); + } + } + + private class FailureDetector implements IFailureDetector + { + private final InetAddressAndPort self; + private final Map> nodeRuns = new HashMap<>(); + + private FailureDetector(InetAddressAndPort self) + { + this.self = self; + } + + @Override + public boolean isAlive(InetAddressAndPort ep) + { + if (self.equals(ep)) return true; + + return !nodeRuns.computeIfAbsent(ep, ignore -> Gens.bools().biasedRepeatingRuns(.01, rs.nextInt(3, 15))).next(rs); + } + + @Override + public void interpret(InetAddressAndPort ep) + { + throw new UnsupportedOperationException(); + } + + @Override + public void report(InetAddressAndPort ep) + { + throw new UnsupportedOperationException(); + } + + @Override + public void remove(InetAddressAndPort ep) + { + throw new UnsupportedOperationException(); + } + + @Override + public void forceConviction(InetAddressAndPort ep) + { + throw new UnsupportedOperationException(); + } + + @Override + public void registerFailureDetectionEventListener(IFailureDetectionEventListener listener) + { + throw new UnsupportedOperationException(); + } + + @Override + public void unregisterFailureDetectionEventListener(IFailureDetectionEventListener listener) + { + throw new UnsupportedOperationException(); + } + } + + private class ConfigService extends AbstractConfigurationService.Minimal implements AccordSyncPropagator.Listener + { + private final Map> syncCompletes = new HashMap<>(); + private final Map> endpointAcks = new HashMap<>(); + private final NavigableSet completedEpochs = Collections.synchronizedNavigableSet(new TreeSet<>()); + private final Map closed = new HashMap<>(); + private final Map redundant = new HashMap<>(); + + private ConfigService(Node.Id node) + { + super(node); + } + + @Override + protected void receiveRemoteSyncCompletePreListenerNotify(Node.Id node, long epoch) + { + syncCompletes.computeIfAbsent(epoch, ignore -> new HashSet<>()).add(node); + } + + @Override + public void fetchTopologyForEpoch(long epoch) + { + // TODO + } + + @Override + protected void localSyncComplete(Topology topology, boolean startSync) + { + Set notify = topology.nodes().stream().filter(i -> !localId.equals(i)).collect(Collectors.toSet()); + instances.get(localId).propagator.reportSyncComplete(topology.epoch(), notify, localId); + } + + @Override + public void reportEpochClosed(Ranges ranges, long epoch) + { + Topology topology = getTopologyForEpoch(epoch); + instances.get(localId).propagator.reportClosed(epoch, topology.nodes(), ranges); + } + + @Override + public void reportEpochRetired(Ranges ranges, long epoch) + { + Topology topology = getTopologyForEpoch(epoch); + instances.get(localId).propagator.reportRetired(epoch, topology.nodes(), ranges); + } + + @Override + public void reportEpochRemoved(long epoch) + { + } + + @Override + public void onEndpointAck(Node.Id id, long epoch) + { + endpointAcks.computeIfAbsent(epoch, ignore -> new HashSet<>()).add(id); + } + + @Override + public void onComplete(long epoch) + { + completedEpochs.add(epoch); + // TODO why do we see multiple calls? +// if (!completedEpochs.add(epoch)) +// throw new IllegalStateException("Completed epoch " + epoch + " multiple times"); + } + + @Override + public synchronized void receiveClosed(Ranges ranges, long epoch) + { + super.receiveClosed(ranges, epoch); + closed.merge(epoch, ranges, Ranges::with); + } + + @Override + public synchronized void receiveRetired(Ranges ranges, long epoch) + { + super.receiveRetired(ranges, epoch); + redundant.merge(epoch, ranges, Ranges::with); + } + } + + public class Instace + { + private final Node.Id id; + private final InetAddressAndPort address; + private final ConfigService configurationService; + private final Sink messagingService; + private final IFailureDetector failureDetector; + private final AccordSyncPropagator.Listener listener; + private final AccordSyncPropagator propagator; + + private Instace(Node.Id id, + InetAddressAndPort address, + ConfigService configurationService, + Sink messagingService, + IFailureDetector failureDetector, + AccordSyncPropagator.Listener listener, + AccordSyncPropagator propagator) + { + this.id = id; + this.address = address; + this.configurationService = configurationService; + this.messagingService = messagingService; + this.failureDetector = failureDetector; + this.listener = listener; + this.propagator = propagator; + } + } + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/AccordTaskTest.java b/test/unit/org/apache/cassandra/service/accord/AccordTaskTest.java new file mode 100644 index 000000000000..9c6dc1360c95 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordTaskTest.java @@ -0,0 +1,491 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.LockSupport; +import java.util.function.BiConsumer; +import java.util.function.Consumer; + +import accord.local.StoreParticipants; +import accord.primitives.Participants; +import accord.primitives.Route; + +import com.google.common.collect.Iterables; +import com.google.common.collect.Maps; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.RoutingKey; +import accord.local.cfk.SafeCommandsForKey; +import accord.local.CheckedCommands; +import accord.local.Command; +import accord.local.PreLoadContext; +import accord.local.SafeCommand; +import accord.local.SafeCommandStore; +import accord.primitives.SaveStatus; +import accord.primitives.Ballot; +import accord.primitives.FullRoute; +import accord.primitives.Keys; +import accord.primitives.PartialDeps; +import accord.primitives.PartialTxn; +import accord.primitives.Ranges; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.utils.Gen; +import accord.utils.Gens; +import accord.utils.RandomSource; +import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.db.ReadExecutionController; +import org.apache.cassandra.db.SinglePartitionReadCommand; +import org.apache.cassandra.db.transform.FilteredPartitions; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.AccordCommandStore.ExclusiveCaches; +import org.apache.cassandra.service.accord.AccordExecutor.ExclusiveGlobalCaches; +import org.apache.cassandra.service.accord.AccordKeyspace.CommandsForKeyAccessor; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.utils.AssertionUtils; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.concurrent.Condition; +import org.assertj.core.api.Assertions; +import org.awaitility.Awaitility; +import org.mockito.Mockito; + +import static accord.local.KeyHistory.SYNC; +import static accord.local.PreLoadContext.contextFor; +import static accord.utils.Property.qt; +import static accord.utils.async.AsyncChains.getUninterruptibly; +import static org.apache.cassandra.cql3.statements.schema.CreateTableStatement.parse; +import static org.apache.cassandra.service.accord.AccordTestUtils.createAccordCommandStore; +import static org.apache.cassandra.service.accord.AccordTestUtils.createPartialTxn; +import static org.apache.cassandra.service.accord.AccordTestUtils.keys; +import static org.apache.cassandra.service.accord.AccordTestUtils.loaded; +import static org.apache.cassandra.service.accord.AccordTestUtils.txnId; + +public class AccordTaskTest +{ + private static final Logger logger = LoggerFactory.getLogger(AccordTaskTest.class); + private static final AtomicLong clock = new AtomicLong(0); + + @BeforeClass + public static void beforeClass() throws Throwable + { + SchemaLoader.prepareServer(); + SchemaLoader.createKeyspace("ks", KeyspaceParams.simple(1), + parse("CREATE TABLE tbl (k int, c int, v int, primary key (k, c)) WITH transactional_mode='full'", "ks")); + StorageService.instance.initServer(); + } + + @Before + public void before() + { + QueryProcessor.executeInternal(String.format("TRUNCATE %s.%s", SchemaConstants.ACCORD_KEYSPACE_NAME, AccordKeyspace.COMMANDS_FOR_KEY)); + } + + /** + * Commands which were not previously on disk and were only accessed via `ifPresent`, and therefore, + * not initialized, should not be saved at the end of the operation + */ + @Test + public void optionalCommandTest() throws Throwable + { + AccordCommandStore commandStore = createAccordCommandStore(clock::incrementAndGet, "ks", "tbl"); + TxnId txnId = txnId(1, clock.incrementAndGet(), 1); + + getUninterruptibly(commandStore.execute(txnId, instance -> { + // TODO review: This change to `ifInitialized` was done in a lot of places and it doesn't preserve this property + // I fixed this reference to point to `ifLoadedAndInitialised` and but didn't update other places + Assert.assertNull(instance.ifInitialised(txnId)); + Assert.assertNull(instance.ifLoadedAndInitialised(txnId)); + })); + } + + @Test + public void touchUnknownTxn() throws Throwable + { + AccordCommandStore commandStore = createAccordCommandStore(clock::incrementAndGet, "ks", "tbl"); + TxnId txnId = txnId(1, clock.incrementAndGet(), 1); + + getUninterruptibly(commandStore.execute(txnId, safe -> { + StoreParticipants participants = StoreParticipants.empty(txnId); + SafeCommand command = safe.get(txnId, participants); + Assert.assertNotNull(command); + })); + } + + @Test + public void optionalCommandsForKeyTest() throws Throwable + { + AccordCommandStore commandStore = createAccordCommandStore(clock::incrementAndGet, "ks", "tbl"); + Txn txn = AccordTestUtils.createWriteTxn((int)clock.incrementAndGet()); + TokenKey key = ((PartitionKey) Iterables.getOnlyElement(txn.keys())).toUnseekable(); + + getUninterruptibly(commandStore.execute(contextFor(key), instance -> { + SafeCommandsForKey cfk = instance.ifLoadedAndInitialised(key); + Assert.assertNull(cfk); + })); + + long nowInSeconds = FBUtilities.nowInSeconds(); + SinglePartitionReadCommand command = CommandsForKeyAccessor.makeRead(commandStore.id(), key, (int) nowInSeconds); + try(ReadExecutionController controller = command.executionController(); + FilteredPartitions partitions = FilteredPartitions.filter(command.executeLocally(controller), nowInSeconds)) + { + Assert.assertFalse(partitions.hasNext()); + } + } + + private static Command createStableAndPersist(AccordCommandStore commandStore, TxnId txnId, Timestamp executeAt) + { + Command command = AccordTestUtils.Commands.stable(txnId, createPartialTxn(0), executeAt); + AccordSafeCommand safeCommand = new AccordSafeCommand(loaded(txnId, null)); + safeCommand.set(command); + + appendDiffToLog(commandStore).accept(null, command); + return command; + } + + private static Command createStableAndPersist(AccordCommandStore commandStore, TxnId txnId) + { + return createStableAndPersist(commandStore, txnId, txnId); + } + + private static Command createStableUsingFastLifeCycle(AccordCommandStore commandStore, TxnId txnId) + { + return createStableUsingFastLifeCycle(commandStore, txnId, txnId); + } + + private static Command createStableUsingFastLifeCycle(AccordCommandStore commandStore, TxnId txnId, Timestamp executeAt) + { + PartialTxn partialTxn = createPartialTxn(0); + RoutingKey routingKey = partialTxn.keys().get(0).asKey().toUnseekable(); + FullRoute route = partialTxn.keys().toRoute(routingKey); + Ranges ranges = AccordTestUtils.fullRange(partialTxn.keys()); + route.slice(ranges); + PartialDeps deps = PartialDeps.builder(ranges, true).build(); + + try + { + Command command = getUninterruptibly(commandStore.submit(contextFor(txnId, route, SYNC), safe -> { + CheckedCommands.preaccept(safe, txnId, partialTxn, route, appendDiffToLog(commandStore)); + CheckedCommands.commit(safe, SaveStatus.Stable, Ballot.ZERO, txnId, route, partialTxn, executeAt, deps, appendDiffToLog(commandStore)); + return safe.ifInitialised(txnId).current(); + }).beginAsResult()); + + // clear cache + commandStore.executeBlocking(() -> { + try (ExclusiveGlobalCaches cache = commandStore.executor().lockCaches();) + { + long cacheSize = cache.global.capacity(); + cache.global.setCapacity(0); + cache.global.setCapacity(cacheSize); + } + }); + + while (commandStore.executor().hasTasks()) + LockSupport.parkNanos(TimeUnit.MILLISECONDS.toNanos(100)); + + return command; + } + catch (ExecutionException e) + { + throw new AssertionError(e); + } + } + + private static Command createStableUsingSlowLifeCycle(AccordCommandStore commandStore, TxnId txnId) + { + return createStableUsingSlowLifeCycle(commandStore, txnId, txnId); + } + + private static BiConsumer appendDiffToLog(AccordCommandStore commandStore) + { + return (before, after) -> { + Condition condition = Condition.newOneTimeCondition(); + commandStore.appendToLog(before, after, condition::signal); + condition.awaitUninterruptibly(); + }; + } + + private static Command createStableUsingSlowLifeCycle(AccordCommandStore commandStore, TxnId txnId, Timestamp executeAt) + { + PartialTxn partialTxn = createPartialTxn(0); + RoutingKey routingKey = partialTxn.keys().get(0).asKey().toUnseekable(); + FullRoute route = partialTxn.keys().toRoute(routingKey); + Ranges ranges = AccordTestUtils.fullRange(partialTxn.keys()); + Route partialRoute = route.slice(ranges); + PartialDeps deps = PartialDeps.builder(ranges, true).build(); + + try + { + Command command = getUninterruptibly(commandStore.submit(contextFor(txnId, route, SYNC), safe -> { + CheckedCommands.preaccept(safe, txnId, partialTxn, route, appendDiffToLog(commandStore)); + CheckedCommands.accept(safe, txnId, Ballot.ZERO, partialRoute, executeAt, deps, appendDiffToLog(commandStore)); + CheckedCommands.commit(safe, SaveStatus.Committed, Ballot.ZERO, txnId, route, partialTxn, executeAt, deps, appendDiffToLog(commandStore)); + CheckedCommands.commit(safe, SaveStatus.Stable, Ballot.ZERO, txnId, route, partialTxn, executeAt, deps, appendDiffToLog(commandStore)); + return safe.ifInitialised(txnId).current(); + }).beginAsResult()); + + // clear cache + commandStore.executeBlocking(() -> { + try (ExclusiveGlobalCaches cache = commandStore.executor().lockCaches();) + { + long cacheSize = cache.global.capacity(); + cache.global.setCapacity(0); + cache.global.setCapacity(cacheSize); + } + }); + + while (commandStore.executor().hasTasks()) + LockSupport.parkNanos(TimeUnit.MILLISECONDS.toNanos(100)); + + return command; + } + catch (ExecutionException e) + { + throw new AssertionError(e); + } + } + + @Test + public void loadFail() + { + AtomicLong clock = new AtomicLong(0); + // all txn use the same key; 0 + Keys keys = keys(Schema.instance.getTableMetadata("ks", "tbl"), 0); + AccordCommandStore commandStore = createAccordCommandStore(clock::incrementAndGet, "ks", "tbl"); + commandStore.executeBlocking(() -> commandStore.executor().cacheUnsafe().setCapacity(0)); + Gen txnIdGen = rs -> txnId(1, clock.incrementAndGet(), 1); + + qt().withSeed(3447647345436261108L).withPure(false) + .withExamples(50) + .forAll(Gens.random(), Gens.lists(txnIdGen).ofSizeBetween(1, 2)) + .check((rs, ids) -> { + before(); // truncate tables + + Participants participants = keys.toParticipants(); + assertNoReferences(commandStore, ids, participants); + createCommand(commandStore, rs, ids); + awaitDone(commandStore, ids, participants); + assertNoReferences(commandStore, ids, participants); + + PreLoadContext ctx = contextFor(ids.get(0), ids.size() == 1 ? null : ids.get(1), participants, SYNC); + Consumer consumer = Mockito.mock(Consumer.class); + + Map failed = selectFailedTxn(rs, ids); + try (ExclusiveGlobalCaches caches = commandStore.executor().lockCaches()) + { + caches.commands.unsafeSetLoadFunction((s, txnId) -> + { + logger.info("Attempting to load {}; expected to fail? {}", txnId, failed.get(txnId)); + if (!failed.get(txnId)) + return commandStore.loadCommand(txnId); + throw new NullPointerException("txn_id " + txnId); + }); + } + AccordTask o1 = AccordTask.create(commandStore, ctx, consumer); + AssertionUtils.assertThatThrownBy(() -> getUninterruptibly(o1.chain())) + .hasRootCause() + .isInstanceOf(NullPointerException.class) + .hasNoSuppressedExceptions(); + + Mockito.verifyNoInteractions(consumer); + + assertNoReferences(commandStore, ids, participants); + // the first failed load causes the whole operation to fail, so some ids may still be pending + // to make sure the next operation does not see a PENDING that will fail, wait for all loads to complete + awaitDone(commandStore, ids, participants); + + // can we recover? + try (ExclusiveGlobalCaches caches = commandStore.executor().lockCaches()) + { + caches.commands.unsafeSetLoadFunction((s, txnId) -> { + Command cmd = commandStore.loadCommand(txnId); + return cmd; + }); + } + AccordTask o2 = AccordTask.create(commandStore, ctx, store -> { + ids.forEach(id -> { + store.ifInitialised(id).readyToExecute(store); + }); + }); + getUninterruptibly(o2.chain()); + awaitDone(commandStore, ids, participants); + assertNoReferences(commandStore, ids, participants); + + }); + } + + @Test + public void consumerFails() + { + AtomicLong clock = new AtomicLong(0); + // all txn use the same key; 0 + Keys keys = keys(Schema.instance.getTableMetadata("ks", "tbl"), 0); + AccordCommandStore commandStore = createAccordCommandStore(clock::incrementAndGet, "ks", "tbl"); + Gen txnIdGen = rs -> txnId(1, clock.incrementAndGet(), 1); + + AtomicInteger counter = new AtomicInteger(); + qt().withPure(false).withExamples(100).forAll(Gens.random(), Gens.lists(txnIdGen).ofSizeBetween(1, 10)).check((rs, ids) -> { + logger.info("Test #{}", counter.incrementAndGet()); + before(); // truncate tables + + Participants participants = keys.toParticipants(); + assertNoReferences(commandStore, ids, participants); + createCommand(commandStore, rs, ids); + + PreLoadContext ctx = contextFor(ids.get(0), ids.size() == 1 ? null : ids.get(1), participants, SYNC); + + Consumer consumer = Mockito.mock(Consumer.class); + String errorMsg = "txn_ids " + ids; + Mockito.doThrow(new NullPointerException(errorMsg)).when(consumer).accept(Mockito.any()); + + AccordTask operation = AccordTask.create(commandStore, ctx, consumer); + + AssertionUtils.assertThatThrownBy(() -> getUninterruptibly(operation.chain())) + .hasRootCause() + .isInstanceOf(NullPointerException.class) + .hasMessage(errorMsg) + .hasNoSuppressedExceptions(); + + assertNoReferences(commandStore, ids, participants); + }); + } + + private static void createCommand(AccordCommandStore commandStore, RandomSource rs, List ids) + { + // to simulate CommandsForKey not being found, use createCommittedAndPersist periodically as it does not update + switch (rs.nextInt(3)) + { + case 0: + logger.info("createStableAndPersist(): {}", ids); + ids.forEach(id -> createStableAndPersist(commandStore, id)); + break; + case 1: + logger.info("createStableUsingFastLifeCycle(): {}", ids); + ids.forEach(id -> createStableUsingFastLifeCycle(commandStore, id)); + break; + case 2: + logger.info("createStableUsingSlowLifeCycle(): {}", ids); + ids.forEach(id -> createStableUsingSlowLifeCycle(commandStore, id)); + } + } + + private static Map selectFailedTxn(RandomSource rs, List ids) + { + ids = new ArrayList<>(ids); + Map failed = Maps.newHashMapWithExpectedSize(ids.size()); + int failedCount = Math.max(1, rs.nextInt(ids.size())); + Collections.shuffle(ids, rs.asJdkRandom()); + for (int i = 0 ; i < failedCount ; ++i) + failed.put(ids.get(i), true); + for (int i = failedCount ; i < ids.size() ; ++i) + failed.put(ids.get(i), false); + return failed; + } + + private static void assertNoReferences(AccordCommandStore commandStore, List ids, Participants keys) + { + AssertionError error = null; + try (ExclusiveCaches caches = commandStore.lockCaches()) + { + assertNoReferences(caches.commands(), ids); + } + catch (AssertionError e) + { + error = e; + } + try (ExclusiveCaches caches = commandStore.lockCaches()) + { + assertNoReferences(caches.commandsForKeys(), keys); + } + catch (AssertionError e) + { + if (error == null) error = e; + else error.addSuppressed(e); + } + if (error != null) throw error; + } + + private static void assertNoReferences(AccordCache.Type.Instance cache, Iterable keys) + { + AssertionError error = null; + for (T key : keys) + { + AccordCacheEntry node = cache.getUnsafe(key); + if (node == null) continue; + try + { + if (node.references() > 0) + throw new IllegalStateException(); + Assertions.assertThat(node.references()) + .describedAs("Key %s found referenced in cache", key) + .isEqualTo(0); + } + catch (AssertionError e) + { + if (error == null) + { + error = e; + } + else + { + error.addSuppressed(e); + } + } + } + if (error != null) throw error; + } + + private static void awaitDone(AccordCommandStore commandStore, List ids, Participants keys) + { + awaitDone(commandStore.cachesUnsafe().commands(), ids); + awaitDone(commandStore.cachesUnsafe().commandsForKeys(), keys); + } + + private static void awaitDone(AccordCache.Type.Instance cache, Iterable keys) + { + for (T key : keys) + { + AccordCacheEntry node = cache.getUnsafe(key); + if (node == null) continue; + Awaitility.await("For node " + node.key() + " to complete") + .atMost(Duration.ofMinutes(1)) + .until(node::isComplete); + } + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/AccordTestUtils.java b/test/unit/org/apache/cassandra/service/accord/AccordTestUtils.java new file mode 100644 index 000000000000..397bc8fc28fd --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordTestUtils.java @@ -0,0 +1,463 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Function; +import java.util.function.LongSupplier; +import java.util.function.ToLongFunction; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import com.google.common.collect.Sets; +import org.junit.Assert; + +import accord.api.Data; +import accord.api.Journal; +import accord.api.ProgressLog.NoOpProgressLog; +import accord.api.RemoteListeners.NoOpRemoteListeners; +import accord.api.Result; +import accord.api.RoutingKey; +import accord.api.Timeouts; +import accord.impl.DefaultLocalListeners; +import accord.impl.DefaultLocalListeners.NotifySink.NoOpNotifySink; +import accord.local.Command; +import accord.local.CommandStore; +import accord.local.CommandStores; +import accord.local.DurableBefore; +import accord.local.Node; +import accord.local.Node.Id; +import accord.local.NodeCommandStoreService; +import accord.local.PreLoadContext; +import accord.local.SafeCommandStore; +import accord.local.StoreParticipants; +import accord.local.TimeService; +import accord.local.durability.DurabilityService; +import accord.primitives.Ballot; +import accord.primitives.FullKeyRoute; +import accord.primitives.FullRoute; +import accord.primitives.Keys; +import accord.primitives.PartialDeps; +import accord.primitives.PartialTxn; +import accord.primitives.Ranges; +import accord.primitives.Routable; +import accord.primitives.SaveStatus; +import accord.primitives.Seekable; +import accord.primitives.Seekables; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.primitives.Writes; +import accord.topology.Shard; +import accord.topology.Topology; +import accord.topology.TopologyManager; +import accord.utils.SortedArrays.SortedArrayList; +import accord.utils.async.AsyncChains; +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.concurrent.ExecutorPlus; +import org.apache.cassandra.concurrent.ManualExecutor; +import org.apache.cassandra.concurrent.Stage; +import org.apache.cassandra.config.AccordSpec; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.config.DurationSpec; +import org.apache.cassandra.cql3.QueryOptions; +import org.apache.cassandra.cql3.QueryProcessor; +import org.apache.cassandra.cql3.statements.TransactionStatement; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.metrics.AccordCacheMetrics; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.accord.api.AccordAgent; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.txn.TxnData; +import org.apache.cassandra.service.accord.txn.TxnQuery; +import org.apache.cassandra.service.accord.txn.TxnRead; +import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.concurrent.Condition; +import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; + +import static accord.primitives.Routable.Domain.Key; +import static accord.primitives.SaveStatus.NotDefined; +import static accord.primitives.SaveStatus.PreAccepted; +import static accord.primitives.Status.Durability.NotDurable; +import static accord.primitives.Txn.Kind.Write; +import static accord.utils.async.AsyncChains.getUninterruptibly; +import static java.lang.String.format; +import static org.apache.cassandra.service.accord.AccordExecutor.Mode.RUN_WITH_LOCK; +import static org.apache.cassandra.service.accord.AccordExecutor.wrap; + +public class AccordTestUtils +{ + public static final TableId TABLE_ID1 = TableId.fromString("00000000-0000-0000-0000-000000000001"); + + public static class Commands + { + public static Command notDefined(TxnId txnId, PartialTxn txn) + { + return Command.NotDefined.notDefined(txnId, NotDefined, NotDurable, StoreParticipants.empty(txnId), Ballot.ZERO); + } + + public static Command preaccepted(TxnId txnId, PartialTxn txn, Timestamp executeAt) + { + return Command.PreAccepted.preaccepted(txnId, PreAccepted, NotDurable, StoreParticipants.all(route(txn)), Ballot.ZERO, executeAt, txn, null); + } + + public static Command committed(TxnId txnId, PartialTxn txn, Timestamp executeAt) + { + return Command.Committed.committed(txnId, SaveStatus.Committed, NotDurable, StoreParticipants.all(route(txn)), + Ballot.ZERO, executeAt, txn, PartialDeps.NONE, Ballot.ZERO, null); + } + + public static Command stable(TxnId txnId, PartialTxn txn, Timestamp executeAt) + { + return Command.Committed.committed(txnId, SaveStatus.Stable, NotDurable, StoreParticipants.all(route(txn)), + Ballot.ZERO, executeAt, txn, PartialDeps.NONE, Ballot.ZERO, Command.WaitingOn.empty(txnId.domain())); + } + + private static FullRoute route(PartialTxn txn) + { + Seekable key = txn.keys().get(0); + RoutingKey routingKey = key.asKey().toUnseekable(); + return new FullKeyRoute(routingKey, new RoutingKey[]{ routingKey }); + } + } + + public static AccordCacheEntry loaded(K key, V value) + { + AccordCacheEntry global = new AccordCacheEntry<>(key, null); + global.initialize(value); + return global; + } + + public static AccordSafeCommand safeCommand(Command command) + { + AccordCacheEntry global = loaded(command.txnId(), command); + return new AccordSafeCommand(global); + } + + public static Function testableLoad(K key, V val) + { + return k -> { + Assert.assertEquals(key, k); + return val; + }; + } + + public static void testLoad(ManualExecutor executor, AccordCache.Type.Instance instance, AccordSafeState safeState, V val) + { + Assert.assertEquals(AccordCacheEntry.Status.WAITING_TO_LOAD, safeState.global().status()); + safeState.global().load(wrap(executor), null, instance.parent().adapter(), AccordCacheEntry.OnLoaded.immediate()); + Assert.assertEquals(AccordCacheEntry.Status.LOADING, safeState.global().status()); + executor.runOne(); + Assert.assertEquals(AccordCacheEntry.Status.LOADED, safeState.global().status()); + safeState.preExecute(); + Assert.assertEquals(val, safeState.current()); + } + + public static TxnId txnId(long epoch, long hlc, int node) + { + return txnId(epoch, hlc, node, Write); + } + + public static TxnId txnId(long epoch, long hlc, int node, Txn.Kind kind) + { + return new TxnId(epoch, hlc, kind, Key, new Node.Id(node)); + } + + public static TxnId txnId(long epoch, long hlc, int node, Txn.Kind kind, Routable.Domain domain) + { + return new TxnId(epoch, hlc, kind, domain, new Node.Id(node)); + } + + public static Timestamp timestamp(long epoch, long hlc, int node) + { + return Timestamp.fromValues(epoch, hlc, new Node.Id(node)); + } + + public static Ballot ballot(long epoch, long hlc, int node) + { + return Ballot.fromValues(epoch, hlc, new Node.Id(node)); + } + + public static Pair processTxnResult(AccordCommandStore commandStore, TxnId txnId, PartialTxn txn, Timestamp executeAt) throws Throwable + { + AtomicReference> result = new AtomicReference<>(); + getUninterruptibly(commandStore.execute(PreLoadContext.contextFor(txn.keys().toParticipants()), + safeStore -> result.set(processTxnResultDirect(safeStore, txnId, txn, executeAt)))); + return result.get(); + } + + public static Pair processTxnResultDirect(SafeCommandStore safeStore, TxnId txnId, PartialTxn txn, Timestamp executeAt) + { + TxnRead read = (TxnRead) txn.read(); + Data readData = read.keys().stream().map(key -> { + try + { + return AsyncChains.getBlocking(read.read(key, safeStore, executeAt, null)); + } + catch (InterruptedException e) + { + throw new UncheckedInterruptedException(e); + } + catch (ExecutionException e) + { + throw new RuntimeException(e); + } + }) + .reduce(null, TxnData::merge); + return Pair.create(txnId.is(Write) ? txn.execute(txnId, executeAt, readData) : null, + txn.query().compute(txnId, executeAt, txn.keys(), readData, txn.read(), txn.update())); + + } + + public static String wrapInTxn(String query) + { + if (!query.endsWith(";")) + query += ";"; + return "BEGIN TRANSACTION\n" + + query + + "\nCOMMIT TRANSACTION"; + } + + public static Txn createTxn(String query) + { + return createTxn(query, QueryOptions.DEFAULT); + } + + public static Txn createTxn(String query, Object... binds) + { + return createTxn(query, Arrays.asList(binds)); + } + + public static Txn createTxn(String query, List binds) + { + TransactionStatement statement = parse(query); + QueryOptions options = QueryProcessor.makeInternalOptions(statement, binds.toArray(new Object[binds.size()])); + return statement.createTxn(ClientState.forInternalCalls(), options); + } + + public static Txn createTxn(String query, QueryOptions options) + { + TransactionStatement statement = parse(query); + return statement.createTxn(ClientState.forInternalCalls(), options); + } + + public static TransactionStatement parse(String query) + { + TransactionStatement.Parsed parsed = (TransactionStatement.Parsed) QueryProcessor.parseStatement(query); + Assert.assertNotNull(parsed); + TransactionStatement statement = (TransactionStatement) parsed.prepare(ClientState.forInternalCalls()); + return statement; + } + + public static Txn createTxn(int readKey, int... writeKeys) + { + StringBuilder sb = new StringBuilder("BEGIN TRANSACTION\n"); + sb.append(format("LET row1 = (SELECT * FROM ks.tbl WHERE k=%s AND c=0);\n", readKey)); + sb.append("SELECT row1.v;\n"); + if (writeKeys.length > 0) + { + sb.append("IF row1 IS NULL THEN\n"); + for (int key : writeKeys) + sb.append(format("INSERT INTO ks.tbl (k, c, v) VALUES (%s, 0, 1);\n", key)); + sb.append("END IF\n"); + } + sb.append("COMMIT TRANSACTION"); + return createTxn(sb.toString()); + } + + public static Txn createWriteTxn(int key) + { + return createTxn(key, key); + } + + public static Txn createTxn(Txn.Kind kind, Seekables seekables) + { + TableMetadatas.Collector tables = new TableMetadatas.Collector(); + for (Seekable seekable : seekables) + tables.add(TableMetadata.minimal("", "", (TableId)seekable.prefix())); + return new Txn.InMemory(kind, seekables, TxnRead.empty(seekables.domain()), TxnQuery.NONE, null, new TableMetadatasAndKeys(tables.build(), seekables)); + } + + public static Ranges fullRange(Txn txn) + { + return fullRange(txn.keys()); + } + + public static Ranges fullRange(Seekables keys) + { + PartitionKey key = (PartitionKey) keys.get(0); + return Ranges.of(TokenRange.fullRange(key.table(), DatabaseDescriptor.getPartitioner())); + } + + public static PartialTxn createPartialTxn(int key) + { + Txn txn = createTxn(key, key); + TableMetadatas.Collector tables = new TableMetadatas.Collector(); + for (Seekable seekable : txn.keys()) + tables.add(TableMetadata.minimal("", "", (TableId)seekable.prefix())); + return new PartialTxn.InMemory(txn.kind(), txn.keys(), txn.read(), txn.query(), txn.update(), new TableMetadatasAndKeys(tables.build(), txn.keys())); + } + + public static AccordCommandStore createAccordCommandStore( + Node.Id node, LongSupplier now, Topology topology, ExecutorPlus loadExecutor, ExecutorPlus saveExecutor) + { + AccordExecutor executor = new AccordExecutorSyncSubmit(0, RUN_WITH_LOCK, CommandStore.class.getSimpleName() + '[' + 0 + ']', new AccordCacheMetrics("test"), loadExecutor, saveExecutor, loadExecutor, new AccordAgent()); + return createAccordCommandStore(node, now, topology, executor); + } + + public static AccordCommandStore createAccordCommandStore( + Node.Id node, LongSupplier now, Topology topology, AccordExecutor executor) + { + NodeCommandStoreService time = new NodeCommandStoreService() + { + private ToLongFunction elapsed = TimeService.elapsedWrapperFromNonMonotonicSource(TimeUnit.MICROSECONDS, this::now); + + @Override public Timeouts timeouts() { return null; } + @Override public DurableBefore durableBefore() { return DurableBefore.EMPTY; } + @Override public DurabilityService durability() { return null; } + @Override public Id id() { return node;} + @Override public long epoch() {return 1; } + @Override public long now() {return now.getAsLong(); } + @Override public long uniqueNow(long atLeast) { return now.getAsLong(); } + @Override public long elapsed(TimeUnit timeUnit) { return elapsed.applyAsLong(timeUnit); } + @Override public TopologyManager topology() { throw new UnsupportedOperationException(); } + }; + + AccordAgent agent = new AccordAgent(); + if (new File(DatabaseDescriptor.getAccordJournalDirectory()).exists()) + ServerTestUtils.cleanupDirectory(DatabaseDescriptor.getAccordJournalDirectory()); + AccordSpec.JournalSpec spec = new AccordSpec.JournalSpec(); + spec.flushPeriod = new DurationSpec.IntSecondsBound(1); + AccordJournal journal = new AccordJournal(spec); + journal.start(null); + + CommandStore.EpochUpdateHolder holder = new CommandStore.EpochUpdateHolder(); + Ranges ranges = topology.rangesForNode(node); + holder.add(1, new CommandStores.RangesForEpoch(1, ranges), ranges); + AccordCommandStore result = new AccordCommandStore(0, time, agent, null, + cs -> new NoOpProgressLog(), + cs -> new DefaultLocalListeners(new NoOpRemoteListeners(), new NoOpNotifySink()), + holder, journal, executor); + result.unsafeUpdateRangesForEpoch(); + return result; + } + + public static AccordCommandStore createAccordCommandStore(Node.Id node, LongSupplier now, Topology topology) + { + return createAccordCommandStore(node, now, topology, Stage.READ.executor(), Stage.MUTATION.executor()); + } + + public static AccordCommandStore createAccordCommandStore( + LongSupplier now, String keyspace, String table, ExecutorPlus loadExecutor, ExecutorPlus saveExecutor) + { + TableMetadata metadata = Schema.instance.getTableMetadata(keyspace, table); + TokenRange range = TokenRange.fullRange(metadata.id, metadata.partitioner); + Node.Id node = new Id(1); + Topology topology = new Topology(1, Shard.create(range, new SortedArrayList<>(new Id[] { node }), Sets.newHashSet(node), Collections.emptySet())); + AccordCommandStore store = createAccordCommandStore(node, now, topology, loadExecutor, saveExecutor); + store.execute(PreLoadContext.empty(), safeStore -> ((AccordCommandStore)safeStore.commandStore()).executor().cacheUnsafe().setCapacity(1 << 20)); + return store; + } + + public static AccordCommandStore createAccordCommandStore(LongSupplier now, String keyspace, String table) + { + return createAccordCommandStore(now, keyspace, table, Stage.READ.executor(), Stage.MUTATION.executor()); + } + + public static void execute(AccordCommandStore commandStore, Runnable runnable) + { + try + { + commandStore.executor().submit(runnable).get(); + } + catch (InterruptedException e) + { + throw new UncheckedInterruptedException(e); + } + catch (ExecutionException e) + { + throw new RuntimeException(e.getCause()); + } + } + + public static PartitionKey key(TableMetadata table, int key) + { + DecoratedKey dk = table.partitioner.decorateKey(Int32Type.instance.decompose(key)); + return new PartitionKey(table.id, dk); + } + + public static Keys keys(TableMetadata table, int... keys) + { + return Keys.of(IntStream.of(keys).mapToObj(key -> key(table, key)).collect(Collectors.toList())); + } + + public static Node.Id id(int id) + { + return new Node.Id(id); + } + + public static SortedArrayList idList(int... ids) + { + return new SortedArrayList<>(Arrays.stream(ids).mapToObj(AccordTestUtils::id).toArray(Id[]::new)); + } + + public static Set idSet(int... ids) + { + return Arrays.stream(ids).mapToObj(AccordTestUtils::id).collect(Collectors.toSet()); + } + + public static Token token(long t) + { + return new Murmur3Partitioner.LongToken(t); + } + + public static Range range(Token left, Token right) + { + return new Range<>(left, right); + } + + public static Range range(long left, long right) + { + return range(token(left), token(right)); + } + + public static void appendCommandsBlocking(AccordCommandStore commandStore, Command before, Command after) + { + Journal.CommandUpdate diff = new Journal.CommandUpdate(before, after); + Condition condition = Condition.newOneTimeCondition(); + commandStore.appendCommands(Collections.singletonList(diff), condition::signal); + condition.awaitUninterruptibly(30, TimeUnit.SECONDS); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/AccordTopologyTest.java b/test/unit/org/apache/cassandra/service/accord/AccordTopologyTest.java new file mode 100644 index 000000000000..753848a21941 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordTopologyTest.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import com.google.common.collect.ImmutableList; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.local.Node; +import accord.local.Node.Id; +import accord.topology.Shard; +import accord.topology.Topology; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.Keyspaces; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.Tables; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.membership.Location; + +import static org.apache.cassandra.cql3.statements.schema.CreateTableStatement.parse; +import static org.apache.cassandra.service.accord.AccordTopologyUtils.NODE_LIST; +import static org.apache.cassandra.service.accord.AccordTopologyUtils.NODE_SET; +import static org.apache.cassandra.service.accord.AccordTopologyUtils.configureCluster; +import static org.apache.cassandra.service.accord.AccordTopologyUtils.range; +import static org.apache.cassandra.service.accord.AccordTopologyUtils.token; + +public class AccordTopologyTest +{ + private static final IPartitioner partitioner = Murmur3Partitioner.instance; + private static TableId tableId = null; + private static KeyspaceMetadata keyspace = null; + private static final Location LOCATION = new Location("DC1", "RACK1"); + + @BeforeClass + public static void beforeClass() throws Throwable + { + DatabaseDescriptor.daemonInitialization(); + DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); + TableMetadata table = parse("CREATE TABLE tbl (k int, c int, v int, primary key (k, c)) WITH transactional_mode='full'", "ks").build(); + tableId = table.id; + keyspace = KeyspaceMetadata.create("ks", KeyspaceParams.simple(3), Tables.of(table)); + } + + /** + * Check converter does the right thing if the ring is constructed with min and max tokens + */ + @Test + public void minMaxTokens() + { + List> ranges = ImmutableList.of(range(partitioner.getMinimumToken(), token(-100)), + range(-100, 100), + range(token(100), partitioner.getMaximumTokenForSplitting())); + Assert.assertEquals(partitioner.getMinimumToken(), ranges.get(0).left); + Assert.assertEquals(partitioner.getMaximumTokenForSplitting(), ranges.get(2).right); + ClusterMetadata metadata = configureCluster(ranges, Keyspaces.of(keyspace)); + + Topology topology = AccordTopology.createAccordTopology(metadata); + Topology expected = new Topology(1, + Shard.create(AccordTopology.fullRange(tableId, partitioner), NODE_LIST, NODE_SET)); + + Assert.assertEquals(expected, topology); + } + + @Test + public void wrapAroundRanges() + { + List> ranges = ImmutableList.of(range(-100, 0), + range(0, 100), + range(100, -100)); + + ClusterMetadata metadata = configureCluster(ranges, Keyspaces.of(keyspace)); + Topology topology = AccordTopology.createAccordTopology(metadata); + Topology expected = new Topology(1, + Shard.create(AccordTopology.fullRange(tableId, partitioner), NODE_LIST, NODE_SET)); + + Assert.assertEquals(expected, topology); + } + + @Test + public void fastPath() + { + List> ranges = ImmutableList.of(range(partitioner.getMinimumToken(), token(-100)), + range(-100, 100), + range(token(100), partitioner.getMaximumTokenForSplitting())); + ClusterMetadata metadata = configureCluster(ranges, Keyspaces.of(keyspace)); + Topology topology = AccordTopology.createAccordTopology(metadata); + Topology expected = new Topology(1, + Shard.create(AccordTopology.fullRange(tableId, partitioner), NODE_LIST, NODE_SET)); + Assert.assertEquals(expected, topology); + + topology = AccordTopology.createAccordTopology(metadata.transformer().withFastPathStatusSince(new Id(1), AccordFastPath.Status.UNAVAILABLE, 1, 1).build().metadata); + + Set fastPath = new HashSet<>(NODE_SET); + fastPath.remove(new Node.Id(1)); + + expected = new Topology(2, + Shard.create(AccordTopology.fullRange(tableId, partitioner), NODE_LIST, fastPath)); + Assert.assertEquals(expected, topology); + } + + /** + * Even if there are too many failures to reach quorum, fast path size shouldn't go below quorum size + */ + @Test + public void fastPathWithMoreThanMinimumFailedNodes() + { + List> ranges = ImmutableList.of(range(partitioner.getMinimumToken(), token(-100)), + range(-100, 100), + range(token(100), partitioner.getMaximumTokenForSplitting())); + ClusterMetadata metadata = configureCluster(ranges, Keyspaces.of(keyspace)); + Topology topology = AccordTopology.createAccordTopology(metadata); + Topology expected = new Topology(1, + Shard.create(AccordTopology.fullRange(tableId, partitioner), NODE_LIST, NODE_SET)); + Assert.assertEquals(expected, topology); + + metadata = metadata.transformer() + .withFastPathStatusSince(new Id(1), AccordFastPath.Status.UNAVAILABLE, 1, 1) + .withFastPathStatusSince(new Id(2), AccordFastPath.Status.UNAVAILABLE, 1, 1) + .build().metadata; + topology = AccordTopology.createAccordTopology(metadata); + + Set fastPath = new HashSet<>(NODE_SET); + fastPath.remove(new Node.Id(1)); + + expected = new Topology(2, + Shard.create(AccordTopology.fullRange(tableId, partitioner), NODE_LIST, fastPath)); + Assert.assertEquals(expected, topology); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/AccordTopologyUtils.java b/test/unit/org/apache/cassandra/service/accord/AccordTopologyUtils.java new file mode 100644 index 000000000000..e4a2a4791c71 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/AccordTopologyUtils.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.net.InetAddress; +import java.net.UnknownHostException; +import java.util.Collections; +import java.util.List; +import java.util.Set; + +import com.google.common.collect.ImmutableSet; + +import accord.local.Node; +import accord.utils.SortedArrays.SortedArrayList; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.locator.AbstractReplicationStrategy; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.schema.DistributedSchema; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.Keyspaces; +import org.apache.cassandra.schema.ReplicationParams; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.membership.Location; +import org.apache.cassandra.tcm.membership.NodeAddresses; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.membership.NodeState; +import org.apache.cassandra.tcm.membership.NodeVersion; +import org.apache.cassandra.tcm.ownership.DataPlacement; +import org.apache.cassandra.tcm.ownership.DataPlacements; + +public class AccordTopologyUtils +{ + public static final Node.Id ID1 = new Node.Id(1); + public static final Node.Id ID2 = new Node.Id(2); + public static final Node.Id ID3 = new Node.Id(3); + public static final SortedArrayList NODE_LIST = new SortedArrayList<>(new Node.Id[] { ID1, ID2, ID3 }); + public static final Set NODE_SET = ImmutableSet.copyOf(NODE_LIST); + + private static final IPartitioner partitioner = Murmur3Partitioner.instance; + private static final Location LOCATION = new Location("DC1", "RACK1"); + + public static InetAddressAndPort ep(int i) + { + try + { + return InetAddressAndPort.getByAddressOverrideDefaults(InetAddress.getByAddress(new byte[]{ 127, 0, 0, (byte)i}), 7012); + } + catch (UnknownHostException e) + { + throw new RuntimeException(e); + } + } + + private static NodeId nodeId(int id) + { + return new NodeId(id); + } + + static void addNode(ClusterMetadata.Transformer transformer, int node, Token token) + { + NodeId nodeId = nodeId(node); + InetAddressAndPort ep = ep(node); + NodeAddresses addresses = new NodeAddresses(nodeId.toUUID(), ep, ep, ep); + transformer.register(nodeId, addresses, LOCATION, NodeVersion.CURRENT); + transformer.withNodeState(nodeId, NodeState.JOINED); + transformer.proposeToken(nodeId, Collections.singleton(token)); + transformer.addToRackAndDC(nodeId); + } + + public static ClusterMetadata configureCluster(List> ranges, Keyspaces keyspaces) + { + assert ranges.size() == 3; + + IPartitioner partitioner = Murmur3Partitioner.instance; + ClusterMetadata empty = new ClusterMetadata(partitioner); + ClusterMetadata.Transformer transformer = empty.transformer(); + transformer.with(new DistributedSchema(keyspaces)); + addNode(transformer, 1, ranges.get(0).right); + addNode(transformer, 2, ranges.get(1).right); + addNode(transformer, 3, ranges.get(2).right); + ClusterMetadata metadata = transformer.build().metadata; + + for (KeyspaceMetadata keyspace : keyspaces) + { + ReplicationParams replication = keyspace.params.replication; + AbstractReplicationStrategy strategy = AbstractReplicationStrategy.createReplicationStrategy(keyspace.name, replication); + DataPlacements.Builder placements = metadata.placements.unbuild(); + DataPlacement placement = strategy.calculateDataPlacement(Epoch.EMPTY, metadata.tokenMap.toRanges(), metadata); + placements.with(replication, placement); + metadata = transformer.with(placements.build()).build().metadata; + } + + return metadata; + } + + static Token token(long t) + { + return new Murmur3Partitioner.LongToken(t); + } + + static Range range(Token left, Token right) + { + return new Range<>(left, right); + } + + public static Range range(long left, long right) + { + return range(token(left), token(right)); + } + +} diff --git a/test/unit/org/apache/cassandra/service/accord/CommandChangeTest.java b/test/unit/org/apache/cassandra/service/accord/CommandChangeTest.java new file mode 100644 index 000000000000..699129002177 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/CommandChangeTest.java @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.EnumSet; +import java.util.Set; + +import com.google.common.collect.Sets; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.impl.CommandChange; +import accord.local.Command; +import accord.local.RedundantBefore; +import accord.primitives.SaveStatus; +import accord.primitives.TxnId; +import accord.utils.Gen; +import accord.utils.LazyToString; +import accord.utils.ReflectionUtils; +import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.utils.AccordGenerators; +import org.assertj.core.api.SoftAssertions; + +import static accord.api.Journal.*; +import static accord.impl.CommandChange.*; +import static accord.utils.Property.qt; +import static org.apache.cassandra.cql3.statements.schema.CreateTableStatement.parse; + +public class CommandChangeTest +{ + private static final EnumSet ALL = EnumSet.allOf(Field.class); + + @BeforeClass + public static void beforeClass() throws Throwable + { + SchemaLoader.prepareServer(); + SchemaLoader.createKeyspace("ks", KeyspaceParams.simple(1), + parse("CREATE TABLE tbl (k int, c int, v int, primary key (k, c)) WITH transactional_mode='full'", "ks")); + TableMetadata tbl = Schema.instance.getTableMetadata("ks", "tbl"); + Assert.assertEquals(TransactionalMode.full, tbl.params.transactionalMode); + StorageService.instance.initServer(); + } + + @Test + public void allNull() + { + int flags = getFlags(null, Command.NotDefined.uninitialised(TxnId.NONE)); + EnumSet missing = EnumSet.allOf(Field.class); + missing.remove(Field.SAVE_STATUS); + missing.remove(Field.PARTICIPANTS); + missing.remove(Field.PROMISED); + missing.remove(Field.ACCEPTED); + missing.remove(Field.DURABILITY); + assertMissing(flags, missing); + } + + @Test + public void simpleNullChangeCheck() + { + int flags = getFlags(null, Command.NotDefined.uninitialised(TxnId.NONE)); + EnumSet has = EnumSet.of(Field.SAVE_STATUS, Field.PARTICIPANTS, Field.DURABILITY, Field.PROMISED, + Field.ACCEPTED /* this is Zero... which kinda means null... */); + Set missing = Sets.difference(ALL, has); + assertHas(flags, has); + assertMissing(flags, missing); + } + + @Test + public void serde() + { + Gen gen = AccordGenerators.commandsBuilder(); + try (DataOutputBuffer out = new DataOutputBuffer()) + { + qt().forAll(gen) + .check(cmdBuilder -> { + for (Version version : Version.V1.greaterThanOrEqual()) + { + SoftAssertions checks = new SoftAssertions(); + for (SaveStatus saveStatus : SaveStatus.values()) + { + out.clear(); + Command orig = cmdBuilder.build(saveStatus); + + AccordJournal.Writer.make(null, orig).write(out, version); + AccordJournal.Builder builder = new AccordJournal.Builder(orig.txnId(), Load.ALL); + builder.deserializeNext(new DataInputBuffer(out.unsafeGetBufferAndFlip(), false), version); + // We are not persisting the result, so force it for strict equality + builder.forceResult(orig.result()); + + Command reconstructed = builder.construct(RedundantBefore.EMPTY); + + checks.assertThat(reconstructed) + .describedAs("lhs=expected\nrhs=actual\n%s", new LazyToString(() -> ReflectionUtils.recursiveEquals(orig, reconstructed).toString())) + .isEqualTo(orig); + } + checks.assertAll(); + } + }); + } + } + + private void assertHas(int flags, Set missing) + { + SoftAssertions checks = new SoftAssertions(); + for (Field field : missing) + { + checks.assertThat(CommandChange.isChanged(field, flags)) + .describedAs("field %s changed", field). + isTrue(); + checks.assertThat(CommandChange.isNull(field, flags)) + .describedAs("field %s not null", field) + .isFalse(); + } + checks.assertAll(); + } + + private void assertMissing(int flags, Set missing) + { + SoftAssertions checks = new SoftAssertions(); + for (Field field : missing) + { + if (field == Field.CLEANUP) continue; + checks.assertThat(CommandChange.isChanged(field, flags)) + .describedAs("field %s changed", field) + .isFalse(); + // Is null flag can not be set on a field that has not changed + checks.assertThat(CommandChange.isNull(field, flags)) + .describedAs("field %s not null", field) + .isFalse(); + } + checks.assertAll(); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/EndpointMappingTest.java b/test/unit/org/apache/cassandra/service/accord/EndpointMappingTest.java new file mode 100644 index 000000000000..7054b35066cf --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/EndpointMappingTest.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import org.junit.Test; + +import accord.local.Node; +import org.apache.cassandra.utils.CassandraGenerators; +import org.assertj.core.api.Assertions; +import static org.quicktheories.QuickTheory.qt; + +import org.quicktheories.generators.SourceDSL; + + +public class EndpointMappingTest +{ + @Test + public void identityTest() throws Throwable + { + qt().forAll(CassandraGenerators.INET_ADDRESS_AND_PORT_GEN, SourceDSL.integers().between(1, Integer.MAX_VALUE).map(Node.Id::new)).checkAssert((endpoint, id) -> { + EndpointMapping mapping = EndpointMapping.builder(1).add(endpoint, id).build(); + Assertions.assertThat(mapping.mappedEndpoint(id)).isEqualTo(endpoint); + }); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/EpochSyncTest.java b/test/unit/org/apache/cassandra/service/accord/EpochSyncTest.java new file mode 100644 index 000000000000..cc05e86db4ad --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/EpochSyncTest.java @@ -0,0 +1,855 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.EnumMap; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.NavigableSet; +import java.util.Objects; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; +import java.util.concurrent.Callable; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.function.BiConsumer; +import java.util.function.Predicate; +import java.util.stream.Collectors; +import java.util.stream.LongStream; + +import com.google.common.collect.Sets; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.api.ConfigurationService; +import accord.api.ConfigurationService.EpochReady; +import accord.impl.DefaultTimeouts; +import accord.impl.SizeOfIntersectionSorter; +import accord.impl.TestAgent; +import accord.local.Node; +import accord.local.TimeService; +import accord.primitives.Ranges; +import accord.topology.Topology; +import accord.topology.TopologyManager; +import accord.utils.Gen; +import accord.utils.Invariants; +import accord.utils.Property.SimpleCommand; +import accord.utils.RandomSource; +import accord.utils.async.AsyncChain; +import accord.utils.async.AsyncChains; +import accord.utils.async.AsyncResult; +import accord.utils.async.Cancellable; +import org.apache.cassandra.concurrent.ScheduledExecutorPlus; +import org.apache.cassandra.concurrent.SimulatedExecutorFactory; +import org.apache.cassandra.concurrent.Stage; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Murmur3Partitioner.LongToken; +import org.apache.cassandra.gms.IFailureDetectionEventListener; +import org.apache.cassandra.gms.IFailureDetector; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.IVerbHandler; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.SimulatedMessageDelivery; +import org.apache.cassandra.net.SimulatedMessageDelivery.Action; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.schema.DistributedSchema; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.ReplicationParams; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.TableParams; +import org.apache.cassandra.schema.Tables; +import org.apache.cassandra.service.accord.AccordConfigurationService.EpochSnapshot; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.tcm.MultiStepOperation; +import org.apache.cassandra.tcm.StubClusterMetadataService; +import org.apache.cassandra.tcm.Transformation; +import org.apache.cassandra.tcm.ValidatingClusterMetadataService; +import org.apache.cassandra.tcm.membership.Location; +import org.apache.cassandra.tcm.membership.NodeAddresses; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.membership.NodeState; +import org.apache.cassandra.tcm.membership.NodeVersion; +import org.apache.cassandra.tcm.ownership.UniformRangePlacement; +import org.apache.cassandra.tcm.sequences.LeaveStreams; +import org.apache.cassandra.tcm.transformations.PrepareJoin; +import org.apache.cassandra.tcm.transformations.PrepareLeave; +import org.apache.cassandra.utils.ByteArrayUtil; +import org.apache.cassandra.utils.Pair; +import org.assertj.core.api.Assertions; +import org.assertj.core.description.Description; + +import static accord.utils.Property.commands; +import static accord.utils.Property.stateful; + +public class EpochSyncTest +{ + private static final Logger logger = LoggerFactory.getLogger(EpochSyncTest.class); + + static + { + DatabaseDescriptor.clientInitialization(); + DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); + + ClusterMetadataService.setInstance(StubClusterMetadataService.forTesting()); + } + + @Test + public void test() + { + stateful().withExamples(50).withSteps(500).check(commands(() -> Cluster::new) + .destroyState(cluster -> { + finishPendingWork(cluster); + cluster.processAll(); + cluster.validate(true); + }) + .addAllIf(Cluster::hasPendingWork, b -> { + b.addIf(c -> !c.status(s -> s == Cluster.Status.Registered).isEmpty(), + (rs, state) -> { + long epoch = state.cms.metadata().epoch.getEpoch() + 1; + Node.Id pick = rs.pick(state.status(s -> s == Cluster.Status.Registered)); + return new SimpleCommand<>(String.format("%s Start Joining; epoch=%d", pick, epoch), + c -> c.increment(pick)); + }) + .addIf(c -> !c.cms.metadata().inProgressSequences.isEmpty(), + (rs, state) -> new SimpleCommand<>(String.format("Next Epoch Step; epoch=%d", state.cms.metadata().epoch.getEpoch() + 1), + Cluster::incrementInProgressSequences)); + }) + .addAllIf(Cluster::hasNoPendingWork, b -> { + b.addIf(cluster -> cluster.joined().size() <= cluster.maxNodes, EpochSyncTest::addNode) + .addIf(cluster -> cluster.joined().size() > cluster.minNodes, EpochSyncTest::removeNode); + }) + .addIf(Cluster::hasWork, EpochSyncTest::processSome) + .add(rs -> new SimpleCommand<>("Validate", c -> c.validate(false))) + .add((rs, cluster) -> new SimpleCommand<>("Bump Epoch " + (cluster.cms.metadata().epoch.getEpoch() + 1), Cluster::bumpEpoch)) + .build()); + } + + private static void finishPendingWork(Cluster cluster) + { + List registered = cluster.status(s -> s == Cluster.Status.Registered); + if (!registered.isEmpty()) + registered.forEach(cluster::increment); + while (!cluster.cms.metadata().inProgressSequences.isEmpty()) + cluster.incrementInProgressSequences(); + } + + private static SimpleCommand addNode(RandomSource rs, Cluster cluster) + { + Node.Id id = new Node.Id(++cluster.nodeCounter); + long token = cluster.tokenGen.nextLong(rs); + while (cluster.tokens.contains(token)) + token = cluster.tokenGen.nextLong(rs); + long epoch = cluster.cms.metadata().epoch.getEpoch() + 1; + long finalToken = token; + return new SimpleCommand<>("Start Node " + id + "; token=" + token + ", epoch=" + epoch, + c -> c.registerNode(id, finalToken)); + } + + private static SimpleCommand removeNode(RandomSource rs, Cluster cluster) + { + List alive = cluster.joined(); + Node.Id pick = rs.pick(alive); + long token = cluster.instances.get(pick).token; + long epoch = cluster.cms.metadata().epoch.getEpoch() + 1; + return new SimpleCommand<>("Remove Node " + pick + "; token=" + token + "; epoch=" + epoch, c -> c.removeNode(pick)); + } + + private static SimpleCommand processSome(RandomSource rs) { + return new SimpleCommand<>("Process Some", + c -> {//noinspection StatementWithEmptyBody + for (int i = 0, attempts = rs.nextInt(1, 100); i < attempts && c.processOne(); i++) {} + }); + } + + private static class Cluster + { + private static final int rf = 2; + private static final ReplicationParams replication_params = ReplicationParams.simple(rf); + + private final RandomSource rs; + private final int minNodes, maxNodes; + private final Gen.LongGen tokenGen; + private final SortedSet tokens = new TreeSet<>(); + private final Map instances = new HashMap<>(); + private final Set removed = new HashSet<>(); + private final List failures = new ArrayList<>(); + private final SimulatedExecutorFactory globalExecutor; + private final ScheduledExecutorPlus scheduler; + private int nodeCounter = 0; + private final ValidatingClusterMetadataService cms = ValidatingClusterMetadataService.createAndRegister(NodeVersion.CURRENT_METADATA_VERSION); + + private final IFailureDetector fd = new IFailureDetector() + { + @Override + public boolean isAlive(InetAddressAndPort ep) + { + return instances.get(nodeId(ep)).status != Status.Removed; + } + + @Override + public void interpret(InetAddressAndPort ep) + { + + } + + @Override + public void report(InetAddressAndPort ep) + { + + } + + @Override + public void remove(InetAddressAndPort ep) + { + + } + + @Override + public void forceConviction(InetAddressAndPort ep) + { + + } + + @Override + public void registerFailureDetectionEventListener(IFailureDetectionEventListener listener) + { + + } + + @Override + public void unregisterFailureDetectionEventListener(IFailureDetectionEventListener listener) + { + + } + }; + + public Cluster(RandomSource rs) + { + // add the test keyspace + createTestKeyspaceAndTable(); + this.rs = rs; + this.minNodes = 3; + this.maxNodes = 10; + this.tokenGen = rs2 -> rs2.nextLong(Long.MIN_VALUE + 1, Long.MAX_VALUE); + + this.globalExecutor = new SimulatedExecutorFactory(rs, failures::add); + this.scheduler = globalExecutor.scheduled("ignored"); + Stage.MISC.unsafeSetExecutor(scheduler); + + scheduler.scheduleWithFixedDelay(() -> { + if (aliveCount() < 2) return; + if (!partitions.isEmpty() && rs.nextBoolean()) + { + // remove partition + if (partitions.size() == 1) + { + partitions.clear(); + return; + } + partitions.remove(rs.pickOrderedSet(partitions)); + } + else + { + // add partition + List alive = notRemoved(); + InetAddressAndPort a = address(rs.pick(alive)); + InetAddressAndPort b = address(rs.pick(alive)); + while (a.equals(b)) + b = address(rs.pick(alive)); + partitions.add(new Connection(a, b)); + } + }, 1, 1, TimeUnit.MINUTES); + } + + private static InetAddressAndPort address(Node.Id id) + { + try + { + return InetAddressAndPort.getByAddress(ByteArrayUtil.bytes(id.id)); + } + catch (UnknownHostException e) + { + throw new AssertionError("Unable to create address for id " + id, e); + } + } + + private boolean hasPendingWork() + { + return !status(s -> s == Cluster.Status.Registered).isEmpty() + || !cms.metadata().inProgressSequences.isEmpty(); + } + + private boolean hasNoPendingWork() + { + return !hasPendingWork(); + } + + private Transformation.Success process(Transformation transformation) + { + Transformation.Result result = transformation.execute(cms.metadata()); + if (result.isRejected()) + throw new IllegalStateException("Unable to make TCM transition: " + result.rejected()); + return result.success(); + } + + private Transformation.Success process(MultiStepOperation transformation) + { + Transformation.Result result = transformation.applyTo(cms.metadata()); + if (result.isRejected()) + throw new IllegalStateException("Unable to make TCM transition"); + return result.success(); + } + + public void incrementInProgressSequences() + { + if (cms.metadata().inProgressSequences.isEmpty()) + throw new IllegalStateException("Attempted to bump epoch when nothing was pending"); + Iterator> it = cms.metadata().inProgressSequences.iterator(); + Invariants.require(it.hasNext()); + notify(process(it.next()).metadata); + } + + private static boolean left(ClusterMetadata metadata, Node.Id id) + { + return metadata.directory.peerState(new NodeId(id.id)) == NodeState.LEFT; + } + + private static boolean joined(ClusterMetadata metadata, Node.Id id) + { + NodeAddresses address = metadata.directory.getNodeAddresses(new NodeId(id.id)); + return metadata.placements.get(replication_params).reads.byEndpoint().keySet().contains(address.broadcastAddress); + } + + public enum EpochTracker { topologyManager, accordSyncPropagator, configurationService} + + Set globalSynced(long epoch) + { + return notRemoved().stream() + .filter(n -> instances.get(n).epoch.getEpoch() <= epoch) + .map(n -> instances.get(n).synced(epoch)) + .reduce(EnumSet.allOf(EpochTracker.class), Sets::intersection); + } + + boolean allSynced(long epoch) + { + Set done = globalSynced(epoch); + return done.contains(EpochTracker.topologyManager); + } + + private static Node.Id nodeId(InetAddressAndPort address) + { + return new Node.Id(ByteArrayUtil.getInt(address.addressBytes)); + } + + private void createTestKeyspaceAndTable() + { + ClusterMetadata current = cms.metadata(); + Tables tables = Tables.of(TableMetadata.minimal("test", "tb1").unbuild() + .partitioner(Murmur3Partitioner.instance) + .params(TableParams.builder().transactionalMode(TransactionalMode.full).build()) + .build()); + KeyspaceMetadata ks = KeyspaceMetadata.create("test", KeyspaceParams.simple(rf), tables); + + cms.setMetadata(current.transformer() + .with(new DistributedSchema(current.schema.getKeyspaces().with(ks))) + .build() + .metadata); + } + + void validate(boolean isDone) + { + for (Node.Id id : notRemoved()) + { + Instance inst = instances.get(id); + if (removed.contains(id)) continue; // ignore removed nodes + AccordConfigurationService conf = inst.config; + TopologyManager tm = inst.topology; + for (long epoch = Math.max(tm.firstNonEmpty(), inst.epoch.getEpoch()); epoch <= cms.metadata().epoch.getEpoch(); epoch++) + { + // validate config + EpochSnapshot snapshot = conf.getEpochSnapshot(epoch); + if (isDone) + { + Assertions.assertThat(snapshot).describedAs("node%s does not have epoch %d", id, epoch).isNotNull(); + Assertions.assertThat(snapshot.syncStatus).isEqualTo(AccordConfigurationService.SyncStatus.COMPLETED); + + // validate topology manager + Assertions.assertThat(tm.hasEpoch(epoch)).describedAs("node%s does not have epoch %d", id, epoch).isTrue(); + Ranges ranges = tm.globalForEpoch(epoch).ranges().mergeTouching(); + Ranges actual = tm.syncComplete(epoch).mergeTouching(); + Assertions.assertThat(actual) + .describedAs("node%s does not have all expected sync ranges for epoch %d; missing %s", id, epoch, ranges.without(actual)) + .isEqualTo(ranges); + } + else + { + if (snapshot == null || snapshot.syncStatus != AccordConfigurationService.SyncStatus.COMPLETED) continue; + + if (!allSynced(epoch)) + continue; + + Assertions.assertThat(tm.hasEpoch(epoch)).describedAs("node%s does not have epoch %d", id, epoch).isTrue(); + Topology topology = tm.globalForEpoch(epoch); + Ranges ranges = topology.ranges().mergeTouching(); + Ranges actual = tm.syncComplete(epoch).mergeTouching(); + // TopologyManager defines syncComplete for an epoch as (epoch - 1).syncComplete. This means that an epoch has reached quorum, but will still miss ranges as previous epochs have not + if (!ranges.equals(actual) && tm.minEpoch() != epoch && !ranges.equals(tm.syncComplete(epoch - 1).mergeTouching())) + continue; + long epoch_ = epoch; + Assertions.assertThat(actual) + .describedAs(new Description() + { + public String value() + { + return String.format("node%s does not have all expected sync ranges for epoch %d; missing %s; peers=%s; previous epochs %s", + id, epoch_, ranges.without(actual), topology.nodes(), + LongStream.range(inst.epoch.getEpoch(), epoch_ + 1) + .mapToObj(e -> String.format("%d -> %s(synced=%s): %s", e, conf.getEpochSnapshot(e).syncStatus, globalSynced(e), tm.syncComplete(e))) + .collect(Collectors.joining("\n"))); + + } + }) + .isEqualTo(ranges); + } + } + } + } + + String displayTopology() + { + class Hold { + final Cluster.Status status; + final long token; + + Hold(Status status, long token) + { + this.status = status; + this.token = token; + } + + @Override + public String toString() + { + return status + "\t" + (status == Status.Registered ? "?" : Long.toString(token)); + } + } + List notRemoved = notRemoved(); + List> list = new ArrayList<>(notRemoved.size()); + for (Node.Id n : notRemoved) + { + Instance instance = instances.get(n); + list.add(Pair.create(n, new Hold(instance.status, instance.token))); + } + list.sort(Comparator.comparing(a -> a.right.token)); + StringBuilder sb = new StringBuilder(); + for (var p : list) + sb.append(p.left).append('\t').append(p.right).append('\n'); + return sb.toString(); + } + + @Override + public String toString() + { + return "Topology:\n" + displayTopology(); + } + + boolean hasWork() + { + return globalExecutor.hasWork(); + } + + boolean processOne() + { + boolean result = globalExecutor.processOne(); + checkFailures(); + return result; + } + + @SuppressWarnings("StatementWithEmptyBody") + void processAll() + { + while (processOne()) + { + } + } + + public void checkFailures() + { + if (Thread.interrupted()) + failures.add(new InterruptedException()); + if (failures.isEmpty()) return; + AssertionError error = new AssertionError("Unexpected exceptions found"); + failures.forEach(error::addSuppressed); + failures.clear(); + throw error; + } + + List joined() + { + return status(s -> s == Status.Joined); + } + + List status(Predicate fn) + { + List ids = new ArrayList<>(instances.size()); + for (Instance i : instances.values()) + { + if (fn.test(i.status)) + ids.add(i.id); + } + ids.sort(Comparator.naturalOrder()); + return ids; + } + + List notRemoved() + { + ArrayList ids = new ArrayList<>(Sets.difference(instances.keySet(), removed)); + ids.sort(Comparator.naturalOrder()); + return ids; + } + + int aliveCount() + { + return instances.size() - removed.size(); + } + + private final NavigableSet partitions = new TreeSet<>(); + + private boolean partitioned(InetAddressAndPort self, InetAddressAndPort to) + { + return partitions.contains(new Connection(self, to)); + } + + private SimulatedMessageDelivery createMessaging(Node.Id id) + { + InetAddressAndPort address = address(id); + return new SimulatedMessageDelivery(address, + (self, msg, to) -> { + if (removed.contains(nodeId(self)) || removed.contains(nodeId(to))) + return Action.DROP; + if (!self.equals(to) && partitioned(self, to)) + return Action.DROP_PARTITIONED; + if (rs.decide(.01)) + return rs.nextBoolean() ? Action.DELIVER_WITH_FAILURE : Action.FAILURE; + return Action.DELIVER; + }, + SimulatedMessageDelivery.randomDelay(rs.fork()), + (to, msg) -> instances.get(nodeId(to)).reciver.recieve(msg), + (action, to, msg) -> logger.trace("{} message {}", action, msg), + scheduler::schedule, + failures::add); + } + + void registerNode(Node.Id id, long token) + { + Invariants.require(!tokens.contains(token), "Attempted to add token %d for node %s but token is already taken", token, id); + Invariants.require(!instances.containsKey(id), "Attempted to add node %s; but already exists", id); + + ClusterMetadata.Transformer builder = cms.metadata().transformer(); + + Instance instance = new Instance(id, token, builder.epoch(), createMessaging(id), fd); + instances.put(id, instance); + tokens.add(token); + + builder.register(new NodeAddresses(address(id)), new Location("dc1", "r1"), NodeVersion.CURRENT); + notify(builder.build().metadata); + } + + void increment(Node.Id pick) + { + Instance inst = Objects.requireNonNull(instances.get(pick), "Unknown id " + pick); + + switch (inst.status) + { + case Init: + case Joined: + case Removed: + throw new IllegalStateException("Unexpected status: " + inst.status); + case Registered: + inst.status = Status.Joining; + PrepareJoin task = new PrepareJoin(new NodeId(pick.id), Collections.singleton(new LongToken(inst.token)), new UniformRangePlacement(), true, false); + notify(process(task).metadata); + break; + default: + throw new UnsupportedOperationException("Unknown status: " + inst.status); + } + } + + void removeNode(Node.Id pick) + { + Instance inst = Objects.requireNonNull(instances.get(pick), "Unknown id " + pick); + Invariants.require(!removed.contains(pick), "Can not remove node twice; node " + pick); + removed.add(pick); + inst.status = Status.Leaving; + PrepareLeave prepareLeave = new PrepareLeave(new NodeId(pick.id), false, new UniformRangePlacement(), LeaveStreams.Kind.REMOVENODE); + notify(process(prepareLeave).metadata); + } + + void bumpEpoch() + { + notify(cms.metadata().forceEpoch(Epoch.create(cms.metadata().epoch.getEpoch() + 1))); + } + + private void notify(ClusterMetadata current) + { + Topology t = AccordTopology.createAccordTopology(current); + Ranges ranges = t.ranges().mergeTouching(); + if (!current.placements.get(replication_params).reads.isEmpty()) + Assertions.assertThat(ranges).hasSize(1); + cms.setMetadata(current); + for (Node.Id id : status(s -> s != Status.Removed)) + { + Instance inst = instances.get(id); + inst.maybeTransition(current, t); + inst.config.maybeReportMetadata(current); + } + } + + @SuppressWarnings("SameParameterValue") + private AsyncChain schedule(long time, TimeUnit unit, Callable task) + { + return new AsyncChains.Head<>() + { + @Override + protected Cancellable start(BiConsumer callback) + { + Future future = scheduler.schedule(() -> { + T value; + try + { + value = task.call(); + } + catch (Throwable t) + { + callback.accept(null, t); + return; + } + callback.accept(value, null); + }, time, unit); + return () -> future.cancel(true); + } + }; + } + + private enum Status { Init, Registered, Joining, Joined, Leaving, Removed} + private class Instance + { + private final Node.Id id; + private final long token; + private final AccordConfigurationService config; + private final SimulatedMessageDelivery messaging; + private final SimulatedMessageDelivery.SimulatedMessageReceiver reciver; + private final TopologyManager topology; + private final Epoch epoch; + private Status status = Status.Init; + + Instance(Node.Id node, long token, Epoch epoch, SimulatedMessageDelivery messagingService, IFailureDetector failureDetector) + { + this.id = node; + this.token = token; + this.epoch = epoch; + // TODO (review): Should there be a real scheduler here? Is it possible to adapt the Scheduler interface to scheduler used in this test? + TimeService time = TimeService.ofNonMonotonic(globalExecutor::currentTimeMillis, TimeUnit.MILLISECONDS); + this.topology = new TopologyManager(SizeOfIntersectionSorter.SUPPLIER, new TestAgent.RethrowAgent(), id, time, new DefaultTimeouts(time)); + config = new AccordConfigurationService(node, messagingService, failureDetector, scheduler); + config.registerListener(new ConfigurationService.Listener() + { + @Override + public AsyncResult onTopologyUpdate(Topology topology, boolean isLoad, boolean startSync) + { + AsyncResult metadata = schedule(rs.nextInt(1, 10), TimeUnit.SECONDS, (Callable) () -> null).beginAsResult(); + AsyncResult coordination = metadata.flatMap(ignore -> schedule(rs.nextInt(1, 10), TimeUnit.SECONDS, (Callable) () -> null)).beginAsResult(); + AsyncResult data = coordination.flatMap(ignore -> schedule(rs.nextInt(1, 10), TimeUnit.SECONDS, (Callable) () -> null)).beginAsResult(); + AsyncResult reads = data.flatMap(ignore -> schedule(rs.nextInt(1, 10), TimeUnit.SECONDS, (Callable) () -> null)).beginAsResult(); + EpochReady ready = new EpochReady(topology.epoch(), metadata, coordination, data, reads); + + topology().onTopologyUpdate(topology, () -> ready, e -> {}); + ready.coordinate.invokeIfSuccess(() -> topology().onEpochSyncComplete(id, topology.epoch())); + if (topology().minEpoch() == topology.epoch() && topology().epoch() != topology.epoch()) + return ready.coordinate; + config.acknowledgeEpoch(ready, startSync); + return ready.coordinate; + } + + @Override + public void onRemoteSyncComplete(Node.Id node, long epoch) + { + topology.onEpochSyncComplete(node, epoch); + } + + @Override + public void onRemoveNode(long epoch, Node.Id removed) + { + // TODO + //topology.onRemoveNode(epoch, removed); + } + + @Override + public void onEpochClosed(Ranges ranges, long epoch) + { + topology.onEpochClosed(ranges, epoch); + } + + @Override + public void onEpochRetired(Ranges ranges, long epoch) + { + topology.onEpochRetired(ranges, epoch); + } + }); + + Map> handlers = new EnumMap<>(Verb.class); + //noinspection unchecked + handlers.put(Verb.ACCORD_SYNC_NOTIFY_REQ, msg -> AccordService.receive(messagingService, config, (Message) (Message) msg)); + this.messaging = messagingService; + this.reciver = messagingService.receiver(new SimulatedMessageDelivery.SimpleVerbHandler(handlers)); + } + + @Override + public String toString() + { + return "Instance{" + + "id=" + id + + ", token=" + token + + ", epoch=" + epoch + + ", status=" + status + + '}'; + } + + void maybeTransition(ClusterMetadata current, Topology t) + { + switch (status) + { + case Init: + Invariants.require(!t.nodes().contains(id), "Node was in Init state but present in the Topology!"); + Invariants.require(current.directory.peerId(address(id)) != null, "Node exists but not in TCM"); + start(); + status = Status.Registered; + break; + case Registered: + Invariants.require(!t.nodes().contains(id), "Node was in Init state but present in the Topology!"); + Invariants.require(current.directory.peerId(address(id)) != null, "Node exists but not in TCM"); + if (current.placements.get(replication_params).writes.byEndpoint().keySet().contains(address(id))) + status = Status.Joining; + break; + case Joining: + Invariants.require(current.directory.peerId(address(id)) != null, "Node exists but not in TCM"); + if (joined(current, id)) + status = Status.Joined; + case Removed: + case Joined: + // nothing to do + break; + case Leaving: + if (left(current, id)) + stop(); + break; + default: + throw new UnsupportedOperationException("Unknown status: " + status); + } + } + + private void start() + { + config.start(); + } + + TopologyManager topology() + { + return topology; + } + + Set synced(long epoch) + { + if (epoch < this.epoch.getEpoch()) throw new IllegalArgumentException("Asked for epoch before this instance existed"); + EnumSet done = EnumSet.noneOf(EpochTracker.class); + EpochSnapshot snapshot = config.getEpochSnapshot(epoch); + if (snapshot != null && snapshot.syncStatus == AccordConfigurationService.SyncStatus.COMPLETED) + done.add(EpochTracker.configurationService); + if (topology.hasReachedQuorum(epoch)) + done.add(EpochTracker.topologyManager); + if (!config.syncPropagator().hasPending(epoch)) + done.add(EpochTracker.accordSyncPropagator); + return done; + } + + void stop() + { + status = Status.Removed; + tokens.remove(token); + messaging.stop(); + } + } + } + + private static class Connection implements Comparable + { + final InetAddressAndPort from, to; + + private Connection(InetAddressAndPort from, InetAddressAndPort to) + { + this.from = from; + this.to = to; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Connection that = (Connection) o; + return from.equals(that.from) && to.equals(that.to); + } + + @Override + public int hashCode() + { + return Objects.hash(from, to); + } + + @Override + public String toString() + { + return "Connection{" + "from=" + from + ", to=" + to + '}'; + } + + @Override + public int compareTo(Connection o) + { + int rc = from.compareTo(o.from); + if (rc == 0) + rc = to.compareTo(o.to); + return rc; + } + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/SimpleAccordEndpointMapper.java b/test/unit/org/apache/cassandra/service/accord/SimpleAccordEndpointMapper.java new file mode 100644 index 000000000000..acca53d5fccf --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/SimpleAccordEndpointMapper.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.net.InetAddress; +import java.net.UnknownHostException; +import java.nio.ByteBuffer; + +import accord.local.Node; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.utils.ByteBufferUtil; + +public enum SimpleAccordEndpointMapper implements AccordEndpointMapper +{ + INSTANCE; + + @Override + public Node.Id mappedIdOrNull(InetAddressAndPort endpoint) + { + if (endpoint.addressBytes.length != 4) + throw new IllegalArgumentException("Only IPV4 is allowed: given " + endpoint.toString(true)); + return new Node.Id(ByteBuffer.wrap(endpoint.addressBytes).getInt()); + } + + @Override + public InetAddressAndPort mappedEndpointOrNull(Node.Id id) + { + byte[] array = ByteBufferUtil.bytes(id.id).array(); + try + { + return InetAddressAndPort.getByAddressOverrideDefaults(InetAddress.getByAddress(array), 1); + } + catch (UnknownHostException e) + { + throw new AssertionError("Unable to convert " + id + " to an IPV4 address", e); + } + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/SimpleSimulatedAccordCommandStoreTest.java b/test/unit/org/apache/cassandra/service/accord/SimpleSimulatedAccordCommandStoreTest.java new file mode 100644 index 000000000000..a8523e469e12 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/SimpleSimulatedAccordCommandStoreTest.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import org.junit.Test; + +import accord.local.StoreParticipants; +import accord.primitives.SaveStatus; +import accord.primitives.TxnId; +import accord.utils.AccordGens; +import org.assertj.core.api.Assertions; + +import static accord.utils.Property.qt; + +public class SimpleSimulatedAccordCommandStoreTest extends SimulatedAccordCommandStoreTestBase +{ + @Test + public void emptyTxns() + { + qt().withExamples(10).check(rs -> { + AccordKeyspace.unsafeClear(); + try (var instance = new SimulatedAccordCommandStore(reverseTokenTbl.id, rs)) + { + for (int i = 0, examples = 100; i < examples; i++) + { + TxnId id = AccordGens.txnIds().next(rs); + instance.process(id, (safe) -> { + var safeCommand = safe.get(id, StoreParticipants.empty(id)); + var command = safeCommand.current(); + Assertions.assertThat(command.saveStatus()).isEqualTo(SaveStatus.Uninitialised); + return null; + }); + } + } + + }); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/SimulatedAccordCommandStore.java b/test/unit/org/apache/cassandra/service/accord/SimulatedAccordCommandStore.java new file mode 100644 index 000000000000..ef7e92c6ea5f --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/SimulatedAccordCommandStore.java @@ -0,0 +1,497 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.CancellationException; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.function.BiFunction; +import java.util.function.BooleanSupplier; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.function.ToLongFunction; +import javax.annotation.Nullable; + +import accord.api.Journal; +import accord.api.LocalListeners; +import accord.api.ProgressLog; +import accord.api.RemoteListeners; +import accord.api.RoutingKey; +import accord.api.Timeouts; +import accord.impl.DefaultLocalListeners; +import accord.impl.DefaultTimeouts; +import accord.impl.SizeOfIntersectionSorter; +import accord.impl.TestAgent; +import accord.impl.basic.InMemoryJournal; +import accord.impl.basic.SimulatedFault; +import accord.local.Command; +import accord.local.CommandStore; +import accord.local.CommandStores; +import accord.local.DurableBefore; +import accord.local.Node; +import accord.local.NodeCommandStoreService; +import accord.local.PreLoadContext; +import accord.local.SafeCommand; +import accord.local.SafeCommandStore; +import accord.local.TimeService; +import accord.local.durability.DurabilityService; +import accord.messages.BeginRecovery; +import accord.messages.PreAccept; +import accord.messages.Reply; +import accord.messages.TxnRequest; +import accord.primitives.AbstractUnseekableKeys; +import accord.primitives.Ballot; +import accord.primitives.EpochSupplier; +import accord.primitives.FullRoute; +import accord.primitives.Ranges; +import accord.primitives.Routable; +import accord.primitives.RoutableKey; +import accord.primitives.Route; +import accord.primitives.RoutingKeys; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.primitives.Unseekables; +import accord.topology.Topologies; +import accord.topology.Topology; +import accord.topology.TopologyManager; +import accord.utils.Gens; +import accord.utils.RandomSource; +import accord.utils.async.AsyncChains; +import accord.utils.async.AsyncResult; +import org.apache.cassandra.concurrent.ExecutorFactory; +import org.apache.cassandra.concurrent.ScheduledExecutorPlus; +import org.apache.cassandra.concurrent.SimulatedExecutorFactory; +import org.apache.cassandra.concurrent.Stage; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.DataRange; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.compaction.CompactionManager; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.memtable.Memtable; +import org.apache.cassandra.metrics.AccordCacheMetrics; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.Generators; +import org.apache.cassandra.utils.Pair; +import org.assertj.core.api.Assertions; + +import static org.apache.cassandra.config.DatabaseDescriptor.getPartitioner; +import static org.apache.cassandra.db.ColumnFamilyStore.FlushReason.UNIT_TESTS; +import static org.apache.cassandra.schema.SchemaConstants.ACCORD_KEYSPACE_NAME; +import static org.apache.cassandra.utils.AccordGenerators.fromQT; + +public class SimulatedAccordCommandStore implements AutoCloseable +{ + private final List failures = new ArrayList<>(); + private final SimulatedExecutorFactory globalExecutor; + private final CommandStore.EpochUpdateHolder updateHolder; + private final BooleanSupplier shouldEvict, shouldFlush, shouldCompact; + + public final NodeCommandStoreService storeService; + public final AccordCommandStore commandStore; + public final Node.Id nodeId; + public final Topology topology; + public final Topologies topologies; + public final Journal journal; + public final ScheduledExecutorPlus unorderedScheduled; + public final List evictions = new ArrayList<>(); + public Predicate ignoreExceptions = ignore -> false; + + public interface FunctionWrapper + { + BiFunction wrap(BiFunction f); + + static BiFunction identity(BiFunction f) { return f; } + static FunctionWrapper identity() { return FunctionWrapper::identity; } + } + + + public SimulatedAccordCommandStore(RandomSource rs) + { + this(null, rs, FunctionWrapper.identity()); + } + + public SimulatedAccordCommandStore(TableId tableId, RandomSource rs) + { + this(tableId, rs, FunctionWrapper.identity()); + } + + public SimulatedAccordCommandStore(RandomSource rs, FunctionWrapper loadFunctionWrapper) + { + this(null, rs, loadFunctionWrapper); + } + + public SimulatedAccordCommandStore(@Nullable TableId tableId, RandomSource rs, FunctionWrapper loadFunctionWrapper) + { + globalExecutor = new SimulatedExecutorFactory(rs.fork(), fromQT(Generators.TIMESTAMP_GEN.map(java.sql.Timestamp::getTime)).mapToLong(TimeUnit.MILLISECONDS::toNanos).next(rs), failures::add); + this.unorderedScheduled = globalExecutor.scheduled("ignored"); + ExecutorFactory.Global.unsafeSet(globalExecutor); + for (Stage stage : Arrays.asList(Stage.READ, Stage.MUTATION)) + stage.unsafeSetExecutor(unorderedScheduled); + for (Stage stage : Arrays.asList(Stage.MISC, Stage.ACCORD_MIGRATION, Stage.READ, Stage.MUTATION)) + stage.unsafeSetExecutor(globalExecutor.configureSequential("ignore").build()); + + this.nodeId = AccordTopology.tcmIdToAccord(ClusterMetadata.currentNullable().myNodeId()); + this.updateHolder = new CommandStore.EpochUpdateHolder(); + this.topology = AccordTopology.createAccordTopology(ClusterMetadata.current()); + this.topologies = new Topologies.Single(SizeOfIntersectionSorter.SUPPLIER, topology); + Ranges ranges = topology.ranges(); + if (tableId != null) + ranges = ranges.slice(Ranges.of(TokenRange.create(TokenKey.min(tableId, getPartitioner()), TokenKey.max(tableId, getPartitioner())))); + CommandStores.RangesForEpoch rangesForEpoch = new CommandStores.RangesForEpoch(topology.epoch(), ranges); + updateHolder.add(topology.epoch(), rangesForEpoch, ranges); + + this.storeService = new NodeCommandStoreService() + { + private final ToLongFunction elapsed = TimeService.elapsedWrapperFromNonMonotonicSource(TimeUnit.NANOSECONDS, this::now); + final Timeouts timeouts = new DefaultTimeouts(this); + + @Override public Timeouts timeouts() { return timeouts; } + + @Override public DurableBefore durableBefore() { return DurableBefore.EMPTY; } + + @Override + public DurabilityService durability() + { + return null; + } + + @Override + public Node.Id id() + { + return nodeId; + } + + @Override + public long epoch() + { + return ClusterMetadata.current().epoch.getEpoch(); + } + + @Override + public long now() + { + return globalExecutor.nanoTime(); + } + + @Override + public long elapsed(TimeUnit unit) + { + return elapsed.applyAsLong(unit); + } + + @Override + public long uniqueNow(long atLeast) + { + long now = now(); + if (now <= atLeast) + throw new UnsupportedOperationException(); + return now; + } + + @Override + public TopologyManager topology() + { + throw new UnsupportedOperationException(); + } + }; + + TestAgent.RethrowAgent agent = new TestAgent.RethrowAgent() + { + @Override + public boolean rejectPreAccept(TimeService time, TxnId txnId) + { + return false; + } + + @Override + public void onUncaughtException(Throwable t) + { + if (ignoreExceptions.test(t)) return; + super.onUncaughtException(t); + } + }; + + this.journal = new DefaultJournal(nodeId, rs.fork()); + this.commandStore = new AccordCommandStore(0, + storeService, + agent, + null, + ignore -> new ProgressLog.NoOpProgressLog(), + cs -> new DefaultLocalListeners(new RemoteListeners.NoOpRemoteListeners(), new DefaultLocalListeners.NotifySink() + { + @Override public void notify(SafeCommandStore safeStore, SafeCommand safeCommand, TxnId listener) {} + @Override public boolean notify(SafeCommandStore safeStore, SafeCommand safeCommand, LocalListeners.ComplexListener listener) { return false; } + }), + updateHolder, + journal, + new AccordExecutorSimple(0, CommandStore.class.getSimpleName() + '[' + 0 + ']', new AccordCacheMetrics("test"), agent)); + this.commandStore.executor().executeDirectlyWithLock(() -> { + commandStore.executor().setCapacity(8 << 20); + commandStore.executor().setWorkingSetSize(4 << 20); + }); + commandStore.unsafeUpdateRangesForEpoch(); + + shouldEvict = boolSource(rs.fork()); + { + // tests used to take 1m but after many changes in accord they now take many minutes and its due to flush... so lower the frequency of flushing + RandomSource fork = rs.fork(); + shouldFlush = () -> fork.decide(.01); + } + shouldCompact = boolSource(rs.fork()); + + commandStore.executor().cacheUnsafe().types().forEach(i -> { + updateLoadFunction(i, loadFunctionWrapper); + i.register(new AccordCache.Listener() + { + @Override + public void onEvict(AccordCacheEntry state) + { + evictions.add(i + " evicted " + state); + } + }); + }); + } + + private void updateLoadFunction(AccordCache.Type i, FunctionWrapper wrapper) + { + i.unsafeSetLoadFunction(wrapper.wrap(i.unsafeGetLoadFunction())); + } + + private static BooleanSupplier boolSource(RandomSource rs) + { + var gen = Gens.bools().mixedDistribution().next(rs); + return () -> gen.next(rs); + } + + public TxnId nextTxnId(Txn.Kind kind, Routable.Domain domain) + { + return new TxnId(storeService.epoch(), storeService.now(), 0, kind, domain, nodeId); + } + + public void maybeCacheEvict(Unseekables keysOrRanges) + { + switch (keysOrRanges.domain()) + { + case Key: + maybeCacheEvict((AbstractUnseekableKeys) keysOrRanges, Ranges.EMPTY); + break; + case Range: + maybeCacheEvict(RoutingKeys.EMPTY, (Ranges) keysOrRanges); + break; + default: + throw new UnsupportedOperationException("Unknown domain: " + keysOrRanges.domain()); + } + } + + public void maybeCacheEvict(Unseekables keys, Ranges ranges) + { + try (AccordExecutor.ExclusiveGlobalCaches caches = commandStore.executor().lockCaches()) + { + AccordCache cache = caches.global; + cache.evictionQueue().forEach(state -> { + Class keyType = state.key().getClass(); + if (TxnId.class.equals(keyType)) + { + Command command = (Command) state.getExclusive(); + if (command != null && command.known().isDefinitionKnown() + && (command.partialTxn().keys().intersects(keys) || command.partialTxn().keys().intersects(ranges)) + && shouldEvict.getAsBoolean()) + cache.tryEvict(state); + } + else if (RoutableKey.class.isAssignableFrom(keyType)) + { + RoutingKey key = (RoutingKey) state.key(); + if ((keys.contains(key) || ranges.intersects(key)) + && shouldEvict.getAsBoolean()) + cache.tryEvict(state); + } + else + { + throw new AssertionError("Unexpected key type: " + state.key().getClass()); + } + }); + } + + for (var store : Keyspace.open(ACCORD_KEYSPACE_NAME).getColumnFamilyStores()) + { + Memtable memtable = store.getCurrentMemtable(); + if (memtable.partitionCount() == 0 || !intersects(store, memtable, keys, ranges)) + continue; + if (shouldFlush.getAsBoolean()) + store.forceBlockingFlush(UNIT_TESTS); + } + for (var store : Keyspace.open(ACCORD_KEYSPACE_NAME).getColumnFamilyStores()) + { + if (store.getLiveSSTables().size() > 5 && shouldCompact.getAsBoolean()) + { + // compaction no-op since auto-compaction is disabled... so need to enable quickly + store.enableAutoCompaction(); + try + { + FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(store)); + } + finally + { + store.disableAutoCompaction(); + } + } + } + } + + private boolean intersects(ColumnFamilyStore store, Memtable memtable, Unseekables keys, Ranges ranges) + { + if (keys.isEmpty() && ranges.isEmpty()) // shouldn't happen, but just in case... + return false; + switch (store.name) + { + case "commands_for_key": + // pk = (store_id, routing_key) + // since this is simulating a single store, store_id is a constant, so check key + try (var it = memtable.partitionIterator(ColumnFilter.NONE, DataRange.allData(store.getPartitioner()), null)) + { + while (it.hasNext()) + { + var key = AccordKeyspace.CommandsForKeyAccessor.getUserTableKey(commandStore.tableId(), it.next().partitionKey()); + if (keys.contains(key) || ranges.intersects(key)) + return true; + } + } + break; + } + return false; + } + + public void checkFailures() + { + if (Thread.interrupted()) + failures.add(new InterruptedException()); + failures.removeIf(f -> f instanceof CancellationException || f instanceof SimulatedFault); + if (failures.isEmpty()) return; + AssertionError error = new AssertionError("Unexpected exceptions found"); + failures.forEach(error::addSuppressed); + failures.clear(); + throw error; + } + + public T process(TxnRequest request) throws ExecutionException, InterruptedException + { + return process(request, request::apply); + } + + public T process(PreLoadContext loadCtx, Function function) throws ExecutionException, InterruptedException + { + var result = processAsync(loadCtx, function); + processAll(); + return AsyncChains.getBlocking(result); + } + + public AsyncResult processAsync(TxnRequest request) + { + return processAsync(request, request::apply); + } + + public AsyncResult processAsync(PreLoadContext loadCtx, Function function) + { + return commandStore.submit(loadCtx, function).beginAsResult(); + } + + public Pair> enqueuePreAccept(Txn txn, FullRoute route) + { + TxnId txnId = nextTxnId(txn.kind(), txn.keys().domain()); + PreAccept preAccept = new PreAccept(nodeId, topologies, txnId, txn, null, false, route); + return Pair.create(txnId, processAsync(preAccept, safe -> { + var reply = preAccept.apply(safe); + Assertions.assertThat(reply.isOk()).isTrue(); + return (PreAccept.PreAcceptOk) reply; + })); + } + + public Pair> enqueueBeginRecovery(Txn txn, FullRoute route) + { + TxnId txnId = nextTxnId(txn.kind(), txn.keys().domain()); + Ballot ballot = Ballot.fromValues(storeService.epoch(), storeService.now(), nodeId); + BeginRecovery br = new BeginRecovery(nodeId, topologies, txnId, null, txn, route, ballot); + + return Pair.create(txnId, processAsync(br, safe -> { + var reply = br.apply(safe); + Assertions.assertThat(reply.kind() == BeginRecovery.RecoverReply.Kind.Ok).isTrue(); + return (BeginRecovery.RecoverOk) reply; + }).beginAsResult()); + } + + public void processAll() + { + while (processOne()) + { + } + } + + private boolean processOne() + { + boolean result = globalExecutor.processOne(); + checkFailures(); + return result; + } + + @Override + public void close() throws Exception + { + commandStore.shutdown(); + } + + private static class DefaultJournal extends InMemoryJournal implements RangeSearcher.Supplier + { + private final RouteInMemoryIndex index = new RouteInMemoryIndex<>(); + private DefaultJournal(Node.Id id, RandomSource rs) + { + super(id, rs); + } + + @Override + public void saveCommand(int commandStoreId, CommandUpdate update, Runnable onFlush) + { + super.saveCommand(commandStoreId, update, onFlush); + if (!update.after.txnId().domain().isRange()) + return; + Command after = update.after; + Route route = after.participants().route(); + if (route != null) + index.update(0, commandStoreId, after.txnId(), route); + } + + @Override + public void purge(CommandStores commandStores, EpochSupplier epochSupplier) + { + super.purge(commandStores, epochSupplier); + index.truncateForTesting(); + } + + @Override + public RangeSearcher rangeSearcher() + { + return index; + } + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/SimulatedAccordCommandStoreTestBase.java b/test/unit/org/apache/cassandra/service/accord/SimulatedAccordCommandStoreTestBase.java new file mode 100644 index 000000000000..8a5570d73607 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/SimulatedAccordCommandStoreTestBase.java @@ -0,0 +1,477 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeSet; +import java.util.concurrent.ExecutionException; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import com.google.common.collect.Maps; +import org.junit.Before; +import org.junit.BeforeClass; + +import accord.api.RoutingKey; +import accord.impl.SizeOfIntersectionSorter; +import accord.local.Node; +import accord.messages.BeginRecovery; +import accord.messages.PreAccept; +import accord.primitives.Ballot; +import accord.primitives.Deps; +import accord.primitives.FullRangeRoute; +import accord.primitives.FullRoute; +import accord.primitives.Keys; +import accord.primitives.LatestDeps; +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.primitives.Routable; +import accord.primitives.Routables; +import accord.primitives.RoutingKeys; +import accord.primitives.Seekables; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.primitives.Unseekables; +import accord.topology.Topologies; +import accord.utils.Gen; +import accord.utils.Gens; +import accord.utils.Invariants; +import accord.utils.async.AsyncChains; +import accord.utils.async.AsyncResult; +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.RTree; +import org.apache.cassandra.utils.RangeTree; +import org.assertj.core.api.Assertions; + +import static org.apache.cassandra.schema.SchemaConstants.ACCORD_KEYSPACE_NAME; +import static org.apache.cassandra.service.accord.AccordTestUtils.createTxn; + +public abstract class SimulatedAccordCommandStoreTestBase extends CQLTester +{ + static + { + CassandraRelevantProperties.TEST_ACCORD_STORE_THREAD_CHECKS_ENABLED.setBoolean(false); + // since this test does frequent truncates, the info table gets updated and forced flushed... which is 90% of the cost of this test... + // this flag disables that flush + CassandraRelevantProperties.UNSAFE_SYSTEM.setBoolean(true); + // The plan is to migrate away from SAI, so rather than hacking around timeout issues; just disable for now + CassandraRelevantProperties.SAI_TEST_DISABLE_TIMEOUT.setBoolean(true); + } + + protected enum DepsMessage + {PreAccept, BeginRecovery, PreAcceptThenBeginRecovery} + + protected static final Gen> mixedDomainGen = Gens.enums().allMixedDistribution(Routable.Domain.class); + protected static final Gen mixedTokenGen = top -> { + switch (top.nextInt(0, 3)) + { + case 0: // all + return rs -> rs.nextLong(Long.MIN_VALUE + 1, Long.MAX_VALUE); + case 1: // small + return rs -> rs.nextLong(0, 100); + case 2: // medium + return rs -> rs.nextLong(0, Long.MAX_VALUE); + default: + throw new AssertionError(); + } + }; + + protected static TableMetadata intTbl, reverseTokenTbl; + protected static Node.Id nodeId; + + @BeforeClass + public static void setUpClass() + { + CQLTester.setUpClass(); + DatabaseDescriptor.setIncrementalBackupsEnabled(false); + } + + @Before + public void init() + { + if (intTbl != null) + return; + createKeyspace("CREATE KEYSPACE test WITH replication={ 'class' : 'SimpleStrategy', 'replication_factor' : 2 }"); + createTable("test", "CREATE TABLE test.tbl1 (pk int PRIMARY KEY, value int) WITH transactional_mode='full'"); + intTbl = Schema.instance.getTableMetadata("test", "tbl1"); + + createTable("test", "CREATE TABLE test.tbl2 (pk vector PRIMARY KEY, value int) WITH transactional_mode='full'"); + reverseTokenTbl = Schema.instance.getTableMetadata("test", "tbl2"); + + nodeId = AccordTopology.tcmIdToAccord(ClusterMetadata.current().myNodeId()); + + // tests may flush, which triggers compaction... since compaction is not simulated this adds a form of non-deterministic behavior + for (var store : Keyspace.open(ACCORD_KEYSPACE_NAME).getColumnFamilyStores()) + store.disableAutoCompaction(); + + AccordService.unsafeSetNoop(); + + ServerTestUtils.markCMS(); + } + + protected static TokenRange fullRange(TableId id, IPartitioner partitioner) + { + return TokenRange.create(TokenKey.min(id, partitioner), TokenKey.max(id, partitioner)); + } + + protected static TokenRange tokenRange(TableId id, IPartitioner partitioner, long start, long end) + { + return TokenRange.create(start == Long.MIN_VALUE ? TokenKey.min(id, partitioner) : tokenKey(id, start), tokenKey(id, end)); + } + + protected static TokenKey tokenKey(TableId id, long token) + { + return new TokenKey(id, new Murmur3Partitioner.LongToken(token)); + } + + protected static Map> keyConflicts(List list, Unseekables keys) + { + if (list.isEmpty()) return Collections.emptyMap(); + Map> kc = Maps.newHashMapWithExpectedSize(keys.size()); + for (RoutingKey key : keys) + kc.put(key, list); + return kc; + } + + protected static Map> rangeConflicts(List list, Ranges ranges) + { + if (list.isEmpty()) return Collections.emptyMap(); + Map> kc = Maps.newHashMapWithExpectedSize(ranges.size()); + for (Range range : ranges) + kc.put(range, list); + return kc; + } + + protected static void assertDepsMessage(SimulatedAccordCommandStore instance, + DepsMessage messageType, + Txn txn, FullRoute route, + DepsModel model) throws ExecutionException, InterruptedException + { + TxnId id = assertDepsMessage(instance, messageType, txn, route, + model.keyConflicts(txn.keys()), + model.rangeConflicts(txn.keys())); + model.register(id, txn); + } + + protected static TxnId assertDepsMessage(SimulatedAccordCommandStore instance, + DepsMessage messageType, + Txn txn, FullRoute route, + Map> keyConflicts) throws ExecutionException, InterruptedException + { + return assertDepsMessage(instance, messageType, txn, route, keyConflicts, Collections.emptyMap()); + } + + protected static TxnId assertDepsMessage(SimulatedAccordCommandStore instance, + DepsMessage messageType, + Txn txn, FullRoute route, + Map> keyConflicts, + Map> rangeConflicts) throws ExecutionException, InterruptedException + { + var pair = assertDepsMessageAsync(instance, messageType, txn, route, keyConflicts, rangeConflicts); + instance.processAll(); + AsyncChains.getBlocking(pair.right); + + return pair.left; + } + + protected static Pair> assertDepsMessageAsync(SimulatedAccordCommandStore instance, + DepsMessage messageType, + Txn txn, FullRoute route, + Map> keyConflicts, + Map> rangeConflicts) + { + switch (messageType) + { + case PreAccept: + return assertPreAcceptAsync(instance, txn, route, keyConflicts, rangeConflicts); + case BeginRecovery: + return assertBeginRecoveryAsync(instance, txn, route, keyConflicts, rangeConflicts); + case PreAcceptThenBeginRecovery: + return assertBeginRecoveryAfterPreAcceptAsync(instance, txn, route, keyConflicts, rangeConflicts); + default: + throw new IllegalArgumentException("Unknown message type: " + messageType); + } + } + + protected static Pair> assertPreAcceptAsync(SimulatedAccordCommandStore instance, + Txn txn, FullRoute route, + Map> keyConflicts, + Map> rangeConflicts) + { + Map> cloneKeyConflicts = keyConflicts.entrySet().stream() + .filter(e -> !e.getValue().isEmpty()) + .collect(Collectors.toMap(e -> e.getKey(), e -> new ArrayList(e.getValue()))); + Map> cloneRangeConflicts = rangeConflicts.entrySet().stream() + .filter(e -> !e.getValue().isEmpty()) + .collect(Collectors.toMap(e -> e.getKey(), e -> new ArrayList(e.getValue()))); + var pair = instance.enqueuePreAccept(txn, route); + return Pair.create(pair.left, pair.right.map(success -> { + assertDeps(success.txnId, success.deps, cloneKeyConflicts, cloneRangeConflicts); + return null; + }).beginAsResult()); + } + + protected static Pair> assertBeginRecoveryAsync(SimulatedAccordCommandStore instance, + Txn txn, FullRoute route, + Map> keyConflicts, + Map> rangeConflicts) + { + Map> cloneKeyConflicts = keyConflicts.entrySet().stream() + .filter(e -> !e.getValue().isEmpty()) + .collect(Collectors.toMap(e -> e.getKey(), e -> new ArrayList(e.getValue()))); + Map> cloneRangeConflicts = rangeConflicts.entrySet().stream() + .filter(e -> !e.getValue().isEmpty()) + .collect(Collectors.toMap(e -> e.getKey(), e -> new ArrayList(e.getValue()))); + var pair = instance.enqueueBeginRecovery(txn, route); + return Pair.create(pair.left, pair.right.map(success -> { + Deps proposeDeps = LatestDeps.mergeProposal(Collections.singletonList(success), ok -> ok.deps); + assertDeps(success.txnId, proposeDeps, cloneKeyConflicts, cloneRangeConflicts); + return null; + }).beginAsResult()); + } + + protected static Pair> assertBeginRecoveryAfterPreAcceptAsync(SimulatedAccordCommandStore instance, + Txn txn, FullRoute route, + Map> keyConflicts, + Map> rangeConflicts) + { + Map> cloneKeyConflicts = keyConflicts.entrySet().stream() + .filter(e -> !e.getValue().isEmpty()) + .collect(Collectors.toMap(e -> e.getKey(), e -> new ArrayList(e.getValue()))); + Map> cloneRangeConflicts = rangeConflicts.entrySet().stream() + .filter(e -> !e.getValue().isEmpty()) + .collect(Collectors.toMap(e -> e.getKey(), e -> new ArrayList(e.getValue()))); + + TxnId txnId = instance.nextTxnId(txn.kind(), txn.keys().domain()); + PreAccept preAccept = new PreAccept(nodeId, new Topologies.Single(SizeOfIntersectionSorter.SUPPLIER, instance.topology), txnId, txn, null, false, route); + + var preAcceptAsync = instance.processAsync(preAccept, safe -> { + var reply = preAccept.apply(safe); + Assertions.assertThat(reply.isOk()).isTrue(); + PreAccept.PreAcceptOk success = (PreAccept.PreAcceptOk) reply; + assertDeps(success.txnId, success.deps, cloneKeyConflicts, cloneRangeConflicts); + return success; + }); + var delay = preAcceptAsync.flatMap(ignore -> AsyncChains.ofCallable(instance.unorderedScheduled, () -> { + Ballot ballot = Ballot.fromValues(instance.storeService.epoch(), instance.storeService.now(), nodeId); + return new BeginRecovery(nodeId, new Topologies.Single(SizeOfIntersectionSorter.SUPPLIER, instance.topology), txnId, null, txn, route, ballot); + })); + var recoverAsync = delay.flatMap(br -> instance.processAsync(br, safe -> { + var reply = br.apply(safe); + Assertions.assertThat(reply.kind() == BeginRecovery.RecoverReply.Kind.Ok).isTrue(); + BeginRecovery.RecoverOk success = (BeginRecovery.RecoverOk) reply; + Deps proposeDeps = LatestDeps.mergeProposal(Collections.singletonList(success), ok -> ok.deps); + assertDeps(success.txnId, proposeDeps, cloneKeyConflicts, cloneRangeConflicts); + return success; + })); + + return Pair.create(txnId, recoverAsync.beginAsResult()); + } + + protected static void assertDeps(TxnId txnId, Deps deps, + Map> keyConflicts, + Map> rangeConflicts) + { + if (rangeConflicts.isEmpty()) + { + Assertions.assertThat(deps.rangeDeps.isEmpty()).describedAs("Txn %s rangeDeps was not empty; %s", txnId, deps.rangeDeps).isTrue(); + } + else + { + List actualRanges = IntStream.range(0, deps.rangeDeps.rangeCount()).mapToObj(deps.rangeDeps::range).collect(Collectors.toList()); + Assertions.assertThat(Ranges.of(actualRanges.toArray(Range[]::new))) + .describedAs("Txn %s had different ranges than expected", txnId) + .isEqualTo(Ranges.of(rangeConflicts.keySet().toArray(Range[]::new))); + AssertionError errors = null; + for (int i = 0; i < rangeConflicts.size(); i++) + { + try + { + var range = deps.rangeDeps.range(i); + Assertions.assertThat(rangeConflicts).describedAs("Txn %s had an unexpected range", txnId).containsKey(range); + var conflict = deps.rangeDeps.txnIdsWithFlagsForRangeIndex(i); + List expectedConflict = rangeConflicts.get(range); + Assertions.assertThat(conflict).describedAs("Txn %s Expected range %s to have different conflicting txns", txnId, range).isEqualTo(expectedConflict); + } + catch (AssertionError e) + { + if (errors == null) + errors = e; + else + errors.addSuppressed(e); + } + } + if (errors != null) + throw errors; + } + if (keyConflicts.isEmpty()) + { + Assertions.assertThat(deps.keyDeps.isEmpty()).describedAs("Txn %s keyDeps was not empty", txnId).isTrue(); + } + else + { + Assertions.assertThat(deps.keyDeps.keys()).describedAs("Txn %s Keys", txnId).isEqualTo(RoutingKeys.of(keyConflicts.keySet())); + for (var key : keyConflicts.keySet()) + Assertions.assertThat(deps.keyDeps.txnIdsWithFlags(key)).describedAs("Txn %s for key %s", txnId, key).isEqualTo(keyConflicts.get(key)); + } + } + + protected static Gen>> randomTxn(TableMetadata tbl, Gen domainGen, Gen.LongGen tokenGen) + { + Invariants.require(tbl == reverseTokenTbl); + Invariants.requireArgument(tbl.partitioner == Murmur3Partitioner.instance, "Only murmur partitioner is supported; given %s", tbl.partitioner.getClass()); + Gen keyGen = rs -> new PartitionKey(tbl.id, tbl.partitioner.decorateKey(Murmur3Partitioner.LongToken.keyForToken(tokenGen.nextLong(rs)))); + Gen rangeGen = rs -> { + long a = tokenGen.nextLong(rs); + long b = tokenGen.nextLong(rs); + while (a == b) + b = tokenGen.nextLong(rs); + if (a > b) + { + long tmp = a; + a = b; + b = tmp; + } + return tokenRange(tbl.id, tbl.partitioner, a, b); + }; + return rs -> { + Routable.Domain domain = domainGen.next(rs); + switch (domain) + { + case Key: + { + Keys keys = Keys.of(Gens.lists(keyGen).unique().ofSizeBetween(1, 5).next(rs)); + List inserts = new ArrayList<>(keys.size()); + List binds = new ArrayList<>(keys.size()); + for (int i = 0; i < keys.size(); i++) + { + inserts.add(String.format("INSERT INTO %s (pk) VALUES (?)", tbl)); + binds.add(((PartitionKey) keys.get(i)).partitionKey().getKey()); + } + Txn txn = createTxn(wrapInTxn(inserts), binds); + FullRoute route = keys.toRoute(keys.get(0).toUnseekable()); + return Pair.create(txn, route); + } + case Range: + { + Ranges ranges = Ranges.of(Gens.arrays(Range.class, rangeGen).unique().ofSizeBetween(1, 5).next(rs)); + Txn txn = createTxn(Txn.Kind.ExclusiveSyncPoint, ranges); + FullRangeRoute route = ranges.toRoute(ranges.get(0).end()); + return Pair.create(txn, route); + } + default: + throw new UnsupportedOperationException(domain.name()); + } + }; + } + + public static class DepsModel + { + private final Map> keyConflicts = new HashMap<>(); + private final RangeTree rangeConflicts = RTree.create(RangeTreeRangeAccessor.instance); + private final Ranges storeRanges; + + public DepsModel(Ranges storeRanges) + { + this.storeRanges = storeRanges; + } + + public Map> keyConflicts(Seekables keysOrRanges) + { + keysOrRanges = keysOrRanges.slice(storeRanges, Routables.Slice.Minimal); + switch (keysOrRanges.domain()) + { + case Key: + { + Keys keys = (Keys) keysOrRanges; + Map> expectedConflicts = new HashMap<>(); + keys.forEach(k -> expectedConflicts.put(k.toUnseekable(), keyConflicts.getOrDefault(k.toUnseekable(), Collections.emptyList()))); + return expectedConflicts; + } + case Range: + { + Ranges ranges = (Ranges) keysOrRanges; + return keyConflicts.entrySet().stream() + .filter(e -> ranges.contains(e.getKey())) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } + + default: + throw new UnsupportedOperationException(); + } + } + + public Map> rangeConflicts(Seekables keysOrRanges) + { + // there is a patch pending to add range support for keys... that isn't here yet so not handled + if (keysOrRanges.domain() != Routable.Domain.Range) + return Collections.emptyMap(); + keysOrRanges = keysOrRanges.slice(storeRanges, Routables.Slice.Minimal); + + Ranges ranges = (Ranges) keysOrRanges; + Map> conflicts = new HashMap<>(); + ranges.forEach(r -> rangeConflicts.search(r, e -> { + for (Range range : Ranges.single(e.getKey()).slice(ranges, Routables.Slice.Minimal)) + conflicts.computeIfAbsent(range, ignore -> new ArrayList<>()).add(e.getValue()); + })); + // need to dedup/sort txns + conflicts.values().forEach(l -> { + var sortedDedup = new ArrayList<>(new TreeSet<>(l)); + l.clear(); + l.addAll(sortedDedup); + }); + return conflicts; + } + + public void register(TxnId txnId, Txn txn) + { + for (var s : txn.keys()) + { + switch (s.domain()) + { + case Key: + keyConflicts.computeIfAbsent(s.asKey().toUnseekable(), i -> new ArrayList<>()).add(txnId); + break; + case Range: + rangeConflicts.add(s.asRange(), txnId); + break; + default: + throw new UnsupportedOperationException(); + } + } + } + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/SimulatedAccordTaskTest.java b/test/unit/org/apache/cassandra/service/accord/SimulatedAccordTaskTest.java new file mode 100644 index 000000000000..035df8d1e231 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/SimulatedAccordTaskTest.java @@ -0,0 +1,258 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.LockSupport; +import java.util.function.BiConsumer; +import java.util.function.BiFunction; +import java.util.function.BooleanSupplier; +import java.util.function.LongSupplier; +import java.util.function.Supplier; + +import org.junit.Before; +import org.junit.Test; + +import accord.api.RoutingKey; +import accord.impl.basic.SimulatedFault; +import accord.local.PreLoadContext; +import accord.local.SafeCommandStore; +import accord.messages.PreAccept; +import accord.primitives.FullRoute; +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.primitives.RoutingKeys; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.primitives.Unseekables; +import accord.utils.Gen; +import accord.utils.Gens; +import accord.utils.RandomSource; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Murmur3Partitioner.LongToken; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.SimulatedAccordCommandStore.FunctionWrapper; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.utils.Pair; +import org.assertj.core.api.Assertions; + +import static accord.utils.Property.qt; + +public class SimulatedAccordTaskTest extends SimulatedAccordCommandStoreTestBase +{ + @Before + public void precondition() + { + Assertions.assertThat(intTbl.partitioner).isEqualTo(Murmur3Partitioner.instance); + Assertions.assertThat(reverseTokenTbl.partitioner).isEqualTo(Murmur3Partitioner.instance); + } + + @Test + public void happyPath() + { + qt().withExamples(100).check(rs -> test(rs, 100, reverseTokenTbl, ignore -> Action.SUCCESS, ignore -> 0L)); + } + + @Test + public void fuzz() + { + Gen actionGen = Gens.enums().allWithWeights(Action.class, 10, 1, 1); + Gen.LongGen delaysNanos = Gens.longs().between(0, TimeUnit.MILLISECONDS.toNanos(10)); + qt().withExamples(100).check(rs -> test(rs, 100, reverseTokenTbl, actionGen, delaysNanos)); + } + + enum Operation { Task, PreAccept } + + private static void test(RandomSource rs, int numSamples, TableMetadata tbl, Gen actionGen, Gen.LongGen delaysNanos) throws Exception + { + AccordKeyspace.unsafeClear(); + Gen operationGen = Gens.enums().all(Operation.class); + + int numKeys = rs.nextInt(20, 1000); + long minToken = 0; + long maxToken = numKeys; + + Gen keyGen = Gens.longs().between(minToken + 1, maxToken).map(t -> new TokenKey(tbl.id, new LongToken(t))); + Gen keysGen = Gens.lists(keyGen).unique().ofSizeBetween(1, 10).map(l -> RoutingKeys.of(l)); + Gen rangesGen = Gens.lists(rangeInsideRange(tbl.id, minToken, maxToken)).uniqueBestEffort().ofSizeBetween(1, 10).map(l -> Ranges.of(l.toArray(Range[]::new))); + Gen> unseekablesGen = Gens.oneOf(keysGen, rangesGen); + Gen>> txnGen = randomTxn(tbl, mixedDomainGen.next(rs), mixedTokenGen.next(rs)); + + try (var instance = new SimulatedAccordCommandStore(tbl.id, rs, new SimulatedLoadFunctionWrapper(actionGen.asSupplier(rs), delaysNanos.asLongSupplier(rs)))) + { + instance.ignoreExceptions = t -> t instanceof SimulatedFault; + Counter counter = new Counter(); + for (int i = 0; i < numSamples; i++) + { + Operation op = operationGen.next(rs); + switch (op) + { + case Task: + { + PreLoadContext ctx = PreLoadContext.contextFor(unseekablesGen.next(rs)); + instance.maybeCacheEvict(ctx.keys()); + operation(instance, ctx, actionGen.next(rs), rs::nextBoolean).chain().begin(counter); + } + break; + case PreAccept: + { + Pair> txnWithRoute = txnGen.next(rs); + Txn txn = txnWithRoute.left; + Action action = actionGen.next(rs); + TxnId txnId = instance.nextTxnId(txn.kind(), txn.keys().domain()); + FullRoute route = txnWithRoute.right; + PreAccept preAccept = new PreAccept(nodeId, instance.topologies, txnId, txn, null, false, route) { + @Override + public PreAcceptReply apply(SafeCommandStore safeStore) + { + PreAcceptReply result = super.apply(safeStore); + if (action == Action.FAILURE) + throw new SimulatedFault("PreAccept failed for keys " + keys()); + return result; + } + }; + instance.maybeCacheEvict(txn.keys().toParticipants()); + instance.processAsync(preAccept).begin(counter); + } + break; + default: + throw new UnsupportedOperationException(op.name()); + } + } + instance.processAll(); + Assertions.assertThat(counter.counter).isEqualTo(numSamples); + instance.commandStore.cachesUnsafe().commands().forEach(e -> { + Assertions.assertThat(e.references()).isEqualTo(0); + }); + instance.commandStore.cachesUnsafe().commandsForKeys().forEach(e -> { + Assertions.assertThat(e.references()).isEqualTo(0); + }); + } + } + + private static Gen rangeInsideRange(TableId tableId, long minToken, long maxToken) + { + if (minToken + 1 == maxToken) + { + // only one range is possible... + return Gens.constant(range(tableId, minToken, maxToken)); + } + return rs -> { + long a = rs.nextLong(minToken, maxToken + 1); + long b = rs.nextLong(minToken, maxToken + 1); + while (a == b) + b = rs.nextLong(minToken, maxToken + 1); + if (a > b) + { + long tmp = a; + a = b; + b = tmp; + } + return range(tableId, a, b); + }; + } + + private static TokenRange range(TableId tableId, long start, long end) + { + return TokenRange.create(new TokenKey(tableId, new LongToken(start)), new TokenKey(tableId, new LongToken(end))); + } + + private enum Action { SUCCESS, FAILURE, LOAD_FAILURE } + + private static AccordTask operation(SimulatedAccordCommandStore instance, PreLoadContext ctx, Action action, BooleanSupplier delay) + { + return new SimulatedOperation(instance.commandStore, ctx, action == Action.FAILURE ? SimulatedOperation.Action.FAILURE : SimulatedOperation.Action.SUCCESS); + } + + private static class Counter implements BiConsumer + { + int counter = 0; + + @Override + public void accept(Object o, Throwable failure) + { + counter++; + if (failure != null && !(failure instanceof SimulatedFault)) + throw new AssertionError("Unexpected error", failure); + } + } + + private static class SimulatedOperation extends AccordTask + { + enum Action { SUCCESS, FAILURE} + private final Action action; + + public SimulatedOperation(AccordCommandStore commandStore, PreLoadContext preLoadContext, Action action) + { + super(commandStore, preLoadContext); + this.action = action; + } + + @Override + public Void apply(SafeCommandStore safe) + { + if (action == Action.FAILURE) + throw new SimulatedFault("Operation failed for keys " + keys()); + return null; + } + } + + private static class SimulatedLoadFunctionWrapper implements FunctionWrapper + { + final Supplier actions; + final LongSupplier delayNanos; + + private SimulatedLoadFunctionWrapper(Supplier actions, LongSupplier delayNanos) + { + this.actions = actions; + this.delayNanos = delayNanos; + } + + @Override + public BiFunction wrap(BiFunction f) + { + return new SimulatedLoadFunction<>(f, actions, delayNanos); + } + } + + private static class SimulatedLoadFunction implements BiFunction + { + private final BiFunction load; + private final Supplier actions; + private final LongSupplier delaysNanos; + SimulatedLoadFunction(BiFunction load, Supplier actions, LongSupplier delaysNanos) + { + this.load = load; + this.actions = actions; + this.delaysNanos = delaysNanos; + } + + @Override + public V apply(I1 i1, I2 i2) + { + long delayNanos = delaysNanos.getAsLong(); + if (delayNanos > 0) + LockSupport.parkNanos(delayNanos); + Action action = actions.get(); + if (action == Action.SUCCESS) return load.apply(i1, i2); + throw new SimulatedFault("Failure loading " + i2); + } + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/SimulatedDepsTest.java b/test/unit/org/apache/cassandra/service/accord/SimulatedDepsTest.java new file mode 100644 index 000000000000..d19408b43ed8 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/SimulatedDepsTest.java @@ -0,0 +1,284 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import accord.api.RoutingKey; +import accord.primitives.FullKeyRoute; +import accord.primitives.FullRangeRoute; +import accord.primitives.FullRoute; +import accord.primitives.Keys; +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.utils.Gen; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Murmur3Partitioner.LongToken; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.utils.Generators; +import org.junit.Test; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static accord.utils.Property.qt; +import static org.apache.cassandra.service.accord.AccordTestUtils.createTxn; + +public class SimulatedDepsTest extends SimulatedAccordCommandStoreTestBase +{ + @Test + public void keyConflicts() + { + TableMetadata tbl = intTbl; + int numSamples = 100; + + qt().withExamples(10).check(rs -> { + AccordKeyspace.unsafeClear(); + int key = rs.nextInt(); + PartitionKey pk = new PartitionKey(tbl.id, tbl.partitioner.decorateKey(Int32Type.instance.decompose(key))); + Keys keys = Keys.of(pk); + FullKeyRoute route = keys.toRoute(pk.toUnseekable()); + Txn txn = createTxn(wrapInTxn("INSERT INTO " + tbl + "(pk, value) VALUES (?, ?)"), Arrays.asList(key, 42)); + try (var instance = new SimulatedAccordCommandStore(tbl.id, rs)) + { + List conflicts = new ArrayList<>(numSamples); + for (int i = 0; i < numSamples; i++) + { + instance.maybeCacheEvict(route, Ranges.EMPTY); + conflicts.add(assertDepsMessage(instance, rs.pick(DepsMessage.values()), txn, route, keyConflicts(conflicts, route))); + } + } + }); + } + + @Test + public void tokenConflicts() + { + TableMetadata tbl = reverseTokenTbl; + int numSamples = 100; + Gen rawKey = Generators.toGen(Generators.bytes(16, 16)); + + qt().withExamples(10).check(rs -> { + AccordKeyspace.unsafeClear(); + + ByteBuffer key = rawKey.next(rs); + PartitionKey pk = new PartitionKey(tbl.id, tbl.partitioner.decorateKey(key)); + Keys keys = Keys.of(pk); + FullKeyRoute route = keys.toRoute(pk.toUnseekable()); + Txn txn = createTxn(wrapInTxn("INSERT INTO " + tbl + "(pk, value) VALUES (?, ?)"), Arrays.asList(key, 42)); + + ByteBuffer tokenConflictKey = Murmur3Partitioner.LongToken.keyForToken((LongToken) Murmur3Partitioner.instance.decorateKey(key).getToken()); + PartitionKey pkTokenConflict = new PartitionKey(tbl.id, tbl.partitioner.decorateKey(tokenConflictKey)); + Keys keysTokenConflict = Keys.of(pkTokenConflict); + FullKeyRoute routeTokenConflict = keysTokenConflict.toRoute(pkTokenConflict.toUnseekable()); + Txn txnTokenConflict = createTxn(wrapInTxn("INSERT INTO " + tbl + "(pk, value) VALUES (?, ?)"), Arrays.asList(tokenConflictKey, 42)); + try (var instance = new SimulatedAccordCommandStore(tbl.id, rs)) + { + List conflicts = new ArrayList<>(numSamples); + for (int i = 0; i < numSamples; i++) + { + instance.maybeCacheEvict(route, Ranges.EMPTY); + conflicts.add(assertDepsMessage(instance, rs.pick(DepsMessage.values()), txn, route, keyConflicts(conflicts, route))); + conflicts.add(assertDepsMessage(instance, rs.pick(DepsMessage.values()), txnTokenConflict, routeTokenConflict, keyConflicts(conflicts, routeTokenConflict))); + } + } + }); + } + + @Test + public void rangePartialKeyMatch() + { + var tbl = reverseTokenTbl; + int numSamples = 250; + int numConflictKeyTxns = 10; + + qt().withExamples(10).check(rs -> { + AccordKeyspace.unsafeClear(); + try (var instance = new SimulatedAccordCommandStore(tbl.id, rs)) + { + long token = rs.nextLong(Long.MIN_VALUE + 1, Long.MAX_VALUE); + Ranges partialRange = Ranges.of(tokenRange(tbl.id, tbl.partitioner, token - 1, token)); + + long outOfRangeToken = token - 10; + if (outOfRangeToken == Long.MIN_VALUE) // if this wraps around that is fine, just can't be min + outOfRangeToken++; + RoutingKey key = new TokenKey(tbl.id, new LongToken(token)); + RoutingKey outOfRangeKey = new TokenKey(tbl.id, new LongToken(outOfRangeToken)); + Txn keyTxn = createTxn(wrapInTxn("INSERT INTO " + tbl + "(pk, value) VALUES (?, ?)", + "INSERT INTO " + tbl + "(pk, value) VALUES (?, ?)"), + Arrays.asList(LongToken.keyForToken(token), 42, + LongToken.keyForToken(outOfRangeToken), 42)); + Keys keys = (Keys) keyTxn.keys(); + FullRoute keyRoute = keys.toRoute(keys.get(0).toUnseekable()); + + Txn conflictingKeyTxn = createTxn(wrapInTxn("INSERT INTO " + tbl + "(pk, value) VALUES (?, ?)"), + Arrays.asList(LongToken.keyForToken(outOfRangeToken), 42)); + Keys conflictingKeys = (Keys) conflictingKeyTxn.keys(); + FullRoute conflictingRoute = conflictingKeys.toRoute(conflictingKeys.get(0).toUnseekable()); + + FullRangeRoute rangeRoute = partialRange.toRoute(key.toUnseekable()); + Txn rangeTxn = createTxn(Txn.Kind.ExclusiveSyncPoint, partialRange); + + List keyConflicts = new ArrayList<>(numSamples); + List outOfRangeKeyConflicts = new ArrayList<>(numSamples); + List rangeConflicts = new ArrayList<>(numSamples); + for (int i = 0; i < numSamples; i++) + { + instance.maybeCacheEvict(((Keys) keyTxn.keys()).toParticipants(), partialRange); + for (int j = 0; j < numConflictKeyTxns; j++) + outOfRangeKeyConflicts.add(assertDepsMessage(instance, rs.pick(DepsMessage.values()), conflictingKeyTxn, conflictingRoute, Map.of(outOfRangeKey, outOfRangeKeyConflicts))); + + TxnId id = assertDepsMessage(instance, rs.pick(DepsMessage.values()), keyTxn, keyRoute, Map.of(key, keyConflicts, outOfRangeKey, outOfRangeKeyConflicts)); + keyConflicts.add(id); + outOfRangeKeyConflicts.add(id); + + rangeConflicts.add(assertDepsMessage(instance, rs.pick(DepsMessage.values()), rangeTxn, rangeRoute, Map.of(key, keyConflicts), rangeConflicts(rangeConflicts, partialRange))); + } + } + }); + } + + @Test + public void simpleRangeConflicts() + { + var tbl = reverseTokenTbl; + Ranges wholeRange = Ranges.of(fullRange(tbl.id, tbl.partitioner)); + int numSamples = 100; + + qt().withExamples(10).check(rs -> { + AccordKeyspace.unsafeClear(); + try (var instance = new SimulatedAccordCommandStore(tbl.id, rs)) + { + long token = rs.nextLong(Long.MIN_VALUE + 1, Long.MAX_VALUE); + ByteBuffer key = LongToken.keyForToken(token); + PartitionKey pk = new PartitionKey(tbl.id, tbl.partitioner.decorateKey(key)); + Keys keys = Keys.of(pk); + FullKeyRoute keyRoute = keys.toRoute(pk.toUnseekable()); + Txn keyTxn = createTxn(wrapInTxn("INSERT INTO " + tbl + "(pk, value) VALUES (?, ?)"), Arrays.asList(key, 42)); + + Ranges partialRange = Ranges.of(tokenRange(tbl.id, tbl.partitioner, token - 1, token)); + boolean useWholeRange = rs.nextBoolean(); + Ranges ranges = useWholeRange ? wholeRange : partialRange; + FullRangeRoute rangeRoute = ranges.toRoute(pk.toUnseekable()); + Txn rangeTxn = createTxn(Txn.Kind.ExclusiveSyncPoint, ranges); + + DepsModel model = new DepsModel(instance.commandStore.unsafeGetRangesForEpoch().currentRanges()); + for (int i = 0; i < numSamples; i++) + { + instance.maybeCacheEvict(keyRoute, ranges); + assertDepsMessage(instance, rs.pick(DepsMessage.values()), keyTxn, keyRoute, model); + assertDepsMessage(instance, rs.pick(DepsMessage.values()), rangeTxn, rangeRoute, model); + } + } + }); + } + + @Test + public void expandingRangeConflicts() + { + var tbl = reverseTokenTbl; + int numSamples = 100; + + qt().withExamples(10).check(rs -> { + AccordKeyspace.unsafeClear(); + try (var instance = new SimulatedAccordCommandStore(tbl.id, rs)) + { + long token = rs.nextLong(Long.MIN_VALUE + numSamples + 1, Long.MAX_VALUE - numSamples); + ByteBuffer key = LongToken.keyForToken(token); + PartitionKey pk = new PartitionKey(tbl.id, tbl.partitioner.decorateKey(key)); + Keys keys = Keys.of(pk); + FullKeyRoute keyRoute = keys.toRoute(pk.toUnseekable()); + Txn keyTxn = createTxn(wrapInTxn("INSERT INTO " + tbl + "(pk, value) VALUES (?, ?)"), Arrays.asList(key, 42)); + + List keyConflicts = new ArrayList<>(numSamples); + Map> rangeConflicts = new HashMap<>(); + for (int i = 0; i < numSamples; i++) + { + Ranges partialRange = Ranges.of(tokenRange(tbl.id, tbl.partitioner, token - i - 1, token + i)); + FullRangeRoute rangeRoute = partialRange.toRoute(pk.toUnseekable()); + Txn rangeTxn = createTxn(Txn.Kind.ExclusiveSyncPoint, partialRange); + try + { + instance.maybeCacheEvict(keyRoute, partialRange); + keyConflicts.add(assertDepsMessage(instance, rs.pick(DepsMessage.values()), keyTxn, keyRoute, keyConflicts(keyConflicts, keyRoute))); + rangeConflicts.put(partialRange.get(0), Collections.singletonList(assertDepsMessage(instance, rs.pick(DepsMessage.values()), rangeTxn, rangeRoute, keyConflicts(keyConflicts, keyRoute), rangeConflicts))); + } + catch (Throwable t) + { + AssertionError error = new AssertionError("Unexpected error: i=" + i + ", token=" + token + ", range=" + partialRange.get(0)); + t.addSuppressed(error); + throw t; + } + } + } + }); + } + + @Test + public void overlappingRangeConflicts() + { + var tbl = reverseTokenTbl; + int numSamples = 100; + + qt().withExamples(10).check(rs -> { + AccordKeyspace.unsafeClear(); + try (var instance = new SimulatedAccordCommandStore(tbl.id, rs)) + { + long token = rs.nextLong(Long.MIN_VALUE + numSamples + 1, Long.MAX_VALUE - numSamples); + ByteBuffer key = LongToken.keyForToken(token); + PartitionKey pk = new PartitionKey(tbl.id, tbl.partitioner.decorateKey(key)); + Keys keys = Keys.of(pk); + FullKeyRoute keyRoute = keys.toRoute(pk.toUnseekable()); + Txn keyTxn = createTxn(wrapInTxn("INSERT INTO " + tbl + "(pk, value) VALUES (?, ?)"), Arrays.asList(key, 42)); + + Range left = tokenRange(tbl.id, tbl.partitioner, token - 10, token + 5); + Range right = tokenRange(tbl.id, tbl.partitioner, token - 5, token + 10); + + DepsModel model = new DepsModel(instance.commandStore.unsafeGetRangesForEpoch().currentRanges()); + for (int i = 0; i < numSamples; i++) + { + Ranges partialRange = Ranges.of(rs.nextBoolean() ? left : right); + try + { + instance.maybeCacheEvict(keyRoute, partialRange); + assertDepsMessage(instance, rs.pick(DepsMessage.values()), keyTxn, keyRoute, model); + + FullRangeRoute rangeRoute = partialRange.toRoute(pk.toUnseekable()); + Txn rangeTxn = createTxn(Txn.Kind.ExclusiveSyncPoint, partialRange); + assertDepsMessage(instance, rs.pick(DepsMessage.values()), rangeTxn, rangeRoute, model); + } + catch (Throwable t) + { + AssertionError error = new AssertionError("Unexpected error: i=" + i + ", token=" + token + ", range=" + partialRange.get(0)); + t.addSuppressed(error); + throw t; + } + } + } + }); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/SimulatedMultiKeyAndRangeTest.java b/test/unit/org/apache/cassandra/service/accord/SimulatedMultiKeyAndRangeTest.java new file mode 100644 index 000000000000..91fd2cb4d621 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/SimulatedMultiKeyAndRangeTest.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import org.junit.Test; + +import accord.api.Key; +import accord.api.RoutingKey; +import accord.primitives.FullRangeRoute; +import accord.primitives.FullRoute; +import accord.primitives.Keys; +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.primitives.Routable.Domain; +import accord.primitives.Txn; +import accord.utils.Gen; +import accord.utils.Gens; +import org.apache.cassandra.service.accord.api.PartitionKey; + +import static accord.utils.Property.qt; +import static org.apache.cassandra.dht.Murmur3Partitioner.LongToken.keyForToken; +import static org.apache.cassandra.service.accord.AccordTestUtils.createTxn; + +public class SimulatedMultiKeyAndRangeTest extends SimulatedAccordCommandStoreTestBase +{ + @Test + public void test() + { + var tbl = reverseTokenTbl; + int numSamples = 300; + long minToken = 0; + long maxToken = 100; + Gen tokenDistribution = Gens.mixedDistribution(minToken, maxToken + 1); + Gen keyDistribution = Gens.mixedDistribution(1, 5); + Gen rangeDistribution = Gens.mixedDistribution(1, 5); + Gen> domainDistribution = Gens.mixedDistribution(Domain.values()); + Gen> msgDistribution = Gens.mixedDistribution(DepsMessage.values()); + + qt().withExamples(100).check(rs -> { + AccordKeyspace.unsafeClear(); + try (var instance = new SimulatedAccordCommandStore(tbl.id, rs)) + { + Gen.LongGen tokenGen = tokenDistribution.next(rs); + Gen domainGen = domainDistribution.next(rs); + Gen msgGen = msgDistribution.next(rs); + + Gen.IntGen keyCountGen = keyDistribution.next(rs); + Gen.IntGen rangeCountGen = rangeDistribution.next(rs); + + DepsModel model = new DepsModel(instance.commandStore.unsafeGetRangesForEpoch().currentRanges()); + + for (int i = 0; i < numSamples; i++) + { + switch (domainGen.next(rs)) + { + case Key: + { + int numKeys = keyCountGen.nextInt(rs); + TreeSet set = new TreeSet<>(); + while (set.size() != numKeys) + set.add(new PartitionKey(tbl.id, tbl.partitioner.decorateKey(keyForToken(tokenGen.nextLong(rs))))); + Keys keys = Keys.of(set); + List inserts = IntStream.range(0, numKeys).mapToObj(ignore -> "INSERT INTO " + tbl + "(pk, value) VALUES (?, ?)").collect(Collectors.toList()); + List binds = new ArrayList<>(numKeys * 2); + keys.forEach(k -> { + binds.add(((PartitionKey) k.asKey()).partitionKey().getKey()); + binds.add(42); + }); + Txn txn = createTxn(wrapInTxn(inserts), binds); + FullRoute route = keys.toRoute(keys.get(0).toUnseekable()); + + assertDepsMessage(instance, msgGen.next(rs), txn, route, model); + } + break; + case Range: + { + int numRanges = rangeCountGen.nextInt(rs); + Set set = new HashSet<>(); + while (set.size() != numRanges) + { + long token = tokenGen.nextLong(rs); + int offset = rs.nextInt(1, 10); + long start, end; + if (token + offset > maxToken) + { + end = token; + start = end - offset; + } + else + { + start = token; + end = start + offset; + } + set.add(tokenRange(tbl.id, tbl.partitioner, start, end)); + } + // The property ranges.size() == numRanges is not true as this logic will sort + deoverlap + // so if the ranges were overlapped, we could have more or less than numRanges + Ranges ranges = Ranges.of(set.toArray(Range[]::new)); + FullRangeRoute route = ranges.toRoute(ranges.get(0).end()); + Txn txn = createTxn(Txn.Kind.ExclusiveSyncPoint, ranges); + + assertDepsMessage(instance, msgGen.next(rs), txn, route, model); + } + break; + default: + throw new AssertionError(); + } + } + } + }); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/SimulatedRandomKeysWithRangeConflictTest.java b/test/unit/org/apache/cassandra/service/accord/SimulatedRandomKeysWithRangeConflictTest.java new file mode 100644 index 000000000000..469732f674b6 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/SimulatedRandomKeysWithRangeConflictTest.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import accord.api.RoutingKey; +import accord.primitives.FullRangeRoute; +import accord.primitives.FullRoute; +import accord.primitives.Keys; +import accord.primitives.Ranges; +import accord.primitives.RoutingKeys; +import accord.primitives.Txn; +import accord.utils.Property; +import accord.utils.RandomSource; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.FailingConsumer; +import org.junit.Test; + +import java.util.Arrays; + +import static accord.utils.Property.commands; +import static accord.utils.Property.stateful; +import static org.apache.cassandra.dht.Murmur3Partitioner.LongToken.keyForToken; +import static org.apache.cassandra.service.accord.AccordTestUtils.createTxn; + +public class SimulatedRandomKeysWithRangeConflictTest extends SimulatedAccordCommandStoreTestBase +{ + private static Property.SimpleCommand insertKey(RandomSource rs, State state) + { + long token = rs.nextLong(Long.MIN_VALUE + 1, Long.MAX_VALUE); + Txn keyTxn = createTxn(wrapInTxn("INSERT INTO " + state.tbl + "(pk, value) VALUES (?, ?)"), + Arrays.asList(keyForToken(token), 42)); + Keys keys = (Keys) keyTxn.keys(); + FullRoute keyRoute = keys.toRoute(keys.get(0).toUnseekable()); + + return new Property.SimpleCommand<>("Write Txn: " + keys, FailingConsumer.orFail(s -> { + s.instance.maybeCacheEvict(keyRoute, s.wholeRange); + assertDepsMessage(s.instance, rs.pick(DepsMessage.values()), keyTxn, keyRoute, s.model); + })); + } + + private static Property.SimpleCommand insertRange(RandomSource rs, State state) + { + return new Property.SimpleCommand<>("Range Txn: " + state.wholeRange, FailingConsumer.orFail(s -> { + s.instance.maybeCacheEvict(RoutingKeys.EMPTY, s.wholeRange); + assertDepsMessage(s.instance, rs.pick(DepsMessage.values()), s.rangeTxn, s.rangeRoute, s.model); + })); + } + + + @Test + public void keysAllOverConflictingWithRange() + { + stateful().withSteps(State.steps).check(commands(() -> State::new) + .add(SimulatedRandomKeysWithRangeConflictTest::insertKey) + .add(SimulatedRandomKeysWithRangeConflictTest::insertRange) + .build()); + } + + public static class State + { + static final int steps = 300; + final SimulatedAccordCommandStore instance; + + final TableMetadata tbl = reverseTokenTbl; + final Ranges wholeRange = Ranges.of(fullRange(tbl.id, tbl.partitioner)); + final FullRangeRoute rangeRoute = wholeRange.toRoute(wholeRange.get(0).end()); + final Txn rangeTxn = createTxn(Txn.Kind.ExclusiveSyncPoint, wholeRange); + final DepsModel model; + + public State(RandomSource rs) + { + AccordKeyspace.unsafeClear(); + this.instance = new SimulatedAccordCommandStore(tbl.id, rs); + this.instance.commandStore.executor().cacheUnsafe().setShrinkingOn(false); + this.model = new DepsModel(instance.commandStore.unsafeGetRangesForEpoch().currentRanges()); + } + + @Override + public String toString() + { + return "Storage Ranges: " + instance.topology.ranges(); + } + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/WatermarkCollectorTest.java b/test/unit/org/apache/cassandra/service/accord/WatermarkCollectorTest.java new file mode 100644 index 000000000000..06af98f5bcfc --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/WatermarkCollectorTest.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord; + +import java.util.HashMap; +import java.util.Map; + +import com.google.common.collect.Sets; +import org.junit.Test; + +import accord.local.Node; +import accord.primitives.Range; +import accord.utils.AccordGens; +import accord.utils.Gen; +import accord.utils.Gens; +import accord.utils.Invariants; +import org.agrona.collections.Int2ObjectHashMap; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.utils.AccordGenerators; + +import static accord.utils.Property.qt; + +public class WatermarkCollectorTest +{ + static + { + DatabaseDescriptor.clientInitialization(); + } + + @Test + public void snapshotSerializer() + { + @SuppressWarnings({ "resource", "IOResourceOpenedButNotSafelyClosed" }) DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(snapshotGen()).check(expected -> { + maybeUpdatePartitioner(expected); + Serializers.testSerde(output, WatermarkCollector.serializer, expected); + }); + } + + private static void maybeUpdatePartitioner(WatermarkCollector.Snapshot snapshot) + { + for (Range range : Sets.union(snapshot.closed.keySet(), snapshot.retired.keySet())) + { + TokenRange tr = (TokenRange) range; + DatabaseDescriptor.setPartitionerUnsafe(tr.start().token().getPartitioner()); + break; + } + } + + private Gen snapshotGen() + { + Gen partitionerGen = AccordGenerators.partitioner(); + Gen.LongGen epochGen = AccordGens.epochs(); + Gen> syncedGen = syncedGen(); + return rs -> { + IPartitioner partitioner = partitionerGen.next(rs); + Gen rangeGen = AccordGenerators.range(partitioner); + Gen> mapGen = mapGen(Gens.ints().between(0, 10), rangeGen, epochGen); + return new WatermarkCollector.Snapshot(mapGen.next(rs), mapGen.next(rs), syncedGen.next(rs)); + }; + } + + private static Gen> syncedGen() + { + Gen.IntGen sizeGen = Gens.ints().between(0, 10); + Gen idGen = AccordGens.nodes(); + Gen.LongGen epochGen = AccordGens.epochs(); + return rs -> { + Int2ObjectHashMap map = new Int2ObjectHashMap<>(); + Gen uniqueIdGen = idGen.filter(id -> !map.containsKey(id.id)); + for (int i = 0, size = sizeGen.nextInt(rs); i < size; i++) + map.put(uniqueIdGen.next(rs).id, epochGen.next(rs)); + return map; + }; + } + + private static Gen> mapGen(Gen.IntGen sizeGen, Gen keyGen, Gen valueGen) + { + //TODO (ux): should move this to Gens + return rs -> { + int size = sizeGen.nextInt(rs); + Invariants.require(size >= 0, "Only 0 and possitive allowed; given %d", size); + if (size == 0) + return Map.of(); + Map map = new HashMap<>(); + Gen uniqueKeyGen = keyGen.filter(k -> !map.containsKey(k)); + for (int i = 0; i < size; i++) + map.put(uniqueKeyGen.next(rs), valueGen.next(rs)); + return map; + }; + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/api/AccordKeyTest.java b/test/unit/org/apache/cassandra/service/accord/api/AccordKeyTest.java new file mode 100644 index 000000000000..d27c00b1c8ef --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/api/AccordKeyTest.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.api; + +import java.io.IOException; + +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.utils.ByteBufferUtil; + +import static org.apache.cassandra.cql3.statements.schema.CreateTableStatement.parse; + +public class AccordKeyTest +{ + private static final TableId TABLE1 = TableId.fromString("00000000-0000-0000-0000-000000000001"); + private static final TableId TABLE2 = TableId.fromString("00000000-0000-0000-0000-000000000002"); + + @BeforeClass + public static void setupClass() + { + SchemaLoader.prepareServer(); + SchemaLoader.createKeyspace("ks", KeyspaceParams.simple(1), + parse("CREATE TABLE tbl1 (k int, c int, v int, primary key (k, c)) WITH transactional_mode='full'", "ks").id(TABLE1), + parse("CREATE TABLE tbl2 (k int, c int, v int, primary key (k, c)) WITH transactional_mode='full'", "ks").id(TABLE2)); + + } + + public static IPartitioner partitioner(TableId tableId) + { + return Schema.instance.getTableMetadata(tableId).partitioner; + } + + @Test + public void partitionKeyTest() throws IOException + { + DecoratedKey dk = partitioner(TABLE1).decorateKey(ByteBufferUtil.bytes(5)); + PartitionKey pk = new PartitionKey(TABLE1, dk); + Serializers.testSerde(PartitionKey.serializer, pk); + } + + @Test + public void tokenKeyTest() throws IOException + { + DecoratedKey dk = partitioner(TABLE1).decorateKey(ByteBufferUtil.bytes(5)); + TokenKey pk = new TokenKey(TABLE1, dk.getToken()); + Serializers.testSerde(TokenKey.serializer, pk); + } + + @Test + public void comparisonTest() + { + DecoratedKey dk = partitioner(TABLE1).decorateKey(ByteBufferUtil.bytes(5)); + PartitionKey pk = new PartitionKey(TABLE1, dk); + TokenKey tk = new TokenKey(TABLE1, dk.getToken()); + TokenKey tkLow = new TokenKey(TABLE1, dk.getToken().decreaseSlightly()); + TokenKey tkHigh = new TokenKey(TABLE1, dk.getToken().increaseSlightly()); + + Assert.assertTrue(tk.compareTo(pk) > 0); + Assert.assertTrue(tkLow.compareTo(pk) < 0); + Assert.assertTrue(pk.compareTo(tkHigh) < 0); + } + + @Test + public void tableComparisonTest() + { + Assert.assertTrue(TABLE1.compareTo(TABLE2) < 0); + + DecoratedKey dk1 = partitioner(TABLE1).decorateKey(ByteBufferUtil.bytes(5)); + PartitionKey pk1 = new PartitionKey(TABLE1, dk1); + + DecoratedKey dk2 = partitioner(TABLE2).decorateKey(ByteBufferUtil.bytes(5)); + PartitionKey pk2 = new PartitionKey(TABLE2, dk2); + + Assert.assertTrue(pk1.compareTo(pk2) < 0); + } + + @Test + public void sentinelComparisonTest() + { + Assert.assertTrue(TABLE1.compareTo(TABLE2) < 0); + DecoratedKey dk1 = partitioner(TABLE1).decorateKey(ByteBufferUtil.bytes(5)); + PartitionKey pk1 = new PartitionKey(TABLE1, dk1); + + DecoratedKey dk2 = partitioner(TABLE2).decorateKey(ByteBufferUtil.bytes(5)); + PartitionKey pk2 = new PartitionKey(TABLE2, dk2); + + TokenKey loSentinel = TokenKey.min(TABLE1, partitioner(TABLE1)); + TokenKey hiSentinel = TokenKey.max(TABLE1, partitioner(TABLE1)); + Assert.assertTrue(loSentinel.compareTo(hiSentinel) < 0); + Assert.assertTrue(pk1.compareTo(loSentinel) > 0); + Assert.assertTrue(loSentinel.compareTo(pk1) < 0); + Assert.assertTrue(pk1.compareTo(hiSentinel) < 0); + Assert.assertTrue(hiSentinel.compareTo(pk1) > 0); + Assert.assertTrue(hiSentinel.compareTo(pk2) < 0); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/fastpath/FastPathParsingTest.java b/test/unit/org/apache/cassandra/service/accord/fastpath/FastPathParsingTest.java new file mode 100644 index 000000000000..96b9e87435c0 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/fastpath/FastPathParsingTest.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.fastpath; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.StringJoiner; + +import com.google.common.collect.ImmutableMap; +import org.junit.Assert; +import org.junit.Test; + +import org.apache.cassandra.exceptions.ConfigurationException; + +import static java.lang.String.format; + +public class FastPathParsingTest +{ + private static void assertThrows(Runnable runnable, Class exception) + { + try + { + runnable.run(); + } + catch (Throwable e) + { + if (!exception.isAssignableFrom(e.getClass())) + { + throw new AssertionError(format("Expected %s to be thrown, got %s: %s", exception.getName(), e.getClass().getName(), e.getMessage())); + } + return; + } + Assert.fail(format("Expected %s to be thrown", exception.getName())); + } + + private static Map options(String... opts) + { + Assert.assertTrue("Need even numbered array for key value pairs, got " + Arrays.toString(opts), opts.length % 2 == 0); + ImmutableMap.Builder builder = ImmutableMap.builder(); + for (int i=0; i options = new HashMap<>(); + options.put(ParameterizedFastPathStrategy.SIZE, Integer.toString(size)); + if (dcs.length > 0) + { + StringJoiner joiner = new StringJoiner(","); + for (String dc : dcs) + joiner.add(dc); + options.put(ParameterizedFastPathStrategy.DCS, joiner.toString()); + } + + return ParameterizedFastPathStrategy.fromMap(options); + } + + @Test + public void fromString() + { + Assert.assertSame(SimpleFastPathStrategy.instance, FastPathStrategy.tableStrategyFromString("simple")); + } + + @Test + public void fromStringFailures() + { + assertThrows(() -> FastPathStrategy.tableStrategyFromString("something"), ConfigurationException.class); + } + + @Test + public void fromMap() + { + Assert.assertEquals(pfs(3), FastPathStrategy.fromMap(options("size", "3"))); + Assert.assertEquals(SimpleFastPathStrategy.instance, FastPathStrategy.fromMap(options())); + Assert.assertEquals(pfs(1, "dc1"), FastPathStrategy.fromMap(options("size", "1", "dcs", "dc1"))); + Assert.assertEquals(pfs(3, "dc1", "dc2"), FastPathStrategy.fromMap(options("size", "3", "dcs", "dc1,dc2"))); + Assert.assertEquals(pfs(5, "dc2", "dc1"), FastPathStrategy.fromMap(options("size", "5", "dcs", "dc2,dc1"))); + } + + @Test + public void fromMapFailures() + { + assertThrows(() -> FastPathStrategy.fromMap(options("dcs", "dc1")), ConfigurationException.class); + assertThrows(() -> FastPathStrategy.fromMap(options("size", "abc")), ConfigurationException.class); + assertThrows(() -> FastPathStrategy.fromMap(options("size", "0")), ConfigurationException.class); + assertThrows(() -> FastPathStrategy.fromMap(options("size", "-1")), ConfigurationException.class); + assertThrows(() -> FastPathStrategy.fromMap(options("size", "2", "dcs", " ")), ConfigurationException.class); + assertThrows(() -> FastPathStrategy.fromMap(options("size", "5", "dcs", "dc2,dc1", "happypath", "5")), ConfigurationException.class); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/fastpath/ParameterizedFastPathStrategyTest.java b/test/unit/org/apache/cassandra/service/accord/fastpath/ParameterizedFastPathStrategyTest.java new file mode 100644 index 000000000000..cda86eff8bc1 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/fastpath/ParameterizedFastPathStrategyTest.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.fastpath; + +import java.util.HashMap; +import java.util.Map; + +import com.google.common.collect.ImmutableMap; +import org.junit.Assert; +import org.junit.Test; + +import accord.local.Node; +import accord.utils.SortedArrays.SortedArrayList; +import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.service.accord.fastpath.ParameterizedFastPathStrategy.WeightedDc; + +import static java.util.Collections.emptySet; +import static org.apache.cassandra.service.accord.AccordTestUtils.id; +import static org.apache.cassandra.service.accord.AccordTestUtils.idList; +import static org.apache.cassandra.service.accord.AccordTestUtils.idSet; +import static org.apache.cassandra.service.accord.fastpath.FastPathParsingTest.pfs; +import static org.junit.Assert.assertEquals; + +public class ParameterizedFastPathStrategyTest +{ + private static final SortedArrayList NODES = idList(1, 2, 3, 4, 5, 6); + private static final Map DCS_2; + private static final Map DCS_3; + + static + { + ImmutableMap.Builder builder = ImmutableMap.builder(); + builder.put(id(1), "DC1"); + builder.put(id(2), "DC1"); + builder.put(id(3), "DC1"); + builder.put(id(4), "DC2"); + builder.put(id(5), "DC2"); + builder.put(id(6), "DC2"); + DCS_2 = builder.build(); + + builder = ImmutableMap.builder(); + builder.put(id(1), "DC1"); + builder.put(id(2), "DC1"); + builder.put(id(3), "DC2"); + builder.put(id(4), "DC2"); + builder.put(id(5), "DC3"); + builder.put(id(6), "DC3"); + DCS_3 = builder.build(); + } + + @Test + public void noDCPreference() + { + assertEquals(idSet(1, 2, 3, 4, 5, 6), pfs(6).calculateFastPath(NODES, emptySet(), DCS_2)); + assertEquals(idSet(1, 2, 3, 4, 5), pfs(5).calculateFastPath(NODES, emptySet(), DCS_2)); + assertEquals(idSet(1, 2, 3, 4), pfs(4).calculateFastPath(NODES, emptySet(), DCS_2)); + assertEquals(idSet(1, 2, 3, 4), pfs(3).calculateFastPath(NODES, emptySet(), DCS_2)); + } + + @Test + public void noDCPreferenceUnavailables() + { + assertEquals(idSet(1, 2, 3, 4, 5, 6), pfs(6).calculateFastPath(NODES, idSet(4), DCS_2)); + assertEquals(idSet(1, 2, 3, 4, 5), pfs(5).calculateFastPath(NODES, idSet(1, 6), DCS_2)); + assertEquals(idSet(2, 3, 4, 5), pfs(4).calculateFastPath(NODES, idSet(1, 6), DCS_2)); + } + + + @Test + public void dcPreference() + { + assertEquals(idSet(1, 2, 3, 4, 5, 6), pfs(6, "DC1", "DC2").calculateFastPath(NODES, idSet(), DCS_3)); + assertEquals(idSet(1, 2, 3, 4), pfs(4, "DC1", "DC2").calculateFastPath(NODES, idSet(), DCS_3)); + assertEquals(idSet(1, 2, 5, 6), pfs(4, "DC1", "DC3").calculateFastPath(NODES, idSet(), DCS_3)); + } + + @Test + public void dcPreferenceUnavailables() + { + assertEquals(idSet(1, 2, 3, 4, 5), pfs(5, "DC1", "DC2").calculateFastPath(NODES, idSet(2, 4, 6), DCS_3)); + assertEquals(idSet(1, 2, 3, 5, 6), pfs(5, "DC1", "DC3").calculateFastPath(NODES, idSet(2, 4, 6), DCS_3)); + assertEquals(idSet(1, 3, 4, 5, 6), pfs(5, "DC2", "DC3").calculateFastPath(NODES, idSet(2, 4, 6), DCS_3)); + } + + private static WeightedDc wdc(String dc, int weight, boolean auto) + { + return new WeightedDc(dc, weight, auto); + } + + private static void assertCFE(int size, String... dcs) + { + try + { + pfs(size, dcs); + Assert.fail("expected ConfigurationException"); + } + catch (ConfigurationException ex) + { + // expected + } + } + + private static void assertPFS(ParameterizedFastPathStrategy actual, int size, WeightedDc... dcs) + { + Map dcMap = new HashMap<>(); + for (WeightedDc dc : dcs) + { + Assert.assertFalse(dcMap.containsKey(dc.name)); + dcMap.put(dc.name, dc); + } + ParameterizedFastPathStrategy expected = new ParameterizedFastPathStrategy(size, ImmutableMap.copyOf(dcMap)); + Assert.assertEquals(expected, actual); + } + + + @Test + public void dcParsingTest() + { + assertCFE(5, "DC1", "DC2:1"); + assertCFE(5, "DC1:-1", "DC2:1"); + assertCFE(5, "DC1", "DC1"); + } + + @Test + public void listParsingTest() + { + assertPFS(pfs(4, "DC1", "DC2", "DC3"), 4, wdc("DC1", 0, true), wdc("DC2", 1, true), wdc("DC3", 2, true)); + assertPFS(pfs(4, "DC2", "DC3", "DC1"), 4, wdc("DC2", 0, true), wdc("DC3", 1, true), wdc("DC1", 2, true)); + } + + @Test + public void weightParsingTest() + { + assertPFS(pfs(4, "DC1:0", "DC2:0", "DC3:1"), 4, wdc("DC1", 0, false), wdc("DC2", 0, false), wdc("DC3", 1, false)); + assertPFS(pfs(4, "DC2:100", "DC3:200", "DC1:300"), 4, wdc("DC2", 100, false), wdc("DC3", 200, false), wdc("DC1", 300, false)); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/fastpath/SimpleFastPathStrategyTest.java b/test/unit/org/apache/cassandra/service/accord/fastpath/SimpleFastPathStrategyTest.java new file mode 100644 index 000000000000..f19cac90ee45 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/fastpath/SimpleFastPathStrategyTest.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.fastpath; + +import java.util.Collections; +import java.util.Map; + +import org.junit.Assert; +import org.junit.Test; + +import accord.local.Node; + +import static org.apache.cassandra.service.accord.AccordTestUtils.idList; +import static org.apache.cassandra.service.accord.AccordTestUtils.idSet; + +public class SimpleFastPathStrategyTest +{ + private static final Map DCMAP = Collections.emptyMap(); + + @Test + public void testCalculation() + { + FastPathStrategy strategy = SimpleFastPathStrategy.instance; + Assert.assertEquals(idSet(1, 2, 3, 4, 5), strategy.calculateFastPath(idList(1, 2, 3, 4, 5), idSet(), DCMAP)); + Assert.assertEquals(idSet(3, 4, 5), strategy.calculateFastPath(idList(1, 2, 3, 4, 5), idSet(1, 2, 3), DCMAP)); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/journal/AccordTopologyUpdateTest.java b/test/unit/org/apache/cassandra/service/accord/journal/AccordTopologyUpdateTest.java new file mode 100644 index 000000000000..c9ed1c5af3c3 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/journal/AccordTopologyUpdateTest.java @@ -0,0 +1,175 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.journal; + +import org.junit.Before; +import org.junit.Test; + +import accord.api.Journal; +import accord.local.CommandStores; +import accord.local.Node; +import accord.primitives.Ranges; +import accord.topology.Topology; +import accord.utils.AccordGens; +import accord.utils.Gen; +import accord.utils.Gens; +import org.agrona.collections.Int2ObjectHashMap; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.utils.AccordGenerators; + +import static accord.utils.Property.qt; + +public class AccordTopologyUpdateTest +{ + private static final long[] EPOCHS = new long[0]; + private static final Ranges[] RANGES = new Ranges[0]; + private static final TableId TBL1 = TableId.fromRaw(0, 0); + + static + { + DatabaseDescriptor.clientInitialization(); + } + + @Before + public void before() + { + DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); + } + + @Test + public void rangesForEpoch() + { + @SuppressWarnings({ "resource", "IOResourceOpenedButNotSafelyClosed" }) DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(rangesForEpochGen()).check(expected -> { + maybeUpdatePartitioner(expected); + Serializers.testSerde(output, AccordTopologyUpdate.RangesForEpochSerializer.instance, expected); + }); + } + + @Test + public void topologyUpdate() + { + @SuppressWarnings({ "resource", "IOResourceOpenedButNotSafelyClosed" }) DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(topologyUpdateGen()).check(expected -> { + maybeUpdatePartitioner(expected); + Serializers.testSerde(output, AccordTopologyUpdate.TopologyUpdateSerializer.instance, expected); + }); + } + + @Test + public void accordTopologyUpdate() + { + @SuppressWarnings({ "resource", "IOResourceOpenedButNotSafelyClosed" }) DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(accordTopologyUpdateGen()).check(expected -> { + maybeUpdatePartitioner(expected); + Serializers.testSerde(output, AccordTopologyUpdate.Serializer.instance, expected); + }); + } + + private static Gen rangesForEpochGen() + { + return AccordGenerators.partitioner().flatMap(p -> rangesForEpochGen(AccordGenerators.rangesSplitOrArbitrary(p))); + } + + private static Gen rangesForEpochGen(Gen rangesGen) + { + Gen.IntGen sizeGen = Gens.ints().between(0, 10); + Gen.LongGen epochGen = AccordGens.epochs(); + return rs -> { + int size = sizeGen.nextInt(rs); + if (size == 0) + return new CommandStores.RangesForEpoch(EPOCHS, RANGES); + long epoch = epochGen.nextLong(rs); + long[] epochs = new long[size]; + Ranges[] ranges = new Ranges[size]; + for (int i = 0; i < size; i++) + { + epochs[i] = epoch++; + ranges[i] = rangesGen.next(rs); + } + return new CommandStores.RangesForEpoch(epochs, ranges); + }; + } + + private static Gen topologyUpdateGen() + { + Gen partitionerGen = AccordGenerators.partitioner(); + return rs -> { + IPartitioner partitioner = partitionerGen.next(rs); + Gen rangesGen = AccordGenerators.ranges(TBL1, partitioner); + Gen rangesForEpochGen = rangesForEpochGen(rangesGen); + Topology topology = AccordGenerators.topologyGen(rangesGen).next(rs); + + Int2ObjectHashMap commandStores = new Int2ObjectHashMap<>(); + for (Node.Id node : topology.nodes()) + commandStores.put(node.id, rangesForEpochGen.next(rs)); + + Node.Id self = rs.pick(topology.nodes()); + + return new Journal.TopologyUpdate(commandStores, topology.forNode(self), topology); + }; + } + + private static Gen accordTopologyUpdateGen() + { + Gen.LongGen epochGen = AccordGens.epochs(); + Gen topologyUpdateGen = topologyUpdateGen(); + Gen kindGen = Gens.enums().all(AccordTopologyUpdate.Kind.class); + return rs -> { + AccordTopologyUpdate.Kind kind = kindGen.next(rs); + switch (kind) + { + case NewTopology: return new AccordTopologyUpdate.NewTopology(topologyUpdateGen.next(rs)); + case Topologies: return new AccordTopologyUpdate.TopologyImage(epochGen.nextLong(rs)); + default: throw new AssertionError("Unknown kind: " + kind); + } + }; + } + + private static void maybeUpdatePartitioner(Journal.TopologyUpdate expected) + { + AccordGenerators.maybeUpdatePartitioner(expected.global.ranges()); + } + + private static void maybeUpdatePartitioner(AccordTopologyUpdate expected) + { + if (expected instanceof AccordTopologyUpdate.NewTopology) + { + maybeUpdatePartitioner(((AccordTopologyUpdate.NewTopology) expected).update); + } + } + + private void maybeUpdatePartitioner(CommandStores.RangesForEpoch expected) + { + if (expected.size() > 0) + { + for (int i = 0; i < expected.size(); i++) + { + Ranges ranges = expected.rangesAtIndex(i); + if (AccordGenerators.maybeUpdatePartitioner(ranges)) + return; + } + } + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/repair/RequiredResponseTrackerTest.java b/test/unit/org/apache/cassandra/service/accord/repair/RequiredResponseTrackerTest.java new file mode 100644 index 000000000000..f2900a3595c6 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/repair/RequiredResponseTrackerTest.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.repair; + +import java.util.List; +import java.util.Set; + +import com.google.common.collect.ImmutableList; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.api.TopologySorter; +import accord.api.TopologySorter.StaticSorter; +import accord.coordinate.tracking.RequestStatus; +import accord.local.Node; +import accord.topology.Topologies; +import accord.topology.Topology; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.Keyspaces; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.Tables; +import org.apache.cassandra.service.accord.AccordTopology; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.membership.Location; + +import static org.apache.cassandra.cql3.statements.schema.CreateTableStatement.parse; +import static org.apache.cassandra.service.accord.AccordTopologyUtils.*; + +public class RequiredResponseTrackerTest +{ + private static final IPartitioner partitioner = Murmur3Partitioner.instance; + private static TableId tableId = null; + private static KeyspaceMetadata keyspace = null; + private static Topology topology; + private static final Location LOCATION = new Location("DC1", "RACK1"); + + private static final List> RANGES = ImmutableList.of(range(-100, 0), range(0, 100), range(100, -100)); + private static final TopologySorter TOPOLOGY_SORTER = (StaticSorter)(node1, node2, shards) -> node1.compareTo(node2); + + @BeforeClass + public static void beforeClass() throws Throwable + { + DatabaseDescriptor.daemonInitialization(); + DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); + TableMetadata table = parse("CREATE TABLE tbl (k int, c int, v int, primary key (k, c)) WITH transactional_mode='full'", "ks").build(); + tableId = table.id; + keyspace = KeyspaceMetadata.create("ks", KeyspaceParams.simple(3), Tables.of(table)); + + ClusterMetadata metadata = configureCluster(RANGES, Keyspaces.of(keyspace)); + topology = AccordTopology.createAccordTopology(metadata); + + } + + @Test + public void successCase() + { + Set nodes = topology.nodes(); + Assert.assertEquals(NODE_SET, nodes); + RequiredResponseTracker tracker = new RequiredResponseTracker(nodes, new Topologies.Single(TOPOLOGY_SORTER, topology)); + Assert.assertEquals(RequestStatus.NoChange, tracker.recordSuccess(ID1)); + Assert.assertEquals(RequestStatus.NoChange, tracker.recordSuccess(ID2)); + Assert.assertEquals(RequestStatus.Success, tracker.recordSuccess(ID3)); + } + + @Test + public void failureCase() + { + Set nodes = topology.nodes(); + Assert.assertEquals(NODE_SET, nodes); + RequiredResponseTracker tracker = new RequiredResponseTracker(nodes, new Topologies.Single(TOPOLOGY_SORTER, topology)); + Assert.assertEquals(RequestStatus.NoChange, tracker.recordSuccess(ID1)); + Assert.assertEquals(RequestStatus.Failed, tracker.recordFailure(ID2)); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/serializers/CheckStatusSerializersTest.java b/test/unit/org/apache/cassandra/service/accord/serializers/CheckStatusSerializersTest.java new file mode 100644 index 000000000000..5a8ac1505b20 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/serializers/CheckStatusSerializersTest.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.util.Comparator; +import java.util.List; + +import org.junit.Test; + +import accord.api.RoutingKey; +import accord.primitives.SaveStatus; +import accord.primitives.KnownMap; +import accord.primitives.Ballot; +import accord.primitives.FullKeyRoute; +import accord.primitives.Routable; +import accord.primitives.Unseekables; +import accord.utils.AccordGens; +import accord.utils.Gen; +import accord.utils.Gens; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.utils.AccordGenerators; +import org.apache.cassandra.utils.CassandraGenerators; + +import static accord.utils.Property.qt; +import static org.apache.cassandra.utils.AccordGenerators.fromQT; + +public class CheckStatusSerializersTest +{ + static + { + DatabaseDescriptor.clientInitialization(); + DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); + } + + @Test + public void serde() + { + DataOutputBuffer buffer = new DataOutputBuffer(); + qt().forAll(foundKnownMap()).check(map -> Serializers.testSerde(buffer, CheckStatusSerializers.knownMap, map)); + } + + private static Gen foundKnownMap() + { + return rs -> { + SaveStatus saveStatus = Gens.pick(SaveStatus.values()).next(rs); + Ballot promised = AccordGens.ballot().next(rs); + Routable.Domain domain = Gens.pick(Routable.Domain.values()).next(rs); + Unseekables keysOrRanges; + switch (domain) + { + case Key: + // TODO (desired): don't hard code murmur + Gen keyGen = AccordGenerators.routingKeyGen(fromQT(CassandraGenerators.TABLE_ID_GEN), Gens.constant(AccordGenerators.RoutingKeyKind.TOKEN), fromQT(CassandraGenerators.murmurToken()), Murmur3Partitioner.instance); + TokenKey homeKey = keyGen.next(rs); + List forOrdering = Gens.lists(keyGen).unique().ofSizeBetween(1, 10).next(rs); + forOrdering.sort(Comparator.naturalOrder()); + // TODO (desired): don't hard code keys type + keysOrRanges = new FullKeyRoute(homeKey, forOrdering.toArray(RoutingKey[]::new)); + break; + case Range: + keysOrRanges = AccordGenerators.ranges(Murmur3Partitioner.instance).next(rs); + break; + default: + throw new AssertionError("Unknown domain"); + } + return KnownMap.create(keysOrRanges, saveStatus); + }; + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/serializers/CommandSerializersTest.java b/test/unit/org/apache/cassandra/service/accord/serializers/CommandSerializersTest.java new file mode 100644 index 000000000000..c4635cdde61b --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/serializers/CommandSerializersTest.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.local.Node; +import accord.primitives.PartialTxn; +import accord.primitives.Ranges; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.utils.AccordGens; +import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.service.accord.AccordTestUtils; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.utils.FastByteOperations; +import org.assertj.core.api.Assertions; + +import static accord.utils.Property.qt; +import static org.apache.cassandra.config.DatabaseDescriptor.getPartitioner; +import static org.apache.cassandra.cql3.statements.schema.CreateTableStatement.parse; + +public class CommandSerializersTest +{ + @BeforeClass + public static void setupClass() + { + SchemaLoader.prepareServer(); + SchemaLoader.createKeyspace("ks", KeyspaceParams.simple(1), + parse("CREATE TABLE tbl (k int, c int, v int, primary key (k, c)) WITH transactional_mode='full'", "ks")); + + } + + @Test + public void txnSerializer() throws IOException + { + Txn txn = AccordTestUtils.createTxn("BEGIN TRANSACTION\n" + + " LET row1 = (SELECT * FROM ks.tbl WHERE k=0 AND c=0);\n" + + " SELECT row1.v;\n" + + " IF row1 IS NULL THEN\n" + + " INSERT INTO ks.tbl (k, c, v) VALUES (0, 0, 1);\n" + + " END IF\n" + + "COMMIT TRANSACTION"); + PartitionKey key = (PartitionKey) txn.keys().get(0); + PartialTxn expected = txn.slice(Ranges.of(TokenRange.fullRange(key.table(), getPartitioner())), true); + Serializers.testSerde(CommandSerializers.partialTxn, expected, Version.LATEST); + } + + @Test + public void txnIdSerde() + { + DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(AccordGens.txnIds()).check(txnId -> { + Serializers.testSerde(output, CommandSerializers.txnId, txnId); + ByteBuffer tmp = output.buffer(); + tmp.clear(); + CommandSerializers.txnId.serialize(txnId, tmp); + tmp.flip(); + TxnId rt = CommandSerializers.txnId.deserialize(tmp); + Assertions.assertThat(rt).isEqualTo(txnId); + }); + } + + @Test + public void txnIdComparable() + { + qt().forAll(AccordGens.txnIds(), AccordGens.txnIds()).check(CommandSerializersTest::testComparable); + qt().forAll(AccordGens.txnIds()).check((a) -> { + ByteBuffer abb = ByteBuffer.allocate((int) CommandSerializers.txnId.serializedSize(a)); + CommandSerializers.txnId.serializeComparable(a, abb, ByteBufferAccessor.instance, 0); + if (a.epoch() < Timestamp.MAX_EPOCH) + testComparable(a, TxnId.fromValues(a.epoch() + 1, a.hlc(), a.flags(), a.node)); + if (a.epoch() > 0) + testComparable(a, TxnId.fromValues(a.epoch() - 1, a.hlc(), a.flags(), a.node)); + if (a.hlc() < Timestamp.MAX.hlc()) + testComparable(a, TxnId.fromValues(a.epoch(), a.hlc() + 1, a.flags(), a.node)); + if (a.hlc() > 0) + testComparable(a, TxnId.fromValues(a.epoch(), a.hlc() - 1, a.flags(), a.node)); + if (a.flags() < Timestamp.MAX.flags()) + testComparable(a, TxnId.fromValues(a.epoch(), a.hlc(), a.flags() + 1, a.node)); + if (a.flags() != 0) + testComparable(a, TxnId.fromValues(a.epoch(), a.hlc(), a.flags() - 1, a.node)); + if (a.node.id > 0) + testComparable(a, TxnId.fromValues(a.epoch(), a.hlc(), a.flags(), new Node.Id(a.node.id - 1))); + if (a.node.id < Integer.MAX_VALUE) + testComparable(a, TxnId.fromValues(a.epoch(), a.hlc(), a.flags(), new Node.Id(a.node.id + 1))); + }); + } + + private static void testComparable(TxnId a, TxnId b) + { + ByteBuffer abb = ByteBuffer.allocate((int) CommandSerializers.txnId.serializedSize(a)); + CommandSerializers.txnId.serializeComparable(a, abb, ByteBufferAccessor.instance, 0); + TxnId art = CommandSerializers.txnId.deserializeComparable(abb, ByteBufferAccessor.instance, 0); + Assertions.assertThat(art).isEqualTo(a); + testComparable(abb, a, b); + } + + private static void testComparable(ByteBuffer abb, TxnId a, TxnId b) + { + ByteBuffer bbb = ByteBuffer.allocate((int) CommandSerializers.txnId.serializedSize(b)); + CommandSerializers.txnId.serializeComparable(b, bbb, ByteBufferAccessor.instance, 0); + Assertions.assertThat(FastByteOperations.compareUnsigned(abb, bbb)).isEqualTo(a.compareTo(b)); + TxnId brt = CommandSerializers.txnId.deserializeComparable(bbb, ByteBufferAccessor.instance, 0); + Assertions.assertThat(brt).isEqualTo(b); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/serializers/CommandStoreSerializersTest.java b/test/unit/org/apache/cassandra/service/accord/serializers/CommandStoreSerializersTest.java new file mode 100644 index 000000000000..6b1211c98fa2 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/serializers/CommandStoreSerializersTest.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import org.junit.Test; + +import accord.local.RedundantBefore; +import accord.utils.Gens; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.utils.AccordGenerators; + +import static accord.utils.Property.qt; + +public class CommandStoreSerializersTest +{ + static + { + DatabaseDescriptor.clientInitialization(); + DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); + } + + @Test + public void redundantBeforeEntry() + { + DataOutputBuffer buffer = new DataOutputBuffer(); + qt().forAll(Gens.random(), AccordGenerators.partitioner()).check((rs, partitioner) -> { + DatabaseDescriptor.setPartitionerUnsafe(partitioner); + RedundantBefore.Bounds entry = AccordGenerators.redundantBeforeEntry(partitioner).next(rs); + Serializers.testSerde(buffer, CommandStoreSerializers.redundantBeforeEntry, entry); + }); + } + + @Test + public void redundantBefore() + { + DataOutputBuffer buffer = new DataOutputBuffer(); + qt().forAll(Gens.random(), AccordGenerators.partitioner()).check((rs, partitioner) -> { + DatabaseDescriptor.setPartitionerUnsafe(partitioner); + // serializer doesn't support the empty set, so filter out + RedundantBefore redundantBefore = AccordGenerators.redundantBefore(partitioner).filter(r -> r.size() != 0).next(rs); + Serializers.testSerde(buffer, CommandStoreSerializers.redundantBefore, redundantBefore); + }); + } + +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/serializers/CommandsForKeySerializerTest.java b/test/unit/org/apache/cassandra/service/accord/serializers/CommandsForKeySerializerTest.java new file mode 100644 index 000000000000..dcea11348d81 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/serializers/CommandsForKeySerializerTest.java @@ -0,0 +1,714 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.EnumSet; +import java.util.List; +import java.util.Random; +import java.util.Set; +import java.util.TreeSet; +import java.util.concurrent.Callable; +import java.util.concurrent.TimeUnit; +import java.util.function.BooleanSupplier; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.function.IntSupplier; +import java.util.function.LongUnaryOperator; +import java.util.function.Supplier; +import java.util.stream.Stream; + +import org.apache.commons.lang3.ArrayUtils; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.api.Agent; +import accord.api.DataStore; +import accord.api.Journal; +import accord.api.Key; +import accord.api.ProgressLog; +import accord.api.Result; +import accord.api.RoutingKey; +import accord.api.Timeouts; +import accord.impl.AbstractSafeCommandStore; +import accord.impl.DefaultLocalListeners; +import accord.impl.DefaultRemoteListeners; +import accord.local.Command; +import accord.local.CommandStore; +import accord.local.DurableBefore; +import accord.local.ICommand; +import accord.local.Node; +import accord.local.NodeCommandStoreService; +import accord.local.PreLoadContext; +import accord.local.SafeCommand; +import accord.local.SafeCommandStore; +import accord.local.StoreParticipants; +import accord.local.TimeService; +import accord.local.cfk.CommandsForKey; +import accord.local.cfk.CommandsForKey.InternalStatus; +import accord.local.cfk.CommandsForKey.TxnInfo; +import accord.local.cfk.CommandsForKey.Unmanaged; +import accord.local.cfk.SafeCommandsForKey; +import accord.local.cfk.Serialize; +import accord.local.durability.DurabilityService; +import accord.messages.ReplyContext; +import accord.primitives.Ballot; +import accord.primitives.KeyDeps; +import accord.primitives.Known; +import accord.primitives.PartialDeps; +import accord.primitives.PartialTxn; +import accord.primitives.RangeDeps; +import accord.primitives.Ranges; +import accord.primitives.Routable; +import accord.primitives.SaveStatus; +import accord.primitives.Status; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.Txn.Kind; +import accord.primitives.TxnId; +import accord.primitives.Unseekables; +import accord.primitives.Writes; +import accord.topology.TopologyManager; +import accord.utils.AccordGens; +import accord.utils.Gen; +import accord.utils.Gens; +import accord.utils.RandomSource; +import accord.utils.SortedArrays; +import accord.utils.UnhandledEnum; +import accord.utils.async.AsyncChain; +import org.agrona.collections.Int2ObjectHashMap; +import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.StorageService; +import org.apache.cassandra.service.accord.AccordTestUtils; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.accord.txn.TxnData; +import org.apache.cassandra.service.accord.txn.TxnWrite; +import org.apache.cassandra.simulator.RandomSource.Choices; +import org.apache.cassandra.utils.AccordGenerators; +import org.apache.cassandra.utils.CassandraGenerators; + +import static accord.api.ProtocolModifiers.Toggles.setTransitiveDependenciesAreVisible; +import static accord.local.cfk.CommandsForKey.NO_BOUNDS_INFO; +import static accord.primitives.Known.KnownExecuteAt.ExecuteAtErased; +import static accord.primitives.Known.KnownExecuteAt.ExecuteAtUnknown; +import static accord.primitives.Status.Durability.Majority; +import static accord.primitives.Status.Durability.NotDurable; +import static accord.utils.Property.qt; +import static accord.utils.SortedArrays.Search.FAST; +import static org.apache.cassandra.cql3.statements.schema.CreateTableStatement.parse; +import static org.apache.cassandra.service.accord.AccordTestUtils.createPartialTxn; + +// TODO (required): test statusOverrides +public class CommandsForKeySerializerTest +{ + @BeforeClass + public static void beforeClass() throws Throwable + { + // need to create the accord test table as generating random txn is not currently supported + SchemaLoader.prepareServer(); + SchemaLoader.createKeyspace("ks", KeyspaceParams.simple(1), + parse("CREATE TABLE tbl (k int, c int, v int, primary key (k, c)) WITH transactional_mode='full'", "ks")); + setTransitiveDependenciesAreVisible(Kind.values()); + StorageService.instance.initServer(); + } + + @Before + public void before() throws Throwable + { + CommandsForKey.disableLinearizabilityViolationsReporting(); + } + + @After + public void after() throws Throwable + { + CommandsForKey.enableLinearizabilityViolationsReporting(); + } + + static class Cmd + { + final TxnId txnId; + final SaveStatus saveStatus; + final PartialTxn txn; + final Timestamp executeAt; + final Ballot ballot; + final boolean isDurable; + final List deps = new ArrayList<>(); + final List missing = new ArrayList<>(); + boolean invisible; + + Cmd(TxnId txnId, PartialTxn txn, SaveStatus saveStatus, boolean isDurable, Timestamp executeAt, Ballot ballot) + { + this.txnId = txnId; + this.saveStatus = saveStatus; + this.txn = txn; + this.executeAt = executeAt; + this.ballot = ballot; + this.isDurable = isDurable; + } + + ICommand.Builder builder() + { + ICommand.Builder builder = new ICommand.Builder(txnId); + if (saveStatus.known.isDefinitionKnown()) + builder.partialTxn(txn); + + builder.setParticipants(StoreParticipants.all(txn.keys().toRoute(txn.keys().get(0).someIntersectingRoutingKey(null)))); + builder.durability(isDurable ? Majority : NotDurable); + if (saveStatus.known.deps().hasPreAcceptedOrProposedOrDecidedDeps()) + { + try (KeyDeps.Builder keyBuilder = KeyDeps.builder();) + { + for (TxnId id : deps) + keyBuilder.add(((Key)txn.keys().get(0)).toUnseekable(), id); + builder.partialDeps(new PartialDeps(AccordTestUtils.fullRange(txn), keyBuilder.build(), RangeDeps.NONE)); + } + } + + builder.executeAt(executeAt); + builder.promised(ballot); + builder.acceptedOrCommitted(ballot); + builder.durability(isDurable ? Majority : NotDurable); + if (saveStatus.compareTo(SaveStatus.Stable) >= 0 && !saveStatus.hasBeen(Status.Truncated)) + builder.waitingOn(Command.WaitingOn.empty(txnId.domain())); + + if (saveStatus.known.outcome() == Known.Outcome.Apply) + { + if (txnId.is(Kind.Write)) + builder.writes(new Writes(txnId, executeAt, txn.keys(), new TxnWrite(TableMetadatas.none(), Collections.emptyList(), true))); + builder.result(new TxnData()); + } + return builder; + } + + Command toCommand() + { + switch (saveStatus) + { + default: throw new AssertionError("Unhandled saveStatus: " + saveStatus); + case Uninitialised: + case NotDefined: + return Command.NotDefined.notDefined(builder(), Ballot.ZERO); + case PreAccepted: + case PreAcceptedWithVote: + case PreAcceptedWithDeps: + return Command.PreAccepted.preaccepted(builder(), saveStatus); + case AcceptedInvalidate: + return Command.NotAcceptedWithoutDefinition.notAccepted(builder(), saveStatus); + case AcceptedMedium: + case AcceptedMediumWithDefinition: + case AcceptedMediumWithDefAndVote: + case AcceptedSlow: + case AcceptedSlowWithDefinition: + case AcceptedSlowWithDefAndVote: + case AcceptedInvalidateWithDefinition: + case PreCommittedWithDefinition: + case PreCommittedWithDefAndDeps: + case PreCommittedWithDefAndFixedDeps: + case PreCommittedWithDeps: + case PreCommittedWithFixedDeps: + case PreCommitted: + return Command.Accepted.accepted(builder(), saveStatus); + + case Committed: + return Command.Committed.committed(builder(), saveStatus); + + case Stable: + case ReadyToExecute: + return Command.Committed.committed(builder(), saveStatus); + + case PreApplied: + case Applying: + case Applied: + return Command.Executed.executed(builder(), saveStatus); + + case Invalidated: + return Command.Truncated.invalidated(txnId, builder().participants()); + } + } + + @Override + public String toString() + { + return "Cmd{" + + "txnId=" + txnId + + ", saveStatus=" + saveStatus + + ", txn=" + txn + + ", executeAt=" + executeAt + + ", deps=" + deps + + ", missing=" + missing + + ", invisible=" + invisible + + '}'; + } + } + + static class ObjectGraph + { + final Cmd[] cmds; + ObjectGraph(Cmd[] cmds) + { + this.cmds = cmds; + } + + List toCommands() + { + List commands = new ArrayList<>(cmds.length); + for (int i = 0 ; i < cmds.length ; ++i) + commands.add(cmds[i].toCommand()); + return commands; + } + } + + private static ObjectGraph generateObjectGraph(int txnIdCount, Supplier txnIdSupplier, Supplier saveStatusSupplier, Function txnSupplier, Function timestampSupplier, Supplier ballotSupplier, IntSupplier missingCountSupplier, RandomSource source) + { + Cmd[] cmds = new Cmd[txnIdCount]; + for (int i = 0 ; i < txnIdCount ; ++i) + { + TxnId txnId = txnIdSupplier.get(); + SaveStatus saveStatus = saveStatusSupplier.get(); + Timestamp executeAt = txnId; + if (!txnId.kind().awaitsOnlyDeps() && !saveStatus.known.is(ExecuteAtErased) && !saveStatus.known.is(ExecuteAtUnknown)) + executeAt = timestampSupplier.apply(txnId); + + boolean isDurable = false; + Ballot ballot; + switch (saveStatus.status) + { + default: throw new UnhandledEnum(saveStatus.status); + case NotDefined: + case PreAccepted: + case Invalidated: + case Truncated: + ballot = Ballot.ZERO; + break; + case PreApplied: + case Applied: + isDurable = source.nextBoolean(); + case AcceptedInvalidate: + case AcceptedMedium: + case AcceptedSlow: + case PreCommitted: + case Committed: + case Stable: + ballot = ballotSupplier.get(); + } + + cmds[i] = new Cmd(txnId, txnSupplier.apply(txnId), saveStatus, isDurable, executeAt, ballot); + } + Arrays.sort(cmds, Comparator.comparing(o -> o.txnId)); + for (int i = 0 ; i < txnIdCount ; ++i) + { + if (!cmds[i].saveStatus.known.deps().hasPreAcceptedOrProposedOrDecidedDeps()) + continue; + + Timestamp knownBefore = cmds[i].saveStatus.known.deps().hasCommittedOrDecidedDeps() ? cmds[i].executeAt : cmds[i].txnId; + int limit = SortedArrays.binarySearch(cmds, 0, cmds.length, knownBefore, (a, b) -> a.compareTo(b.txnId), FAST); + if (limit < 0) limit = -1 - limit; + + List deps = cmds[i].deps; + List missing = cmds[i].missing; + for (int j = 0 ; j < limit ; ++j) + { + if (i != j && cmds[i].txnId.kind().witnesses(cmds[j].txnId)) + deps.add(cmds[j].txnId); + } + + int missingCount = Math.min(deps.size(), missingCountSupplier.getAsInt()); + while (missingCount > 0) + { + int remove = source.nextInt(deps.size()); + int cmdIndex = SortedArrays.binarySearch(cmds, 0, cmds.length, deps.get(remove), (a, b) -> a.compareTo(b.txnId), FAST); + if (!cmds[cmdIndex].saveStatus.hasBeen(Status.Committed)) + missing.add(deps.get(remove)); + deps.set(remove, deps.get(deps.size() - 1)); + deps.remove(deps.size() - 1); + --missingCount; + } + deps.sort(TxnId::compareTo); + missing.sort(TxnId::compareTo); + } + + outer: for (int i = 0 ; i < cmds.length ; ++i) + { + if (null != InternalStatus.from(cmds[i].saveStatus)) + continue; + + for (int j = 0 ; j < i ; ++j) + { + InternalStatus status = InternalStatus.from(cmds[j].saveStatus); + if (status == null || !status.hasExecuteAtOrDeps()) continue; + if (cmds[j].txnId.kind().witnesses(cmds[i].txnId) && status.depsKnownBefore(cmds[j].txnId, cmds[j].executeAt).compareTo(cmds[i].txnId) > 0 && Collections.binarySearch(cmds[j].missing, cmds[i].txnId) < 0) + continue outer; + } + for (int j = i + 1 ; j < cmds.length ; ++j) + { + InternalStatus status = InternalStatus.from(cmds[j].saveStatus); + if (status == null || !status.hasExecuteAtOrDeps()) continue; + if (cmds[j].txnId.kind().witnesses(cmds[i].txnId) && Collections.binarySearch(cmds[j].missing, cmds[i].txnId) < 0) + continue outer; + } + cmds[i].invisible = true; + for (int j = 0 ; j < i ; ++j) + { + if (cmds[j].executeAt.compareTo(cmds[i].txnId) > 0) + { + int remove = Collections.binarySearch(cmds[j].missing, cmds[i].txnId); + if (remove >= 0) cmds[j].missing.remove(remove); + } + } + for (int j = i + 1 ; j < cmds.length ; ++j) + { + int remove = Collections.binarySearch(cmds[j].missing, cmds[i].txnId); + if (remove >= 0) cmds[j].missing.remove(remove); + } + } + return new ObjectGraph(cmds); + } + + private static Function txnIdSupplier(LongUnaryOperator epochSupplier, LongUnaryOperator hlcSupplier, Supplier kindSupplier, Supplier idSupplier) + { + return min -> new TxnId(epochSupplier.applyAsLong(min == null ? 1 : min.epoch()), hlcSupplier.applyAsLong(min == null ? 1 : min.hlc() + 1), kindSupplier.get(), Routable.Domain.Key, idSupplier.get()); + } + + private static Function timestampSupplier(LongUnaryOperator epochSupplier, LongUnaryOperator hlcSupplier, IntSupplier flagSupplier, Supplier idSupplier) + { + return min -> Timestamp.fromValues(epochSupplier.applyAsLong(min == null ? 1 : min.epoch()), hlcSupplier.applyAsLong(min == null ? 1 : min.hlc() + 1), flagSupplier.getAsInt(), idSupplier.get()); + } + + private static Supplier ballotSupplier(LongUnaryOperator epochSupplier, LongUnaryOperator hlcSupplier, IntSupplier flagSupplier, Supplier idSupplier) + { + return () -> Ballot.fromValues(epochSupplier.applyAsLong(1), hlcSupplier.applyAsLong(1), flagSupplier.getAsInt(), idSupplier.get()); + } + + private static Function timestampSupplier(Set unique, Function supplier) + { + return min -> { + T candidate = supplier.apply(min); + while (!unique.add(candidate)) + { + T next = supplier.apply(min); + if (next.equals(candidate)) min = candidate; + else candidate = next; + } + return candidate; + }; + } + + @Test + public void serde() + { + testOne(629993588068216851L); + Random random = new Random(); + for (int i = 0 ; i < 10000 ; ++i) + { + long seed = random.nextLong(); + testOne(seed); + } + } + + private static void testOne(long seed) + { + try + { + System.out.println(seed); + RandomSource source = RandomSource.wrap(new Random(seed)); + + // TODO (required): produce broader variety of distributions, including executeAt with lower HLC but higher epoch + final LongUnaryOperator epochSupplier; { + long maxEpoch = source.nextLong(1, 10); + epochSupplier = min -> min >= maxEpoch ? min : maxEpoch == 1 ? 1 : source.nextLong(min, maxEpoch); + } + final LongUnaryOperator hlcSupplier; { + long maxHlc = source.nextLong(10, 1000000); + hlcSupplier = min -> min >= maxHlc ? min : source.nextLong(min, maxHlc); + } + final Supplier idSupplier; { + int maxId = source.nextInt(1, 10); + Int2ObjectHashMap lookup = new Int2ObjectHashMap<>(); + idSupplier = () -> lookup.computeIfAbsent(maxId == 1 ? 1 : source.nextInt(1, maxId), Node.Id::new); + } + final IntSupplier flagSupplier = () -> 0; + final Supplier kindSupplier = () -> { + float v = source.nextFloat(); + if (v < 0.5) return Kind.Read; + if (v < 0.97) return Kind.Write; + return Kind.ExclusiveSyncPoint; + }; + + boolean permitMissing = source.decide(0.75f); + final IntSupplier missingCountSupplier; { + if (!permitMissing) + { + missingCountSupplier = () -> 0; + } + else + { + float zeroChance = source.nextFloat(); + int maxMissing = source.nextInt(1, 10); + missingCountSupplier = () -> { + float v = source.nextFloat(); + if (v < zeroChance) return 0; + return source.nextInt(0, maxMissing); + }; + } + } + + Choices saveStatusChoices = Choices.uniform(EnumSet.complementOf(EnumSet.of(SaveStatus.TruncatedApply, SaveStatus.TruncatedUnapplied, SaveStatus.TruncatedApplyWithOutcome)).toArray(SaveStatus[]::new)); + Supplier saveStatusSupplier = () -> { + SaveStatus result = saveStatusChoices.choose(source); + while (result.is(Status.Truncated)) // we don't currently process truncations + result = saveStatusChoices.choose(source); + return result; + }; + + Set uniqueTs = new TreeSet<>(); + final Function txnIdSupplier = timestampSupplier(uniqueTs, txnIdSupplier(epochSupplier, hlcSupplier, kindSupplier, idSupplier)); + boolean permitExecuteAt = source.decide(0.75f); + final Function executeAtSupplier; + { + if (!permitExecuteAt) + { + executeAtSupplier = id -> id; + } + else + { + Function rawTimestampSupplier = timestampSupplier(uniqueTs, timestampSupplier(epochSupplier, hlcSupplier, flagSupplier, idSupplier)); + float useTxnIdChance = source.nextFloat(); + BooleanSupplier useTxnId = () -> source.decide(useTxnIdChance); + executeAtSupplier = txnId -> useTxnId.getAsBoolean() ? txnId : rawTimestampSupplier.apply(txnId); + } + } + + Supplier ballotSupplier; + { + Supplier delegate = ballotSupplier(epochSupplier, hlcSupplier, flagSupplier, idSupplier); + ballotSupplier = () -> source.decide(0.5f) ? Ballot.ZERO : delegate.get(); + } + + PartialTxn txn = createPartialTxn(0); + RoutingKey key = ((Key) txn.keys().get(0)).toUnseekable(); + ObjectGraph graph = generateObjectGraph(source.nextInt(0, 100), () -> txnIdSupplier.apply(null), saveStatusSupplier, ignore -> txn, executeAtSupplier, ballotSupplier, missingCountSupplier, source); + List commands = graph.toCommands(); + CommandsForKey cfk = new CommandsForKey(key); + while (commands.size() > 0) + { + int next = source.nextInt(commands.size()); + Command command = commands.get(next); + cfk = cfk.update(new TestSafeCommandStore(command.txnId()), command).cfk(); + commands.set(next, commands.get(commands.size() - 1)); + commands.remove(commands.size() - 1); + } + + for (int i = 0, j = 0 ; j < graph.cmds.length ; ++j) + { + Cmd cmd = graph.cmds[j]; + if (i >= cfk.size() || !cfk.txnId(i).equals(cmd.txnId)) + { + Assert.assertTrue(cmd.invisible); + continue; + } + TxnInfo info = cfk.get(i); + InternalStatus expectStatus = InternalStatus.from(cmd.saveStatus); + if (expectStatus == InternalStatus.APPLIED_NOT_DURABLE && cmd.isDurable) + expectStatus = InternalStatus.APPLIED_DURABLE; + if (expectStatus == null) expectStatus = InternalStatus.TRANSITIVE_VISIBLE; + if (expectStatus.hasExecuteAt()) + Assert.assertEquals(cmd.executeAt, info.executeAt); + Assert.assertEquals(expectStatus, info.status()); + Assert.assertArrayEquals(cmd.missing.toArray(TxnId[]::new), info.missing()); + if (expectStatus.hasBallot) + Assert.assertEquals(cmd.ballot, info.ballot()); + ++i; + } + + cfk = cfk.updateUniqueHlc(source.nextLong(Long.MAX_VALUE)); + ByteBuffer buffer = Serialize.toBytesWithoutKey(cfk); + CommandsForKey roundTrip = Serialize.fromBytes(key, buffer); + Assert.assertEquals(cfk, roundTrip); + } + catch (Throwable t) + { + throw new AssertionError(seed + " seed failed", t); + } + } + + @Test + public void test() + { + var tableGen = AccordGenerators.fromQT(CassandraGenerators.TABLE_ID_GEN); + var txnIdGen = AccordGens.txnIds((Gen.LongGen) rs -> rs.nextLong(0, 100), rs -> rs.nextLong(100), rs -> rs.nextInt(10)); + qt().check(rs -> { + TableId table = tableGen.next(rs); + TokenKey pk = new TokenKey(table, new Murmur3Partitioner.LongToken(rs.nextLong())); + var redudentBefore = txnIdGen.next(rs); + TxnId[] ids = Gens.arrays(TxnId.class, rs0 -> { + TxnId next = txnIdGen.next(rs0); + while (next.compareTo(redudentBefore) <= 0) + next = txnIdGen.next(rs0); + return next; + }).unique().ofSizeBetween(0, 10).next(rs); + Arrays.sort(ids, Comparator.naturalOrder()); + TxnInfo[] info = new TxnInfo[ids.length]; + InternalStatus[] statuses = Stream.of(InternalStatus.values()).filter(s -> s != InternalStatus.PRUNED).toArray(InternalStatus[]::new); + for (int i = 0; i < info.length; i++) + { + InternalStatus status = rs.pick(statuses); + info[i] = TxnInfo.create(ids[i], status, true, ids[i], TxnId.NO_TXNIDS, Ballot.ZERO); + } + + Gen pendingGen = Gens.enums().allMixedDistribution(Unmanaged.Pending.class).next(rs); + + Unmanaged[] unmanaged = Gens.lists(txnIdGen) + .unique() + .ofSizeBetween(0, 10) + .map((rs0, txnIds) -> txnIds.stream().map(i -> new Unmanaged(pendingGen.next(rs0), i, i)).toArray(Unmanaged[]::new)) + .next(rs); + Arrays.sort(unmanaged, Comparator.naturalOrder()); + if (unmanaged.length > 0) + { + // when registering unmanaged, if the txn is "missing" in TxnInfo we add it + List missing = new ArrayList<>(unmanaged.length); + for (Unmanaged u : unmanaged) + { + int idx = Arrays.binarySearch(ids, u.txnId); + if (idx < 0) + missing.add(TxnInfo.create(u.txnId, InternalStatus.TRANSITIVE, true, u.txnId, Ballot.ZERO)); + } + if (!missing.isEmpty()) + { + info = ArrayUtils.addAll(info, missing.toArray(TxnInfo[]::new)); + Arrays.sort(info, Comparator.naturalOrder()); + } + } + else unmanaged = CommandsForKey.NO_PENDING_UNMANAGED; + + long maxUniqueHlc = rs.nextLong(0, Long.MAX_VALUE); + CommandsForKey expected = CommandsForKey.SerializerSupport.create(pk, info, maxUniqueHlc, unmanaged, TxnId.NONE, NO_BOUNDS_INFO); + + ByteBuffer buffer = Serialize.toBytesWithoutKey(expected); + CommandsForKey roundTrip = Serialize.fromBytes(pk, buffer); + Assert.assertEquals(expected, roundTrip); + }); + } + + @Test + public void thereAndBackAgain() + { + long tokenValue = -2311778975040348869L; + Token token = new Murmur3Partitioner.LongToken(tokenValue); + TokenKey pk = new TokenKey(TableId.fromString("1b255f4d-ef25-40a6-0000-000000000009"), token); + TxnId txnId = TxnId.fromValues(11,34052499,2,1); + CommandsForKey expected = CommandsForKey.SerializerSupport.create(pk, + new TxnInfo[] { TxnInfo.create(txnId, InternalStatus.PREACCEPTED_WITHOUT_DEPS, true, txnId, TxnId.NO_TXNIDS, Ballot.ZERO) }, + 0, CommandsForKey.NO_PENDING_UNMANAGED, TxnId.NONE, NO_BOUNDS_INFO); + + ByteBuffer buffer = Serialize.toBytesWithoutKey(expected); + CommandsForKey roundTrip = Serialize.fromBytes(pk, buffer); + Assert.assertEquals(expected, roundTrip); + } + + static class TestCommandStore extends CommandStore implements Agent + { + static final TestCommandStore INSTANCE = new TestCommandStore(); + protected TestCommandStore() + { + super(0, + null, + null, + null, + ignore -> new ProgressLog.NoOpProgressLog(), + ignore -> new DefaultLocalListeners(new DefaultRemoteListeners((a, b, c, d, e)->{}), DefaultLocalListeners.DefaultNotifySink.INSTANCE), + new EpochUpdateHolder()); + } + + @Override public boolean inStore() { return true; } + @Override public Journal.Loader loader() { throw new UnsupportedOperationException(); } + @Override public Agent agent() { return this; } + @Override public AsyncChain build(PreLoadContext context, Consumer consumer) { return null; } + @Override public AsyncChain build(PreLoadContext context, Function apply) { throw new UnsupportedOperationException(); } + @Override public void shutdown() { } + @Override protected void registerTransitive(SafeCommandStore safeStore, RangeDeps deps){ } + @Override public AsyncChain build(Callable task) { throw new UnsupportedOperationException(); } + @Override public void onRecover(Node node, Result success, Throwable fail) { throw new UnsupportedOperationException(); } + @Override public void onInconsistentTimestamp(Command command, Timestamp prev, Timestamp next) { throw new UnsupportedOperationException(); } + @Override public void onFailedBootstrap(int attempts, String phase, Ranges ranges, Runnable retry, Throwable failure) { throw new UnsupportedOperationException(); } + @Override public void onStale(Timestamp staleSince, Ranges ranges) { throw new UnsupportedOperationException(); } + @Override public void onUncaughtException(Throwable t) { throw new UnsupportedOperationException(); } + @Override public void onCaughtException(Throwable t, String context) { throw new UnsupportedOperationException(); } + @Override public boolean rejectPreAccept(TimeService time, TxnId txnId) { throw new UnsupportedOperationException(); } + @Override public long cfkHlcPruneDelta() { return 0; } + @Override public int cfkPruneInterval() { return 0; } + @Override public long maxConflictsHlcPruneDelta() { return 0; } + @Override public long maxConflictsPruneInterval() { return 0; } + @Override public Txn emptySystemTxn(Kind kind, Routable.Domain domain) { throw new UnsupportedOperationException(); } + @Override public long slowCoordinatorDelay(Node node, SafeCommandStore safeStore, TxnId txnId, TimeUnit units, int retryCount) { return 0; } + @Override public long slowReplicaDelay(Node node, SafeCommandStore safeStore, TxnId txnId, int retryCount, ProgressLog.BlockedUntil blockedUntil, TimeUnit units) { return 0; } + @Override public long slowAwaitDelay(Node node, SafeCommandStore safeStore, TxnId txnId, int retryCount, ProgressLog.BlockedUntil retrying, TimeUnit units) { return 0; } + @Override public long retrySyncPointDelay(Node node, int attempt, TimeUnit units) { return 0; } + @Override public long retryDurabilityDelay(Node node, int attempt, TimeUnit units) { return 0; } + @Override public long expireEpochWait(TimeUnit units) { return 0; } + @Override public long expiresAt(ReplyContext replyContext, TimeUnit unit) { return 0; } + @Override public long selfSlowAt(TxnId txnId, Status.Phase phase, TimeUnit unit) { return 0; } + @Override public long selfExpiresAt(TxnId txnId, Status.Phase phase, TimeUnit unit) { return 0; } + @Override public AsyncChain awaitStaleId(Node node, TxnId staleId, boolean isRequested) { return null; } + @Override public long minStaleHlc(Node node, boolean requested) { return 0; } + } + + public static class TestSafeCommandStore extends AbstractSafeCommandStore + { + public TestSafeCommandStore(PreLoadContext context) + { + super(context, TestCommandStore.INSTANCE); + } + + @Override protected CommandStoreCaches tryGetCaches() { return null; } + @Override protected SafeCommand add(SafeCommand safeCommand, CommandStoreCaches caches) { return null; } + @Override protected SafeCommandsForKey add(SafeCommandsForKey safeCfk, CommandStoreCaches caches) { return null; } + @Override protected SafeCommand getInternal(TxnId txnId) { return null; } + @Override protected SafeCommandsForKey getInternal(RoutingKey key) { return null; } + @Override public DataStore dataStore() { return null; } + @Override public Agent agent() { return null; } + @Override public ProgressLog progressLog() { return null; } + @Override public NodeCommandStoreService node() { return new NodeCommandStoreService() + { + @Override public long epoch() { return 0;} + @Override public Node.Id id() { return Node.Id.NONE; } + @Override public Timeouts timeouts() { return null; } + @Override public DurableBefore durableBefore() { return null;} + @Override public DurabilityService durability() { return null; } + @Override public long uniqueNow(long atLeast) { return 0; } + @Override public TopologyManager topology() { return null; } + @Override public long now() { return 0; } + @Override public long elapsed(TimeUnit unit) { return 0; } + }; } + @Override public boolean visit(Unseekables keysOrRanges, TxnId testTxnId, Kind.Kinds testKind, TestStartedAt testStartedAt, Timestamp testStartAtTimestamp, ComputeIsDep computeIsDep, AllCommandVisitor visit) { return false; } + @Override public void visit(Unseekables keysOrRanges, Timestamp startedBefore, Kind.Kinds testKind, ActiveCommandVisitor visit, P1 p1, P2 p2) { } + } + +} diff --git a/test/unit/org/apache/cassandra/service/accord/serializers/DepsSerializersTest.java b/test/unit/org/apache/cassandra/service/accord/serializers/DepsSerializersTest.java new file mode 100644 index 000000000000..d68558f2c67e --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/serializers/DepsSerializersTest.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import org.junit.Test; + +import accord.primitives.Deps; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.SchemaProvider; +import org.apache.cassandra.utils.AccordGenerators; +import org.mockito.Mockito; + +import static accord.utils.Property.qt; + +public class DepsSerializersTest +{ + static + { + DatabaseDescriptor.clientInitialization(); + DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); + } + + @Test + public void serde() + { + DataOutputBuffer buffer = new DataOutputBuffer(); + qt().check(rs -> { + IPartitioner partitioner = AccordGenerators.partitioner().next(rs); + Schema.instance = Mockito.mock(SchemaProvider.class); + DatabaseDescriptor.setPartitionerUnsafe(partitioner); + Mockito.when(Schema.instance.getExistingTablePartitioner(Mockito.any())).thenReturn(partitioner); + Deps deps = AccordGenerators.depsGen(partitioner).next(rs); + Serializers.testSerde(buffer, DepsSerializers.deps, deps); + }); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/serializers/EncodeAsVInt32Test.java b/test/unit/org/apache/cassandra/service/accord/serializers/EncodeAsVInt32Test.java new file mode 100644 index 000000000000..c2f65f84744f --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/serializers/EncodeAsVInt32Test.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import org.junit.Test; + +import accord.utils.Gen; +import accord.utils.Gens; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.io.util.DataOutputBuffer; + +import static accord.utils.Property.qt; + +public class EncodeAsVInt32Test +{ + private static final Gen.IntGen ENUM_RANGE = Gens.ints().between(0, Integer.MAX_VALUE - 1); + + @Test + public void withNulls() + { + @SuppressWarnings({ "resource", "IOResourceOpenedButNotSafelyClosed" }) DataOutputBuffer output = new DataOutputBuffer(); + EncodeAsVInt32 serializer = EncodeAsVInt32.withNulls(Integer::intValue, Integer::valueOf); + qt().forAll(ENUM_RANGE).check(expected -> Serializers.testSerde(output, serializer, expected)); + } + + @Test + public void withoutNulls() + { + @SuppressWarnings({ "resource", "IOResourceOpenedButNotSafelyClosed" }) DataOutputBuffer output = new DataOutputBuffer(); + EncodeAsVInt32 serializer = EncodeAsVInt32.withoutNulls(Integer::intValue, Integer::valueOf); + qt().forAll(Gens.ints().all()).check(expected -> Serializers.testSerde(output, serializer, expected)); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/serializers/IVersionedWithKeysSerializerTest.java b/test/unit/org/apache/cassandra/service/accord/serializers/IVersionedWithKeysSerializerTest.java new file mode 100644 index 000000000000..f06f3c15f138 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/serializers/IVersionedWithKeysSerializerTest.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; +import java.util.stream.Collectors; + +import org.junit.Test; + +import accord.api.Key; +import accord.api.RoutingKey; +import accord.primitives.Keys; +import accord.primitives.Range; +import accord.primitives.Ranges; +import accord.primitives.Routable; +import accord.primitives.Routables; +import accord.primitives.RoutingKeys; +import accord.utils.Gen; +import accord.utils.Gens; +import accord.utils.RandomSource; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.utils.AccordGenerators; + +import static accord.utils.Property.qt; + +public class IVersionedWithKeysSerializerTest +{ + static + { + DatabaseDescriptor.clientInitialization(); + DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); + } + + @Test + public void test() + { + @SuppressWarnings({ "resource", "IOResourceOpenedButNotSafelyClosed" }) DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(Gens.random(), routables()).check((rs, superset) -> { + var serializer = serializer(superset); + Serializers.testSerde(output, serializer, superset); + if (superset.isEmpty()) return; + // find subsets + Gen> gen = subset(superset); + for (int i = 0; i < 100; i++) + Serializers.testSerde(output, serializer, gen.next(rs)); + }); + } + + private static Gen> routables() + { + Gen partitionerGen = AccordGenerators.partitioner(); + Gen routableKindGen = Gens.enums().all(Routable.Kind.class); + return rs -> { + IPartitioner partitioner = partitionerGen.next(rs); + DatabaseDescriptor.setPartitionerUnsafe(partitioner); + switch (routableKindGen.next(rs)) + { + case SeekableKey: return seekablekeysSuperset(rs, partitioner); + case UnseekableKey: return unseekablekeysSuperset(rs, partitioner); + case Range: return rangesSuperset(rs, partitioner); + default: throw new UnsupportedOperationException(); + } + }; + } + + static Keys seekablekeysSuperset(RandomSource rs, IPartitioner partitioner) + { + return Keys.of(Gens.lists(AccordGenerators.keys(partitioner)).unique().ofSizeBetween(0, 100).next(rs)); + } + + private static RoutingKeys unseekablekeysSuperset(RandomSource rs, IPartitioner partitioner) + { + return RoutingKeys.of(Gens.arrays(RoutingKey.class, (Gen) (Gen) AccordGenerators.routingKeysGen(partitioner)).unique().ofSizeBetween(0, 100).next(rs)); + } + + static Ranges rangesSuperset(RandomSource rs, IPartitioner partitioner) + { + return AccordGenerators.rangesSplitOrArbitrary(partitioner, Gens.ints().between(0, 100)).next(rs); + } + + private static Gen> subset(Routables superset) + { + switch (superset.domainKind()) + { + case SeekableKey: return seekablekeysSubset((Keys) superset); + case UnseekableKey: return unseekablekeysSubset((RoutingKeys) superset); + case Range: return rangesSubset((Ranges) superset); + default: throw new UnsupportedOperationException(); + } + } + + private static Gen> seekablekeysSubset(Keys superset) + { + return Gens.select(superset.stream().collect(Collectors.toList())).map(l -> Keys.of(l.toArray(Key[]::new))); + } + + private static Gen> unseekablekeysSubset(RoutingKeys superset) + { + return Gens.select(superset.stream().collect(Collectors.toList())).map(l -> RoutingKeys.of(l.toArray(RoutingKey[]::new))); + } + + private static Gen> rangesSubset(Ranges superset) + { + return Gens.select(superset.stream().collect(Collectors.toList())).map(l -> Ranges.of(l.toArray(Range[]::new))); + } + + private static UnversionedSerializer> serializer(Routables superset) + { + class S extends IVersionedWithKeysSerializer.AbstractWithKeysSerializer implements UnversionedSerializer> + { + @Override + public void serialize(Routables t, DataOutputPlus out) throws IOException + { + serializeSubsetInternal(t, superset, out); + } + + @Override + public Routables deserialize(DataInputPlus in) throws IOException + { + return deserializeSubsetInternal(superset, in); + } + + @Override + public long serializedSize(Routables t) + { + return serializedSubsetSizeInternal(t, superset); + } + + @Override + public void skip(DataInputPlus in) throws IOException + { + skipSubsetInternal(superset.size(), in); + } + } + return new S(); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/serializers/JournalKeySerializerTest.java b/test/unit/org/apache/cassandra/service/accord/serializers/JournalKeySerializerTest.java new file mode 100644 index 000000000000..b36a5c74d5a6 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/serializers/JournalKeySerializerTest.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.local.Node; +import accord.primitives.Routable; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.harry.dsl.TestRunner; +import org.apache.cassandra.harry.gen.Generator; +import org.apache.cassandra.harry.gen.Generators; +import org.apache.cassandra.service.accord.AccordKeyspace; +import org.apache.cassandra.service.accord.JournalKey; + +public class JournalKeySerializerTest +{ + @BeforeClass + public static void setUp() + { + DatabaseDescriptor.daemonInitialization(); + ServerTestUtils.prepareServer(); + } + + @Test + public void testOrder() + { + Node.Id node = new Node.Id(1); + Generator kindGen = Generators.enumValues(Txn.Kind.class); + Generator domainGen = Generators.enumValues(Routable.Domain.class); + Generator keyTypeGen = Generators.enumValues(JournalKey.Type.class); + + Generator keyGen = rng -> { + TxnId txnId = new TxnId(rng.nextLong(0, Timestamp.MAX_EPOCH + 1), + rng.nextLong(0, Long.MAX_VALUE), + kindGen.generate(rng), + domainGen.generate(rng), + node); + return new JournalKey(txnId, keyTypeGen.generate(rng), rng.nextInt(100)); + }; + TestRunner.test(keyGen, keyGen, (key1, key2) -> { + DecoratedKey dk1 = AccordKeyspace.JournalColumns.decorate(key1); + DecoratedKey dk2 = AccordKeyspace.JournalColumns.decorate(key2); + Assert.assertEquals(String.format("Sort mismatch for\n%s (%s) \n%s (%s) ", key1, dk1, key2, dk2), + dk1.compareTo(dk2) >= 0 ? 1 : -1, + JournalKey.SUPPORT.compare(key1, key2) >= 0 ? 1 : -1); + }); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/serializers/KeySerializersTest.java b/test/unit/org/apache/cassandra/service/accord/serializers/KeySerializersTest.java new file mode 100644 index 000000000000..6dc45af8bb8c --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/serializers/KeySerializersTest.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; +import java.util.Arrays; + +import org.junit.Test; + +import accord.api.RoutingKey; +import accord.local.StoreParticipants; +import accord.primitives.AbstractRanges; +import accord.primitives.AbstractUnseekableKeys; +import accord.primitives.FullKeyRoute; +import accord.primitives.FullRangeRoute; +import accord.primitives.KeyRoute; +import accord.primitives.PartialKeyRoute; +import accord.primitives.PartialRangeRoute; +import accord.primitives.Participants; +import accord.primitives.Range; +import accord.primitives.RangeRoute; +import accord.primitives.Ranges; +import accord.primitives.Route; +import accord.primitives.RoutingKeys; +import accord.primitives.Unseekable; +import accord.utils.Gen; +import accord.utils.RandomSource; +import accord.utils.RandomTestRunner; +import accord.utils.UnhandledEnum; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.utils.AccordGenerators; +import org.apache.cassandra.utils.CassandraGenerators; + +import static accord.utils.Property.qt; +import static org.apache.cassandra.utils.AccordGenerators.fromQT; +import static org.apache.cassandra.utils.AccordGenerators.maybeUpdatePartitioner; +import static org.apache.cassandra.utils.AccordGenerators.partitioner; + +public class KeySerializersTest +{ + static + { + DatabaseDescriptor.clientInitialization(); + // If the first example is "[]" then need a partitioner for static init + DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); + } + + @Test + public void ranges() + { + @SuppressWarnings({ "resource", "IOResourceOpenedButNotSafelyClosed" }) DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(rangesGen()).check(expected -> { + maybeUpdatePartitioner(expected); + Serializers.testSerde(output, KeySerializers.ranges, expected); + }); + } + + @Test + public void storeParticipants() + { + DataOutputBuffer output = new DataOutputBuffer(); + for (int i = 0 ; i < 10000 ; ++i) + { + RandomTestRunner.test().check(rs -> testTwo(rs, output)); + } + } + + private void testTwo(RandomSource rs, DataOutputBuffer output) + { + IPartitioner partitioner = partitioner().next(rs); + DatabaseDescriptor.setPartitionerUnsafe(partitioner); + testOne(rs, output, keyRoute(partitioner, rs)); + testOne(rs, output, rangeRoute(partitioner, rs)); + } + + private void testOne(RandomSource rs, DataOutputBuffer output, Participants superset) + { + Route route = null; + if (rs.nextBoolean()) superset = ((Route)superset).participantsOnly(); + else route = (Route)subset(rs, superset, false); + Participants hasTouched = subset(rs, superset, true); + Participants touches = subset(rs, hasTouched, true); + Participants owns = subset(rs, touches, true); + Participants executes = rs.nextBoolean() ? subset(rs, owns, true) : null; + Participants waitsOn = executes != null ? subset(rs, executes, true) : null; + StoreParticipants participants = StoreParticipants.create(route, owns, executes, waitsOn, touches, hasTouched); + try + { + Serializers.testSerde(output, CommandSerializers.participants, participants); + } + catch (IOException e) + { + throw new RuntimeException(e); + } + } + + private static KeyRoute keyRoute(IPartitioner partitioner, RandomSource rs) + { + TableId tableId = fromQT(CassandraGenerators.TABLE_ID_GEN).next(rs); + Gen tokenGen = fromQT(CassandraGenerators.token(partitioner)); + Gen keyGen = AccordGenerators.routingKeyGen(ignore -> tableId, tokenGen, partitioner); + RoutingKey[] ks = new RoutingKey[rs.nextInt(1, 10)]; + for (int i = 0 ; i < ks.length ; ++i) + ks[i] = keyGen.next(rs); + Arrays.sort(ks); + int count = 1; + for (int i = 1 ; i < ks.length ; ++i) + { + if (!ks[count - 1].equals(ks[i])) + ks[count++] = ks[i]; + } + if (count != ks.length) + ks = Arrays.copyOf(ks, count); + + float f = rs.nextFloat(); + if (f < 0.66f) + { + int homeKey = rs.nextInt(ks.length); + return f < 0.33f ? new FullKeyRoute(ks[homeKey], ks) : new PartialKeyRoute(ks[homeKey], ks); + } + return new PartialKeyRoute(keyGen.next(rs), ks); + } + + private static RangeRoute rangeRoute(IPartitioner partitioner, RandomSource rs) + { + TableId tableId = fromQT(CassandraGenerators.TABLE_ID_GEN).next(rs); + Gen tokenGen = fromQT(CassandraGenerators.token(partitioner)); + Gen keyGen = AccordGenerators.routingKeyGen(ignore -> tableId, tokenGen, partitioner); + TokenKey[] ks = new TokenKey[rs.nextInt(1, 10) * 2]; + for (int i = 0 ; i < ks.length ; ++i) + ks[i] = keyGen.next(rs); + Arrays.sort(ks); + int count = 1; + for (int i = 1 ; i < ks.length ; ++i) + { + if (!ks[count - 1].equals(ks[i])) + ks[count++] = ks[i]; + } + Range[] ranges = new Range[count / 2]; + for (int i = 0 ; i < ranges.length ; ++i) + ranges[i] = TokenRange.create(ks[i*2], ks[i*2+1]); + + float f = rs.nextFloat(); + if (ranges.length > 0 && f < 0.66f) + { + RoutingKey homeKey = rs.nextBoolean() ? ks[rs.nextInt(ranges.length * 2)] : ranges[rs.nextInt(ranges.length)].someIntersectingRoutingKey(null); + return f < 0.33f ? new FullRangeRoute(homeKey, ranges) : new PartialRangeRoute(homeKey, ranges); + } + return new PartialRangeRoute(keyGen.next(rs), ranges); + } + + private static Participants subset(RandomSource rs, Participants superset, boolean changeType) + { + if (rs.nextBoolean()) + return changeType && superset instanceof Route && rs.nextBoolean() ? ((Route)superset).participantsOnly() : superset; + + int count = superset.isEmpty() ? 0 : rs.nextInt(superset.size()); + Participants subset = selectSubset(rs, count, superset); + if (superset instanceof Route && (!changeType || rs.nextBoolean())) + return superset.intersecting(subset); + return subset; + } + + private static Participants selectSubset(RandomSource rs, int count, Participants superset) + { + switch (superset.domain()) + { + default: throw UnhandledEnum.unknown(superset.domain()); + case Key: + { + AbstractUnseekableKeys in = (AbstractUnseekableKeys) superset; + RoutingKey[] out = new RoutingKey[count]; + int j = 0; + for (int i = 0 ; i < out.length ; ++i) + { + j += count == (in.size() - j) ? 0 : rs.nextInt(0, in.size() - j); + out[i] = in.get(j); + } + return (Participants) RoutingKeys.of(out); + } + + case Range: + { + AbstractRanges in = (AbstractRanges) superset; + Range[] out = new Range[count]; + int j = 0; + for (int i = 0 ; i < out.length ; ++i) + { + j += count == (in.size() - j) ? 0 : rs.nextInt(0, in.size() - j); + out[i] = in.get(j); + } + return (Participants) Ranges.of(out); + } + + } + + } + + private static Gen rangesGen() + { + return partitioner().flatMap(p -> AccordGenerators.rangesSplitOrArbitrary(p)); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/serializers/LatestDepsSerializerTest.java b/test/unit/org/apache/cassandra/service/accord/serializers/LatestDepsSerializerTest.java new file mode 100644 index 000000000000..b0e090ff2a31 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/serializers/LatestDepsSerializerTest.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import org.junit.Test; + +import accord.primitives.LatestDeps; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.io.util.DataOutputBuffer; + +public class LatestDepsSerializerTest +{ + @Test + public void emptySerializerTest() throws Throwable + { + DataOutputBuffer buf = new DataOutputBuffer(); + Serializers.testSerde(buf, LatestDepsSerializers.latestDeps, LatestDeps.EMPTY); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/serializers/TableMetadatasAndKeysTest.java b/test/unit/org/apache/cassandra/service/accord/serializers/TableMetadatasAndKeysTest.java new file mode 100644 index 000000000000..14a41cb4bb58 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/serializers/TableMetadatasAndKeysTest.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.LinkedHashMap; + +import org.junit.Test; + +import accord.primitives.Keys; +import accord.primitives.Ranges; +import accord.primitives.Routable; +import accord.primitives.Seekable; +import accord.primitives.Seekables; +import accord.utils.Gen; +import accord.utils.Gens; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.utils.AccordGenerators; + +import static accord.utils.Property.qt; +import static org.apache.cassandra.service.accord.serializers.TableMetadatasTest.buildSchema; +import static org.apache.cassandra.service.accord.serializers.TableMetadatasTest.toMetadatas; + +public class TableMetadatasAndKeysTest +{ + static + { + DatabaseDescriptor.clientInitialization(); + DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); + } + + private static final Gen partitionerGen = AccordGenerators.partitioner(); + + @Test + public void test() + { + Gen domainGen = Gens.enums().all(Routable.Domain.class); + @SuppressWarnings({ "resource", "IOResourceOpenedButNotSafelyClosed" }) DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(Gens.random(), TableMetadatasTest.tables().filter(m -> !m.isEmpty())).check((rs, tables) -> { + TableMetadatas metadatas = toMetadatas(tables); + Schema.instance = buildSchema(tables); + + Seekables keysOrRanges; + switch (domainGen.next(rs)) + { + case Key: + keysOrRanges = createKeys(tables).next(rs); + break; + case Range: + keysOrRanges = createRanges(tables).next(rs); + break; + default: throw new UnsupportedOperationException(); + } + + TableMetadatasAndKeys tablesAndKeys = new TableMetadatasAndKeys(metadatas, keysOrRanges); + + Serializers.testSerde(output, TableMetadatasAndKeys.serializer, tablesAndKeys); + var serializer = serializer(tablesAndKeys); + var partitionSerializer = partitionKeySerializer(tablesAndKeys); + for (Seekable s : keysOrRanges) + { + Serializers.testSerde(output, serializer, s); + if (s instanceof PartitionKey) + Serializers.testSerde(output, partitionSerializer, (PartitionKey) s); + } + }); + } + + private static Gen createKeys(LinkedHashMap tables) + { + return rs -> { + IPartitioner partitioner = partitionerGen.next(rs); + DatabaseDescriptor.setPartitionerUnsafe(partitioner); + Gen keyGen = AccordGenerators.keys(partitioner, new ArrayList<>(tables.keySet())); + return Keys.of(Gens.lists(keyGen).unique().ofSizeBetween(1, 100).next(rs)); + }; + } + + private static Gen createRanges(LinkedHashMap tables) + { + return rs -> { + var partitioner = partitionerGen.next(rs); + DatabaseDescriptor.setPartitionerUnsafe(partitioner); + return AccordGenerators.rangesSplitOrArbitrary(partitioner, Gens.ints().between(1, 100), Gens.constant(new ArrayList<>(tables.keySet()))).next(rs); + }; + } + + private static UnversionedSerializer serializer(TableMetadatasAndKeys tableAndKeys) + { + return new UnversionedSerializer<>() + { + @Override + public void serialize(Seekable t, DataOutputPlus out) throws IOException + { + tableAndKeys.serializeSeekable(t, out); + } + + @Override + public Seekable deserialize(DataInputPlus in) throws IOException + { + return tableAndKeys.deserializeSeekable(in); + } + + @Override + public long serializedSize(Seekable t) + { + return tableAndKeys.serializedSeekableSize(t); + } + }; + } + + public static UnversionedSerializer partitionKeySerializer(TableMetadatasAndKeys tableAndKeys) + { + return new UnversionedSerializer<>() + { + @Override + public void serialize(PartitionKey t, DataOutputPlus out) throws IOException + { + tableAndKeys.serializeKey(t, out); + } + + @Override + public PartitionKey deserialize(DataInputPlus in) throws IOException + { + return tableAndKeys.deserializeKey(in); + } + + @Override + public long serializedSize(PartitionKey t) + { + return tableAndKeys.serializedKeySize(t); + } + }; + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/serializers/TableMetadatasTest.java b/test/unit/org/apache/cassandra/service/accord/serializers/TableMetadatasTest.java new file mode 100644 index 000000000000..e89f49bffa89 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/serializers/TableMetadatasTest.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; +import java.util.LinkedHashMap; +import java.util.Map; + +import org.junit.Test; + +import accord.utils.Gen; +import accord.utils.Gens; +import org.apache.cassandra.exceptions.UnknownTableException; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.SchemaProvider; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.utils.CassandraGenerators; +import org.apache.cassandra.utils.Generators; +import org.mockito.Mockito; +import org.mockito.invocation.InvocationOnMock; +import org.mockito.stubbing.Answer; + +import static accord.utils.Property.qt; + +public class TableMetadatasTest +{ + @Test + public void test() + { + @SuppressWarnings({ "resource", "IOResourceOpenedButNotSafelyClosed" }) DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(tables()).check(tables -> { + TableMetadatas metadatas = toMetadatas(tables); + Schema.instance = buildSchema(tables); + Serializers.testSerde(output, SelfSerializer.instance, metadatas); + + UnversionedSerializer serializer = tableSerializer(metadatas); + for (var metadata : tables.values()) + Serializers.testSerde(output, serializer, metadata); + }); + } + + static SchemaProvider buildSchema(Map tables) throws UnknownTableException + { + SchemaProvider schema = Mockito.mock(SchemaProvider.class); + Mockito.when(schema.getTableMetadata(Mockito.any())).thenAnswer(new Answer() + { + @Override + public TableMetadata answer(InvocationOnMock invocationOnMock) throws Throwable + { + TableId id = invocationOnMock.getArgument(0); + var metadata = tables.get(id); + if (metadata == null) throw new UnknownTableException("Unknown table " + id, id); + return metadata; + } + }); + return schema; + } + + static TableMetadatas toMetadatas(Map map) + { + TableMetadatas.Collector collector = new TableMetadatas.Collector(); + map.values().forEach(collector::add); + return collector.build(); + } + + static Gen> tables() + { + Gen idGen = Generators.toGen(CassandraGenerators.TABLE_ID_GEN); + return rs -> { + TableId[] ids = Gens.arrays(TableId.class, idGen).unique().ofSizeBetween(0, 100).next(rs); + LinkedHashMap map = new LinkedHashMap<>(); + for (int i = 0; i < ids.length; i++) + map.put(ids[i], forId(ids[i])); + return map; + }; + } + + private static TableMetadata forId(TableId id) + { + TableMetadata metadata = TableMetadata.minimal("ks", "tbl", id); + if (!metadata.id().equals(id)) throw new AssertionError("Unexpected table id: " + metadata.id() + "; expected " + id); + return metadata; + } + + private static UnversionedSerializer tableSerializer(TableMetadatas metadatas) + { + return new UnversionedSerializer<>() + { + @Override + public void serialize(TableMetadata t, DataOutputPlus out) throws IOException + { + metadatas.serialize(t, out); + } + + @Override + public TableMetadata deserialize(DataInputPlus in) throws IOException + { + return metadatas.deserialize(in); + } + + @Override + public long serializedSize(TableMetadata t) + { + return metadatas.serializedSize(t); + } + }; + } + + private enum SelfSerializer implements UnversionedSerializer + { + instance; + + @Override + public void serialize(TableMetadatas t, DataOutputPlus out) throws IOException + { + t.serializeSelf(out); + } + + @Override + public TableMetadatas deserialize(DataInputPlus in) throws IOException + { + return TableMetadatas.deserializeSelf(in); + } + + @Override + public long serializedSize(TableMetadatas t) + { + return t.serializedSelfSize(); + } + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/serializers/TokenKeyTest.java b/test/unit/org/apache/cassandra/service/accord/serializers/TokenKeyTest.java new file mode 100644 index 000000000000..5d3e023b2051 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/serializers/TokenKeyTest.java @@ -0,0 +1,381 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.junit.Before; +import org.junit.Test; + +import accord.utils.Gen; +import accord.utils.Gens; +import accord.utils.Invariants; +import accord.utils.LazyToString; +import accord.utils.ReflectionUtils; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; +import org.apache.cassandra.dht.ByteOrderedPartitioner; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Murmur3Partitioner.LongToken; +import org.apache.cassandra.dht.RandomPartitioner; +import org.apache.cassandra.dht.RandomPartitioner.BigIntegerToken; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.utils.AccordGenerators; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.CassandraGenerators; +import org.assertj.core.api.Assertions; + +import static accord.utils.Property.qt; +import static org.apache.cassandra.service.accord.api.TokenKey.serializer; +import static org.apache.cassandra.utils.AccordGenerators.fromQT; +import static org.apache.cassandra.utils.CassandraGenerators.partitioners; +import static org.apache.cassandra.utils.CassandraGenerators.token; + +public class TokenKeyTest +{ + static + { + DatabaseDescriptor.clientInitialization(); + } + + @Before + public void before() + { + // AccordRoutingKey$TokenKey reaches into DD to get partitioner, so need to set that up... + DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); + } + + @Test + public void beforeIsTokenSentinel() + { + qt().forAll(simpleTokenKey()).check(tokenKey -> { + var t = tokenKey.before(); + Assertions.assertThat(t.isTokenSentinel()).isTrue(); + Assertions.assertThat(t.isTableSentinel()).isEqualTo(tokenKey.isTableSentinel()); + Assertions.assertThat(t.isMin()).isEqualTo(tokenKey.isMin()); + Assertions.assertThat(t.isMax()).isEqualTo(tokenKey.isMax()); + Assertions.assertThat(t.isBefore()).isTrue(); + Assertions.assertThat(t.isAfter()).isFalse(); + + Assertions.assertThatThrownBy(() -> t.before()); + Assertions.assertThatThrownBy(() -> t.after()); + + Assertions.assertThat(tokenKey.compareTo(t)).isGreaterThan(0); + Assertions.assertThat(t.compareTo(tokenKey)).isLessThan(0); + }); + } + + @Test + public void afterIsTokenSentinel() + { + qt().forAll(simpleTokenKey()).check(tokenKey -> { + var t = tokenKey.after(); + Assertions.assertThat(t.isTokenSentinel()).isTrue(); + Assertions.assertThat(t.isTableSentinel()).isEqualTo(tokenKey.isTableSentinel()); + Assertions.assertThat(t.isMin()).isEqualTo(tokenKey.isMin()); + Assertions.assertThat(t.isMax()).isEqualTo(tokenKey.isMax()); + Assertions.assertThat(t.isBefore()).isFalse(); + Assertions.assertThat(t.isAfter()).isTrue(); + + Assertions.assertThat(tokenKey.compareTo(t)).isLessThan(0); + Assertions.assertThat(t.compareTo(tokenKey)).isGreaterThan(0); + }); + } + + @Test + public void serdeSimple() + { + Gen tokenKeyGen = AccordGenerators.allowBeforeAndAfter(simpleTokenKey()); + @SuppressWarnings({ "resource", "IOResourceOpenedButNotSafelyClosed" }) DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(tokenKeyGen).check(expected -> { + DatabaseDescriptor.setPartitionerUnsafe(expected.token().getPartitioner()); + Serializers.testSerde(output, serializer, expected); + testSerdePrefix(output, serializer, expected); + }); + } + + @Test + public void serde() + { + @SuppressWarnings({ "resource", "IOResourceOpenedButNotSafelyClosed" }) DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(tokenKeyWithBeforeAndAfterGen()) + .check(key -> { + IPartitioner partitioner = key.token().getPartitioner(); + DatabaseDescriptor.setPartitionerUnsafe(partitioner); + + Serializers.testSerde(output, serializer, key); + Assertions.assertThat(serializer.deserializeAndConsume(serializer.serialize(key), partitioner)).isEqualTo(key); + { + TokenKey roundTrip = serializer.deserializeWithPrefixAndImpliedLength(key.prefix(), serializer.serializeWithoutPrefixOrLength(key), partitioner); + Assertions.assertThat(roundTrip).isEqualTo(key); + } + { + TokenKey roundTrip = serializer.deserializeWithPrefixAndImpliedLength(key.prefix(), serializer.serializeWithoutPrefixOrLength(key), ByteBufferAccessor.instance, 0, partitioner); + Assertions.assertThat(roundTrip).isEqualTo(key); + } + { + TokenKey roundTrip = serializer.deserializeWithPrefix(key.prefix(), serializer.serializedSizeWithoutPrefix(key), serializer.serializeWithoutPrefixOrLength(key), partitioner); + Assertions.assertThat(roundTrip).isEqualTo(key); + } + { + TokenKey roundTrip = serializer.deserializeWithPrefix(key.prefix(), serializer.serializedSizeWithoutPrefix(key), serializer.serializeWithoutPrefixOrLength(key), ByteBufferAccessor.instance, 0, partitioner); + Assertions.assertThat(roundTrip).isEqualTo(key); + } + output.clear(); + serializer.serialize(key, output); + try (DataInputBuffer in = new DataInputBuffer(output.toByteArray())) + { + serializer.skip(in, partitioner); + Invariants.require(0 == in.available()); + } + }); + } + + @Test + public void compare() + { + qt().forAll(tokenKeyGen()) + .check(key -> { + ByteBuffer keyBytes = serializer.serialize(key); + for (TokenKey test : mutateAfter(key)) + { + ByteBuffer testBytes = serializer.serialize(test); + Invariants.require(test.compareTo(key) > 0); + Invariants.require(ByteBufferUtil.compareUnsigned(testBytes, keyBytes) > 0); + } + for (TokenKey test : mutateBefore(key)) + { + ByteBuffer testBytes = serializer.serialize(test); + Invariants.require(test.compareTo(key) < 0); + Invariants.require(ByteBufferUtil.compareUnsigned(testBytes, keyBytes) < 0); + } + }); + } + + private static Gen simpleTokenKey() + { + return AccordGenerators.partitioner().flatMap(p -> AccordGenerators.routingKeysGen(p)); + } + + private static void testSerdePrefix(DataOutputBuffer output, TokenKey.Serializer serializer, TokenKey input) throws IOException + { + output.clear(); + Object expected = input.prefix(); + long expectedSize = serializer.serializedSizeOfPrefix(expected); + serializer.serializePrefix(expected, output); + Assertions.assertThat(output.getLength()).describedAs("The serialized size and bytes written do not match").isEqualTo(expectedSize); + DataInputBuffer in = new DataInputBuffer(output.unsafeGetBufferAndFlip(), false); + Object read = serializer.deserializePrefix(in); + Assertions.assertThat(read) + .describedAs("The deserialized output does not match the serialized input; difference %s", new LazyToString(() -> ReflectionUtils.recursiveEquals(read, expected).toString())) + .isEqualTo(expected); + } + + private static Gen tokenKeyGen() + { + return fromQT(partitioners()).filter(IPartitioner::accordSupported) + .flatMap(partitioner -> routingKeyGen(fromQT(CassandraGenerators.TABLE_ID_GEN), fromQT(token(partitioner)), partitioner)); + } + + private static Gen tokenKeyWithBeforeAndAfterGen() + { + return AccordGenerators.allowBeforeAndAfter(tokenKeyGen()); + } + + private List mutateAfter(TokenKey mutate) + { + List results = new ArrayList<>(); + if (!mutate.isTableSentinel()) + { + Token token = mutate.token(); + if (token instanceof ByteOrderedPartitioner.BytesToken) + { + byte[] bytes = (byte[]) token.getTokenValue(); + bytes = bytes.clone(); + for (int i = 0 ; i < bytes.length ; ++i) + { + if ((bytes[i] & 0xff) != 0xff) + { + ++bytes[i]; + add(results, mutate.withToken(new ByteOrderedPartitioner.BytesToken(bytes.clone()))); + --bytes[i]; + } + } + add(results, mutate.withToken(new ByteOrderedPartitioner.BytesToken(Arrays.copyOf(bytes, bytes.length + 1)))); + } + else if (token instanceof LongToken) + { + long value = token.getLongValue(); + if (value < Long.MAX_VALUE) + add(results, mutate.withToken(new LongToken(value + 1))); + for (long v = 2L; v >= 0 ; v <<= 1) + { + if ((value & v) == 0) + add(results, mutate.withToken(new LongToken(value | v))); + } + if (value >= 0) + { + long higher = value; + while ((higher <<= 8) > value) + add(results, mutate.withToken(new LongToken(higher))); + } + else + { + for (int i = 1 ; i < 8 ; ++i) + add(results, mutate.withToken(new LongToken(value >> (i * 8)))); + } + } + else if (token instanceof BigIntegerToken) + { + BigInteger value = (BigInteger) token.getTokenValue(); + if (value.compareTo(RandomPartitioner.MAXIMUM) < 0) + add(results, mutate.withToken(new BigIntegerToken(value.add(BigInteger.ONE)))); + for (long v = 1L; v >= 0 ; v <<= 1) + { + BigInteger i = BigInteger.valueOf(v); + if (value.and(i).equals(BigInteger.ZERO)) + add(results, mutate.withToken(new BigIntegerToken(value.or(i)))); + } + BigInteger higher = value; + while ((higher = higher.shiftLeft(8)).compareTo(RandomPartitioner.MAXIMUM) <= 0) + add(results, mutate.withToken(new BigIntegerToken(higher))); + } + else throw new UnsupportedOperationException(); + } + TableId tableId = mutate.table(); + if (tableId.msb() != Long.MAX_VALUE) + add(results, mutate.withTable(TableId.fromRaw(tableId.msb() + 1, tableId.lsb()))); + if (tableId.lsb() != Long.MAX_VALUE) + add(results, mutate.withTable(TableId.fromRaw(tableId.msb(), tableId.lsb() + 1))); + return results; + } + + private List mutateBefore(TokenKey mutate) + { + List results = new ArrayList<>(); + if (!mutate.isTableSentinel()) + { + Token token = mutate.token(); + if (token instanceof ByteOrderedPartitioner.BytesToken) + { + byte[] bytes = (byte[]) token.getTokenValue(); + bytes = bytes.clone(); + for (int i = 0 ; i < bytes.length ; ++i) + { + add(results, mutate.withToken(new ByteOrderedPartitioner.BytesToken(Arrays.copyOf(bytes, i)))); + if ((bytes[i] & 0xff) != 0) + { + --bytes[i]; + add(results, mutate.withToken(new ByteOrderedPartitioner.BytesToken(bytes.clone()))); + ++bytes[i]; + } + } + } + else if (token instanceof LongToken) + { + long value = token.getLongValue(); + if (value > Long.MIN_VALUE) + add(results, mutate.withToken(new LongToken(value - 1))); + for (long v = 2L; v >= 0 ; v <<= 1) + { + if ((value & v) != 0) + add(results, mutate.withToken(new LongToken(value & ~v))); + } + if (value >= 0) + { + for (int i = 1 ; i < 8 ; ++i) + add(results, mutate.withToken(new LongToken(value >>> (i * 8)))); + } + else + { + for (int i = 0 ; i < 7 ; ++i) + { + long next = value & (-1L << (i * 8)); + if (next != value) + add(results, mutate.withToken(new LongToken(next))); + } + } + } + else if (token instanceof BigIntegerToken) + { + BigInteger value = (BigInteger) token.getTokenValue(); + if (value.compareTo(RandomPartitioner.MINIMUM.getTokenValue()) > 0) + add(results, mutate.withToken(new BigIntegerToken(value.subtract(BigInteger.ONE)))); + for (long v = 1L; v >= 0 ; v <<= 1) + { + BigInteger i = BigInteger.valueOf(v); + if (!value.and(i).equals(BigInteger.ZERO)) + add(results, mutate.withToken(new BigIntegerToken(value.andNot(i)))); + } + for (int i = 1 ; i < 8 ; ++i) + add(results, mutate.withToken(new BigIntegerToken(value.shiftRight(i * 16)))); + } + else throw new UnsupportedOperationException(); + } + TableId tableId = mutate.table(); + if (tableId.msb() != Long.MIN_VALUE) + add(results, mutate.withTable(TableId.fromRaw(tableId.msb() - 1, tableId.lsb()))); + if (tableId.lsb() != Long.MIN_VALUE) + add(results, mutate.withTable(TableId.fromRaw(tableId.msb(), tableId.lsb() -1))); + return results; + } + + private static void add(List to, TokenKey vary) + { + to.add(vary); + if (!vary.isTokenSentinel()) + { + to.add(vary.before()); + to.add(vary.after()); + } + } + + private static Gen routingKeyGen(Gen tableIdGen, Gen tokenGen, IPartitioner partitioner) + { + Gen result = AccordGenerators.routingKeyGen(tableIdGen, Gens.enums().all(AccordGenerators.RoutingKeyKind.class), tokenGen, partitioner); + if (!(partitioner instanceof ByteOrderedPartitioner)) + return result; + return result.map((rs, k) -> { + byte[] bytes = (byte[]) k.token().getTokenValue(); + if (bytes.length >= 3) + { + while (rs.nextFloat() < 0.25f) + { + int i = rs.nextInt(bytes.length - 2); + bytes[i] = 0; + bytes[i + 1] = (byte) rs.nextInt(0, TokenKey.Serializer.ESCAPE_BYTE); + } + } + + return k; + }); + } + +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/serializers/TopologySerializersTest.java b/test/unit/org/apache/cassandra/service/accord/serializers/TopologySerializersTest.java new file mode 100644 index 000000000000..0740fae85616 --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/serializers/TopologySerializersTest.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.io.IOException; + +import org.junit.Test; + +import accord.local.Node; +import accord.utils.AccordGens; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.utils.AccordGenerators; + +import static accord.utils.Property.qt; + + +public class TopologySerializersTest +{ + static + { + DatabaseDescriptor.clientInitialization(); + } + + @Test + public void nodeId() throws IOException + { + qt().forAll(AccordGens.nodes()).check(n -> Serializers.testSerde(TopologySerializers.nodeId, n)); + } + + @Test + public void topology() + { + @SuppressWarnings({ "resource", "IOResourceOpenedButNotSafelyClosed" }) DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(AccordGenerators.partitioner().flatMap(p -> AccordGenerators.topologyGen(p))).check(expected -> { + AccordGenerators.maybeUpdatePartitioner(expected.ranges()); + Serializers.testSerde(output, TopologySerializers.topology, expected); + + for (Node.Id node : expected.nodes()) + Serializers.testSerde(output, TopologySerializers.topology, expected.forNode(node)); + }); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/serializers/WaitingOnSerializerTest.java b/test/unit/org/apache/cassandra/service/accord/serializers/WaitingOnSerializerTest.java new file mode 100644 index 000000000000..aa97ee534bbc --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/serializers/WaitingOnSerializerTest.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.serializers; + +import java.nio.ByteBuffer; + +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.local.Command; +import accord.primitives.Deps; +import accord.primitives.KeyDeps; +import accord.primitives.PartialDeps; +import accord.primitives.Routable; +import accord.primitives.RoutingKeys; +import accord.primitives.TxnId; +import accord.utils.Gen; +import accord.utils.Gens; +import accord.utils.SimpleBitSet; +import accord.utils.Utils; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.utils.AccordGenerators; +import org.apache.cassandra.utils.CassandraGenerators; +import org.assertj.core.api.Assertions; + +import static accord.utils.Property.qt; + +public class WaitingOnSerializerTest +{ + @BeforeClass + public static void setup() + { + DatabaseDescriptor.clientInitialization(); + DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); + } + + @Test + public void serde() + { + qt().forAll(waitingOnGen()).check(waitingOn -> { + TxnId txnId = TxnId.NONE; + if (waitingOn.appliedOrInvalidated != null) txnId = new TxnId(txnId.epoch(), txnId.hlc(), txnId.kind(), Routable.Domain.Range, txnId.node); + ByteBuffer bb; + try (DataOutputBuffer buf = new DataOutputBuffer()) + { + WaitingOnSerializer.serializeBitSetsOnly(txnId, waitingOn, buf); + bb = buf.asNewBuffer(); + } + try (DataInputBuffer buf = new DataInputBuffer(bb, true)) + { + PartialDeps deps = new PartialDeps(RoutingKeys.EMPTY, KeyDeps.none(waitingOn.keys), waitingOn.directRangeDeps); + Command.WaitingOn read = WaitingOnSerializer.deserializeProvider(txnId, buf).provide(txnId, deps, null, 0); + Assertions.assertThat(read).isEqualTo(waitingOn); + Assertions.assertThat(buf.available()).isEqualTo(0); + } + try (DataInputBuffer buf = new DataInputBuffer(bb, true)) + { + WaitingOnSerializer.skip(txnId, buf); + Assertions.assertThat(buf.available()).isEqualTo(0); + } + }); + } + + private enum WaitingOnSets { APPLY, APPLIED_OR_INVALIDATED } + + private static Gen waitingOnGen() + { + Gen depsGen = AccordGenerators.fromQT(CassandraGenerators.nonLocalPartitioners()) + .flatMap(AccordGenerators::depsGen); + Gen sets = Gens.enums().all(WaitingOnSets.class); + return rs -> { + Deps deps = depsGen.next(rs); + if (deps.isEmpty()) return Command.WaitingOn.empty(Routable.Domain.Key); + int txnIdCount = deps.rangeDeps.txnIdCount(); + int keyCount = deps.keyDeps.keys().size(); + int[] selected = Gens.arrays(Gens.ints().between(0, txnIdCount + keyCount - 1)).unique().ofSizeBetween(0, txnIdCount + keyCount).next(rs); + SimpleBitSet waitingOn = new SimpleBitSet(txnIdCount + keyCount, false); + SimpleBitSet appliedOrInvalidated = rs.nextBoolean() ? null : new SimpleBitSet(txnIdCount, false); + for (int i : selected) + { + WaitingOnSets set = appliedOrInvalidated == null || i >= txnIdCount ? WaitingOnSets.APPLY : sets.next(rs); + switch (set) + { + case APPLY: + waitingOn.set(i); + break; + case APPLIED_OR_INVALIDATED: + appliedOrInvalidated.set(i); + break; + default: + throw new IllegalStateException("Unexpected set: " + set); + } + } + + return new Command.WaitingOn(deps.keyDeps.keys(), deps.rangeDeps, Utils.ensureImmutable(waitingOn), Utils.ensureImmutable(appliedOrInvalidated)); + }; + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/txn/AbstractKeySortedTest.java b/test/unit/org/apache/cassandra/service/accord/txn/AbstractKeySortedTest.java new file mode 100644 index 000000000000..2c2b89193adf --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/txn/AbstractKeySortedTest.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +import com.google.common.collect.Lists; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.primitives.Routable.Domain; +import accord.primitives.Seekable; +import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.dht.ByteOrderedPartitioner; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.utils.ByteBufferUtil; + +public class AbstractKeySortedTest +{ + private static final TableId TABLE1 = TableId.fromString("00000000-0000-0000-0000-000000000001"); + + @BeforeClass + public static void beforeClass() throws Exception + { + SchemaLoader.prepareServer(); + } + + static class Item + { + final PartitionKey key; + final int value; + + public Item(PartitionKey key, int value) + { + this.key = key; + this.value = value; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Item item = (Item) o; + return value == item.value && key.equals(item.key); + } + + @Override + public int hashCode() + { + return Objects.hash(key, value); + } + + @Override + public String toString() + { + return "Item{" + + "key=" + key + + ", value=" + value + + '}'; + } + } + + static class SortedItems extends AbstractKeySorted + { + public SortedItems(Item... items) + { + super(items, Domain.Key); + } + + public SortedItems(List items) + { + super(items, Domain.Key); + } + + @Override + int compareNonKeyFields(Item left, Item right) + { + return Integer.compare(left.value, right.value); + } + + @Override + Seekable getKey(Item item) + { + return item.key; + } + + @Override + Item[] newArray(int size) + { + return new Item[size]; + } + } + + private static PartitionKey key(int k) + { + DecoratedKey dk = ByteOrderedPartitioner.instance.decorateKey(ByteBufferUtil.bytes(k)); + return new PartitionKey(TABLE1, dk); + } + + private static Item item(int k, int v) + { + return new Item(key(k), v); + } + + private static List itemList(Item... items) + { + return Lists.newArrayList(items); + } + + @Test + public void checkInitialSorting() + { + List initial = itemList(item(5, 4), item(3, 3), item(3, 1), item(6, 5)); + SortedItems expected = new SortedItems(item(3, 1), item(3, 3), item(5, 4), item(6, 5)); + expected.validateOrder(); + SortedItems actual = new SortedItems(initial); + actual.validateOrder(); + Assert.assertEquals(expected, actual); + } + + @Test + public void checkIterationForKey() + { + SortedItems source = new SortedItems(item(1, 5), item(3, 1), item(3, 3), item(5, 4), item(6, 5)); + source.validateOrder(); + + source.forEachWithKey(key(0), i -> Assert.fail()); + source.forEachWithKey(key(1), i -> Assert.assertEquals(item(1, 5), i)); + source.forEachWithKey(key(2), i -> Assert.fail()); + List actual = new ArrayList<>(); + source.forEachWithKey(key(3), actual::add); + Assert.assertEquals(itemList(item(3, 1), item(3, 3)), actual); + source.forEachWithKey(key(4), i -> Assert.fail()); + source.forEachWithKey(key(5), i -> Assert.assertEquals(item(5, 4), i)); + source.forEachWithKey(key(6), i -> Assert.assertEquals(item(6, 5), i)); + source.forEachWithKey(key(7), i -> Assert.fail()); + } +} diff --git a/test/unit/org/apache/cassandra/service/accord/txn/AccordUpdateTest.java b/test/unit/org/apache/cassandra/service/accord/txn/AccordUpdateTest.java new file mode 100644 index 000000000000..15df8a781bde --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/txn/AccordUpdateTest.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.io.IOException; + +import org.junit.BeforeClass; +import org.junit.Test; + +import accord.primitives.Txn; +import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.service.accord.AccordTestUtils; +import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys; +import org.apache.cassandra.service.accord.serializers.Version; + +import static org.apache.cassandra.cql3.statements.schema.CreateTableStatement.parse; + +public class AccordUpdateTest +{ + @BeforeClass + public static void setupClass() + { + SchemaLoader.prepareServer(); + SchemaLoader.createKeyspace("ks", KeyspaceParams.simple(1), + parse("CREATE TABLE tbl (k int, c int, v int, primary key (k, c)) WITH transactional_mode='full'", "ks")); + + } + + @Test + public void predicateSerializer() throws IOException + { + Txn txn = AccordTestUtils.createTxn(0, 0); + TxnUpdate update = (TxnUpdate) txn.update(); + TableMetadatasAndKeys tablesAndKeys = new TableMetadatasAndKeys(update.tables, update.keys()); + for (Version version : Version.V1.greaterThanOrEqual()) + Serializers.testSerde(AccordUpdate.serializer, update, tablesAndKeys, version); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/accord/txn/TxnConditionTest.java b/test/unit/org/apache/cassandra/service/accord/txn/TxnConditionTest.java new file mode 100644 index 000000000000..1cb1199f7f1e --- /dev/null +++ b/test/unit/org/apache/cassandra/service/accord/txn/TxnConditionTest.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.service.accord.txn; + +import java.nio.ByteBuffer; +import java.nio.charset.CharacterCodingException; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import javax.annotation.Nullable; + +import org.junit.Test; + +import accord.utils.Gen; +import accord.utils.Gens; +import org.apache.cassandra.cql3.conditions.ColumnCondition; +import org.apache.cassandra.cql3.conditions.ColumnConditionTest; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.rows.CellPath; +import org.apache.cassandra.io.Serializers; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.MockSchema; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.accord.serializers.Version; +import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.CassandraGenerators; +import org.apache.cassandra.utils.Generators; + +import static accord.utils.Property.qt; + +//TOOD (maintaince): rather than copy the condition supported kinds, maybe references directly from the type? +public class TxnConditionTest +{ + private static final SchemaProvider SCHEMA = new SchemaProvider(); + static + { + // ColumnMetadata serializer only stores the ks/table/name and uses Schema to load it + Schema.instance = SCHEMA; + } + + private static Gen BOOLEAN_KIND_GEN = Gens.pick(TxnCondition.Kind.AND, TxnCondition.Kind.OR); + private static Gen EXISTS_KIND_GEN = Gens.pick(TxnCondition.Kind.IS_NOT_NULL, TxnCondition.Kind.IS_NULL); + private static Gen VALUE_KIND_GEN = Gens.pick(TxnCondition.Kind.EQUAL, TxnCondition.Kind.NOT_EQUAL, + TxnCondition.Kind.GREATER_THAN, TxnCondition.Kind.GREATER_THAN_OR_EQUAL, + TxnCondition.Kind.LESS_THAN, TxnCondition.Kind.LESS_THAN_OR_EQUAL); + private static Gen PROTOCOL_VERSION_GEN = Gens.enums().all(ProtocolVersion.class); + private static Gen COLUM_METADATA_GEN = Generators.toGen(CassandraGenerators.columnMetadataGen()).map(cm -> { + SCHEMA.add(cm); + return cm; + }); + private static Gen BYTES_GEN = Generators.toGen(Generators.directAndHeapBytes(0, 10)); + private static Gen TXN_REF_GEN = rs -> { + { + ColumnMetadata cm = COLUM_METADATA_GEN.next(rs); + TableMetadata.Builder builder = TableMetadata.builder("", "", TableId.generate()) + .addColumn(cm); + if (!cm.isPartitionKey()) + builder.addPartitionKeyColumn(cm.name.toString().equals("_") ? "__" : "_", Int32Type.instance); + TableMetadata tm = builder.build(); + cm = tm.getColumn(cm.name); + return rs.nextBoolean() ? new TxnReference(rs.nextInt(0, Integer.MAX_VALUE), cm, tm) + : new TxnReference(rs.nextInt(0, Integer.MAX_VALUE), tm, cm, CellPath.create(BYTES_GEN.next(rs))); + } + }; + private static Gen> CLUSTERING_GEN = Generators.toGen(CassandraGenerators.CLUSTERING_GEN); + private static Gen BOUND_GEN = ColumnConditionTest.boundGen().map(b -> { + SCHEMA.add(b.column); + return b; + }); + + @Test + public void serde() + { + DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(txnConditionGen()).check(condition -> { + TableMetadatas.Collector collector = new TableMetadatas.Collector(); + condition.collect(collector); + TableMetadatas tables = collector.build(); + for (Version version : Version.V1.greaterThanOrEqual()) + Serializers.testSerde(output, TxnCondition.serializer, condition, tables, version); + SCHEMA.clear(); + }); + } + + private Gen txnConditionGen() + { + return rs -> { + switch (rs.nextInt(1, 5)) + { + case 0: return TxnCondition.none(); + case 1: return new TxnCondition.Exists(TXN_REF_GEN.next(rs), EXISTS_KIND_GEN.next(rs)); + case 2: return new TxnCondition.Value(TXN_REF_GEN.next(rs), VALUE_KIND_GEN.next(rs), BYTES_GEN.next(rs), PROTOCOL_VERSION_GEN.next(rs)); + case 3: return new TxnCondition.ColumnConditionsAdapter(CLUSTERING_GEN.next(rs), Gens.lists(BOUND_GEN).ofSizeBetween(0, 3).next(rs)); + case 4: return new TxnCondition.BooleanGroup(BOOLEAN_KIND_GEN.next(rs), Gens.lists(txnConditionGen()).ofSizeBetween(0, 3).next(rs)); + default: throw new AssertionError(); + } + }; + } + + private static class SchemaProvider extends MockSchema.MockSchemaProvider + { + private final class Key + { + private final String keyspace, table; + private final ByteBuffer name; + + private Key(String keyspace, String table, ByteBuffer name) + { + this.keyspace = keyspace; + this.table = table; + this.name = name; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Key key = (Key) o; + return keyspace.equals(key.keyspace) && table.equals(key.table) && name.equals(key.name); + } + + @Override + public int hashCode() + { + return Objects.hash(keyspace, table, name); + } + + @Override + public String toString() + { + try + { + return keyspace + "." + table + "/" + ByteBufferUtil.string(name); + } + catch (CharacterCodingException e) + { + throw new RuntimeException(e); + } + } + } + private final Map columns = new HashMap<>(); + + public void add(ColumnMetadata cm) + { + columns.put(new Key(cm.ksName, cm.cfName, cm.name.bytes), cm); + } + + public void clear() + { + columns.clear(); + } + + @Nullable + @Override + public ColumnMetadata getColumnMetadata(String keyspace, String table, ByteBuffer name) + { + Key key = new Key(keyspace, table, name); + ColumnMetadata match = columns.get(key); + if (match == null) + { + throw new AssertionError("Unable to find ColumnMetadata for " + key + "; known columns are " + columns.keySet()); + } + return match; + } + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/service/paxos/AbstractPaxosRepairTest.java b/test/unit/org/apache/cassandra/service/paxos/AbstractPaxosRepairTest.java index 9721879d56b0..b91604792d9d 100644 --- a/test/unit/org/apache/cassandra/service/paxos/AbstractPaxosRepairTest.java +++ b/test/unit/org/apache/cassandra/service/paxos/AbstractPaxosRepairTest.java @@ -51,7 +51,7 @@ private static class PaxosTestRepair extends AbstractPaxosRepair { public PaxosTestRepair() { - super(Murmur3Partitioner.instance.decorateKey(ByteBufferUtil.bytes(1)), null); + super(Murmur3Partitioner.instance.decorateKey(ByteBufferUtil.bytes(1)), null, -1); } public State restart(State state, long waitUntil) diff --git a/test/unit/org/apache/cassandra/service/paxos/ContentionStrategyTest.java b/test/unit/org/apache/cassandra/service/paxos/ContentionStrategyTest.java deleted file mode 100644 index 8b67c425b1d1..000000000000 --- a/test/unit/org/apache/cassandra/service/paxos/ContentionStrategyTest.java +++ /dev/null @@ -1,466 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.service.paxos; - -import java.util.List; -import java.util.Random; -import java.util.concurrent.ThreadLocalRandom; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicReference; -import java.util.function.BiFunction; -import java.util.function.Consumer; -import java.util.function.DoubleSupplier; -import java.util.function.LongBinaryOperator; - -import com.google.common.collect.ImmutableList; -import org.junit.Assert; -import org.junit.Test; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import net.nicoulaj.compilecommand.annotations.Inline; -import org.apache.cassandra.config.DatabaseDescriptor; - -import static org.apache.cassandra.service.paxos.ContentionStrategy.*; -import static org.apache.cassandra.service.paxos.ContentionStrategy.WaitRandomizerFactory.*; -import static org.apache.cassandra.service.paxos.ContentionStrategyTest.WaitRandomizerType.*; - -public class ContentionStrategyTest -{ - private static final Logger logger = LoggerFactory.getLogger(ContentionStrategyTest.class); - - static - { - DatabaseDescriptor.daemonInitialization(); - } - - private static final long MAX = maxQueryTimeoutMicros()/2; - - private static final WaitParseValidator DEFAULT_WAIT_RANDOMIZER_VALIDATOR = new WaitParseValidator(defaultWaitRandomizer(), QEXP, 1.5); - private static final BoundParseValidator DEFAULT_MIN_VALIDATOR = new BoundParseValidator(defaultMinWait(), true, assertBound(0, MAX, 0, selectors.maxReadWrite(0f).getClass(), 0.50, 0, modifiers.multiply(0f).getClass(), 0.66)); - private static final BoundParseValidator DEFAULT_MAX_VALIDATOR = new BoundParseValidator(defaultMaxWait(), false, assertBound(10000, 100000, 100000, selectors.maxReadWrite(0f).getClass(), 0.95, 0, modifiers.multiplyByAttemptsExp(0f).getClass(), 1.8)); - private static final BoundParseValidator DEFAULT_MIN_DELTA_VALIDATOR = new BoundParseValidator(defaultMinDelta(), true, assertBound(5000, MAX, 5000, selectors.maxReadWrite(0f).getClass(), 0.50, 0, modifiers.multiply(0f).getClass(), 0.5)); - - private static List VALIDATE = ImmutableList.of( - new BoundParseValidator("p95(rw)", false, assertBound(0, MAX, MAX, selectors.maxReadWrite(0f).getClass(), 0.95, 0, modifiers.identity().getClass(), 1)), - new BoundParseValidator("5ms<=p50(rw)*0.66", false, assertBound(5000, MAX, MAX, selectors.maxReadWrite(0f).getClass(), 0.50, 0, modifiers.multiply(0).getClass(), 0.66)), - new BoundParseValidator("5us <= p50(r)*1.66*attempts", true, assertBound(5, MAX, 5, selectors.read(0f).getClass(), 0.50, 0, modifiers.multiplyByAttempts(0f).getClass(), 1.66)), - new BoundParseValidator("0<=p50(w)*0.66^attempts", true, assertBound(0, MAX, 0, selectors.write(0f).getClass(), 0.50, 0, modifiers.multiplyByAttemptsExp(0f).getClass(), 0.66)), - new BoundParseValidator("125us", true, assertBound(125, 125, 125, selectors.constant(0).getClass(), 0.0f, 125, modifiers.identity().getClass(), 1)), - new BoundParseValidator("5us <= p95(r)*1.8^attempts <= 100us", true, assertBound(5, 100, 5, selectors.read(0f).getClass(), 0.95, 0, modifiers.multiplyByAttemptsExp(0f).getClass(), 1.8)), - DEFAULT_MIN_VALIDATOR, DEFAULT_MAX_VALIDATOR, DEFAULT_MIN_DELTA_VALIDATOR - ); - - private static List VALIDATE_RANDOMIZER = ImmutableList.of( - new WaitParseValidator("quantizedexponential(0.5)", QEXP, 0.5), - new WaitParseValidator("exponential(2.5)", EXP, 2.5), - new WaitParseValidator("exp(10)", EXP, 10), - new WaitParseValidator("uniform", UNIFORM, 0), - DEFAULT_WAIT_RANDOMIZER_VALIDATOR - ); - - static class BoundParseValidator - { - final String spec; - final boolean isMin; - final Consumer validator; - - BoundParseValidator(String spec, boolean isMin, Consumer validator) - { - this.spec = spec; - this.isMin = isMin; - this.validator = validator; - } - - void validate(Bound bound) - { - validator.accept(bound); - } - } - - enum WaitRandomizerType - { - UNIFORM(Uniform.class, (p, f) -> f.uniform()), - EXP(Exponential.class, (p, f) -> f.exponential(p)), - QEXP(QuantizedExponential.class, (p, f) -> f.quantizedExponential(p)); - - final Class clazz; - final BiFunction getter; - - WaitRandomizerType(Class clazz, BiFunction getter) - { - this.clazz = clazz; - this.getter = getter; - } - } - - - static class WaitParseValidator - { - final String spec; - final WaitRandomizerType type; - final double power; - - WaitParseValidator(String spec, WaitRandomizerType type, double power) - { - this.spec = spec; - this.type = type; - this.power = power; - } - - void validate(WaitRandomizer randomizer) - { - Assert.assertSame(type.clazz, randomizer.getClass()); - if (AbstractExponential.class.isAssignableFrom(type.clazz)) - Assert.assertEquals(power, ((AbstractExponential) randomizer).power, 0.00001); - } - } - - private static class WaitRandomizerOutputValidator - { - static void validate(WaitRandomizerType type, long seed, int trials, int samplesPerTrial) - { - Random random = new Random(seed); - WaitRandomizer randomizer = type.getter.apply(2d, new WaitRandomizerFactory() - { - @Override public LongBinaryOperator uniformLongSupplier() { return (min, max) -> min + random.nextInt((int) (max - min)); } - @Override public DoubleSupplier uniformDoubleSupplier() { return random::nextDouble; } - }); - - for (int i = 0 ; i < trials ; ++i) - { - int min = random.nextInt(1 << 20); - int max = min + 1024 + random.nextInt(1 << 20); - double minMean = minMean(type, min, max); - double maxMean = maxMean(type, min, max); - double sampleMean = sampleMean(samplesPerTrial, min, max, randomizer); - Assert.assertTrue(minMean <= sampleMean); - Assert.assertTrue(maxMean >= sampleMean); - } - } - - private static double minMean(WaitRandomizerType type, int min, int max) - { - switch (type) - { - case UNIFORM: return min + (max - min) * (4d/10); - case EXP: case QEXP: return min + (max - min) * (6d/10); - default: throw new IllegalStateException(); - } - } - - private static double maxMean(WaitRandomizerType type, int min, int max) - { - switch (type) - { - case UNIFORM: return min + (max - min) * (6d/10); - case EXP: case QEXP: return min + (max - min) * (8d/10); - default: throw new IllegalStateException(); - } - } - - private static double sampleMean(int samples, int min, int max, WaitRandomizer randomizer) - { - double sum = 0; - int attempts = 1; - for (int i = 0 ; i < samples ; ++i) - { - long wait = randomizer.wait(min, max, attempts = (attempts & 15) + 1); - Assert.assertTrue(wait >= min); - Assert.assertTrue(wait <= max); - sum += wait; - } - double mean = sum / samples; - Assert.assertTrue(mean >= min); - Assert.assertTrue(mean <= max); - return mean; - } - } - - private static Consumer assertBound( - long min, long max, long onFailure, - Class selectorClass, - double selectorPercentile, - long selectorConst, - Class modifierClass, - double modifierVal - ) - { - return bound -> { - Assert.assertEquals(min, bound.min); - Assert.assertEquals(max, bound.max); - Assert.assertEquals(onFailure, bound.onFailure); - Assert.assertSame(selectorClass, bound.selector.getClass()); - if (selectorClass == selectors.constant(0).getClass()) - { - LatencySupplier fail = v -> { throw new UnsupportedOperationException(); }; - Assert.assertEquals(selectorConst, bound.selector.select(fail, fail)); - } - else - { - AtomicReference percentile = new AtomicReference<>(); - LatencySupplier set = v -> { percentile.set(v); return 0; }; - bound.selector.select(set, set); - Assert.assertNotNull(percentile.get()); - Assert.assertEquals(selectorPercentile, percentile.get(), 0.00001); - } - Assert.assertSame(modifierClass, bound.modifier.getClass()); - Assert.assertEquals(1000000L * modifierVal, bound.modifier.modify(1000000, 1), 0.00001); - }; - } - - private static void assertParseFailure(String spec) - { - - try - { - Bound bound = parseBound(spec, false); - Assert.fail("expected parse failure, but got " + bound); - } - catch (IllegalArgumentException e) - { - // expected - } - } - - @Test - public void strategyParseTest() - { - for (BoundParseValidator min : VALIDATE.stream().filter(v -> v.isMin).toArray(BoundParseValidator[]::new)) - { - for (BoundParseValidator max : VALIDATE.stream().filter(v -> !v.isMin).toArray(BoundParseValidator[]::new)) - { - for (BoundParseValidator minDelta : VALIDATE.stream().filter(v -> v.isMin).toArray(BoundParseValidator[]::new)) - { - for (WaitParseValidator random : VALIDATE_RANDOMIZER) - { - { - ParsedStrategy parsed = parseStrategy("min=" + min.spec + ",max=" + max.spec + ",delta=" + minDelta.spec + ",random=" + random.spec); - Assert.assertEquals(parsed.min, min.spec); - min.validate(parsed.strategy.min); - Assert.assertEquals(parsed.max, max.spec); - max.validate(parsed.strategy.max); - Assert.assertEquals(parsed.minDelta, minDelta.spec); - minDelta.validate(parsed.strategy.minDelta); - Assert.assertEquals(parsed.waitRandomizer, random.spec); - random.validate(parsed.strategy.waitRandomizer); - } - ParsedStrategy parsed = parseStrategy("random=" + random.spec); - Assert.assertEquals(parsed.min, DEFAULT_MIN_VALIDATOR.spec); - DEFAULT_MIN_VALIDATOR.validate(parsed.strategy.min); - Assert.assertEquals(parsed.max, DEFAULT_MAX_VALIDATOR.spec); - DEFAULT_MAX_VALIDATOR.validate(parsed.strategy.max); - Assert.assertEquals(parsed.minDelta, DEFAULT_MIN_DELTA_VALIDATOR.spec); - DEFAULT_MIN_DELTA_VALIDATOR.validate(parsed.strategy.minDelta); - Assert.assertEquals(parsed.waitRandomizer, random.spec); - random.validate(parsed.strategy.waitRandomizer); - } - ParsedStrategy parsed = parseStrategy("delta=" + minDelta.spec); - Assert.assertEquals(parsed.min, DEFAULT_MIN_VALIDATOR.spec); - DEFAULT_MIN_VALIDATOR.validate(parsed.strategy.min); - Assert.assertEquals(parsed.max, DEFAULT_MAX_VALIDATOR.spec); - DEFAULT_MAX_VALIDATOR.validate(parsed.strategy.max); - Assert.assertEquals(parsed.minDelta, minDelta.spec); - minDelta.validate(parsed.strategy.minDelta); - } - ParsedStrategy parsed = parseStrategy("max=" + max.spec); - Assert.assertEquals(parsed.min, DEFAULT_MIN_VALIDATOR.spec); - DEFAULT_MIN_VALIDATOR.validate(parsed.strategy.min); - Assert.assertEquals(parsed.max, max.spec); - max.validate(parsed.strategy.max); - Assert.assertEquals(parsed.minDelta, DEFAULT_MIN_DELTA_VALIDATOR.spec); - DEFAULT_MIN_DELTA_VALIDATOR.validate(parsed.strategy.minDelta); - } - ParsedStrategy parsed = parseStrategy("min=" + min.spec); - Assert.assertEquals(parsed.min, min.spec); - min.validate(parsed.strategy.min); - Assert.assertEquals(parsed.max, DEFAULT_MAX_VALIDATOR.spec); - DEFAULT_MAX_VALIDATOR.validate(parsed.strategy.max); - Assert.assertEquals(parsed.minDelta, DEFAULT_MIN_DELTA_VALIDATOR.spec); - DEFAULT_MIN_DELTA_VALIDATOR.validate(parsed.strategy.minDelta); - } - } - - @Test - public void testParseRoundTrip() - { - LatencySelectorFactory selectorFactory = new LatencySelectorFactory() - { - LatencySelectorFactory delegate = ContentionStrategy.selectors; - public LatencySelector constant(long latency) { return selector(delegate.constant(latency), String.format("%dms", latency)); } - public LatencySelector read(double percentile) { return selector(delegate.read(percentile), String.format("p%d(r)", (int) (percentile * 100))); } - public LatencySelector write(double percentile) { return selector(delegate.write(percentile), String.format("p%d(w)", (int) (percentile * 100))); } - public LatencySelector maxReadWrite(double percentile) { return selector(delegate.maxReadWrite(percentile), String.format("p%d(rw)", (int) percentile * 100)); } - - private LatencySelector selector(LatencySelector selector, String str) { - return new LatencySelector() - { - public long select(LatencySupplier read, LatencySupplier write) - { - return selector.select(read, write); - } - - public String toString() - { - return str; - } - }; - } - }; - - LatencyModifierFactory modifierFactory = new LatencyModifierFactory() - { - LatencyModifierFactory delegate = ContentionStrategy.modifiers; - public LatencyModifier identity() { return modifier(delegate.identity(), ""); } - public LatencyModifier multiply(double constant) { return modifier(delegate.multiply(constant), String.format(" * %.2f", constant)); } - public LatencyModifier multiplyByAttempts(double multiply) { return modifier(delegate.multiplyByAttempts(multiply), String.format(" * %.2f * attempts", multiply)); } - public LatencyModifier multiplyByAttemptsExp(double base) { return modifier(delegate.multiplyByAttemptsExp(base), String.format(" * %.2f ^ attempts", base)); } - - private LatencyModifier modifier(LatencyModifier modifier, String str) { - return new LatencyModifier() - { - @Inline - public long modify(long latency, int attempts) - { - return modifier.modify(latency, attempts); - } - - public String toString() - { - return str; - } - }; - } - }; - - LatencyModifier[] latencyModifiers = new LatencyModifier[]{ - modifierFactory.multiply(0.5), - modifierFactory.multiplyByAttempts(0.5), - modifierFactory.multiplyByAttemptsExp(0.5) - }; - - LatencySelector[] latencySelectors = new LatencySelector[]{ - selectorFactory.read(0.5), - selectorFactory.write(0.5), - selectorFactory.maxReadWrite(0.99) - }; - - for (boolean min : new boolean[] { true, false}) - { - String left = min ? "10ms <= " : ""; - for (boolean max : new boolean[] { true, false}) - { - String right = max ? " <= 10ms" : ""; - - for (LatencySelector selector : latencySelectors) - { - for (LatencyModifier modifier : latencyModifiers) - { - String mid = String.format("%s%s", selector, modifier); - String input = left + mid + right; - Bound bound = parseBound(input, false, selectorFactory, modifierFactory); - Assert.assertTrue(String.format("Bound: %d" , bound.min), !min || bound.min == 10000); - Assert.assertTrue(String.format("Bound: %d" , bound.max), !max || bound.max == 10000); - Assert.assertEquals(selector.toString(), bound.selector.toString()); - Assert.assertEquals(modifier.toString(), bound.modifier.toString()); - } - } - } - } - } - - @Test - public void boundParseTest() - { - VALIDATE.forEach(v -> v.validate(parseBound(v.spec, v.isMin))); - } - - @Test - public void waitRandomizerParseTest() - { - VALIDATE_RANDOMIZER.forEach(v -> v.validate(parseWaitRandomizer(v.spec))); - } - - @Test - public void waitRandomizerSampleTest() - { - waitRandomizerSampleTest(2); - } - - private void waitRandomizerSampleTest(int count) - { - while (count-- > 0) - { - long seed = ThreadLocalRandom.current().nextLong(); - logger.info("Seed {}", seed); - for (WaitRandomizerType type : WaitRandomizerType.values()) - { - WaitRandomizerOutputValidator.validate(type, seed, 100, 1000000); - } - } - } - - @Test - public void boundParseFailureTest() - { - assertParseFailure("10ms <= p95(r) <= 5ms"); - assertParseFailure("10 <= p95(r)"); - assertParseFailure("10 <= 20 <= 30"); - assertParseFailure("p95(r) < 5"); - assertParseFailure("p95(x)"); - assertParseFailure("p95()"); - assertParseFailure("p95"); - assertParseFailure("p50(rw)+0.66"); - } - - @Test - public void testBackoffTime() - { - ContentionStrategy strategy = parseStrategy("min=0ms,max=100ms,random=uniform").strategy; - double total = 0; - int count = 100000; - for (int i = 0 ; i < count ; ++i) - { - long now = System.nanoTime(); - long waitUntil = strategy.computeWaitUntilForContention(1, null, null, null, null); - long waitLength = Math.max(waitUntil - now, 0); - total += waitLength; - } - Assert.assertTrue(Math.abs(TimeUnit.MILLISECONDS.toNanos(50) - (total / count)) < TimeUnit.MILLISECONDS.toNanos(1L)); - } - - @Test - public void testBackoffTimeElapsed() - { - ContentionStrategy strategy = parseStrategy("min=0ms,max=10ms,random=uniform").strategy; - double total = 0; - int count = 1000; - for (int i = 0 ; i < count ; ++i) - { - long start = System.nanoTime(); - strategy.doWaitForContention(Long.MAX_VALUE, 1, null, null, null, null); - long end = System.nanoTime(); - total += end - start; - } - // make sure we have slept at least 4ms on average, given a mean wait time of 5ms - double avg = total / count; - double nanos = avg - TimeUnit.MILLISECONDS.toNanos(4); - Assert.assertTrue(nanos > 0); - } -} diff --git a/test/unit/org/apache/cassandra/service/paxos/PaxosVerbHandlerOutOfRangeTest.java b/test/unit/org/apache/cassandra/service/paxos/PaxosVerbHandlerOutOfRangeTest.java index 1ed2bef52ed9..a788c0ff9ddf 100644 --- a/test/unit/org/apache/cassandra/service/paxos/PaxosVerbHandlerOutOfRangeTest.java +++ b/test/unit/org/apache/cassandra/service/paxos/PaxosVerbHandlerOutOfRangeTest.java @@ -33,7 +33,7 @@ import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.metrics.StorageMetrics; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; @@ -48,9 +48,15 @@ import org.apache.cassandra.tcm.membership.NodeState; import org.apache.cassandra.utils.ByteBufferUtil; -import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.*; import static org.junit.Assert.assertEquals; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.MessageDelivery; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.broadcastAddress; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.bytesToken; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.node1; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.randomInt; +import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.registerOutgoingMessageSink; + public class PaxosVerbHandlerOutOfRangeTest // PaxosV1 out of range tests - V2 implements OOTR checks at the protocol level { // For the purposes of this testing, the details of the Commit don't really matter @@ -175,7 +181,7 @@ private void getAndVerifyResponse(ListenableFuture messageSink, MessageDelivery response = messageSink.get(100, TimeUnit.MILLISECONDS); assertEquals(verb, response.message.verb()); Assert.assertEquals(broadcastAddress, response.message.from()); - assertEquals(isOutOfRange, response.message.payload instanceof RequestFailureReason); + assertEquals(isOutOfRange, response.message.payload instanceof RequestFailure); assertEquals(messageId, response.message.id()); Assert.assertEquals(node1, response.to); assertEquals(startingTotalMetricCount + (isOutOfRange ? 1 : 0), StorageMetrics.totalOpsForInvalidToken.getCount()); diff --git a/test/unit/org/apache/cassandra/service/paxos/cleanup/PaxosTableRepairsTest.java b/test/unit/org/apache/cassandra/service/paxos/cleanup/PaxosTableRepairsTest.java index 22441fec4253..fe21b820c5f9 100644 --- a/test/unit/org/apache/cassandra/service/paxos/cleanup/PaxosTableRepairsTest.java +++ b/test/unit/org/apache/cassandra/service/paxos/cleanup/PaxosTableRepairsTest.java @@ -51,7 +51,7 @@ private static class MockRepair extends AbstractPaxosRepair public MockRepair(DecoratedKey key) { - super(key, null); + super(key, null, -1); } public State restart(State state, long waitUntil) diff --git a/test/unit/org/apache/cassandra/service/paxos/uncommitted/PaxosBallotTrackerTest.java b/test/unit/org/apache/cassandra/service/paxos/uncommitted/PaxosBallotTrackerTest.java index 1f9db0851f0f..c9ec812f10a0 100644 --- a/test/unit/org/apache/cassandra/service/paxos/uncommitted/PaxosBallotTrackerTest.java +++ b/test/unit/org/apache/cassandra/service/paxos/uncommitted/PaxosBallotTrackerTest.java @@ -20,23 +20,26 @@ import java.io.IOException; -import org.apache.cassandra.cql3.statements.schema.CreateTableStatement; -import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.service.paxos.Ballot; -import org.apache.cassandra.service.paxos.PaxosState.MaybePromise.Outcome; -import org.junit.*; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.cassandra.SchemaLoader; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.cql3.statements.schema.CreateTableStatement; import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.service.paxos.Ballot; import org.apache.cassandra.service.paxos.Commit; import org.apache.cassandra.service.paxos.Paxos; import org.apache.cassandra.service.paxos.PaxosState; +import org.apache.cassandra.service.paxos.PaxosState.MaybePromise.Outcome; import org.apache.cassandra.utils.ByteBufferUtil; import static org.apache.cassandra.service.paxos.PaxosState.MaybePromise.Outcome.REJECT; @@ -126,7 +129,7 @@ private static void testHighBound(Stage stage, Order order) case PROPOSE: try (PaxosState state = PaxosState.get(commit)) { - state.acceptIfLatest(commit); + state.acceptIfLatest(commit, false); } break; case COMMIT: @@ -220,7 +223,7 @@ public void lowBoundAccept() throws IOException DecoratedKey key = dk(1); try (PaxosState state = PaxosState.get(key, cfm)) { - Ballot result = state.acceptIfLatest(new Commit.Proposal(ballot2, PartitionUpdate.emptyUpdate(cfm, key))); + Ballot result = state.acceptIfLatest(new Commit.Proposal(ballot2, PartitionUpdate.emptyUpdate(cfm, key)), false).supersededBy; Assert.assertNull(result); } @@ -228,7 +231,7 @@ public void lowBoundAccept() throws IOException ballotTracker.updateLowBound(ballot4); try (PaxosState state = PaxosState.get(key, cfm)) { - Ballot result = state.acceptIfLatest(new Commit.Proposal(ballot3, PartitionUpdate.emptyUpdate(cfm, key))); + Ballot result = state.acceptIfLatest(new Commit.Proposal(ballot3, PartitionUpdate.emptyUpdate(cfm, key)), false).supersededBy; Assert.assertEquals(ballot4, result); } } diff --git a/test/unit/org/apache/cassandra/service/paxos/uncommitted/PaxosUncommittedTests.java b/test/unit/org/apache/cassandra/service/paxos/uncommitted/PaxosUncommittedTests.java index 2804508e64a0..654c10b6d0d9 100644 --- a/test/unit/org/apache/cassandra/service/paxos/uncommitted/PaxosUncommittedTests.java +++ b/test/unit/org/apache/cassandra/service/paxos/uncommitted/PaxosUncommittedTests.java @@ -18,14 +18,23 @@ package org.apache.cassandra.service.paxos.uncommitted; -import java.util.*; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; import com.google.common.collect.Lists; import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.db.*; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.db.commitlog.CommitLog; -import org.apache.cassandra.dht.*; +import org.apache.cassandra.dht.ByteOrderedPartitioner; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.paxos.Ballot; import org.apache.cassandra.utils.ByteBufferUtil; @@ -42,7 +51,7 @@ class PaxosUncommittedTests CommitLog.instance.start(); } - static final IPartitioner PARTITIONER = new ByteOrderedPartitioner(); + static final IPartitioner PARTITIONER = ByteOrderedPartitioner.instance; static final Token MIN_TOKEN = PARTITIONER.getMinimumToken(); static final Range FULL_RANGE = new Range<>(MIN_TOKEN, MIN_TOKEN); static final Collection> ALL_RANGES = Collections.singleton(FULL_RANGE); diff --git a/test/unit/org/apache/cassandra/service/paxos/uncommitted/PaxosUncommittedTrackerIntegrationTest.java b/test/unit/org/apache/cassandra/service/paxos/uncommitted/PaxosUncommittedTrackerIntegrationTest.java index 8c3dd250f5b3..dc8fafa96640 100644 --- a/test/unit/org/apache/cassandra/service/paxos/uncommitted/PaxosUncommittedTrackerIntegrationTest.java +++ b/test/unit/org/apache/cassandra/service/paxos/uncommitted/PaxosUncommittedTrackerIntegrationTest.java @@ -20,7 +20,10 @@ import com.google.common.collect.Iterators; import com.google.common.collect.Lists; -import org.junit.*; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; import org.apache.cassandra.SchemaLoader; import org.apache.cassandra.Util; @@ -36,7 +39,7 @@ import static org.apache.cassandra.service.paxos.Ballot.Flag.NONE; import static org.apache.cassandra.service.paxos.BallotGenerator.Global.nextBallot; -import static org.apache.cassandra.service.paxos.Commit.*; +import static org.apache.cassandra.service.paxos.Commit.Proposal; import static org.apache.cassandra.service.paxos.uncommitted.PaxosUncommittedTests.ALL_RANGES; import static org.apache.cassandra.service.paxos.uncommitted.PaxosUncommittedTests.PAXOS_CFS; @@ -97,7 +100,7 @@ public void commitCycle() try (PaxosState state = PaxosState.get(key, cfm)) { - state.acceptIfLatest(proposal); + state.acceptIfLatest(proposal, false); } try (CloseableIterator iterator = tracker.uncommittedKeyIterator(cfm.id, ALL_RANGES)) @@ -124,7 +127,7 @@ public void inMemoryCommit() try (PaxosState state = PaxosState.get(key, cfm)) { state.promiseIfNewer(proposal.ballot, true); - state.acceptIfLatest(proposal); + state.acceptIfLatest(proposal, false); } try (CloseableIterator iterator = tracker.uncommittedKeyIterator(cfm.id, ALL_RANGES)) { diff --git a/test/unit/org/apache/cassandra/service/reads/AbstractReadResponseTest.java b/test/unit/org/apache/cassandra/service/reads/AbstractReadResponseTest.java index 7d22439d8d5c..3c0450d2e817 100644 --- a/test/unit/org/apache/cassandra/service/reads/AbstractReadResponseTest.java +++ b/test/unit/org/apache/cassandra/service/reads/AbstractReadResponseTest.java @@ -30,10 +30,12 @@ import org.junit.BeforeClass; import org.junit.Ignore; +import org.apache.cassandra.CassandraTestBase; +import org.apache.cassandra.CassandraTestBase.DDDaemonInitialization; +import org.apache.cassandra.CassandraTestBase.UseMurmur3Partitioner; import org.apache.cassandra.SchemaLoader; import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.Util; -import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.ColumnIdentifier; import org.apache.cassandra.db.BufferClusteringBound; import org.apache.cassandra.db.BufferClusteringBoundary; @@ -66,7 +68,6 @@ import org.apache.cassandra.db.rows.Rows; import org.apache.cassandra.db.rows.Unfiltered; import org.apache.cassandra.db.rows.UnfilteredRowIterator; -import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.MessagingService; @@ -83,7 +84,9 @@ * Base class for testing various components which deal with read responses */ @Ignore -public abstract class AbstractReadResponseTest +@DDDaemonInitialization +@UseMurmur3Partitioner +public abstract class AbstractReadResponseTest extends CassandraTestBase { public static final String KEYSPACE1 = "DataResolverTest"; public static final String KEYSPACE3 = "DataResolverTest3"; @@ -124,9 +127,6 @@ public abstract class AbstractReadResponseTest @BeforeClass public static void setupClass() throws Throwable { - DatabaseDescriptor.daemonInitialization(); - DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance); - TableMetadata.Builder builder1 = TableMetadata.builder(KEYSPACE1, CF_STANDARD) .addPartitionKeyColumn("key", BytesType.instance) diff --git a/test/unit/org/apache/cassandra/service/reads/DataResolverTest.java b/test/unit/org/apache/cassandra/service/reads/DataResolverTest.java index 9c56f00a8098..062cda0b9661 100644 --- a/test/unit/org/apache/cassandra/service/reads/DataResolverTest.java +++ b/test/unit/org/apache/cassandra/service/reads/DataResolverTest.java @@ -25,17 +25,18 @@ import com.google.common.collect.Iterators; import com.google.common.collect.Sets; - import org.junit.Assert; import org.junit.Test; import org.apache.cassandra.Util; import org.apache.cassandra.db.Clustering; import org.apache.cassandra.db.ClusteringBound; +import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.db.DeletionInfo; import org.apache.cassandra.db.DeletionTime; import org.apache.cassandra.db.EmptyIterators; +import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.MutableDeletionInfo; import org.apache.cassandra.db.Mutation; import org.apache.cassandra.db.RangeTombstone; @@ -54,8 +55,6 @@ import org.apache.cassandra.db.rows.RangeTombstoneBoundaryMarker; import org.apache.cassandra.db.rows.Row; import org.apache.cassandra.db.rows.RowIterator; -import org.apache.cassandra.db.ColumnFamilyStore; -import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.dht.Token; import org.apache.cassandra.locator.EndpointsForRange; @@ -66,7 +65,7 @@ import org.apache.cassandra.locator.ReplicaUtils; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; -import org.apache.cassandra.net.*; +import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.service.reads.repair.ReadRepair; import org.apache.cassandra.service.reads.repair.RepairedDataTracker; import org.apache.cassandra.service.reads.repair.RepairedDataVerifier; @@ -137,7 +136,7 @@ private EndpointsForRange makeReplicas(int num) public void testResolveNewerSingleRow() { EndpointsForRange replicas = makeReplicas(2); - DataResolver resolver = new DataResolver(command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); + DataResolver resolver = new DataResolver(ReadCoordinator.DEFAULT, command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); InetAddressAndPort peer1 = replicas.get(0).endpoint(); resolver.preprocess(response(command, peer1, iter(new RowUpdateBuilder(cfm, nowInSec, 0L, dk).clustering("1") .add("c1", "v1") @@ -169,7 +168,7 @@ public void testResolveNewerSingleRow() public void testResolveDisjointSingleRow() { EndpointsForRange replicas = makeReplicas(2); - DataResolver resolver = new DataResolver(command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); + DataResolver resolver = new DataResolver(ReadCoordinator.DEFAULT, command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); InetAddressAndPort peer1 = replicas.get(0).endpoint(); resolver.preprocess(response(command, peer1, iter(new RowUpdateBuilder(cfm, nowInSec, 0L, dk).clustering("1") .add("c1", "v1") @@ -206,7 +205,7 @@ public void testResolveDisjointSingleRow() public void testResolveDisjointMultipleRows() throws UnknownHostException { EndpointsForRange replicas = makeReplicas(2); - DataResolver resolver = new DataResolver(command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); + DataResolver resolver = new DataResolver(ReadCoordinator.DEFAULT, command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); InetAddressAndPort peer1 = replicas.get(0).endpoint(); resolver.preprocess(response(command, peer1, iter(new RowUpdateBuilder(cfm, nowInSec, 0L, dk).clustering("1") .add("c1", "v1") @@ -253,7 +252,7 @@ public void testResolveDisjointMultipleRows() throws UnknownHostException public void testResolveDisjointMultipleRowsWithRangeTombstones() { EndpointsForRange replicas = makeReplicas(4); - DataResolver resolver = new DataResolver(command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); + DataResolver resolver = new DataResolver(ReadCoordinator.DEFAULT, command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); RangeTombstone tombstone1 = tombstone("1", "11", 1, nowInSec); RangeTombstone tombstone2 = tombstone("3", "31", 1, nowInSec); @@ -334,7 +333,7 @@ public void testResolveDisjointMultipleRowsWithRangeTombstones() public void testResolveWithOneEmpty() { EndpointsForRange replicas = makeReplicas(2); - DataResolver resolver = new DataResolver(command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); + DataResolver resolver = new DataResolver(ReadCoordinator.DEFAULT, command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); InetAddressAndPort peer1 = replicas.get(0).endpoint(); resolver.preprocess(response(command, peer1, iter(new RowUpdateBuilder(cfm, nowInSec, 1L, dk).clustering("1") .add("c2", "v2") @@ -365,7 +364,7 @@ public void testResolveWithBothEmpty() { EndpointsForRange replicas = makeReplicas(2); TestableReadRepair readRepair = new TestableReadRepair(command); - DataResolver resolver = new DataResolver(command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); + DataResolver resolver = new DataResolver(ReadCoordinator.DEFAULT, command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); resolver.preprocess(response(command, replicas.get(0).endpoint(), EmptyIterators.unfilteredPartition(cfm))); resolver.preprocess(response(command, replicas.get(1).endpoint(), EmptyIterators.unfilteredPartition(cfm))); @@ -381,7 +380,7 @@ public void testResolveWithBothEmpty() public void testResolveDeleted() { EndpointsForRange replicas = makeReplicas(2); - DataResolver resolver = new DataResolver(command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); + DataResolver resolver = new DataResolver(ReadCoordinator.DEFAULT, command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); // one response with columns timestamped before a delete in another response InetAddressAndPort peer1 = replicas.get(0).endpoint(); resolver.preprocess(response(command, peer1, iter(new RowUpdateBuilder(cfm, nowInSec, 0L, dk).clustering("1") @@ -407,7 +406,7 @@ public void testResolveDeleted() public void testResolveMultipleDeleted() { EndpointsForRange replicas = makeReplicas(4); - DataResolver resolver = new DataResolver(command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); + DataResolver resolver = new DataResolver(ReadCoordinator.DEFAULT, command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); // deletes and columns with interleaved timestamp, with out of order return sequence InetAddressAndPort peer1 = replicas.get(0).endpoint(); resolver.preprocess(response(command, peer1, fullPartitionDelete(cfm, dk, 0, nowInSec))); @@ -492,7 +491,7 @@ public void testResolveRangeTombstonesOnBoundarySameTimestamp() throws UnknownHo private void resolveRangeTombstonesOnBoundary(long timestamp1, long timestamp2) { EndpointsForRange replicas = makeReplicas(2); - DataResolver resolver = new DataResolver(command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); + DataResolver resolver = new DataResolver(ReadCoordinator.DEFAULT, command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); InetAddressAndPort peer1 = replicas.get(0).endpoint(); InetAddressAndPort peer2 = replicas.get(1).endpoint(); @@ -566,7 +565,7 @@ public void testRepairRangeTombstoneBoundary() throws UnknownHostException */ private void testRepairRangeTombstoneBoundary(EndpointsForRange replicas, int timestamp1, int timestamp2, int timestamp3) throws UnknownHostException { - DataResolver resolver = new DataResolver(command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); + DataResolver resolver = new DataResolver(ReadCoordinator.DEFAULT, command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); InetAddressAndPort peer1 = replicas.get(0).endpoint(); InetAddressAndPort peer2 = replicas.get(1).endpoint(); @@ -619,7 +618,7 @@ public void testRepairRangeTombstoneWithPartitionDeletion() { EndpointsForRange replicas = makeReplicas(2); - DataResolver resolver = new DataResolver(command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); + DataResolver resolver = new DataResolver(ReadCoordinator.DEFAULT, command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); InetAddressAndPort peer1 = replicas.get(0).endpoint(); InetAddressAndPort peer2 = replicas.get(1).endpoint(); @@ -658,7 +657,7 @@ public void testRepairRangeTombstoneWithPartitionDeletion() public void testRepairRangeTombstoneWithPartitionDeletion2() { EndpointsForRange replicas = makeReplicas(2); - DataResolver resolver = new DataResolver(command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); + DataResolver resolver = new DataResolver(ReadCoordinator.DEFAULT, command, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); InetAddressAndPort peer1 = replicas.get(0).endpoint(); InetAddressAndPort peer2 = replicas.get(1).endpoint(); @@ -742,7 +741,7 @@ public void testResolveComplexDelete() EndpointsForRange replicas = makeReplicas(2); ReadCommand cmd = Util.cmd(cfs2, dk).withNowInSeconds(nowInSec).build(); TestableReadRepair readRepair = new TestableReadRepair(cmd); - DataResolver resolver = new DataResolver(cmd, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); + DataResolver resolver = new DataResolver(ReadCoordinator.DEFAULT, cmd, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); long[] ts = {100, 200}; @@ -794,7 +793,7 @@ public void testResolveDeletedCollection() EndpointsForRange replicas = makeReplicas(2); ReadCommand cmd = Util.cmd(cfs2, dk).withNowInSeconds(nowInSec).build(); TestableReadRepair readRepair = new TestableReadRepair(cmd); - DataResolver resolver = new DataResolver(cmd, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); + DataResolver resolver = new DataResolver(ReadCoordinator.DEFAULT, cmd, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); long[] ts = {100, 200}; @@ -838,7 +837,7 @@ public void testResolveNewCollection() EndpointsForRange replicas = makeReplicas(2); ReadCommand cmd = Util.cmd(cfs2, dk).withNowInSeconds(nowInSec).build(); TestableReadRepair readRepair = new TestableReadRepair(cmd); - DataResolver resolver = new DataResolver(cmd, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); + DataResolver resolver = new DataResolver(ReadCoordinator.DEFAULT, cmd, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); long[] ts = {100, 200}; @@ -888,7 +887,7 @@ public void testResolveNewCollectionOverwritingDeleted() EndpointsForRange replicas = makeReplicas(2); ReadCommand cmd = Util.cmd(cfs2, dk).withNowInSeconds(nowInSec).build(); TestableReadRepair readRepair = new TestableReadRepair(cmd); - DataResolver resolver = new DataResolver(cmd, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); + DataResolver resolver = new DataResolver(ReadCoordinator.DEFAULT, cmd, plan(replicas, ALL), readRepair, Dispatcher.RequestTime.forImmediateExecution()); long[] ts = {100, 200}; @@ -1260,7 +1259,7 @@ class TestableDataResolver extends DataResolver public TestableDataResolver(ReadCommand command, ReplicaPlan.SharedForRangeRead plan, ReadRepair readRepair, Dispatcher.RequestTime requestTime) { - super(command, plan, readRepair, requestTime, true); + super(ReadCoordinator.DEFAULT, command, plan, readRepair, requestTime, true); } protected RepairedDataVerifier getRepairedDataVerifier(ReadCommand command) @@ -1326,12 +1325,12 @@ private void assertRepairMetadata(Mutation mutation) private ReplicaPlan.SharedForRangeRead plan(EndpointsForRange replicas, ConsistencyLevel consistencyLevel) { - BiFunction, Token, ReplicaPlan.ForWrite> repairPlan = (self, t) -> ReplicaPlans.forReadRepair(self, ClusterMetadata.current(), ks, consistencyLevel, t, (i) -> true); + BiFunction, Token, ReplicaPlan.ForWrite> repairPlan = (self, t) -> ReplicaPlans.forReadRepair(self, ClusterMetadata.current(), ks, null, consistencyLevel, t, (i) -> true, ReadCoordinator.DEFAULT); return ReplicaPlan.shared(new ReplicaPlan.ForRangeRead(ks, ks.getReplicationStrategy(), consistencyLevel, ReplicaUtils.FULL_BOUNDS, - replicas, replicas, + replicas, replicas, replicas, 1, null, repairPlan, Epoch.EMPTY)); diff --git a/test/unit/org/apache/cassandra/service/reads/DigestResolverTest.java b/test/unit/org/apache/cassandra/service/reads/DigestResolverTest.java index 84a116729567..bde9763baf78 100644 --- a/test/unit/org/apache/cassandra/service/reads/DigestResolverTest.java +++ b/test/unit/org/apache/cassandra/service/reads/DigestResolverTest.java @@ -69,7 +69,7 @@ public void noRepairNeeded() { SinglePartitionReadCommand command = SinglePartitionReadCommand.fullPartitionRead(cfm, nowInSec, dk); EndpointsForToken targetReplicas = EndpointsForToken.of(dk.getToken(), full(EP1), full(EP2)); - DigestResolver resolver = new DigestResolver(command, plan(ConsistencyLevel.QUORUM, targetReplicas), new Dispatcher.RequestTime(0L, 0L)); + DigestResolver resolver = new DigestResolver(ReadCoordinator.DEFAULT, command, plan(ConsistencyLevel.QUORUM, targetReplicas), new Dispatcher.RequestTime(0L, 0L)); PartitionUpdate response = update(row(1000, 4, 4), row(1000, 5, 5)).build(); @@ -102,7 +102,7 @@ public void multiThreadedNoRepairNeededReadCallback() { final long startNanos = System.nanoTime(); final Dispatcher.RequestTime requestTime = new Dispatcher.RequestTime(startNanos, startNanos); - final DigestResolver resolver = new DigestResolver<>(command, plan, requestTime); + final DigestResolver resolver = new DigestResolver<>(ReadCoordinator.DEFAULT, command, plan, requestTime); final ReadCallback callback = new ReadCallback<>(resolver, command, plan, requestTime); final CountDownLatch startlatch = new CountDownLatch(2); @@ -137,7 +137,7 @@ public void digestMismatch() { SinglePartitionReadCommand command = SinglePartitionReadCommand.fullPartitionRead(cfm, nowInSec, dk); EndpointsForToken targetReplicas = EndpointsForToken.of(dk.getToken(), full(EP1), full(EP2)); - DigestResolver resolver = new DigestResolver(command, plan(ConsistencyLevel.QUORUM, targetReplicas), new Dispatcher.RequestTime(0L, 0L)); + DigestResolver resolver = new DigestResolver(ReadCoordinator.DEFAULT, command, plan(ConsistencyLevel.QUORUM, targetReplicas), new Dispatcher.RequestTime(0L, 0L)); PartitionUpdate response1 = update(row(1000, 4, 4), row(1000, 5, 5)).build(); PartitionUpdate response2 = update(row(2000, 4, 5)).build(); @@ -158,7 +158,7 @@ public void agreeingTransient() { SinglePartitionReadCommand command = SinglePartitionReadCommand.fullPartitionRead(cfm, nowInSec, dk); EndpointsForToken targetReplicas = EndpointsForToken.of(dk.getToken(), full(EP1), trans(EP2)); - DigestResolver resolver = new DigestResolver<>(command, plan(ConsistencyLevel.QUORUM, targetReplicas), new Dispatcher.RequestTime(0L, 0L)); + DigestResolver resolver = new DigestResolver<>(ReadCoordinator.DEFAULT, command, plan(ConsistencyLevel.QUORUM, targetReplicas), new Dispatcher.RequestTime(0L, 0L)); PartitionUpdate response1 = update(row(1000, 4, 4), row(1000, 5, 5)).build(); PartitionUpdate response2 = update(row(1000, 5, 5)).build(); @@ -179,7 +179,7 @@ public void transientResponse() { SinglePartitionReadCommand command = SinglePartitionReadCommand.fullPartitionRead(cfm, nowInSec, dk); EndpointsForToken targetReplicas = EndpointsForToken.of(dk.getToken(), full(EP1), trans(EP2)); - DigestResolver resolver = new DigestResolver<>(command, plan(ConsistencyLevel.QUORUM, targetReplicas), new Dispatcher.RequestTime(0L, 0L)); + DigestResolver resolver = new DigestResolver<>(ReadCoordinator.DEFAULT, command, plan(ConsistencyLevel.QUORUM, targetReplicas), new Dispatcher.RequestTime(0L, 0L)); PartitionUpdate response2 = update(row(1000, 5, 5)).build(); Assert.assertFalse(resolver.isDataPresent()); @@ -194,7 +194,7 @@ public void transientResponseData() { SinglePartitionReadCommand command = SinglePartitionReadCommand.fullPartitionRead(cfm, nowInSec, dk); EndpointsForToken targetReplicas = EndpointsForToken.of(dk.getToken(), full(EP1), full(EP2), trans(EP3)); - DigestResolver resolver = new DigestResolver<>(command, plan(ConsistencyLevel.QUORUM, targetReplicas), new Dispatcher.RequestTime(0L, 0L)); + DigestResolver resolver = new DigestResolver<>(ReadCoordinator.DEFAULT, command, plan(ConsistencyLevel.QUORUM, targetReplicas), new Dispatcher.RequestTime(0L, 0L)); PartitionUpdate fullResponse = update(row(1000, 1, 1)).build(); PartitionUpdate digestResponse = update(row(1000, 1, 1)).build(); @@ -215,7 +215,7 @@ public void transientResponseData() private ReplicaPlan.SharedForTokenRead plan(ConsistencyLevel consistencyLevel, EndpointsForToken replicas) { - return ReplicaPlan.shared(new ReplicaPlan.ForTokenRead(ks, ks.getReplicationStrategy(), consistencyLevel, replicas, replicas, null, (self) -> null, Epoch.EMPTY)); + return ReplicaPlan.shared(new ReplicaPlan.ForTokenRead(ks, ks.getReplicationStrategy(), consistencyLevel, replicas, replicas, replicas, null, (self) -> null, Epoch.EMPTY)); } private void waitForLatch(CountDownLatch startlatch) diff --git a/test/unit/org/apache/cassandra/service/reads/ReadExecutorTest.java b/test/unit/org/apache/cassandra/service/reads/ReadExecutorTest.java index 046da259e98e..dcb095246185 100644 --- a/test/unit/org/apache/cassandra/service/reads/ReadExecutorTest.java +++ b/test/unit/org/apache/cassandra/service/reads/ReadExecutorTest.java @@ -22,31 +22,32 @@ import org.apache.commons.lang3.exception.ExceptionUtils; -import org.apache.cassandra.ServerTestUtils; -import org.apache.cassandra.dht.Murmur3Partitioner; -import org.apache.cassandra.dht.Token; -import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; -import org.apache.cassandra.locator.ReplicaPlan; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.Util; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.SinglePartitionReadCommand; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; import org.apache.cassandra.exceptions.ReadFailureException; import org.apache.cassandra.exceptions.ReadTimeoutException; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.locator.EndpointsForToken; import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.tcm.Epoch; +import org.apache.cassandra.locator.ReplicaPlan; import org.apache.cassandra.net.Message; import org.apache.cassandra.net.NoPayload; import org.apache.cassandra.net.Verb; import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.transport.Dispatcher; import static java.util.concurrent.TimeUnit.DAYS; @@ -100,7 +101,7 @@ public void testUnableToSpeculate() throws Throwable { assertEquals(0, cfs.metric.speculativeInsufficientReplicas.getCount()); assertEquals(0, ks.metric.speculativeInsufficientReplicas.getCount()); - AbstractReadExecutor executor = new AbstractReadExecutor.NeverSpeculatingReadExecutor(cfs, new MockSinglePartitionReadCommand(), plan(targets, LOCAL_QUORUM), Dispatcher.RequestTime.forImmediateExecution(), true); + AbstractReadExecutor executor = new AbstractReadExecutor.NeverSpeculatingReadExecutor(ReadCoordinator.DEFAULT, cfs, new MockSinglePartitionReadCommand(), plan(targets, LOCAL_QUORUM), Dispatcher.RequestTime.forImmediateExecution(), true); executor.maybeTryAdditionalReplicas(); try { @@ -115,7 +116,7 @@ public void testUnableToSpeculate() throws Throwable assertEquals(1, ks.metric.speculativeInsufficientReplicas.getCount()); //Shouldn't increment - executor = new AbstractReadExecutor.NeverSpeculatingReadExecutor(cfs, new MockSinglePartitionReadCommand(), plan(targets, LOCAL_QUORUM), Dispatcher.RequestTime.forImmediateExecution(), false); + executor = new AbstractReadExecutor.NeverSpeculatingReadExecutor(ReadCoordinator.DEFAULT, cfs, new MockSinglePartitionReadCommand(), plan(targets, LOCAL_QUORUM), Dispatcher.RequestTime.forImmediateExecution(), false); executor.maybeTryAdditionalReplicas(); try { @@ -141,7 +142,7 @@ public void testSpeculateSucceeded() throws Throwable assertEquals(0, cfs.metric.speculativeFailedRetries.getCount()); assertEquals(0, ks.metric.speculativeRetries.getCount()); assertEquals(0, ks.metric.speculativeFailedRetries.getCount()); - AbstractReadExecutor executor = new AbstractReadExecutor.SpeculatingReadExecutor(cfs, new MockSinglePartitionReadCommand(DAYS.toMillis(365)), plan(LOCAL_QUORUM, targets, targets.subList(0, 2)), Dispatcher.RequestTime.forImmediateExecution()); + AbstractReadExecutor executor = new AbstractReadExecutor.SpeculatingReadExecutor(ReadCoordinator.DEFAULT, cfs, new MockSinglePartitionReadCommand(DAYS.toMillis(365)), plan(LOCAL_QUORUM, targets, targets.subList(0, 2)), Dispatcher.RequestTime.forImmediateExecution()); executor.maybeTryAdditionalReplicas(); new Thread() { @@ -149,8 +150,8 @@ public void testSpeculateSucceeded() throws Throwable public void run() { //Failures end the read promptly but don't require mock data to be suppleid - executor.handler.onFailure(targets.get(0).endpoint(), RequestFailureReason.READ_TOO_MANY_TOMBSTONES); - executor.handler.onFailure(targets.get(1).endpoint(), RequestFailureReason.READ_TOO_MANY_TOMBSTONES); + executor.handler.onFailure(targets.get(0).endpoint(), RequestFailure.READ_TOO_MANY_TOMBSTONES); + executor.handler.onFailure(targets.get(1).endpoint(), RequestFailure.READ_TOO_MANY_TOMBSTONES); executor.handler.condition.signalAll(); } }.start(); @@ -182,7 +183,7 @@ public void testSpeculateFailed() throws Throwable assertEquals(0, cfs.metric.speculativeFailedRetries.getCount()); assertEquals(0, ks.metric.speculativeRetries.getCount()); assertEquals(0, ks.metric.speculativeFailedRetries.getCount()); - AbstractReadExecutor executor = new AbstractReadExecutor.SpeculatingReadExecutor(cfs, new MockSinglePartitionReadCommand(), plan(LOCAL_QUORUM, targets, targets.subList(0, 2)), Dispatcher.RequestTime.forImmediateExecution()); + AbstractReadExecutor executor = new AbstractReadExecutor.SpeculatingReadExecutor(ReadCoordinator.DEFAULT, cfs, new MockSinglePartitionReadCommand(), plan(LOCAL_QUORUM, targets, targets.subList(0, 2)), Dispatcher.RequestTime.forImmediateExecution()); executor.maybeTryAdditionalReplicas(); try { @@ -208,7 +209,7 @@ public void testRaceWithNonSpeculativeFailure() { MockSinglePartitionReadCommand command = new MockSinglePartitionReadCommand(TimeUnit.DAYS.toMillis(365)); ReplicaPlan.ForTokenRead plan = plan(ConsistencyLevel.LOCAL_ONE, targets, targets.subList(0, 1)); - AbstractReadExecutor executor = new AbstractReadExecutor.SpeculatingReadExecutor(cfs, command, plan, Dispatcher.RequestTime.forImmediateExecution()); + AbstractReadExecutor executor = new AbstractReadExecutor.SpeculatingReadExecutor(ReadCoordinator.DEFAULT, cfs, command, plan, Dispatcher.RequestTime.forImmediateExecution()); // Issue an initial request against the first endpoint... executor.executeAsync(); @@ -221,7 +222,7 @@ public void testRaceWithNonSpeculativeFailure() { // Fail the first request. When this fails the number of contacts has already been increased // to 2, so the failure won't actally signal. However... - executor.handler.onFailure(targets.get(0).endpoint(), RequestFailureReason.READ_TOO_MANY_TOMBSTONES); + executor.handler.onFailure(targets.get(0).endpoint(), RequestFailure.READ_TOO_MANY_TOMBSTONES); // ...speculative retries are fired after a short wait, and it is possible for the failure to // reach the handler just before one is fired and the number of contacts incremented... @@ -254,7 +255,7 @@ public static class MockSinglePartitionReadCommand extends SinglePartitionReadCo MockSinglePartitionReadCommand(long timeout) { - super(cfs.metadata().epoch, false, 0, false, cfs.metadata(), 0, null, null, null, Util.dk("ry@n_luvs_teh_y@nk33z"), null, null, false, null); + super(cfs.metadata().epoch, false, 0, false, PotentialTxnConflicts.DISALLOW, cfs.metadata(), 0, null, null, null, Util.dk("ry@n_luvs_teh_y@nk33z"), null, null, false, null); this.timeout = timeout; } @@ -278,6 +279,6 @@ private ReplicaPlan.ForTokenRead plan(EndpointsForToken targets, ConsistencyLeve private ReplicaPlan.ForTokenRead plan(ConsistencyLevel consistencyLevel, EndpointsForToken natural, EndpointsForToken selected) { - return new ReplicaPlan.ForTokenRead(ks, ks.getReplicationStrategy(), consistencyLevel, natural, selected, (cm) -> null, (self) -> null, Epoch.EMPTY); + return new ReplicaPlan.ForTokenRead(ks, ks.getReplicationStrategy(), consistencyLevel, natural, selected, natural, (cm) -> null, (self) -> null, Epoch.EMPTY); } } diff --git a/test/unit/org/apache/cassandra/service/reads/range/RangeCommandIteratorTest.java b/test/unit/org/apache/cassandra/service/reads/range/RangeCommandIteratorTest.java index dfd1f7f88d15..7cec4c405e7f 100644 --- a/test/unit/org/apache/cassandra/service/reads/range/RangeCommandIteratorTest.java +++ b/test/unit/org/apache/cassandra/service/reads/range/RangeCommandIteratorTest.java @@ -40,6 +40,8 @@ import org.apache.cassandra.locator.ReplicaPlan; import org.apache.cassandra.locator.ReplicaPlans; import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.reads.ReadCoordinator; import org.apache.cassandra.transport.Dispatcher; import org.apache.cassandra.utils.CloseableIterator; @@ -49,6 +51,7 @@ public class RangeCommandIteratorTest { private static final String KEYSPACE1 = "RangeCommandIteratorTest"; private static final String CF_STANDARD1 = "Standard1"; + private static final TableId TABLE_ID = TableId.generate(); @BeforeClass public static void defineSchema() throws ConfigurationException @@ -70,11 +73,11 @@ public void testRangeCountWithRangeMerge() for (int i = 0; i + 1 < tokens.size(); i++) { Range range = Range.makeRowRange(tokens.get(i), tokens.get(i + 1)); - ranges.add(ReplicaPlans.forRangeRead(keyspace, null, ConsistencyLevel.ONE, range, 1)); + ranges.add(ReplicaPlans.forRangeRead(keyspace, TABLE_ID, null, ConsistencyLevel.ONE, range, 1)); vnodeCount++; } - ReplicaPlanMerger merge = new ReplicaPlanMerger(ranges.iterator(), keyspace, ConsistencyLevel.ONE); + ReplicaPlanMerger merge = new ReplicaPlanMerger(ranges.iterator(), keyspace, TABLE_ID, ConsistencyLevel.ONE); ReplicaPlan.ForRangeRead mergedRange = Iterators.getOnlyElement(merge); // all ranges are merged as test has only one node. assertEquals(vnodeCount, mergedRange.vnodeCount()); @@ -105,27 +108,27 @@ public void testRangeQueried() // without range merger, there will be 2 batches requested: 1st batch with 1 range and 2nd batch with remaining ranges CloseableIterator replicaPlans = replicaPlanIterator(keyRange, keyspace, false); - RangeCommandIterator data = new RangeCommandIterator(replicaPlans, command, 1, 1000, vnodeCount, Dispatcher.RequestTime.forImmediateExecution()); + RangeCommandIterator data = new RangeCommandIterator(replicaPlans, command, ReadCoordinator.DEFAULT, 1, 1000, vnodeCount, Dispatcher.RequestTime.forImmediateExecution()); verifyRangeCommandIterator(data, rows, 2, vnodeCount); // without range merger and initial cf=5, there will be 1 batches requested: 5 vnode ranges for 1st batch replicaPlans = replicaPlanIterator(keyRange, keyspace, false); - data = new RangeCommandIterator(replicaPlans, command, vnodeCount, 1000, vnodeCount, Dispatcher.RequestTime.forImmediateExecution()); + data = new RangeCommandIterator(replicaPlans, command, ReadCoordinator.DEFAULT, vnodeCount, 1000, vnodeCount, Dispatcher.RequestTime.forImmediateExecution()); verifyRangeCommandIterator(data, rows, 1, vnodeCount); // without range merger and max cf=1, there will be 5 batches requested: 1 vnode range per batch replicaPlans = replicaPlanIterator(keyRange, keyspace, false); - data = new RangeCommandIterator(replicaPlans, command, 1, 1, vnodeCount, Dispatcher.RequestTime.forImmediateExecution()); + data = new RangeCommandIterator(replicaPlans, command, ReadCoordinator.DEFAULT, 1, 1, vnodeCount, Dispatcher.RequestTime.forImmediateExecution()); verifyRangeCommandIterator(data, rows, vnodeCount, vnodeCount); // with range merger, there will be only 1 batch requested, as all ranges share the same replica - localhost replicaPlans = replicaPlanIterator(keyRange, keyspace, true); - data = new RangeCommandIterator(replicaPlans, command, 1, 1000, vnodeCount, Dispatcher.RequestTime.forImmediateExecution()); + data = new RangeCommandIterator(replicaPlans, command, ReadCoordinator.DEFAULT, 1, 1000, vnodeCount, Dispatcher.RequestTime.forImmediateExecution()); verifyRangeCommandIterator(data, rows, 1, vnodeCount); // with range merger and max cf=1, there will be only 1 batch requested, as all ranges share the same replica - localhost replicaPlans = replicaPlanIterator(keyRange, keyspace, true); - data = new RangeCommandIterator(replicaPlans, command, 1, 1, vnodeCount, Dispatcher.RequestTime.forImmediateExecution()); + data = new RangeCommandIterator(replicaPlans, command, ReadCoordinator.DEFAULT, 1, 1, vnodeCount, Dispatcher.RequestTime.forImmediateExecution()); verifyRangeCommandIterator(data, rows, 1, vnodeCount); } @@ -164,9 +167,9 @@ private static CloseableIterator replicaPlanIterator(A Keyspace keyspace, boolean withRangeMerger) { - CloseableIterator replicaPlans = new ReplicaPlanIterator(keyRange, null, keyspace, ConsistencyLevel.ONE); + CloseableIterator replicaPlans = new ReplicaPlanIterator(keyRange, null, keyspace, null, ConsistencyLevel.ONE); if (withRangeMerger) - replicaPlans = new ReplicaPlanMerger(replicaPlans, keyspace, ConsistencyLevel.ONE); + replicaPlans = new ReplicaPlanMerger(replicaPlans, keyspace, null, ConsistencyLevel.ONE); return replicaPlans; } diff --git a/test/unit/org/apache/cassandra/service/reads/range/RangeCommandsTest.java b/test/unit/org/apache/cassandra/service/reads/range/RangeCommandsTest.java index ce04d5deab54..42ea24173eee 100644 --- a/test/unit/org/apache/cassandra/service/reads/range/RangeCommandsTest.java +++ b/test/unit/org/apache/cassandra/service/reads/range/RangeCommandsTest.java @@ -38,6 +38,7 @@ import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.index.StubIndex; import org.apache.cassandra.schema.IndexMetadata; +import org.apache.cassandra.service.reads.ReadCoordinator; import org.apache.cassandra.transport.Dispatcher; import static org.apache.cassandra.config.CassandraRelevantProperties.MAX_CONCURRENT_RANGE_REQUESTS; @@ -78,8 +79,8 @@ public void tesConcurrencyFactor() // verify that a low concurrency factor is not capped by the max concurrency factor PartitionRangeReadCommand command = command(cfs, 50, 50); - try (RangeCommandIterator partitions = RangeCommands.rangeCommandIterator(command, ONE, Dispatcher.RequestTime.forImmediateExecution()); - ReplicaPlanIterator ranges = new ReplicaPlanIterator(command.dataRange().keyRange(), command.indexQueryPlan(), keyspace, ONE)) + try (RangeCommandIterator partitions = RangeCommands.rangeCommandIterator(command, ONE, ReadCoordinator.DEFAULT, Dispatcher.RequestTime.forImmediateExecution()); + ReplicaPlanIterator ranges = new ReplicaPlanIterator(command.dataRange().keyRange(), command.indexQueryPlan(), keyspace, command.metadata().id, ONE)) { assertEquals(2, partitions.concurrencyFactor()); assertEquals(MAX_CONCURRENCY_FACTOR, partitions.maxConcurrencyFactor()); @@ -88,8 +89,8 @@ public void tesConcurrencyFactor() // verify that a high concurrency factor is capped by the max concurrency factor command = command(cfs, 1000, 50); - try (RangeCommandIterator partitions = RangeCommands.rangeCommandIterator(command, ONE, Dispatcher.RequestTime.forImmediateExecution()); - ReplicaPlanIterator ranges = new ReplicaPlanIterator(command.dataRange().keyRange(), command.indexQueryPlan(), keyspace, ONE)) + try (RangeCommandIterator partitions = RangeCommands.rangeCommandIterator(command, ONE, ReadCoordinator.DEFAULT, Dispatcher.RequestTime.forImmediateExecution()); + ReplicaPlanIterator ranges = new ReplicaPlanIterator(command.dataRange().keyRange(), command.indexQueryPlan(), keyspace, command.metadata().id, ONE)) { assertEquals(MAX_CONCURRENCY_FACTOR, partitions.concurrencyFactor()); assertEquals(MAX_CONCURRENCY_FACTOR, partitions.maxConcurrencyFactor()); @@ -98,8 +99,8 @@ public void tesConcurrencyFactor() // with 0 estimated results per range the concurrency factor should be 1 command = command(cfs, 1000, 0); - try (RangeCommandIterator partitions = RangeCommands.rangeCommandIterator(command, ONE, Dispatcher.RequestTime.forImmediateExecution()); - ReplicaPlanIterator ranges = new ReplicaPlanIterator(command.dataRange().keyRange(), command.indexQueryPlan(), keyspace, ONE)) + try (RangeCommandIterator partitions = RangeCommands.rangeCommandIterator(command, ONE, ReadCoordinator.DEFAULT, Dispatcher.RequestTime.forImmediateExecution()); + ReplicaPlanIterator ranges = new ReplicaPlanIterator(command.dataRange().keyRange(), command.indexQueryPlan(), keyspace, command.metadata().id, ONE)) { assertEquals(1, partitions.concurrencyFactor()); assertEquals(MAX_CONCURRENCY_FACTOR, partitions.maxConcurrencyFactor()); diff --git a/test/unit/org/apache/cassandra/service/reads/range/ReplicaPlanIteratorTest.java b/test/unit/org/apache/cassandra/service/reads/range/ReplicaPlanIteratorTest.java index 84f3a5e2e750..829211f8e86f 100644 --- a/test/unit/org/apache/cassandra/service/reads/range/ReplicaPlanIteratorTest.java +++ b/test/unit/org/apache/cassandra/service/reads/range/ReplicaPlanIteratorTest.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.List; +import org.apache.cassandra.schema.TableId; import org.junit.BeforeClass; import org.junit.Test; @@ -44,6 +45,7 @@ public class ReplicaPlanIteratorTest { private static final String KEYSPACE = "ReplicaPlanIteratorTest"; + private static final TableId TABLE_ID = TableId.generate(); private static Keyspace keyspace; @BeforeClass @@ -163,7 +165,7 @@ private final void testRanges(AbstractBounds queryRange, Abst @SafeVarargs private final void testRanges(Keyspace keyspace, AbstractBounds queryRange, AbstractBounds... expected) { - try (ReplicaPlanIterator iterator = new ReplicaPlanIterator(queryRange, null, keyspace, ConsistencyLevel.ANY)) + try (ReplicaPlanIterator iterator = new ReplicaPlanIterator(queryRange, null, keyspace, TABLE_ID, ConsistencyLevel.ANY)) { List> restrictedRanges = new ArrayList<>(expected.length); while (iterator.hasNext()) diff --git a/test/unit/org/apache/cassandra/service/reads/range/ReplicaPlanMergerTest.java b/test/unit/org/apache/cassandra/service/reads/range/ReplicaPlanMergerTest.java index aaa88c938dd3..46b71d532e76 100644 --- a/test/unit/org/apache/cassandra/service/reads/range/ReplicaPlanMergerTest.java +++ b/test/unit/org/apache/cassandra/service/reads/range/ReplicaPlanMergerTest.java @@ -416,8 +416,8 @@ private final void testRanges(ConsistencyLevel consistencyLevel, AbstractBounds queryRange, AbstractBounds... expected) { - try (ReplicaPlanIterator originals = new ReplicaPlanIterator(queryRange, null, keyspace, ANY); // ANY avoids endpoint erros - ReplicaPlanMerger merger = new ReplicaPlanMerger(originals, keyspace, consistencyLevel)) + try (ReplicaPlanIterator originals = new ReplicaPlanIterator(queryRange, null, keyspace, null, ANY); // ANY avoids endpoint erros + ReplicaPlanMerger merger = new ReplicaPlanMerger(originals, keyspace, null, consistencyLevel)) { // collect the merged ranges List> mergedRanges = new ArrayList<>(expected.length); diff --git a/test/unit/org/apache/cassandra/service/reads/repair/AbstractReadRepairTest.java b/test/unit/org/apache/cassandra/service/reads/repair/AbstractReadRepairTest.java index f0026d35b9c0..4a6d31447c4b 100644 --- a/test/unit/org/apache/cassandra/service/reads/repair/AbstractReadRepairTest.java +++ b/test/unit/org/apache/cassandra/service/reads/repair/AbstractReadRepairTest.java @@ -66,14 +66,15 @@ import org.apache.cassandra.locator.Replica; import org.apache.cassandra.locator.ReplicaPlan; import org.apache.cassandra.locator.ReplicaUtils; +import org.apache.cassandra.service.reads.ReadCoordinator; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.net.Message; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.KeyspaceParams; import org.apache.cassandra.schema.SchemaTestUtil; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.schema.Tables; -import org.apache.cassandra.tcm.ClusterMetadata; -import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.transport.Dispatcher; import org.apache.cassandra.utils.ByteBufferUtil; @@ -357,9 +358,10 @@ static ReplicaPlan.ForRangeRead replicaPlan(Keyspace keyspace, ConsistencyLevel ReplicaUtils.FULL_BOUNDS, replicas, targets, + replicas, 1, null, - (self, token) -> forReadRepair(self, ClusterMetadata.current(), keyspace, consistencyLevel, token, (r) -> true), + (self, token) -> forReadRepair(self, ClusterMetadata.current(), keyspace, null, consistencyLevel, token, (r) -> true, ReadCoordinator.DEFAULT), Epoch.EMPTY); } diff --git a/test/unit/org/apache/cassandra/service/reads/repair/BlockingReadRepairTest.java b/test/unit/org/apache/cassandra/service/reads/repair/BlockingReadRepairTest.java index 6806172402f1..a0320c92c789 100644 --- a/test/unit/org/apache/cassandra/service/reads/repair/BlockingReadRepairTest.java +++ b/test/unit/org/apache/cassandra/service/reads/repair/BlockingReadRepairTest.java @@ -42,6 +42,7 @@ import org.apache.cassandra.net.Message; import org.apache.cassandra.service.reads.ReadCallback; import org.apache.cassandra.transport.Dispatcher; +import org.apache.cassandra.service.reads.ReadCoordinator; import static java.util.concurrent.TimeUnit.NANOSECONDS; import static org.apache.cassandra.utils.Clock.Global.nanoTime; @@ -53,7 +54,7 @@ private static class InstrumentedReadRepairHandler { public InstrumentedReadRepairHandler(Map repairs, ReplicaPlan.ForWrite writePlan) { - super(Util.dk("not a real usable value"), repairs, writePlan); + super(ReadCoordinator.DEFAULT, Util.dk("not a real usable value"), repairs, writePlan); } Map mutationsSent = new HashMap<>(); @@ -86,7 +87,7 @@ private static class InstrumentedBlockingReadRepair, P ex { public InstrumentedBlockingReadRepair(ReadCommand command, ReplicaPlan.Shared replicaPlan, Dispatcher.RequestTime requestTime) { - super(command, replicaPlan, requestTime); + super(ReadCoordinator.DEFAULT, command, replicaPlan, requestTime); } Set readCommandRecipients = new HashSet<>(); diff --git a/test/unit/org/apache/cassandra/service/reads/repair/DiagEventsBlockingReadRepairTest.java b/test/unit/org/apache/cassandra/service/reads/repair/DiagEventsBlockingReadRepairTest.java index 9258922ff88d..1a330402d834 100644 --- a/test/unit/org/apache/cassandra/service/reads/repair/DiagEventsBlockingReadRepairTest.java +++ b/test/unit/org/apache/cassandra/service/reads/repair/DiagEventsBlockingReadRepairTest.java @@ -47,6 +47,7 @@ import org.apache.cassandra.locator.Replica; import org.apache.cassandra.net.Message; import org.apache.cassandra.service.reads.ReadCallback; +import org.apache.cassandra.service.reads.ReadCoordinator; import org.apache.cassandra.service.reads.repair.ReadRepairEvent.ReadRepairEventType; import org.apache.cassandra.transport.Dispatcher; @@ -135,7 +136,7 @@ private static class DiagnosticBlockingRepairHandler extends BlockingReadRepair DiagnosticBlockingRepairHandler(ReadCommand command, ReplicaPlan.Shared replicaPlan, Dispatcher.RequestTime requestTime) { - super(command, replicaPlan, requestTime); + super(ReadCoordinator.DEFAULT, command, replicaPlan, requestTime); DiagnosticEventService.instance().subscribe(ReadRepairEvent.class, this::onRepairEvent); } @@ -183,7 +184,7 @@ private static Predicate isLocal() DiagnosticPartitionReadRepairHandler(DecoratedKey key, Map repairs, ReplicaPlan.ForWrite forReadRepair) { - super(key, repairs, forReadRepair); + super(ReadCoordinator.DEFAULT, key, repairs, forReadRepair); DiagnosticEventService.instance().subscribe(PartitionRepairEvent.class, this::onRepairEvent); } diff --git a/test/unit/org/apache/cassandra/service/reads/repair/ReadOnlyReadRepairTest.java b/test/unit/org/apache/cassandra/service/reads/repair/ReadOnlyReadRepairTest.java index 749e444425f7..1d4b9b745ae6 100644 --- a/test/unit/org/apache/cassandra/service/reads/repair/ReadOnlyReadRepairTest.java +++ b/test/unit/org/apache/cassandra/service/reads/repair/ReadOnlyReadRepairTest.java @@ -22,7 +22,6 @@ import java.util.HashSet; import java.util.Set; -import org.apache.cassandra.locator.ReplicaPlan; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; @@ -32,8 +31,11 @@ import org.apache.cassandra.locator.Endpoints; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.Replica; +import org.apache.cassandra.locator.ReplicaPlan; import org.apache.cassandra.service.reads.ReadCallback; import org.apache.cassandra.transport.Dispatcher; +import org.apache.cassandra.service.reads.ReadCoordinator; +import org.apache.cassandra.service.reads.repair.ReadRepair.ReadRepairSource; public class ReadOnlyReadRepairTest extends AbstractReadRepairTest { @@ -42,7 +44,7 @@ private static class InstrumentedReadOnlyReadRepair, P ex { public InstrumentedReadOnlyReadRepair(ReadCommand command, ReplicaPlan.Shared replicaPlan, Dispatcher.RequestTime requestTime) { - super(command, replicaPlan, requestTime); + super(ReadCoordinator.DEFAULT, command, replicaPlan, requestTime); } Set readCommandRecipients = new HashSet<>(); @@ -95,6 +97,6 @@ public void repairPartitionFailure() ReplicaPlan.SharedForRangeRead readPlan = ReplicaPlan.shared(replicaPlan(replicas, replicas)); ReplicaPlan.ForWrite writePlan = repairPlan(replicas, replicas); InstrumentedReadRepair repair = createInstrumentedReadRepair(readPlan); - repair.repairPartition(null, Collections.emptyMap(), writePlan); + repair.repairPartition(null, Collections.emptyMap(), writePlan, ReadRepairSource.OTHER); } } diff --git a/test/unit/org/apache/cassandra/service/reads/repair/ReadRepairTest.java b/test/unit/org/apache/cassandra/service/reads/repair/ReadRepairTest.java index 5138de03000b..d0f0682fbf1b 100644 --- a/test/unit/org/apache/cassandra/service/reads/repair/ReadRepairTest.java +++ b/test/unit/org/apache/cassandra/service/reads/repair/ReadRepairTest.java @@ -57,6 +57,7 @@ import org.apache.cassandra.schema.SchemaTestUtil; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.schema.Tables; +import org.apache.cassandra.service.reads.ReadCoordinator; import org.apache.cassandra.tcm.membership.Location; import org.apache.cassandra.utils.ByteBufferUtil; @@ -79,7 +80,7 @@ private static class InstrumentedReadRepairHandler, P ext { public InstrumentedReadRepairHandler(Map repairs, ReplicaPlan.ForWrite writePlan) { - super(Util.dk("not a valid key"), repairs, writePlan); + super(ReadCoordinator.DEFAULT, Util.dk("not a valid key"), repairs, writePlan); } Map mutationsSent = new HashMap<>(); diff --git a/test/unit/org/apache/cassandra/service/reads/repair/RepairedDataVerifierTest.java b/test/unit/org/apache/cassandra/service/reads/repair/RepairedDataVerifierTest.java index 682dc740ff08..f4c230e790b5 100644 --- a/test/unit/org/apache/cassandra/service/reads/repair/RepairedDataVerifierTest.java +++ b/test/unit/org/apache/cassandra/service/reads/repair/RepairedDataVerifierTest.java @@ -281,6 +281,7 @@ private static class StubReadCommand extends SinglePartitionReadCommand isDigest, 0, false, + PotentialTxnConflicts.DISALLOW, metadata, FBUtilities.nowInSeconds(), ColumnFilter.all(metadata), diff --git a/test/unit/org/apache/cassandra/service/reads/repair/TestableReadRepair.java b/test/unit/org/apache/cassandra/service/reads/repair/TestableReadRepair.java index eecd106e06ac..3bca1cd0621c 100644 --- a/test/unit/org/apache/cassandra/service/reads/repair/TestableReadRepair.java +++ b/test/unit/org/apache/cassandra/service/reads/repair/TestableReadRepair.java @@ -35,7 +35,9 @@ import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.locator.ReplicaPlan; +import org.apache.cassandra.locator.ReplicaPlan.ForWrite; import org.apache.cassandra.service.reads.DigestResolver; +import org.apache.cassandra.service.reads.ReadCoordinator; public class TestableReadRepair, P extends ReplicaPlan.ForRead> implements ReadRepair @@ -89,7 +91,6 @@ public void startRepair(DigestResolver digestResolver, Consumer mutations, ReplicaPlan.ForWrite writePlan) + public void repairPartition(DecoratedKey partitionKey, Map mutations, ReplicaPlan.ForWrite writePlan, ReadRepairSource rrSource) { for (Map.Entry entry: mutations.entrySet()) sent.put(entry.getKey().endpoint(), entry.getValue()); } + @Override + public void repairPartitionDirectly(ReadCoordinator coordinator, DecoratedKey partitionKey, Map mutations, ForWrite writePlan) + { + throw new UnsupportedOperationException(); + } + public Mutation getForEndpoint(InetAddressAndPort endpoint) { return sent.get(endpoint); diff --git a/test/unit/org/apache/cassandra/streaming/SessionInfoTest.java b/test/unit/org/apache/cassandra/streaming/SessionInfoTest.java index 778b6b2d7115..b44d6b42c930 100644 --- a/test/unit/org/apache/cassandra/streaming/SessionInfoTest.java +++ b/test/unit/org/apache/cassandra/streaming/SessionInfoTest.java @@ -27,6 +27,8 @@ import org.apache.cassandra.schema.TableId; import org.apache.cassandra.utils.FBUtilities; +import static java.util.Collections.emptyList; + public class SessionInfoTest { /** @@ -41,11 +43,11 @@ public void testTotals() Collection summaries = new ArrayList<>(); for (int i = 0; i < 10; i++) { - StreamSummary summary = new StreamSummary(tableId, i, (i + 1) * 10); + StreamSummary summary = new StreamSummary(tableId, emptyList(), i, (i + 1) * 10); summaries.add(summary); } - StreamSummary sending = new StreamSummary(tableId, 10, 100); + StreamSummary sending = new StreamSummary(tableId, emptyList(), 10, 100); SessionInfo info = new SessionInfo(local, 0, local, summaries, Collections.singleton(sending), StreamSession.State.PREPARING, null); assert info.getTotalFilesToReceive() == 45; diff --git a/test/unit/org/apache/cassandra/streaming/StreamReaderTest.java b/test/unit/org/apache/cassandra/streaming/StreamReaderTest.java index 79856ee1539b..0a854c074430 100644 --- a/test/unit/org/apache/cassandra/streaming/StreamReaderTest.java +++ b/test/unit/org/apache/cassandra/streaming/StreamReaderTest.java @@ -70,6 +70,7 @@ import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.TimeUUID; +import static java.util.Collections.emptyList; import static org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper.*; import static org.apache.cassandra.tcm.ownership.OwnershipUtils.beginJoin; import static org.apache.cassandra.tcm.ownership.OwnershipUtils.beginMove; @@ -366,7 +367,7 @@ public static StreamSession setupStreamingSessionForTest() StreamResultFuture future = StreamResultFuture.createInitiator(nextTimeUUID(), StreamOperation.REPAIR, Collections.emptyList(), streamCoordinator); InetAddressAndPort peer = FBUtilities.getBroadcastAddressAndPort(); - streamCoordinator.addSessionInfo(new SessionInfo(peer, 0, peer, Collections.emptyList(), Collections.emptyList(), StreamSession.State.INITIALIZED, "")); + streamCoordinator.addSessionInfo(new SessionInfo(peer, 0, peer, emptyList(), emptyList(), StreamSession.State.INITIALIZED, "")); StreamSession session = streamCoordinator.getOrCreateOutboundSession(peer); session.init(future); @@ -380,7 +381,7 @@ private static void tryReceiveExpectingSuccess(int[] tokens) throws Throwable CassandraStreamHeader streamHeader = streamMessageHeader(tokens); long startMetricCount = StorageMetrics.totalOpsForInvalidToken.getCount(); IStreamReader reader = streamReader(header, streamHeader, session); - StreamSummary streamSummary = new StreamSummary(streamHeader.tableId, 1, 0); + StreamSummary streamSummary = new StreamSummary(streamHeader.tableId, emptyList(), 1, 0); session.prepareReceiving(streamSummary); reader.read(incomingStream(tokens)); assertEquals(StorageMetrics.totalOpsForInvalidToken.getCount(), startMetricCount); @@ -392,7 +393,7 @@ private static void tryReceiveExpectingFailure(int[] tokens) throws Throwable StreamMessageHeader header = streamHeader(); CassandraStreamHeader streamHeader = streamMessageHeader(tokens); long startMetricCount = StorageMetrics.totalOpsForInvalidToken.getCount(); - StreamSummary streamSummary = new StreamSummary(streamHeader.tableId, 1, 0); + StreamSummary streamSummary = new StreamSummary(streamHeader.tableId, emptyList(), 1, 0); session.prepareReceiving(streamSummary); try { diff --git a/test/unit/org/apache/cassandra/streaming/async/StreamingInboundHandlerTest.java b/test/unit/org/apache/cassandra/streaming/async/StreamingInboundHandlerTest.java index 03d6aa7fd321..904272f7b8a1 100644 --- a/test/unit/org/apache/cassandra/streaming/async/StreamingInboundHandlerTest.java +++ b/test/unit/org/apache/cassandra/streaming/async/StreamingInboundHandlerTest.java @@ -50,13 +50,11 @@ import org.apache.cassandra.streaming.messages.StreamMessageHeader; import org.apache.cassandra.utils.TimeUUID; -import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; - import static org.apache.cassandra.net.TestChannel.REMOTE_ADDR; +import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; public class StreamingInboundHandlerTest { - private NettyStreamingChannel streamingChannel; private EmbeddedChannel channel; private ByteBuf buf; diff --git a/test/unit/org/apache/cassandra/tcm/ClusterMetadataMetadataKeyTest.java b/test/unit/org/apache/cassandra/tcm/ClusterMetadataMetadataKeyTest.java new file mode 100644 index 000000000000..291be79bf438 --- /dev/null +++ b/test/unit/org/apache/cassandra/tcm/ClusterMetadataMetadataKeyTest.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm; + +import java.lang.reflect.Field; +import java.lang.reflect.Modifier; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import org.junit.Test; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.utils.FBUtilities; +import org.assertj.core.api.Assertions; + +/** + * This test is to make sure that the fields of {@link ClusterMetadata} have a matching {@link MetadataKey} and the + * utility functions linking key to field are maintained. + * + * If this test is failing it likely means a new field was added to {@link ClusterMetadata} and {@link MetadataKeys} was + * not updated to know about it. + */ +public class ClusterMetadataMetadataKeyTest +{ + static + { + DatabaseDescriptor.clientInitialization(); + } + + private static final Map NAME_TO_KEY; + + static + { + ImmutableMap.Builder builder = ImmutableMap.builder(); + for (Field field : MetadataKeys.class.getDeclaredFields()) + { + if (field.getType() == MetadataKey.class + && Modifier.isStatic(field.getModifiers()) + && Modifier.isPublic(field.getModifiers())) + builder.put(field.getName(), field); + } + NAME_TO_KEY = builder.build(); + } + + @Test + public void metadataKeyExists() throws IllegalAccessException + { + ClusterMetadata empty = new ClusterMetadata(Murmur3Partitioner.instance); + // Theese are fields that should not have MetadataKeys and should be ignored. + Set exclude = ImmutableSet.of("metadataIdentifier", + "epoch", + "partitioner", + "extensions", + "locator"); + // Mapping of ClusterMetadata field names to MetadataKey name; mapping is only needed if the names don't match. + Map mapping = ImmutableMap.of("directory", "node_directory", + "placements", "data_placements"); + for (Field field : ClusterMetadata.class.getDeclaredFields()) + { + if (Modifier.isStatic(field.getModifiers()) + || !Modifier.isPublic(field.getModifiers()) + || !Modifier.isFinal(field.getModifiers())) + continue; + String name = field.getName(); + if (exclude.contains(name)) continue; + if (mapping.containsKey(name)) + name = mapping.get(name); + String snakeName = FBUtilities.camelToSnake(name).toUpperCase(Locale.ROOT); + Assertions.assertThat(NAME_TO_KEY.keySet()) + .describedAs("Unable to locate MetadataKey for %s", snakeName) + .contains(snakeName); + MetadataKey expectedKey = (MetadataKey) NAME_TO_KEY.get(snakeName).get(null); + if (!MetadataKeys.CORE_METADATA.containsKey(expectedKey)) + throw new IllegalStateException("MetadataKeys.CORE_METADATA is missing key " + expectedKey + " for field " + name); + + Assertions.assertThat(field.get(empty)) + .describedAs("Extraction function does not seem to match the field %s and key %s", name, snakeName) + .isSameAs(MetadataKeys.CORE_METADATA.get(expectedKey).apply(empty)); + } + } +} diff --git a/test/unit/org/apache/cassandra/tcm/ClusterMetadataSerializerTest.java b/test/unit/org/apache/cassandra/tcm/ClusterMetadataSerializerTest.java new file mode 100644 index 000000000000..e4ca474d0712 --- /dev/null +++ b/test/unit/org/apache/cassandra/tcm/ClusterMetadataSerializerTest.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm; + +import org.junit.Test; + +import accord.utils.Gen; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.service.accord.AccordFastPath; +import org.apache.cassandra.service.accord.AccordStaleReplicas; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationState; +import org.apache.cassandra.tcm.membership.NodeVersion; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializers; +import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.utils.CassandraGenerators.ClusterMetadataBuilder; +import org.apache.cassandra.utils.Generators; +import org.assertj.core.api.Assertions; + +import static accord.utils.Property.qt; + +public class ClusterMetadataSerializerTest +{ + static + { + DatabaseDescriptor.toolInitialization(); + } + + @Test + public void serdeLatest() + { + DataOutputBuffer output = new DataOutputBuffer(); + qt().forAll(Generators.toGen(new ClusterMetadataBuilder().build())).check(cm -> { + AsymmetricMetadataSerializers.testSerde(output, ClusterMetadata.serializer, cm, NodeVersion.CURRENT_METADATA_VERSION); + }); + } + + @Test + public void serdeWithoutAccord() + { + DataOutputBuffer output = new DataOutputBuffer(); + Gen gen = Generators.toGen(new ClusterMetadataBuilder().build()).filter(cm -> { + if (!cm.consensusMigrationState.equals(ConsensusMigrationState.EMPTY)) + return true; + if (!cm.accordStaleReplicas.equals(AccordStaleReplicas.EMPTY)) + return true; + if (!cm.accordFastPath.equals(AccordFastPath.EMPTY)) + return true; + return false; + }); + qt().forAll(gen).check(cm -> { + output.clear(); + Version version = Version.V2; // this is the version before accord + long expectedSize = ClusterMetadata.serializer.serializedSize(cm, version); + ClusterMetadata.serializer.serialize(cm, output, version); + Assertions.assertThat(output.getLength()).describedAs("The serialized size and bytes written do not match").isEqualTo(expectedSize); + DataInputBuffer in = new DataInputBuffer(output.unsafeGetBufferAndFlip(), false); + ClusterMetadata read = ClusterMetadata.serializer.deserialize(in, version); + Assertions.assertThat(read).isNotEqualTo(cm); + + Assertions.assertThat(read.consensusMigrationState).isEqualTo(ConsensusMigrationState.EMPTY); + Assertions.assertThat(read.accordStaleReplicas).isEqualTo(AccordStaleReplicas.EMPTY); + Assertions.assertThat(read.accordFastPath).isEqualTo(AccordFastPath.EMPTY); + }); + } +} diff --git a/test/unit/org/apache/cassandra/tcm/ClusterMetadataTransformationTest.java b/test/unit/org/apache/cassandra/tcm/ClusterMetadataTransformationTest.java index bd91832d00bb..d3bb189c90fb 100644 --- a/test/unit/org/apache/cassandra/tcm/ClusterMetadataTransformationTest.java +++ b/test/unit/org/apache/cassandra/tcm/ClusterMetadataTransformationTest.java @@ -24,8 +24,10 @@ import java.util.concurrent.ThreadLocalRandom; import com.google.common.collect.Iterables; +import org.junit.BeforeClass; import org.junit.Test; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.io.util.DataInputBuffer; import org.apache.cassandra.io.util.DataOutputBuffer; @@ -50,12 +52,7 @@ import org.apache.cassandra.tcm.sequences.LockedRanges; import org.mockito.Mockito; -import static org.apache.cassandra.tcm.MetadataKeys.DATA_PLACEMENTS; -import static org.apache.cassandra.tcm.MetadataKeys.IN_PROGRESS_SEQUENCES; -import static org.apache.cassandra.tcm.MetadataKeys.LOCKED_RANGES; -import static org.apache.cassandra.tcm.MetadataKeys.NODE_DIRECTORY; -import static org.apache.cassandra.tcm.MetadataKeys.SCHEMA; -import static org.apache.cassandra.tcm.MetadataKeys.TOKEN_MAP; +import static org.apache.cassandra.tcm.MetadataKeys.*; import static org.apache.cassandra.tcm.ownership.OwnershipUtils.randomPlacements; import static org.apache.cassandra.tcm.ownership.OwnershipUtils.token; import static org.apache.cassandra.tcm.sequences.SequencesUtils.affectedRanges; @@ -66,6 +63,12 @@ public class ClusterMetadataTransformationTest { + @BeforeClass + public static void init() + { + DatabaseDescriptor.toolInitialization(); + } + long seed = System.nanoTime(); Random random = new Random(seed); @@ -272,7 +275,7 @@ private static void assertModifications(Transformed transformed, MetadataKey... // anything modified by in this transformation, and therefore included in the modified keys, // should have the same epoch as the CM itself. Anything not modified now must have a strictly // earlier epoch - for (MetadataKey key : Iterables.concat(MetadataKeys.CORE_METADATA, transformed.metadata.extensions.keySet())) + for (MetadataKey key : Iterables.concat(MetadataKeys.CORE_METADATA.keySet(), transformed.metadata.extensions.keySet())) { MetadataValue value = valueFor(key, transformed.metadata); if (transformed.modifiedKeys.contains(key)) @@ -284,7 +287,7 @@ private static void assertModifications(Transformed transformed, MetadataKey... private static MetadataValue valueFor(MetadataKey key, ClusterMetadata metadata) { - if (!MetadataKeys.CORE_METADATA.contains(key)) + if (!MetadataKeys.CORE_METADATA.containsKey(key)) { assert key instanceof ExtensionKey; return metadata.extensions.get((ExtensionKey)key); @@ -302,6 +305,12 @@ else if (key == LOCKED_RANGES) return metadata.lockedRanges; else if (key == IN_PROGRESS_SEQUENCES) return metadata.inProgressSequences; + else if (key == ACCORD_FAST_PATH) + return metadata.accordFastPath; + else if (key == CONSENSUS_MIGRATION_STATE) + return metadata.consensusMigrationState; + else if (key == ACCORD_STALE_REPLICAS) + return metadata.accordStaleReplicas; throw new IllegalArgumentException("Unknown metadata key " + key); } diff --git a/test/unit/org/apache/cassandra/tcm/DiscoverySimulationTest.java b/test/unit/org/apache/cassandra/tcm/DiscoverySimulationTest.java index 564eb98f8457..60213554420f 100644 --- a/test/unit/org/apache/cassandra/tcm/DiscoverySimulationTest.java +++ b/test/unit/org/apache/cassandra/tcm/DiscoverySimulationTest.java @@ -36,7 +36,7 @@ import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.dht.Murmur3Partitioner; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.ConnectionType; import org.apache.cassandra.net.IVerbHandler; @@ -194,7 +194,7 @@ public void sendWithCallback(Message message, InetAddressAndPort else { logger.info("{} simulating failure sending request to {}", addr, to); - cb.onFailure(to, RequestFailureReason.TIMEOUT); + cb.onFailure(to, RequestFailure.TIMEOUT); } } catch (IOException e) diff --git a/test/unit/org/apache/cassandra/tcm/ValidatingClusterMetadataService.java b/test/unit/org/apache/cassandra/tcm/ValidatingClusterMetadataService.java new file mode 100644 index 000000000000..8c2dc5a2ecb3 --- /dev/null +++ b/test/unit/org/apache/cassandra/tcm/ValidatingClusterMetadataService.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm; + +import java.io.IOException; +import java.util.List; +import java.util.TreeMap; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.tcm.log.Entry; +import org.apache.cassandra.tcm.log.LogState; +import org.apache.cassandra.tcm.sequences.LockedRanges; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializers; +import org.apache.cassandra.tcm.serialization.Version; +import org.assertj.core.api.Assertions; + +public class ValidatingClusterMetadataService extends StubClusterMetadataService +{ + private final List supportedVersions; + private final TreeMap epochs = new TreeMap<>(); + + private ValidatingClusterMetadataService(List supportedVersions) + { + super(new ClusterMetadata(safeGetPartitioner())); + this.supportedVersions = supportedVersions; + } + + public static ValidatingClusterMetadataService createAndRegister(Version minVersion) + { + return createAndRegister(minVersion.greaterThanOrEqual()); + } + + public static ValidatingClusterMetadataService createAndRegister(List supportedVersions) + { + ValidatingClusterMetadataService cms = new ValidatingClusterMetadataService(supportedVersions); + + ClusterMetadataService.unsetInstance(); + ClusterMetadataService.setInstance(cms); + return cms; + } + + private static IPartitioner safeGetPartitioner() + { + IPartitioner partitioner = DatabaseDescriptor.getPartitioner(); + return partitioner == null ? Murmur3Partitioner.instance : partitioner; + } + + private void testSerde(AsymmetricMetadataSerializer serializer, In input) + { + for (Version version : supportedVersions) + { + try (DataOutputBuffer buffer = DataOutputBuffer.scratchBuffer.get()) + { + AsymmetricMetadataSerializers.testSerde(buffer, serializer, input, version); + } + catch (IOException e) + { + throw new AssertionError(String.format("Serde error for version=%s; input=%s", version, input), e); + } + } + } + + @Override + protected Transformation.Result execute(Transformation transform) + { + Transformation.Result result = super.execute(transform); + if (result.isSuccess()) + { + Transformation.Success success = result.success(); + Assertions.assertThat(success.affectedMetadata) + .describedAs("Affected Metadata keys do not match") + .isEqualTo(MetadataKeys.diffKeys(metadata(), success.metadata)); + } + return result; + } + + @Override + public T1 commit(Transformation transform, CommitSuccessHandler onSuccess, CommitFailureHandler onFailure) + { + testSerde(transform.kind().serializer(), transform); + return super.commit(transform, onSuccess, onFailure); + } + + @Override + public void setMetadata(ClusterMetadata metadata) + { + if (!metadata.epoch.equals(metadata().epoch.nextEpoch())) + throw new AssertionError("Epochs were not sequential: expected " + metadata().epoch.nextEpoch() + " but given " + metadata.epoch); + testSerde(ClusterMetadata.serializer, metadata); + epochs.put(metadata.epoch, metadata); + super.setMetadata(metadata); + } + + @Override + public Processor processor() + { + Processor delegate = super.processor(); + return new Processor() + { + @Override + public Commit.Result commit(Entry.Id entryId, Transformation transform, Epoch lastKnown, Retry retryPolicy) + { + return delegate.commit(entryId, transform, lastKnown, retryPolicy); + } + + @Override + public ClusterMetadata fetchLogAndWait(Epoch waitFor, Retry retryPolicy) + { + return delegate.fetchLogAndWait(waitFor, retryPolicy); + } + + @Override + public LogState getLocalState(Epoch lowEpoch, Epoch highEpoch, boolean includeSnapshot) + { + if (!epochs.containsKey(lowEpoch)) + throw new AssertionError("Unknown epoch: " + lowEpoch); + ClusterMetadata base = epochs.get(lowEpoch); + ImmutableList.Builder entries = ImmutableList.builder(); + int id = 0; + for (ClusterMetadata cm : epochs.subMap(lowEpoch, false, highEpoch, true).values()) + entries.add(new Entry(new Entry.Id(id++), cm.epoch, new MockTransformer(cm))); + return new LogState(includeSnapshot ? base : null, entries.build()); + } + + @Override + public LogState getLogState(Epoch lowEpoch, Epoch highEpoch, boolean includeSnapshot, Retry retryPolicy) + { + return getLocalState(lowEpoch, highEpoch, includeSnapshot); + } + }; + } + + private static class MockTransformer implements Transformation + { + private final ClusterMetadata result; + + private MockTransformer(ClusterMetadata result) + { + this.result = result; + } + + @Override + public Kind kind() + { + return null; + } + + @Override + public Result execute(ClusterMetadata metadata) + { + return new Success(result, LockedRanges.AffectedRanges.EMPTY, ImmutableSet.of()); + } + } +} diff --git a/test/unit/org/apache/cassandra/tcm/listeners/MetadataSnapshotListenerTest.java b/test/unit/org/apache/cassandra/tcm/listeners/MetadataSnapshotListenerTest.java index 2f962f1ab274..4c57a76c8b91 100644 --- a/test/unit/org/apache/cassandra/tcm/listeners/MetadataSnapshotListenerTest.java +++ b/test/unit/org/apache/cassandra/tcm/listeners/MetadataSnapshotListenerTest.java @@ -20,6 +20,7 @@ import java.util.Random; +import org.apache.cassandra.config.DatabaseDescriptor; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; @@ -50,7 +51,7 @@ public class MetadataSnapshotListenerTest { private static final Logger logger = LoggerFactory.getLogger(MetadataSnapshotListenerTest.class); - private IPartitioner partitioner = Murmur3Partitioner.instance; + private final IPartitioner partitioner = Murmur3Partitioner.instance; private Random r; @BeforeClass @@ -59,6 +60,7 @@ public static void disableSortedReplicaGroups() // Set this so that we don't attempt to sort the random placements as this depends on a populated // TokenMap. This is a temporary element of ClusterMetadata, at least in the current form CassandraRelevantProperties.TCM_SORT_REPLICA_GROUPS.setBoolean(false); + DatabaseDescriptor.daemonInitialization(); } @Before diff --git a/test/unit/org/apache/cassandra/tcm/log/DistributedLogStateTest.java b/test/unit/org/apache/cassandra/tcm/log/DistributedLogStateTest.java index 37cee7237347..656ee5551097 100644 --- a/test/unit/org/apache/cassandra/tcm/log/DistributedLogStateTest.java +++ b/test/unit/org/apache/cassandra/tcm/log/DistributedLogStateTest.java @@ -102,9 +102,9 @@ public void snapshotMetadata() } @Override - public LogState getLogState(Epoch since) + public LogReader reader() { - return reader.getLogState(since); + return reader; } @Override diff --git a/test/unit/org/apache/cassandra/tcm/log/LocalLogTest.java b/test/unit/org/apache/cassandra/tcm/log/LocalLogTest.java index fbdafb131d63..20bb5c79b06c 100644 --- a/test/unit/org/apache/cassandra/tcm/log/LocalLogTest.java +++ b/test/unit/org/apache/cassandra/tcm/log/LocalLogTest.java @@ -145,6 +145,11 @@ public EntryHolder getEntries(Epoch since) throws IOException return new EntryHolder(since); } + public EntryHolder getEntries(Epoch since, Epoch until) throws IOException + { + return new EntryHolder(since); + } + @Override public MetadataSnapshots snapshots() { diff --git a/test/unit/org/apache/cassandra/tcm/log/LocalStorageLogStateTest.java b/test/unit/org/apache/cassandra/tcm/log/LocalStorageLogStateTest.java index 5bc6ec0fa831..196c69ed7ec8 100644 --- a/test/unit/org/apache/cassandra/tcm/log/LocalStorageLogStateTest.java +++ b/test/unit/org/apache/cassandra/tcm/log/LocalStorageLogStateTest.java @@ -90,9 +90,9 @@ public void snapshotMetadata() throws IOException } @Override - public LogState getLogState(Epoch since) + public LogReader reader() { - return storage.getLogState(since); + return storage; } @Override diff --git a/test/unit/org/apache/cassandra/tcm/log/LogListenerNotificationTest.java b/test/unit/org/apache/cassandra/tcm/log/LogListenerNotificationTest.java index 59aafa37e0d2..55df4e868e4a 100644 --- a/test/unit/org/apache/cassandra/tcm/log/LogListenerNotificationTest.java +++ b/test/unit/org/apache/cassandra/tcm/log/LogListenerNotificationTest.java @@ -111,7 +111,7 @@ static ClusterMetadata cm() static Set affectedMetadata(Random random) { - List src = new ArrayList<>(CORE_METADATA); + List src = new ArrayList<>(CORE_METADATA.keySet()); int required = random.nextInt(src.size()); Set keys = new HashSet<>(); while (keys.size() < required) diff --git a/test/unit/org/apache/cassandra/tcm/log/LogStateTestBase.java b/test/unit/org/apache/cassandra/tcm/log/LogStateTestBase.java index c7df0141fbbb..5342930da03f 100644 --- a/test/unit/org/apache/cassandra/tcm/log/LogStateTestBase.java +++ b/test/unit/org/apache/cassandra/tcm/log/LogStateTestBase.java @@ -20,18 +20,25 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; +import java.util.Objects; +import java.util.stream.Stream; import org.junit.Before; import org.junit.Test; +import accord.utils.Gen; +import accord.utils.Gens; import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.MetadataSnapshots; import org.apache.cassandra.tcm.sequences.SequencesUtils; +import org.assertj.core.api.Assertions; +import static accord.utils.Property.qt; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; @@ -44,13 +51,42 @@ public abstract class LogStateTestBase static int EXTRA_ENTRIES = 2; static Epoch CURRENT_EPOCH = Epoch.create((NUM_SNAPSHOTS * SNAPSHOT_FREQUENCY) + EXTRA_ENTRIES); static Epoch LATEST_SNAPSHOT_EPOCH = Epoch.create(NUM_SNAPSHOTS * SNAPSHOT_FREQUENCY); + private static final Gen.LongGen EPOCH_GEN = rs -> rs.nextLong(0, CURRENT_EPOCH.getEpoch()) + 1; + private static final Gen BETWEEN_GEN = rs -> { + long a = EPOCH_GEN.nextLong(rs); + long b = EPOCH_GEN.nextLong(rs); + while (b == a) + b = EPOCH_GEN.nextLong(rs); + if (b < a) + { + long tmp = a; + a = b; + b = tmp; + } + return new Between(Epoch.create(a), Epoch.create(b)); + }; + private static final Gen SNAPSHOTS_GEN = Gens.oneOf() + .add(i -> MetadataSnapshots.NO_OP) + .add(i -> throwing()) + .add(rs -> rs.nextBoolean() ? withCorruptSnapshots(LATEST_SNAPSHOT_EPOCH) : withAvailableSnapshots(LATEST_SNAPSHOT_EPOCH)) + .add(rs -> { + Epoch[] queriedEpochs = new Epoch[NUM_SNAPSHOTS]; + for (int i = 0; i < NUM_SNAPSHOTS; i++) + queriedEpochs[i] = SequencesUtils.epoch((NUM_SNAPSHOTS - i) * SNAPSHOT_FREQUENCY); + return rs.nextBoolean() ? withCorruptSnapshots(queriedEpochs) : withAvailableSnapshots(queriedEpochs); + }) + .build(); interface LogStateSUT { void cleanup() throws IOException; void insertRegularEntry() throws IOException; void snapshotMetadata() throws IOException; - LogState getLogState(Epoch since); + LogReader reader(); + default LogState getLogState(Epoch since) + { + return reader().getLogState(since); + } // just for manually checking the test data void dumpTables() throws IOException; @@ -113,6 +149,11 @@ public List listSnapshotsSince(Epoch epoch) return list; } + @Override + public String toString() + { + return (corrupt ? "Corrupted" : "") + "Snapshots{" + Arrays.toString(Stream.of(expected).mapToLong(e -> e.getEpoch()).toArray()) + '}'; + } }; static MetadataSnapshots withCorruptSnapshots(Epoch ... expected) @@ -135,6 +176,12 @@ public ClusterMetadata getSnapshot(Epoch epoch) fail("Did not expect to request a snapshot"); return null; } + + @Override + public String toString() + { + return "Throwing"; + } }; } @@ -244,6 +291,47 @@ public void sinceArbitraryEpochWithMultipleCorruptSnapshots() assertEntries(state.entries, since.nextEpoch(), CURRENT_EPOCH); } + @Test + public void getLogStateBetween() + { + qt().forAll(SNAPSHOTS_GEN, BETWEEN_GEN).check((snapshots, between) -> { + LogStateSUT sut = getSystemUnderTest(snapshots); + LogState state = sut.reader().getLogState(between.start, between.end, true); + Assertions.assertThat(state.entries).describedAs("with and without snapshot should have the same entries").isEqualTo(sut.reader().getLogState(between.start, between.end, false).entries); + Assertions.assertThat(state.baseState.epoch).isEqualTo(between.start); + + List entries = state.entries; + Assertions.assertThat(entries.size()).isEqualTo(between.end.getEpoch() - between.start.getEpoch()); + + long expected = between.start.nextEpoch().getEpoch(); + for (Entry e : entries) + { + long actual = e.epoch.getEpoch(); + Assertions.assertThat(actual).describedAs("Unexpected epoch").isEqualTo(expected); + expected++; + } + }); + } + + @Test + public void getEntriesBetween() + { + qt().forAll(SNAPSHOTS_GEN, BETWEEN_GEN).check((snapshots, between) -> { + LogStateSUT sut = getSystemUnderTest(snapshots); + LogReader.EntryHolder entries = sut.reader().getEntries(between.start, between.end); + Assertions.assertThat(entries.since).isEqualTo(between.start); + Assertions.assertThat(entries.entries.size()).isEqualTo(between.end.getEpoch() - between.start.getEpoch()); + + long expected = between.start.nextEpoch().getEpoch(); + for (Entry e : entries.entries) + { + long actual = e.epoch.getEpoch(); + Assertions.assertThat(actual).describedAs("Unexpected epoch").isEqualTo(expected); + expected++; + } + }); + } + private void assertEntries(List entries, Epoch min, Epoch max) { int idx = 0; @@ -255,4 +343,39 @@ private void assertEntries(List entries, Epoch min, Epoch max) } assertEquals(idx, entries.size()); } + + private static class Between + { + private final Epoch start, end; + + private Between(Epoch start, Epoch end) + { + this.start = start; + this.end = end; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Between between = (Between) o; + return start.equals(between.start) && end.equals(between.end); + } + + @Override + public int hashCode() + { + return Objects.hash(start, end); + } + + @Override + public String toString() + { + return "Between{" + + "start=" + start.getEpoch() + + ", end=" + end.getEpoch() + + '}'; + } + } } diff --git a/test/unit/org/apache/cassandra/tcm/membership/DirectoryTest.java b/test/unit/org/apache/cassandra/tcm/membership/DirectoryTest.java new file mode 100644 index 000000000000..ea66961ee5cf --- /dev/null +++ b/test/unit/org/apache/cassandra/tcm/membership/DirectoryTest.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.membership; + +import org.junit.Test; + +import static org.apache.cassandra.tcm.membership.MembershipUtils.endpoint; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +public class DirectoryTest +{ + + @Test + public void updateLocationTest() + { + Location DC1_R1 = new Location("datacenter1", "rack1"); + Directory dir = new Directory(); + assertTrue(dir.isEmpty()); + assertTrue(dir.knownDatacenters().isEmpty()); + + NodeId missing = new NodeId(1000); + assertInvalidLocationUpdate(dir, missing, DC1_R1, "Node " + missing + " has no registered location to update"); + + // add a new node and retrieve its Location + NodeAddresses addresses = new NodeAddresses(endpoint(1)); + dir = dir.with(addresses, DC1_R1); + NodeId node = dir.peerId(addresses.broadcastAddress); + assertEquals(DC1_R1, dir.location(node)); + assertTrue(dir.knownDatacenters().contains("datacenter1")); + + // endpoints by DC & rack are not updated immediately, this is an explicit step when a node joins + assertTrue(dir.allDatacenterEndpoints().isEmpty()); + assertTrue(dir.allDatacenterRacks().isEmpty()); + + // when a node joins, its DC and rack become active + dir = dir.withRackAndDC(node); + assertTrue(dir.allDatacenterEndpoints().asMap().get("datacenter1").contains(addresses.broadcastAddress)); + assertTrue(dir.allDatacenterRacks().get("datacenter1").get("rack1").contains(addresses.broadcastAddress)); + + // update rack + Location DC1_R2 = new Location("datacenter1", "rack2"); + dir = dir.withUpdatedRackAndDc(node, DC1_R2); + assertEquals(DC1_R2, dir.location(node)); + assertTrue(dir.allDatacenterEndpoints().asMap().get("datacenter1").contains(addresses.broadcastAddress)); + assertTrue(dir.allDatacenterRacks().get("datacenter1").get("rack2").contains(addresses.broadcastAddress)); + // previous rack is no longer present as it was made empty + assertFalse(dir.allDatacenterRacks().get("datacenter1").containsKey("rack1")); + + // update DC + Location DC2_R2 = new Location("datacenter2", "rack2"); + dir = dir.withUpdatedRackAndDc(node, DC2_R2); + assertEquals(DC2_R2, dir.location(node)); + assertTrue(dir.allDatacenterEndpoints().asMap().get("datacenter2").contains(addresses.broadcastAddress)); + assertTrue(dir.allDatacenterRacks().get("datacenter2").get("rack2").contains(addresses.broadcastAddress)); + // datacenter1 is no longer present as it was made empty + assertFalse(dir.allDatacenterRacks().containsKey("datacenter1")); + assertFalse(dir.knownDatacenters().contains("datacenter1")); + assertTrue(dir.knownDatacenters().contains("datacenter2")); + + // Add a second node in the same dc & rack + NodeAddresses otherAddresses = new NodeAddresses(endpoint(2)); + dir = dir.with(otherAddresses, DC2_R2); + NodeId otherNode = dir.peerId(otherAddresses.broadcastAddress); + dir = dir.withRackAndDC(otherNode); + assertTrue(dir.allDatacenterEndpoints().asMap().get("datacenter2").contains(addresses.broadcastAddress)); + assertTrue(dir.allDatacenterEndpoints().asMap().get("datacenter2").contains(otherAddresses.broadcastAddress)); + assertTrue(dir.allDatacenterRacks().get("datacenter2").get("rack2").contains(addresses.broadcastAddress)); + assertTrue(dir.allDatacenterRacks().get("datacenter2").get("rack2").contains(otherAddresses.broadcastAddress)); + + // now updating the rack of the first node should not remove rack2 altogether as it not empty + Location DC2_R3 = new Location("datacenter2", "rack3"); + dir = dir.withUpdatedRackAndDc(node, DC2_R3); + assertEquals(DC2_R3, dir.location(node)); + // updated node is removed from rack2 and added to rack3 + assertTrue(dir.allDatacenterEndpoints().asMap().get("datacenter2").contains(addresses.broadcastAddress)); + assertTrue(dir.allDatacenterRacks().get("datacenter2").get("rack3").contains(addresses.broadcastAddress)); + assertFalse(dir.allDatacenterRacks().get("datacenter2").get("rack2").contains(addresses.broadcastAddress)); + // other node is still present in rack2 + assertTrue(dir.allDatacenterEndpoints().asMap().get("datacenter2").contains(otherAddresses.broadcastAddress)); + assertTrue(dir.allDatacenterRacks().get("datacenter2").get("rack2").contains(otherAddresses.broadcastAddress)); + assertFalse(dir.allDatacenterRacks().get("datacenter2").get("rack3").contains(otherAddresses.broadcastAddress)); + + // simulate what happens when the nodes leave the cluster + dir = dir.withoutRackAndDC(otherNode); + assertFalse(dir.allDatacenterEndpoints().asMap().get("datacenter2").contains(otherAddresses.broadcastAddress)); + assertFalse(dir.allDatacenterRacks().get("datacenter2").containsKey("rack2")); + assertTrue(dir.allDatacenterEndpoints().asMap().get("datacenter2").contains(addresses.broadcastAddress)); + assertTrue(dir.allDatacenterRacks().get("datacenter2").get("rack3").contains(addresses.broadcastAddress)); + + dir = dir.withoutRackAndDC(node); + assertTrue(dir.allDatacenterEndpoints().isEmpty()); + assertTrue(dir.allDatacenterRacks().isEmpty()); + } + + private void assertInvalidLocationUpdate(Directory dir, NodeId nodeId, Location loc, String message) + { + try + { + dir.withUpdatedRackAndDc(nodeId, loc); + fail("Expected an exception"); + } + catch (IllegalArgumentException e) + { + assertTrue(e.getMessage().equals(message)); + } + } +} diff --git a/test/unit/org/apache/cassandra/tcm/sequences/DropAccordTableTest.java b/test/unit/org/apache/cassandra/tcm/sequences/DropAccordTableTest.java new file mode 100644 index 000000000000..8b183688092d --- /dev/null +++ b/test/unit/org/apache/cassandra/tcm/sequences/DropAccordTableTest.java @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.sequences; + +import java.util.TreeSet; +import java.util.stream.Stream; + +import org.junit.Test; + +import accord.utils.Gen; +import accord.utils.Property; +import accord.utils.Property.Command; +import accord.utils.RandomSource; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.locator.MetaStrategy; +import org.apache.cassandra.schema.DistributedSchema; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.Keyspaces; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.Types; +import org.apache.cassandra.service.consensus.TransactionalMode; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.MultiStepOperation; +import org.apache.cassandra.tcm.StubClusterMetadataService; +import org.apache.cassandra.tcm.ValidatingClusterMetadataService; +import org.apache.cassandra.tcm.serialization.Version; +import org.apache.cassandra.tcm.transformations.PrepareDropAccordTable; +import org.apache.cassandra.tcm.sequences.DropAccordTable.TableReference; +import org.apache.cassandra.utils.AbstractTypeGenerators; +import org.apache.cassandra.utils.CassandraGenerators; +import org.apache.cassandra.utils.CassandraGenerators.TableMetadataBuilder; +import org.apache.cassandra.utils.Generators; +import org.assertj.core.api.Assertions; +import org.quicktheories.generators.SourceDSL; + +import static accord.utils.Property.commands; +import static accord.utils.Property.qt; +import static accord.utils.Property.stateful; +import static org.apache.cassandra.utils.CassandraGenerators.TABLE_ID_GEN; + +public class DropAccordTableTest +{ + static + { + DatabaseDescriptor.clientInitialization(); + } + + private static final TransactionalMode[] ACCORD_ENABLED_MODES = Stream.of(TransactionalMode.values()) + .filter(t -> t.accordIsEnabled) + .toArray(TransactionalMode[]::new); + + private static final Gen TABLE_GEN = Generators.toGen(defaultTableMetadataBuilder().build()); + + private static TableMetadataBuilder defaultTableMetadataBuilder() + { + return new TableMetadataBuilder() + .withUseCounter(false) + .withPartitioner(Murmur3Partitioner.instance) + .withTransactionalMode(SourceDSL.arbitrary().pick(ACCORD_ENABLED_MODES)); + } + + @Test + public void e2e() + { + qt().check(rs -> { + ValidatingClusterMetadataService cms = createCMS(); + TableMetadata metadata = TABLE_GEN.next(rs); + addTable(cms, metadata); // hack this table into the schema... + + TableReference table = TableReference.from(metadata); + + cms.commit(new PrepareDropAccordTable(table)); + + // This is only here because "applyTo" is not touched without it... + for (KeyspaceMetadata ks : cms.metadata().schema.getKeyspaces()) + cms.metadata().writePlacementAllSettled(ks); + + Assertions.assertThat(cms.metadata().inProgressSequences.isEmpty()).isFalse(); + InProgressSequences.finishInProgressSequences(table); + Assertions.assertThat(cms.metadata().inProgressSequences.isEmpty()).isTrue(); + + // table is dropped + Assertions.assertThat(cms.metadata().schema.getTableMetadata(metadata.id)).isNull(); + }); + } + + @Test + public void multi() + { + stateful().withExamples(50).withSteps(500).check(commands(() -> State::new) + .destroyState(DropAccordTableTest::validate) + .add(DropAccordTableTest::addTable) + .addIf(s -> !s.aliveTables.isEmpty(), DropAccordTableTest::dropTable) + .addIf(s -> !s.cms.metadata().inProgressSequences.isEmpty(), DropAccordTableTest::inProgressSequences) + .build()); + } + + private static void validate(State state) + { + while (!state.cms.metadata().inProgressSequences.isEmpty()) + { + for (MultiStepOperation opt : state.cms.metadata().inProgressSequences) + InProgressSequences.resume(opt); + } + // all tables are dropped, unless they were never dropped + Keyspaces keyspaces = state.cms.metadata().schema.getKeyspaces(); + for (KeyspaceMetadata k : keyspaces) + { + if (k.tables.size() == 0) continue; + if (k.replicationStrategy instanceof MetaStrategy) continue; + for (TableMetadata t : k.tables) + { + Assertions.assertThat(t.params.pendingDrop).isFalse(); + Assertions.assertThat(state.aliveTables).contains(t.id); + } + } + } + + private static Command addTable(RandomSource rs, State state) + { + TableMetadata metadata = Generators.toGen(defaultTableMetadataBuilder() + .withKeyspaceName(CassandraGenerators.KEYSPACE_NAME_GEN.assuming(name -> !state.cms.metadata().schema.getKeyspaces().containsKeyspace(name))) + .withTableId(TABLE_ID_GEN.assuming(id -> state.cms.metadata().schema.getTableMetadata(id) == null)) + // other tests better cover serialization so can speed up tests by only doing primitive types + .withDefaultTypeGen(CassandraGenerators.TableMetadataBuilder.defaultTypeGen().withTypeKinds(AbstractTypeGenerators.TypeKind.PRIMITIVE)) + .build()) + .next(rs); + return new Property.SimpleCommand<>("Add Table " + metadata, s2 -> { + addTable(s2.cms, metadata); + s2.aliveTables.add(metadata.id); + }); + } + + private static Command dropTable(RandomSource rs, State state) + { + TableId id = rs.pickOrderedSet(state.aliveTables); + TableMetadata metadata = state.cms.metadata().schema.getTableMetadata(id); + return new Property.SimpleCommand<>("Drop Table " + metadata, s2 -> { + TableReference table = TableReference.from(metadata); + + s2.cms.commit(new PrepareDropAccordTable(table)); + s2.aliveTables.remove(id); + }); + } + + private static Command inProgressSequences(RandomSource rs, State state) + { + ClusterMetadata current = state.cms.metadata(); + TreeSet pending = new TreeSet<>(); + for (MultiStepOperation opt : current.inProgressSequences) + { + if (!(opt instanceof DropAccordTable)) throw new AssertionError("Only DropAccordTable should exist in this test; found " + opt); + pending.add(((DropAccordTable) opt).table); + } + TableReference ref = rs.pickOrderedSet(pending); + MultiStepOperation seq = current.inProgressSequences.get(ref); + Assertions.assertThat(seq).isNotNull(); + return new Property.SimpleCommand<>("Progress for " + ref + ": " + seq.nextStep(), s2 -> InProgressSequences.resume(seq)); + } + + public static class State + { + private final StubClusterMetadataService cms; + private final TreeSet aliveTables = new TreeSet<>(); + + public State(RandomSource rs) + { + // With validation enabled the test runtime is dominated by serialization checks, so enable them rarely + // just so tests do run with them, but the whole test runtime isn't serde testing. + if (rs.decide(0.01)) + { + cms = ValidatingClusterMetadataService.createAndRegister(Version.MIN_ACCORD_VERSION); + } + else + { + cms = StubClusterMetadataService.forTesting(new ClusterMetadata(Murmur3Partitioner.instance)); + ClusterMetadataService.unsetInstance(); + ClusterMetadataService.setInstance(cms); + } + } + } + + private static ValidatingClusterMetadataService createCMS() + { + return ValidatingClusterMetadataService.createAndRegister(Version.MIN_ACCORD_VERSION); + } + + private static void addTable(StubClusterMetadataService cms, TableMetadata table) + { + class Ref { Types types;} + // first mock out a keyspace + ClusterMetadata prev = cms.metadata(); + KeyspaceMetadata schema = KeyspaceMetadata.create(table.keyspace, KeyspaceParams.simple(3)); + Ref ref = new Ref(); + ref.types = schema.types; + CassandraGenerators.visitUDTs(table, udt -> ref.types = ref.types.with(udt.unfreeze())); + schema = schema.withSwapped(ref.types); + schema = schema.withSwapped(schema.tables.with(table)); + Keyspaces keyspaces = prev.schema.getKeyspaces().withAddedOrUpdated(schema); + ClusterMetadata metadata = prev.transformer().with(new DistributedSchema(keyspaces)).build().metadata; + cms.setMetadata(metadata); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/tcm/sequences/InProgressSequenceCancellationTest.java b/test/unit/org/apache/cassandra/tcm/sequences/InProgressSequenceCancellationTest.java index 1c1a44296b71..100979104f7e 100644 --- a/test/unit/org/apache/cassandra/tcm/sequences/InProgressSequenceCancellationTest.java +++ b/test/unit/org/apache/cassandra/tcm/sequences/InProgressSequenceCancellationTest.java @@ -34,7 +34,6 @@ import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.dht.Token; -import org.apache.cassandra.locator.EndpointsForRange; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.Transformation; import org.apache.cassandra.tcm.ClusterMetadata; @@ -44,10 +43,8 @@ import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.tcm.membership.NodeState; import org.apache.cassandra.tcm.membership.NodeVersion; -import org.apache.cassandra.tcm.ownership.DataPlacement; import org.apache.cassandra.tcm.ownership.DataPlacements; import org.apache.cassandra.tcm.ownership.PlacementDeltas; -import org.apache.cassandra.tcm.ownership.ReplicaGroups; import org.apache.cassandra.tcm.transformations.PrepareJoin; import org.apache.cassandra.tcm.transformations.PrepareLeave; import org.apache.cassandra.tcm.transformations.PrepareReplace; @@ -304,9 +301,9 @@ private void testRevertingReplace(long seed) private void assertRelevantMetadata(ClusterMetadata first, ClusterMetadata second) { - assertPlacementsEquivalent(first.placements, second.placements); - assertTrue(first.directory.isEquivalent(second.directory)); - assertTrue(first.tokenMap.isEquivalent(second.tokenMap)); + assertTrue(first.placements.equivalentTo(second.placements)); + assertTrue(first.directory.equivalentTo(second.directory)); + assertTrue(first.tokenMap.equivalentTo(second.tokenMap)); assertEquals(first.lockedRanges.locked.keySet(), second.lockedRanges.locked.keySet()); } @@ -314,31 +311,4 @@ private static ClusterMetadata metadata(Directory directory) { return new ClusterMetadata(Murmur3Partitioner.instance, directory); } - - private void assertPlacementsEquivalent(DataPlacements first, DataPlacements second) - { - assertEquals(first.keys(), second.keys()); - - first.asMap().forEach((params, placement) -> { - DataPlacement otherPlacement = second.get(params); - ReplicaGroups r1 = placement.reads; - ReplicaGroups r2 = otherPlacement.reads; - assertEquals(r1.ranges, r2.ranges); - r1.forEach((range, e1) -> { - EndpointsForRange e2 = r2.forRange(range).get(); - assertEquals(e1.size(),e2.size()); - assertTrue(e1.get().stream().allMatch(e2::contains)); - }); - - ReplicaGroups w1 = placement.reads; - ReplicaGroups w2 = otherPlacement.reads; - assertEquals(w1.ranges, w2.ranges); - w1.forEach((range, e1) -> { - EndpointsForRange e2 = w2.forRange(range).get(); - assertEquals(e1.size(),e2.size()); - assertTrue(e1.get().stream().allMatch(e2::contains)); - }); - - }); - } } diff --git a/test/unit/org/apache/cassandra/tcm/sequences/ProgressBarrierTest.java b/test/unit/org/apache/cassandra/tcm/sequences/ProgressBarrierTest.java index fe73677e2575..ba431db848a7 100644 --- a/test/unit/org/apache/cassandra/tcm/sequences/ProgressBarrierTest.java +++ b/test/unit/org/apache/cassandra/tcm/sequences/ProgressBarrierTest.java @@ -37,7 +37,7 @@ import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.distributed.api.IIsolatedExecutor; import org.apache.cassandra.distributed.test.log.CMSTestBase; -import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.harry.gen.EntropySource; import org.apache.cassandra.harry.gen.Surjections; import org.apache.cassandra.harry.gen.rng.PCGFastPure; @@ -147,13 +147,14 @@ public void sendWithCallback(Message message, InetAddressAndPort } else { - cb.onFailure(message.from(), RequestFailureReason.TIMEOUT); + cb.onFailure(message.from(), RequestFailure.TIMEOUT); } } public void send(Message message, InetAddressAndPort to) {} public void sendWithCallback(Message message, InetAddressAndPort to, RequestCallback cb, ConnectionType specifyConnection) {} public Future> sendWithResult(Message message, InetAddressAndPort to) { return null; } + @Override public void respond(V response, Message message) {} }; ProgressBarrier progressBarrier = ((MultiStepOperation)metadata.inProgressSequences.get(node.nodeId())) @@ -355,4 +356,4 @@ public static Surjections.Surjection combine(Surjecti gen4.inflate(PCGFastPure.next(l, 2))); }; } -} \ No newline at end of file +} diff --git a/test/unit/org/apache/cassandra/tcm/sequences/SequencesUtils.java b/test/unit/org/apache/cassandra/tcm/sequences/SequencesUtils.java index eb310cc78b08..44b3b59ce3dd 100644 --- a/test/unit/org/apache/cassandra/tcm/sequences/SequencesUtils.java +++ b/test/unit/org/apache/cassandra/tcm/sequences/SequencesUtils.java @@ -18,19 +18,27 @@ package org.apache.cassandra.tcm.sequences; +import java.io.Serializable; import java.util.List; import java.util.Random; import java.util.Set; import java.util.function.Predicate; import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.ReplicationParams; +import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.Transformation; import org.apache.cassandra.tcm.membership.NodeId; import org.apache.cassandra.tcm.ownership.DataPlacements; import org.apache.cassandra.tcm.ownership.PlacementDeltas; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializer; +import org.apache.cassandra.tcm.serialization.Version; import org.apache.cassandra.tcm.transformations.PrepareJoin; import org.apache.cassandra.tcm.transformations.PrepareLeave; import org.apache.cassandra.tcm.transformations.PrepareMove; @@ -174,4 +182,68 @@ public static Epoch epoch(int epoch) { return Epoch.create(epoch); } + + // Custom transforms to lock/unlock an arbitrary set of ranges to + // avoid having to actually initiate some range movement + public static class LockRanges implements Transformation, Serializable + { + public static final AsymmetricMetadataSerializer serializer = new AsymmetricMetadataSerializer() + { + @Override + public void serialize(Transformation t, DataOutputPlus out, Version version){} + @Override + public LockRanges deserialize(DataInputPlus in, Version version) {return new LockRanges();} + @Override + public long serializedSize(Transformation t, Version version) {return 0;} + }; + + public static final String NAME = "TestLockRanges"; + + // at the moment, the detail of the specific LockedRanges doesn't matter, transformations + // which are rejected in the presence of locking are rejected whatever is actually locked + private static final LockedRanges.AffectedRanges toLock = + LockedRanges.AffectedRanges.singleton(ReplicationParams.simple(3), + new Range<>(Murmur3Partitioner.instance.getMinimumToken(), + Murmur3Partitioner.instance.getRandomToken())); + + @Override + public Kind kind() + { + return Kind.CUSTOM; + } + + @Override + public Result execute(ClusterMetadata metadata) + { + LockedRanges newLocked = metadata.lockedRanges.lock(LockedRanges.keyFor(metadata.epoch), toLock); + return Transformation.success(metadata.transformer().with(newLocked), toLock); + } + } + + public static class ClearLockedRanges implements Transformation, Serializable + { + public static final AsymmetricMetadataSerializer serializer = new AsymmetricMetadataSerializer() + { + @Override + public void serialize(Transformation t, DataOutputPlus out, Version version) {} + @Override + public ClearLockedRanges deserialize(DataInputPlus in, Version version) {return new ClearLockedRanges();} + @Override + public long serializedSize(Transformation t, Version version) {return 0;} + }; + public static final String NAME = "TestClearLockedRanges"; + + @Override + public Kind kind() + { + return Kind.CUSTOM; + } + + @Override + public Result execute(ClusterMetadata metadata) + { + LockedRanges newLocked = LockedRanges.EMPTY; + return Transformation.success(metadata.transformer().with(newLocked), LockedRanges.AffectedRanges.EMPTY); + } + } } diff --git a/test/unit/org/apache/cassandra/tcm/serialization/AsymmetricMetadataSerializers.java b/test/unit/org/apache/cassandra/tcm/serialization/AsymmetricMetadataSerializers.java new file mode 100644 index 000000000000..bd3cd4547e0e --- /dev/null +++ b/test/unit/org/apache/cassandra/tcm/serialization/AsymmetricMetadataSerializers.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.serialization; + +import java.io.IOException; + +import org.assertj.core.api.Assertions; + +import accord.utils.LazyToString; +import accord.utils.ReflectionUtils; +import org.apache.cassandra.io.util.DataInputBuffer; +import org.apache.cassandra.io.util.DataOutputBuffer; + +public class AsymmetricMetadataSerializers +{ + public static void testSerde(DataOutputBuffer output, AsymmetricMetadataSerializer serializer, In input, Version version) throws IOException + { + output.clear(); + long expectedSize = serializer.serializedSize(input, version); + serializer.serialize(input, output, version); + Assertions.assertThat(output.getLength()).describedAs("The serialized size and bytes written do not match").isEqualTo(expectedSize); + DataInputBuffer in = new DataInputBuffer(output.unsafeGetBufferAndFlip(), false); + Out read = serializer.deserialize(in, version); + Assertions.assertThat(read).describedAs("The deserialized output does not match the serialized input; difference %s", new LazyToString(() -> ReflectionUtils.recursiveEquals(read, input).toString())).isEqualTo(input); + } +} diff --git a/test/unit/org/apache/cassandra/tcm/transformations/AccordMarkRejoiningTest.java b/test/unit/org/apache/cassandra/tcm/transformations/AccordMarkRejoiningTest.java new file mode 100644 index 000000000000..32999a8aae6b --- /dev/null +++ b/test/unit/org/apache/cassandra/tcm/transformations/AccordMarkRejoiningTest.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.transformations; + +import java.io.IOException; +import java.util.Collections; + +import com.google.common.collect.ImmutableSet; +import org.junit.Test; + +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializers; +import org.apache.cassandra.tcm.serialization.Version; + +public class AccordMarkRejoiningTest +{ + @Test + public void shouldSerializeEmpty() throws IOException + { + DataOutputBuffer buffer = new DataOutputBuffer(); + AsymmetricMetadataSerializers.testSerde(buffer, AccordMarkRejoining.serializer, new AccordMarkRejoining(Collections.emptySet()), Version.MIN_ACCORD_VERSION); + } + + @Test + public void shouldSerializeSingleton() throws IOException + { + DataOutputBuffer buffer = new DataOutputBuffer(); + AccordMarkRejoining markStale = new AccordMarkRejoining(Collections.singleton(NodeId.fromString("1"))); + AsymmetricMetadataSerializers.testSerde(buffer, AccordMarkRejoining.serializer, markStale, Version.MIN_ACCORD_VERSION); + } + + @Test + public void shouldSerializeMulti() throws IOException + { + DataOutputBuffer buffer = new DataOutputBuffer(); + AccordMarkRejoining markStale = new AccordMarkRejoining(ImmutableSet.of(NodeId.fromString("1"), NodeId.fromString("2"))); + AsymmetricMetadataSerializers.testSerde(buffer, AccordMarkRejoining.serializer, markStale, Version.MIN_ACCORD_VERSION); + } +} diff --git a/test/unit/org/apache/cassandra/tcm/transformations/AccordMarkStaleTest.java b/test/unit/org/apache/cassandra/tcm/transformations/AccordMarkStaleTest.java new file mode 100644 index 000000000000..baa4936a0732 --- /dev/null +++ b/test/unit/org/apache/cassandra/tcm/transformations/AccordMarkStaleTest.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tcm.transformations; + +import java.io.IOException; +import java.util.Collections; + +import com.google.common.collect.ImmutableSet; +import org.junit.Test; + +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.serialization.AsymmetricMetadataSerializers; +import org.apache.cassandra.tcm.serialization.Version; + +public class AccordMarkStaleTest +{ + @Test + public void shouldSerializeEmpty() throws IOException + { + DataOutputBuffer buffer = new DataOutputBuffer(); + AsymmetricMetadataSerializers.testSerde(buffer, AccordMarkStale.serializer, new AccordMarkStale(Collections.emptySet()), Version.MIN_ACCORD_VERSION); + } + + @Test + public void shouldSerializeSingleton() throws IOException + { + DataOutputBuffer buffer = new DataOutputBuffer(); + AccordMarkStale markStale = new AccordMarkStale(Collections.singleton(NodeId.fromString("1"))); + AsymmetricMetadataSerializers.testSerde(buffer, AccordMarkStale.serializer, markStale, Version.MIN_ACCORD_VERSION); + } + + @Test + public void shouldSerializeMulti() throws IOException + { + DataOutputBuffer buffer = new DataOutputBuffer(); + AccordMarkStale markStale = new AccordMarkStale(ImmutableSet.of(NodeId.fromString("1"), NodeId.fromString("2"))); + AsymmetricMetadataSerializers.testSerde(buffer, AccordMarkStale.serializer, markStale, Version.MIN_ACCORD_VERSION); + } +} diff --git a/test/unit/org/apache/cassandra/test/asserts/ExtendedAssertions.java b/test/unit/org/apache/cassandra/test/asserts/ExtendedAssertions.java index fc6040e1b99f..f5bdd7ff725d 100644 --- a/test/unit/org/apache/cassandra/test/asserts/ExtendedAssertions.java +++ b/test/unit/org/apache/cassandra/test/asserts/ExtendedAssertions.java @@ -77,7 +77,7 @@ public HistogramAssert hasMax(long expected) isNotNull(); Snapshot snapshot = actual.getSnapshot(); if (snapshot.getMax() != expected) - throw failure("Expected max %d but given %d", expected, actual.getCount()); + throw failure("Expected max %d but given %d", expected, snapshot.getMax()); return this; } } diff --git a/test/unit/org/apache/cassandra/tools/JMXStandardsTest.java b/test/unit/org/apache/cassandra/tools/JMXStandardsTest.java index 87b9ff93fded..df0a2c1be2ea 100644 --- a/test/unit/org/apache/cassandra/tools/JMXStandardsTest.java +++ b/test/unit/org/apache/cassandra/tools/JMXStandardsTest.java @@ -50,6 +50,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig; import org.apache.cassandra.utils.BreaksJMX; import org.assertj.core.api.Assertions; import org.reflections.Reflections; @@ -98,6 +101,10 @@ public class JMXStandardsTest .add(IllegalStateException.class) .add(ClassNotFoundException.class) .add(OpenDataException.class) + .add(InvalidRequestException.class) + .add(AutoRepairConfig.RepairType.class) + .add(InetAddressAndPort.class) + .add(AutoRepairConfig.class) .build(); /** * This list is a set of types under java.* and javax.*, but are too vague that could cause issues; this does not diff --git a/test/unit/org/apache/cassandra/tools/NodeToolCommandTest.java b/test/unit/org/apache/cassandra/tools/NodeToolCommandTest.java index 8a3a6881ec13..6dab7136d1d3 100644 --- a/test/unit/org/apache/cassandra/tools/NodeToolCommandTest.java +++ b/test/unit/org/apache/cassandra/tools/NodeToolCommandTest.java @@ -78,7 +78,9 @@ private Map testRepairCommand(int expectedExitCode, String ...ar public void repairCommandTest() throws IOException { Map options = testRepairCommand(0, "--paxos-only", "ks"); - Assert.assertEquals(options.get(RepairOption.PAXOS_ONLY_KEY), Boolean.toString(true)); + Assert.assertEquals(options.get(RepairOption.REPAIR_DATA_KEY), Boolean.toString(false)); + Assert.assertEquals(options.get(RepairOption.REPAIR_PAXOS_KEY), Boolean.toString(true)); + Assert.assertEquals(options.get(RepairOption.REPAIR_ACCORD_KEY), Boolean.toString(false)); Assert.assertEquals(options.get(RepairOption.INCREMENTAL_KEY), Boolean.toString(false)); } diff --git a/test/unit/org/apache/cassandra/tools/nodetool/AutoRepairStatusTest.java b/test/unit/org/apache/cassandra/tools/nodetool/AutoRepairStatusTest.java new file mode 100644 index 000000000000..82293581d807 --- /dev/null +++ b/test/unit/org/apache/cassandra/tools/nodetool/AutoRepairStatusTest.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tools.nodetool; + +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import java.util.Arrays; +import java.util.Collection; + +import com.google.common.collect.ImmutableSet; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig; +import org.apache.cassandra.tools.NodeProbe; +import org.apache.cassandra.tools.Output; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +import static org.apache.cassandra.Util.setAutoRepairEnabled; +import static org.junit.Assert.assertEquals; +import static org.mockito.Mockito.when; + +/** + * Unit tests for {@link org.apache.cassandra.tools.nodetool.AutoRepairStatus} + */ +@RunWith(Parameterized.class) +public class AutoRepairStatusTest +{ + @Mock + private static NodeProbe probe; + + private ByteArrayOutputStream cmdOutput; + + private static AutoRepairStatus cmd; + + @Parameterized.Parameter() + public AutoRepairConfig.RepairType repairType; + + @Parameterized.Parameters(name = "repairType={0}") + public static Collection repairTypes() + { + return Arrays.asList(AutoRepairConfig.RepairType.values()); + } + + @Before + public void setUp() throws Exception + { + MockitoAnnotations.initMocks(this); + cmdOutput = new ByteArrayOutputStream(); + PrintStream out = new PrintStream(cmdOutput); + when(probe.output()).thenReturn(new Output(out, out)); + cmd = new AutoRepairStatus(); + DatabaseDescriptor.daemonInitialization(); + DatabaseDescriptor.loadConfig(); + setAutoRepairEnabled(true); + DatabaseDescriptor.getAutoRepairConfig().setAutoRepairEnabled(AutoRepairConfig.RepairType.FULL, true); + DatabaseDescriptor.getAutoRepairConfig().setAutoRepairEnabled(AutoRepairConfig.RepairType.INCREMENTAL, true); + } + + @Test(expected = IllegalArgumentException.class) + public void testExecuteWithoutRepairType() + { + cmd.repairType = null; + cmd.execute(probe); + } + + @Test + public void testExecuteWithNoNodes() + { + cmd.repairType = repairType.name(); + + cmd.execute(probe); + assertEquals("Active Repairs\n" + + "NONE \n", cmdOutput.toString()); + } + + @Test + public void testExecute() + { + when(probe.getAutoRepairOnGoingRepairHostIds(repairType.name())).thenReturn(ImmutableSet.of("host1", "host2", "host3", "host4")); + cmd.repairType = repairType.name(); + + cmd.execute(probe); + + assertEquals("Active Repairs \n" + + "host1,host2,host3,host4\n", cmdOutput.toString()); + } +} diff --git a/test/unit/org/apache/cassandra/tools/nodetool/ClientStatsTest.java b/test/unit/org/apache/cassandra/tools/nodetool/ClientStatsTest.java index 2189a0e963f8..ff1b16d6db9d 100644 --- a/test/unit/org/apache/cassandra/tools/nodetool/ClientStatsTest.java +++ b/test/unit/org/apache/cassandra/tools/nodetool/ClientStatsTest.java @@ -225,22 +225,22 @@ public void testClientStatsAll() /** * Example expected output: * Address SSL Cipher Protocol Version User Keyspace Requests Driver-Name Driver-Version - * /127.0.0.1:52549 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra 17 DataStax Java Driver 3.11.5 - * /127.0.0.1:52550 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra 3 DataStax Java Driver 3.11.5 - * /127.0.0.1:52551 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra 16 DataStax Java Driver 3.11.5 - * /127.0.0.1:52552 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra system 3 DataStax Java Driver 3.11.5 - * /127.0.0.1:52546 false undefined undefined 5 cassandra 17 DataStax Java Driver 3.11.5 - * /127.0.0.1:52548 false undefined undefined 5 cassandra 4 DataStax Java Driver 3.11.5 + * /127.0.0.1:52549 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra 17 Apache Cassandra Java Driver 3.12.1 + * /127.0.0.1:52550 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra 3 Apache Cassandra Java Driver 3.12.1 + * /127.0.0.1:52551 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra 16 Apache Cassandra Java Driver 3.12.1 + * /127.0.0.1:52552 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra system 3 Apache Cassandra Java Driver 3.12.1 + * /127.0.0.1:52546 false undefined undefined 5 cassandra 17 Apache Cassandra Java Driver 3.12.1 + * /127.0.0.1:52548 false undefined undefined 5 cassandra 4 Apache Cassandra Java Driver 3.12.1 */ assertThat(stdout).containsPattern("Address +SSL +Cipher +Protocol +Version +User +Keyspace +Requests +Driver-Name +Driver-Version"); // Unencrypted password-based client. - assertThat(stdout).containsPattern("/127.0.0.1:[0-9]+ false +undefined +undefined +[0-9]+ +cassandra +[0-9]+ +DataStax Java Driver 3.11.5"); + assertThat(stdout).containsPattern("/127.0.0.1:[0-9]+ false +undefined +undefined +[0-9]+ +cassandra +[0-9]+ +Apache Cassandra Java Driver 3.12.1"); // TLS-encrypted password-based client. - assertThat(stdout).containsPattern("/127.0.0.1:[0-9]+ true +TLS\\S+ +TLS\\S+ +[0-9]+ +cassandra +[0-9]+ +DataStax Java Driver 3.11.5"); + assertThat(stdout).containsPattern("/127.0.0.1:[0-9]+ true +TLS\\S+ +TLS\\S+ +[0-9]+ +cassandra +[0-9]+ +Apache Cassandra Java Driver 3.12.1"); // MTLS-based client. - assertThat(stdout).containsPattern("/127.0.0.1:[0-9]+ true +TLS\\S+ +TLS\\S+ +[0-9]+ +cassandra +[0-9]+ +DataStax Java Driver 3.11.5"); + assertThat(stdout).containsPattern("/127.0.0.1:[0-9]+ true +TLS\\S+ +TLS\\S+ +[0-9]+ +cassandra +[0-9]+ +Apache Cassandra Java Driver 3.12.1"); // MTLS-based client with 'system' keyspace set on connection. - assertThat(stdout).containsPattern("/127.0.0.1:[0-9]+ true +TLS\\S+ +TLS\\S+ +[0-9]+ +cassandra +system +[0-9]+ +DataStax Java Driver 3.11.5"); + assertThat(stdout).containsPattern("/127.0.0.1:[0-9]+ true +TLS\\S+ +TLS\\S+ +[0-9]+ +cassandra +system +[0-9]+ +Apache Cassandra Java Driver 3.12.1"); assertClientCount(stdout); } @@ -256,17 +256,17 @@ public void testClientStatsClientOptions() /* * Example expected output: * Address SSL Cipher Protocol Version User Keyspace Requests Driver-Name Driver-Version Client-Options - * /127.0.0.1:51047 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra 17 DataStax Java Driver 3.11.5 DRIVER_VERSION=3.11.5, DRIVER_NAME=DataStax Java Driver, CQL_VERSION=3.0.0 - * /127.0.0.1:51048 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra 3 DataStax Java Driver 3.11.5 DRIVER_VERSION=3.11.5, DRIVER_NAME=DataStax Java Driver, CQL_VERSION=3.0.0 - * /127.0.0.1:51046 false undefined undefined 5 cassandra 4 DataStax Java Driver 3.11.5 DRIVER_VERSION=3.11.5, DRIVER_NAME=DataStax Java Driver, CQL_VERSION=3.0.0 - * /127.0.0.1:51044 false undefined undefined 5 cassandra 17 DataStax Java Driver 3.11.5 DRIVER_VERSION=3.11.5, DRIVER_NAME=DataStax Java Driver, CQL_VERSION=3.0.0 - * /127.0.0.1:51049 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra 16 DataStax Java Driver 3.11.5 DRIVER_VERSION=3.11.5, DRIVER_NAME=DataStax Java Driver, CQL_VERSION=3.0.0 - * /127.0.0.1:51050 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra system 3 DataStax Java Driver 3.11.5 DRIVER_VERSION=3.11.5, DRIVER_NAME=DataStax Java Driver, CQL_VERSION=3.0.0 + * /127.0.0.1:51047 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra 17 Apache Cassandra Java Driver 3.12.1 DRIVER_VERSION=3.12.1, DRIVER_NAME=Apache Cassandra Java Driver, CQL_VERSION=3.0.0 + * /127.0.0.1:51048 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra 3 Apache Cassandra Java Driver 3.12.1 DRIVER_VERSION=3.12.1, DRIVER_NAME=Apache Cassandra Java Driver, CQL_VERSION=3.0.0 + * /127.0.0.1:51046 false undefined undefined 5 cassandra 4 Apache Cassandra Java Driver 3.12.1 DRIVER_VERSION=3.12.1, DRIVER_NAME=Apache Cassandra Java Driver, CQL_VERSION=3.0.0 + * /127.0.0.1:51044 false undefined undefined 5 cassandra 17 Apache Cassandra Java Driver 3.12.1 DRIVER_VERSION=3.12.1, DRIVER_NAME=Apache Cassandra Java Driver, CQL_VERSION=3.0.0 + * /127.0.0.1:51049 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra 16 Apache Cassandra Java Driver 3.12.1 DRIVER_VERSION=3.12.1, DRIVER_NAME=Apache Cassandra Java Driver, CQL_VERSION=3.0.0 + * /127.0.0.1:51050 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra system 3 Apache Cassandra Java Driver 3.12.1 DRIVER_VERSION=3.12.1, DRIVER_NAME=Apache Cassandra Java Driver, CQL_VERSION=3.0.0 */ assertThat(stdout).containsPattern("Address +SSL +Cipher +Protocol +Version +User +Keyspace +Requests +Driver-Name +Driver-Version +Client-Options"); - assertThat(stdout).containsPattern("/127.0.0.1:[0-9]+ false+ undefined +undefined +[0-9]+ +cassandra +[0-9]+ +DataStax Java Driver 3.11.5"); - assertThat(stdout).containsPattern("DRIVER_NAME=DataStax Java Driver"); - assertThat(stdout).containsPattern("DRIVER_VERSION=3.11.5"); + assertThat(stdout).containsPattern("/127.0.0.1:[0-9]+ false+ undefined +undefined +[0-9]+ +cassandra +[0-9]+ +Apache Cassandra Java Driver 3.12.1"); + assertThat(stdout).containsPattern("DRIVER_NAME=Apache Cassandra Java Driver"); + assertThat(stdout).containsPattern("DRIVER_VERSION=3.12.1"); assertThat(stdout).containsPattern("CQL_VERSION=3.0.0"); assertClientCount(stdout); @@ -282,23 +282,23 @@ public void testClientStatsClientVerbose() /* * Example expected output: * Address SSL Cipher Protocol Version User Keyspace Requests Driver-Name Driver-Version Client-Options Auth-Mode Auth-Metadata - * /127.0.0.1:57141 false undefined undefined 5 cassandra 17 DataStax Java Driver 3.11.5 DRIVER_VERSION=3.11.5, DRIVER_NAME=DataStax Java Driver, CQL_VERSION=3.0.0 Password - * /127.0.0.1:57165 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra system 3 DataStax Java Driver 3.11.5 DRIVER_VERSION=3.11.5, DRIVER_NAME=DataStax Java Driver, CQL_VERSION=3.0.0 MutualTls identity=spiffe://test.cassandra.apache.org/unitTest/mtls - * /127.0.0.1:57164 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra 3 DataStax Java Driver 3.11.5 DRIVER_VERSION=3.11.5, DRIVER_NAME=DataStax Java Driver, CQL_VERSION=3.0.0 Password - * /127.0.0.1:57144 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra 17 DataStax Java Driver 3.11.5 DRIVER_VERSION=3.11.5, DRIVER_NAME=DataStax Java Driver, CQL_VERSION=3.0.0 Password - * /127.0.0.1:57146 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra 16 DataStax Java Driver 3.11.5 DRIVER_VERSION=3.11.5, DRIVER_NAME=DataStax Java Driver, CQL_VERSION=3.0.0 MutualTls identity=spiffe://test.cassandra.apache.org/unitTest/mtls - * /127.0.0.1:57163 false undefined undefined 5 cassandra 4 DataStax Java Driver 3.11.5 DRIVER_VERSION=3.11.5, DRIVER_NAME=DataStax Java Driver, CQL_VERSION=3.0.0 Password + * /127.0.0.1:57141 false undefined undefined 5 cassandra 17 Apache Cassandra Java Driver 3.12.1 DRIVER_VERSION=3.11.5, DRIVER_NAME=Apache Cassandra Java Driver, CQL_VERSION=3.0.0 Password + * /127.0.0.1:57165 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra system 3 Apache Cassandra Java Driver 3.12.1 DRIVER_VERSION=3.11.5, DRIVER_NAME=Apache Cassandra Java Driver, CQL_VERSION=3.0.0 MutualTls identity=spiffe://test.cassandra.apache.org/unitTest/mtls + * /127.0.0.1:57164 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra 3 Apache Cassandra Java Driver 3.12.1 DRIVER_VERSION=3.11.5, DRIVER_NAME=Apache Cassandra Java Driver, CQL_VERSION=3.0.0 Password + * /127.0.0.1:57144 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra 17 Apache Cassandra Java Driver 3.12.1 DRIVER_VERSION=3.11.5, DRIVER_NAME=Apache Cassandra Java Driver, CQL_VERSION=3.0.0 Password + * /127.0.0.1:57146 true TLS_AES_256_GCM_SHA384 TLSv1.3 5 cassandra 16 Apache Cassandra Java Driver 3.12.1 DRIVER_VERSION=3.11.5, DRIVER_NAME=Apache Cassandra Java Driver, CQL_VERSION=3.0.0 MutualTls identity=spiffe://test.cassandra.apache.org/unitTest/mtls + * /127.0.0.1:57163 false undefined undefined 5 cassandra 4 Apache Cassandra Java Driver 3.12.1 DRIVER_VERSION=3.11.5, DRIVER_NAME=Apache Cassandra Java Driver, CQL_VERSION=3.0.0 Password */ // Header assertThat(stdout).containsPattern("Address +SSL +Cipher +Protocol +Version +User +Keyspace +Requests +Driver-Name +Driver-Version +Client-Options +Auth-Mode +Auth-Metadata"); // Unencrypted password-based client. Expect 'DRIVER_VERSION' to appear before Password. - assertThat(stdout).containsPattern("/127.0.0.1:[0-9]+ false +undefined +undefined +[0-9]+ +cassandra +[0-9]+ +DataStax Java Driver 3.11.5 +.*DRIVER_VERSION.* +Password"); + assertThat(stdout).containsPattern("/127.0.0.1:[0-9]+ false +undefined +undefined +[0-9]+ +cassandra +[0-9]+ +Apache Cassandra Java Driver 3.12.1 +.*DRIVER_VERSION.* +Password"); // TLS-encrypted password-based client. - assertThat(stdout).containsPattern("/127.0.0.1:[0-9]+ true +TLS\\S+ +TLS\\S+ +[0-9]+ +cassandra +[0-9]+ +DataStax Java Driver 3.11.5 +.*DRIVER_VERSION.* +Password"); + assertThat(stdout).containsPattern("/127.0.0.1:[0-9]+ true +TLS\\S+ +TLS\\S+ +[0-9]+ +cassandra +[0-9]+ +Apache Cassandra Java Driver 3.12.1 +.*DRIVER_VERSION.* +Password"); // MTLS-based client. - assertThat(stdout).containsPattern("/127.0.0.1:[0-9]+ true +TLS\\S+ +TLS\\S+ +[0-9]+ +cassandra +[0-9]+ +DataStax Java Driver 3.11.5 +.*DRIVER_VERSION.* +MutualTls +identity=" + TlsTestUtils.CLIENT_SPIFFE_IDENTITY); + assertThat(stdout).containsPattern("/127.0.0.1:[0-9]+ true +TLS\\S+ +TLS\\S+ +[0-9]+ +cassandra +[0-9]+ +Apache Cassandra Java Driver 3.12.1 +.*DRIVER_VERSION.* +MutualTls +identity=" + TlsTestUtils.CLIENT_SPIFFE_IDENTITY); // MTLS-based client with 'system' keyspace set on connection. - assertThat(stdout).containsPattern("/127.0.0.1:[0-9]+ true +TLS\\S+ +TLS\\S+ +[0-9]+ +cassandra +system +[0-9]+ +DataStax Java Driver 3.11.5 +.*DRIVER_VERSION.* +MutualTls +identity=" + TlsTestUtils.CLIENT_SPIFFE_IDENTITY); + assertThat(stdout).containsPattern("/127.0.0.1:[0-9]+ true +TLS\\S+ +TLS\\S+ +[0-9]+ +cassandra +system +[0-9]+ +Apache Cassandra Java Driver 3.12.1 +.*DRIVER_VERSION.* +MutualTls +identity=" + TlsTestUtils.CLIENT_SPIFFE_IDENTITY); assertClientCount(stdout); } diff --git a/test/unit/org/apache/cassandra/tools/nodetool/NetStatsTest.java b/test/unit/org/apache/cassandra/tools/nodetool/NetStatsTest.java index 7bddc9b23ae7..9664c9071461 100644 --- a/test/unit/org/apache/cassandra/tools/nodetool/NetStatsTest.java +++ b/test/unit/org/apache/cassandra/tools/nodetool/NetStatsTest.java @@ -40,6 +40,7 @@ import org.apache.cassandra.tools.ToolRunner; import org.apache.cassandra.utils.FBUtilities; +import static java.util.Collections.emptyList; import static org.apache.cassandra.net.Verb.ECHO_REQ; import static org.assertj.core.api.Assertions.assertThat; @@ -111,7 +112,7 @@ public void testNetStats() @Test public void testHumanReadable() throws IOException { - List streamSummaries = Collections.singletonList(new StreamSummary(TableId.generate(), 1, 1024)); + List streamSummaries = Collections.singletonList(new StreamSummary(TableId.generate(), emptyList(), 1, 1024)); SessionInfo info = new SessionInfo(InetAddressAndPort.getLocalHost(), 1, InetAddressAndPort.getLocalHost(), diff --git a/test/unit/org/apache/cassandra/tools/nodetool/SSTableRepairedSetTest.java b/test/unit/org/apache/cassandra/tools/nodetool/SSTableRepairedSetTest.java new file mode 100644 index 000000000000..5d23d22253ad --- /dev/null +++ b/test/unit/org/apache/cassandra/tools/nodetool/SSTableRepairedSetTest.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tools.nodetool; + +import java.io.OutputStream; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; + +import org.junit.Before; +import org.junit.Test; + +import org.apache.cassandra.tools.NodeProbe; +import org.apache.cassandra.tools.Output; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +import static org.mockito.ArgumentMatchers.anyBoolean; +import static org.mockito.ArgumentMatchers.anyList; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +/** + * Unit tests for {@link org.apache.cassandra.tools.nodetool.SSTableRepairedSetTest} + */ +public class SSTableRepairedSetTest +{ + @Mock + private NodeProbe probe; + + private SSTableRepairedSet cmd; + + @Before + public void setUp() + { + MockitoAnnotations.initMocks(this); + PrintStream noopStream = new PrintStream(new OutputStream() + { + @Override + public void write(int b) + { + } + }); + when(probe.output()).thenReturn(new Output(noopStream, noopStream)); + cmd = new SSTableRepairedSet(); + } + + @Test + public void testNoKeyspace() + { + when(probe.getNonLocalStrategyKeyspaces()).thenReturn(new ArrayList<>(Arrays.asList("ks1", "ks2"))); + when(probe.getKeyspaces()).thenReturn(new ArrayList<>(Arrays.asList("ks1", "ks2"))); + when(probe.getAutoRepairTablesForKeyspace("ks1")).thenReturn(new ArrayList<>(Arrays.asList("table1", "table2"))); + when(probe.getAutoRepairTablesForKeyspace("ks2")).thenReturn(new ArrayList<>(Arrays.asList("table3", "table4"))); + cmd.isRepaired = true; + cmd.reallySet = true; + + cmd.execute(probe); + + verify(probe, times(1)).mutateSSTableRepairedState(true, false, "ks1", Arrays.asList("table1", "table2")); + verify(probe, times(1)).mutateSSTableRepairedState(true, false, "ks2", Arrays.asList("table3", "table4")); + } + + @Test + public void testBothRepairedAndUnrepaired() + { + cmd.args = Arrays.asList("keyspace"); + cmd.isRepaired = true; + cmd.isUnrepaired = true; + cmd.execute(probe); + verify(probe, never()).mutateSSTableRepairedState(anyBoolean(), anyBoolean(), anyString(), anyList()); + } + + @Test + public void testNeitherRepairedNorUnrepaired() + { + cmd.args = Arrays.asList("keyspace"); + cmd.execute(probe); + verify(probe, never()).mutateSSTableRepairedState(anyBoolean(), anyBoolean(), anyString(), anyList()); + } + + @Test + public void testRepairedPreview() + { + cmd.args = Arrays.asList("keyspace"); + when(probe.getKeyspaces()).thenReturn(new ArrayList<>(Arrays.asList("keyspace"))); + cmd.isRepaired = true; + cmd.execute(probe); + verify(probe).mutateSSTableRepairedState(true, true, "keyspace", new ArrayList<>()); + } + + @Test + public void testUnrepairedReallySet() + { + cmd.args = Arrays.asList("keyspace"); + when(probe.getKeyspaces()).thenReturn(new ArrayList<>(Arrays.asList("keyspace"))); + cmd.isUnrepaired = true; + cmd.reallySet = true; + cmd.execute(probe); + verify(probe).mutateSSTableRepairedState(false, false, "keyspace", new ArrayList<>()); + } + + @Test + public void testExecuteWithTableNames() + { + cmd.args = Arrays.asList("keyspace", "table1", "table2"); + when(probe.getKeyspaces()).thenReturn(new ArrayList<>(Arrays.asList("keyspace"))); + cmd.isRepaired = true; + cmd.reallySet = true; + cmd.execute(probe); + verify(probe).mutateSSTableRepairedState(true, false, "keyspace", Arrays.asList("table1", "table2")); + } +} diff --git a/test/unit/org/apache/cassandra/tools/nodetool/SetAutoRepairConfigTest.java b/test/unit/org/apache/cassandra/tools/nodetool/SetAutoRepairConfigTest.java new file mode 100644 index 000000000000..4ea9516e8ef8 --- /dev/null +++ b/test/unit/org/apache/cassandra/tools/nodetool/SetAutoRepairConfigTest.java @@ -0,0 +1,318 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.tools.nodetool; + +import java.io.PrintStream; +import java.util.Arrays; +import java.util.Collection; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Suite; + +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.repair.autorepair.AutoRepairConfig; +import org.apache.cassandra.tools.NodeProbe; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +import static org.junit.Assert.fail; +import static org.mockito.Mockito.when; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + +/** + * Unit tests for {@link org.apache.cassandra.tools.nodetool.SetAutoRepairConfig} + */ +@RunWith(Suite.class) +@Suite.SuiteClasses({ SetAutoRepairConfigTest.NoParamTests.class, SetAutoRepairConfigTest.RepairTypeParamTests.class, + SetAutoRepairConfigTest.RepairTypeAndArgsParamsTests.class }) +public class SetAutoRepairConfigTest +{ + protected static SetAutoRepairConfig cmd; + + public static void before(NodeProbe probeMock, PrintStream outMock) + { + when(probeMock.isAutoRepairDisabled()).thenReturn(false); + cmd = new SetAutoRepairConfig(); + cmd.out = outMock; + } + + public static class NoParamTests + { + @Mock + private static NodeProbe probe; + + @Mock + private static PrintStream out; + + @Before + public void setUp() + { + MockitoAnnotations.initMocks(this); + before(probe, out); + } + + @Test + public void testHistoryDeleteHostsClearBufferInSec() + { + cmd.args = ImmutableList.of("history_clear_delete_hosts_buffer_interval", "1s"); + + cmd.execute(probe); + + verify(probe, times(1)).setAutoRepairHistoryClearDeleteHostsBufferDuration("1s"); + + // test scenario when auto repair is disabled + when(probe.isAutoRepairDisabled()).thenReturn(true); + + cmd.execute(probe); + + // test new calls are not made when auto repair is disabled + verify(probe, times(1)).setAutoRepairHistoryClearDeleteHostsBufferDuration("1s"); + } + + @Test + public void testStartScheduler() + { + cmd.args = ImmutableList.of("start_scheduler", "false"); + + cmd.execute(probe); + + verify(probe, times(0)).startAutoRepairScheduler(); + + cmd.args = ImmutableList.of("start_scheduler", "true"); + + cmd.execute(probe); + + verify(probe, times(1)).startAutoRepairScheduler(); + } + + @Test + public void testMinRepairDuration() + { + cmd.args = ImmutableList.of("min_repair_task_duration", "4s"); + + cmd.execute(probe); + + verify(probe, times(1)).setAutoRepairMinRepairTaskDuration("4s"); + } + } + + @RunWith(Parameterized.class) + public static class RepairTypeParamTests + { + @Mock + private static NodeProbe probe; + + @Mock + private static PrintStream out; + + @Parameterized.Parameter + public AutoRepairConfig.RepairType repairType; + + @Parameterized.Parameters(name = "repairType={0}") + public static Object[] data() + { + return AutoRepairConfig.RepairType.values(); + } + + private static InetAddressAndPort localEndpoint; + private static InetAddressAndPort otherEndpoint; + + @Before + public void setUp() throws Exception + { + MockitoAnnotations.initMocks(this); + before(probe, out); + localEndpoint = InetAddressAndPort.getByName("127.0.0.1:7000"); + otherEndpoint = localEndpoint.withPort(localEndpoint.getPort() + 1); + } + + @Test(expected = IllegalArgumentException.class) + public void testNoArgs() + { + cmd.repairTypeStr = repairType.name(); + cmd.execute(probe); + } + + @Test + public void testRepairSchedulingDisabled() + { + when(probe.isAutoRepairDisabled()).thenReturn(true); + cmd.repairTypeStr = repairType.name(); + cmd.args = ImmutableList.of("threads", "1"); + + cmd.execute(probe); + + verify(out, times(1)).println("Auto-repair is not enabled"); + verify(probe, times(0)).setAutoRepairThreads(repairType.name(), 1); + } + + @Test + public void testRepairTypeDisabled() + { + cmd.repairTypeStr = repairType.name(); + cmd.args = ImmutableList.of("number_of_repair_threads", "1"); + + cmd.execute(probe); + + verify(probe, times(1)).setAutoRepairThreads(repairType.name(), 1); + } + + @Test + public void testV2FlagMissing() + { + cmd.repairTypeStr = repairType.name(); + cmd.args = ImmutableList.of("threads", "1"); + + try + { + cmd.execute(probe); + + fail("expected IllegalArgumentException"); + } + catch (IllegalArgumentException e) + { + // expected + } + + verify(probe, times(0)).setAutoRepairThreads(repairType.name(), 0); + } + + @Test(expected = IllegalArgumentException.class) + public void testInvalidParamType() + { + cmd.repairTypeStr = repairType.name(); + cmd.args = ImmutableList.of("unknown_type", "1"); + + cmd.execute(probe); + } + + @Test + public void testPriorityHosts() + { + String commaSeparatedHostSet = String.join(",", localEndpoint.toString().substring(1), otherEndpoint.toString().substring(1)); + cmd.repairTypeStr = repairType.name(); + cmd.args = ImmutableList.of("priority_hosts", commaSeparatedHostSet); + + cmd.execute(probe); + + verify(probe, times(1)).setAutoRepairPriorityForHosts(repairType.name(), commaSeparatedHostSet); + } + + @Test + public void testForceRepairHosts() + { + String commaSeparatedHostSet = String.join(",", localEndpoint.toString().substring(1), otherEndpoint.toString().substring(1)); + cmd.repairTypeStr = repairType.name(); + cmd.args = ImmutableList.of("forcerepair_hosts", commaSeparatedHostSet); + + cmd.execute(probe); + + verify(probe, times(1)).setAutoRepairForceRepairForHosts(repairType.name(), commaSeparatedHostSet); + } + } + + @RunWith(Parameterized.class) + public static class RepairTypeAndArgsParamsTests + { + @Parameterized.Parameter + public AutoRepairConfig.RepairType repairType; + + @Parameterized.Parameter(1) + public String paramType; + + @Parameterized.Parameter(2) + public String paramVal; + + @Parameterized.Parameter(3) + public Consumer verifyFunc; + + @Parameterized.Parameters(name = "repairType={0},paramType={1}") + public static Collection testCases() + { + return Stream.of( + forEachRepairType("enabled", "true", (type) -> verify(probe, times(1)).setAutoRepairEnabled(type.name(), true)), + forEachRepairType("number_of_repair_threads", "1", (type) -> verify(probe, times(1)).setAutoRepairThreads(type.name(), 1)), + forEachRepairType("min_repair_interval", "3h", (type) -> verify(probe, times(1)).setAutoRepairMinInterval(type.name(), "3h")), + forEachRepairType("sstable_upper_threshold", "4", (type) -> verify(probe, times(1)).setAutoRepairSSTableCountHigherThreshold(type.name(), 4)), + forEachRepairType("table_max_repair_time", "5s", (type) -> verify(probe, times(1)).setAutoRepairTableMaxRepairTime(type.name(), "5s")), + forEachRepairType("repair_primary_token_range_only", "true", (type) -> verify(probe, times(1)).setAutoRepairPrimaryTokenRangeOnly(type.name(), true)), + forEachRepairType("parallel_repair_count", "6", (type) -> verify(probe, times(1)).setAutoRepairParallelRepairCount(type.name(), 6)), + forEachRepairType("parallel_repair_percentage", "7", (type) -> verify(probe, times(1)).setAutoRepairParallelRepairPercentage(type.name(), 7)), + forEachRepairType("materialized_view_repair_enabled", "true", (type) -> verify(probe, times(1)).setAutoRepairMaterializedViewRepairEnabled(type.name(), true)), + forEachRepairType("ignore_dcs", "dc1,dc2", (type) -> verify(probe, times(1)).setAutoRepairIgnoreDCs(type.name(), ImmutableSet.of("dc1", "dc2"))), + forEachRepairType("token_range_splitter.max_bytes_per_schedule", "500GiB", (type) -> verify(probe, times(1)).setAutoRepairTokenRangeSplitterParameter(type.name(), "max_bytes_per_schedule", "500GiB")), + forEachRepairType("repair_max_retries", "3", (type) -> verify(probe, times(1)).setAutoRepairMaxRetriesCount(type.name(), 3)), + forEachRepairType("repair_retry_backoff", "60s", (type) -> verify(probe, times(1)).setAutoRepairRetryBackoff(type.name(), "60s")) + ).flatMap(Function.identity()).collect(Collectors.toList()); + } + + private static Stream forEachRepairType(String paramType, String paramVal, Consumer verifyFunc) + { + Object[][] testCases = new Object[AutoRepairConfig.RepairType.values().length][4]; + for (AutoRepairConfig.RepairType repairType : AutoRepairConfig.RepairType.values()) + { + testCases[repairType.ordinal()] = new Object[]{ repairType, paramType, paramVal, verifyFunc }; + } + + return Arrays.stream(testCases); + } + + @Mock + private static NodeProbe probe; + + @Mock + private static PrintStream out; + + @Before + public void setUp() + { + MockitoAnnotations.initMocks(this); + before(probe, out); + } + + @Test + public void test() + { + cmd.repairTypeStr = repairType.name(); + cmd.args = ImmutableList.of(paramType, paramVal); + + cmd.execute(probe); + + verifyFunc.accept(repairType); + + // test scenario when auto repair is disabled + when(probe.isAutoRepairDisabled()).thenReturn(true); + + cmd.execute(probe); + + // test new calls are not made when auto repair is disabled + verifyFunc.accept(repairType); + } + } +} diff --git a/test/unit/org/apache/cassandra/tools/nodetool/TableHistogramsTest.java b/test/unit/org/apache/cassandra/tools/nodetool/TableHistogramsTest.java index 4a233b22faf8..b3c51d196bf6 100644 --- a/test/unit/org/apache/cassandra/tools/nodetool/TableHistogramsTest.java +++ b/test/unit/org/apache/cassandra/tools/nodetool/TableHistogramsTest.java @@ -17,19 +17,19 @@ */ package org.apache.cassandra.tools.nodetool; -import org.apache.cassandra.auth.AuthKeyspace; -import org.apache.cassandra.db.SystemKeyspace; -import org.apache.cassandra.schema.SchemaConstants; -import org.apache.cassandra.schema.SchemaKeyspace; -import org.apache.cassandra.schema.SystemDistributedKeyspace; -import org.apache.cassandra.tracing.TraceKeyspace; - import org.apache.commons.lang3.StringUtils; import org.junit.BeforeClass; import org.junit.Test; +import org.apache.cassandra.auth.AuthKeyspace; import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.db.SystemKeyspace; +import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.SchemaKeyspace; +import org.apache.cassandra.schema.SystemDistributedKeyspace; +import org.apache.cassandra.service.accord.AccordKeyspace; import org.apache.cassandra.tools.ToolRunner; +import org.apache.cassandra.tracing.TraceKeyspace; import static org.apache.cassandra.tools.ToolRunner.invokeNodetool; import static org.assertj.core.api.Assertions.assertThat; @@ -44,6 +44,7 @@ public class TableHistogramsTest extends CQLTester TraceKeyspace.TABLE_NAMES.size() + AuthKeyspace.TABLE_NAMES.size() + SystemDistributedKeyspace.TABLE_NAMES.size() + + AccordKeyspace.tables().size() + 1; // DistributedMetadataLogKeyspace contains a single table @BeforeClass diff --git a/test/unit/org/apache/cassandra/transport/EarlyAuthenticationTest.java b/test/unit/org/apache/cassandra/transport/EarlyAuthenticationTest.java index ed98140b9b5e..a581086c7645 100644 --- a/test/unit/org/apache/cassandra/transport/EarlyAuthenticationTest.java +++ b/test/unit/org/apache/cassandra/transport/EarlyAuthenticationTest.java @@ -84,23 +84,23 @@ public void initNetwork() throws IOException, TimeoutException }); } - private EncryptionOptions clientEncryptionOptions(boolean presentClientCertificate) + private EncryptionOptions.ClientEncryptionOptions clientEncryptionOptions(boolean presentClientCertificate) { - EncryptionOptions encryptionOptions = new EncryptionOptions() - .withEnabled(true) - .withRequireClientAuth(EncryptionOptions.ClientAuth.OPTIONAL) - .withTrustStore(TlsTestUtils.CLIENT_TRUSTSTORE_PATH) - .withTrustStorePassword(TlsTestUtils.CLIENT_TRUSTSTORE_PASSWORD) - .withSslContextFactory(new ParameterizedClass(SimpleClientSslContextFactory.class.getName())); + EncryptionOptions.ClientEncryptionOptions.Builder builder = new EncryptionOptions.ClientEncryptionOptions.Builder(); + builder.withEnabled(true) + .withRequireClientAuth(EncryptionOptions.ClientEncryptionOptions.ClientAuth.OPTIONAL) + .withTrustStore(TlsTestUtils.CLIENT_TRUSTSTORE_PATH) + .withTrustStorePassword(TlsTestUtils.CLIENT_TRUSTSTORE_PASSWORD) + .withSslContextFactory(new ParameterizedClass(SimpleClientSslContextFactory.class.getName())); if (presentClientCertificate) { - encryptionOptions = encryptionOptions.withKeyStore(TlsTestUtils.CLIENT_SPIFFE_KEYSTORE_PATH) - .withStoreType("JKS") - .withKeyStorePassword(TlsTestUtils.CLIENT_SPIFFE_KEYSTORE_PASSWORD); + builder.withKeyStore(TlsTestUtils.CLIENT_SPIFFE_KEYSTORE_PATH) + .withStoreType("JKS") + .withKeyStorePassword(TlsTestUtils.CLIENT_SPIFFE_KEYSTORE_PASSWORD); } - return new EncryptionOptions(encryptionOptions); + return new EncryptionOptions.ClientEncryptionOptions(builder.build()); } @Test @@ -180,6 +180,5 @@ public Consumer expectAuthenticationError(final String expecte } }; } - } diff --git a/test/unit/org/apache/cassandra/transport/MessagePayloadTest.java b/test/unit/org/apache/cassandra/transport/MessagePayloadTest.java index a50174bd8702..42c6cbd021ba 100644 --- a/test/unit/org/apache/cassandra/transport/MessagePayloadTest.java +++ b/test/unit/org/apache/cassandra/transport/MessagePayloadTest.java @@ -127,7 +127,7 @@ public void testMessagePayloadBeta() throws Throwable nativePort, ProtocolVersion.V5, true, - new EncryptionOptions()); + new EncryptionOptions.ClientEncryptionOptions()); try { client.connect(false); diff --git a/test/unit/org/apache/cassandra/transport/SerDeserTest.java b/test/unit/org/apache/cassandra/transport/SerDeserTest.java index b14e854cab65..605119f65bfe 100644 --- a/test/unit/org/apache/cassandra/transport/SerDeserTest.java +++ b/test/unit/org/apache/cassandra/transport/SerDeserTest.java @@ -50,6 +50,7 @@ import org.apache.cassandra.cql3.terms.UserTypes; import org.apache.cassandra.db.ConsistencyLevel; import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; import org.apache.cassandra.db.marshal.Int32Type; import org.apache.cassandra.db.marshal.ListType; import org.apache.cassandra.db.marshal.LongType; @@ -94,7 +95,7 @@ public void collectionSerDeserTest() for (Integer i : l) lb.add(Int32Type.instance.decompose(i)); - assertEquals(l, lt.compose(lt.pack(lb))); + assertEquals(l, lt.compose(lt.pack(lb, ByteBufferAccessor.instance))); // Sets SetType st = SetType.getInstance(UTF8Type.instance, true); @@ -104,7 +105,7 @@ public void collectionSerDeserTest() for (String t : s) sb.add(UTF8Type.instance.decompose(t)); - assertEquals(s, st.compose(st.pack(sb))); + assertEquals(s, st.compose(st.pack(sb, ByteBufferAccessor.instance))); // Maps MapType mt = MapType.getInstance(UTF8Type.instance, LongType.instance, true); @@ -120,7 +121,7 @@ public void collectionSerDeserTest() mb.add(LongType.instance.decompose(entry.getValue())); } - assertEquals(m, mt.compose(mt.pack(mb))); + assertEquals(m, mt.compose(mt.pack(mb, ByteBufferAccessor.instance))); } @Test(expected = MarshalException.class) @@ -130,7 +131,7 @@ public void setsMayNotContainNullsTest() List sb = new ArrayList<>(1); sb.add(null); - st.compose(st.pack(sb)); + st.compose(st.pack(sb, ByteBufferAccessor.instance)); } @Test(expected = MarshalException.class) @@ -141,7 +142,7 @@ public void mapKeysMayNotContainNullsTest() mb.add(null); mb.add(LongType.instance.decompose(999L)); - mt.compose(mt.pack(mb)); + mt.compose(mt.pack(mb, ByteBufferAccessor.instance)); } @Test(expected = MarshalException.class) @@ -152,7 +153,7 @@ public void mapValueMayNotContainNullsTest() mb.add(UTF8Type.instance.decompose("danger")); mb.add(null); - mt.compose(mt.pack(mb)); + mt.compose(mt.pack(mb, ByteBufferAccessor.instance)); } @Test diff --git a/test/unit/org/apache/cassandra/transport/SimpleClientSslContextFactory.java b/test/unit/org/apache/cassandra/transport/SimpleClientSslContextFactory.java index 1a6871716a53..468c9cbc17b3 100644 --- a/test/unit/org/apache/cassandra/transport/SimpleClientSslContextFactory.java +++ b/test/unit/org/apache/cassandra/transport/SimpleClientSslContextFactory.java @@ -30,12 +30,12 @@ import org.apache.cassandra.config.EncryptionOptions; import org.apache.cassandra.security.FileBasedSslContextFactory; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.NOT_REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.NOT_REQUIRED; /** * A custom implementation of {@link FileBasedSslContextFactory} to be used by tests utilizing {@link SimpleClient}. *

    - * Provides a subtly different implementation of {@link #createNettySslContext(EncryptionOptions.ClientAuth, SocketType, CipherSuiteFilter)} + * Provides a subtly different implementation of {@link #createNettySslContext(EncryptionOptions.ClientEncryptionOptions.ClientAuth, SocketType, CipherSuiteFilter)} * that only configures an {@link SslContext} for clients and most importantly only configures a key manager if an * outbound keystore is configured, where the existing implementation always does this. This is useful for tests * that try to create a client that uses encryption but does not provide a certificate. @@ -49,7 +49,7 @@ public SimpleClientSslContextFactory(Map parameters) } @Override - public SSLContext createJSSESslContext(EncryptionOptions.ClientAuth clientAuth) throws SSLException + public SSLContext createJSSESslContext(EncryptionOptions.ClientEncryptionOptions.ClientAuth clientAuth) throws SSLException { TrustManager[] trustManagers = null; if (clientAuth != NOT_REQUIRED) @@ -76,7 +76,7 @@ public SSLContext createJSSESslContext(EncryptionOptions.ClientAuth clientAuth) } @Override - public SslContext createNettySslContext(EncryptionOptions.ClientAuth clientAuth, SocketType socketType, + public SslContext createNettySslContext(EncryptionOptions.ClientEncryptionOptions.ClientAuth clientAuth, SocketType socketType, CipherSuiteFilter cipherFilter) throws SSLException { SslContextBuilder builder = SslContextBuilder.forClient(); diff --git a/test/unit/org/apache/cassandra/transport/TlsTestUtils.java b/test/unit/org/apache/cassandra/transport/TlsTestUtils.java index 30a6054127ca..76faec85b1eb 100644 --- a/test/unit/org/apache/cassandra/transport/TlsTestUtils.java +++ b/test/unit/org/apache/cassandra/transport/TlsTestUtils.java @@ -86,17 +86,18 @@ public class TlsTestUtils public static String CLIENT_TRUSTSTORE_PATH = "test/conf/cassandra_ssl_test.truststore"; public static String CLIENT_TRUSTSTORE_PASSWORD = "cassandra"; - public static EncryptionOptions getClientEncryptionOptions() + public static EncryptionOptions.ClientEncryptionOptions getClientEncryptionOptions() { - return new EncryptionOptions(new EncryptionOptions() + return new EncryptionOptions.ClientEncryptionOptions(new EncryptionOptions.ClientEncryptionOptions.Builder() .withEnabled(true) - .withRequireClientAuth(EncryptionOptions.ClientAuth.OPTIONAL) + .withRequireClientAuth(EncryptionOptions.ClientEncryptionOptions.ClientAuth.OPTIONAL) .withOptional(true) .withKeyStore(SERVER_KEYSTORE_PATH) .withKeyStorePassword(SERVER_KEYSTORE_PASSWORD) .withTrustStore(SERVER_TRUSTSTORE_PATH) .withTrustStorePassword(SERVER_TRUSTSTORE_PASSWORD) - .withRequireEndpointVerification(false)); + .withRequireEndpointVerification(false) + .build()); } public static void configureWithMutualTlsWithPasswordFallbackAuthenticator(Config config) @@ -129,7 +130,7 @@ public static SSLOptions getSSLOptions(boolean provideClientCert) throws SSLExce { return RemoteEndpointAwareJdkSSLOptions.builder() .withSSLContext(getClientSslContextFactory(provideClientCert) - .createJSSESslContext(EncryptionOptions.ClientAuth.OPTIONAL)) + .createJSSESslContext(EncryptionOptions.ClientEncryptionOptions.ClientAuth.OPTIONAL)) .build(); } @@ -139,7 +140,7 @@ public static SSLOptions getSSLOptions(Path keystorePath, Path truststorePath) t { return RemoteEndpointAwareJdkSSLOptions.builder() .withSSLContext(getClientSslContextFactory(keystorePath, truststorePath) - .createJSSESslContext(EncryptionOptions.ClientAuth.OPTIONAL)) + .createJSSESslContext(EncryptionOptions.ClientEncryptionOptions.ClientAuth.OPTIONAL)) .build(); } catch (SSLException e) diff --git a/test/unit/org/apache/cassandra/utils/ASTGenerators.java b/test/unit/org/apache/cassandra/utils/ASTGenerators.java index cdf533cb420e..3e1a228db1ac 100644 --- a/test/unit/org/apache/cassandra/utils/ASTGenerators.java +++ b/test/unit/org/apache/cassandra/utils/ASTGenerators.java @@ -61,6 +61,7 @@ import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.Int32Type; import org.apache.cassandra.db.marshal.IntegerType; +import org.apache.cassandra.db.marshal.ListType; import org.apache.cassandra.db.marshal.LongType; import org.apache.cassandra.db.marshal.MapType; import org.apache.cassandra.db.marshal.SetType; @@ -373,6 +374,10 @@ public enum DeleteKind { Partition, Row, Column } private BiFunction, List> ifConditionFilter = (rnd, symbols) -> symbols; private Gen deleteKindGen = SourceDSL.arbitrary().enumValues(DeleteKind.class); private Map columnExpressions = new LinkedHashMap<>(); + private boolean allowPartitionOnlyUpdate = true; + private boolean allowPartitionOnlyInsert = true; + private boolean allowUpdateMultipleClusteringKeys = true; + private EnumSet ignoreIssues = IGNORE_ISSUES; public MutationGenBuilder(TableMetadata metadata) { @@ -391,6 +396,30 @@ public MutationGenBuilder(TableMetadata metadata) columnExpressions.put(symbol, new ExpressionBuilder(symbol.type())); } + public MutationGenBuilder withIgnoreIssues(EnumSet ignoreIssues) + { + this.ignoreIssues = Objects.requireNonNull(ignoreIssues); + return this; + } + + public MutationGenBuilder withAllowPartitionOnlyUpdate(boolean value) + { + this.allowPartitionOnlyUpdate = value; + return this; + } + + public MutationGenBuilder withAllowPartitionOnlyInsert(boolean value) + { + this.allowPartitionOnlyInsert = value; + return this; + } + + public MutationGenBuilder withAllowUpdateMultipleClusteringKeys(boolean allowUpdateMultipleClusteringKeys) + { + this.allowUpdateMultipleClusteringKeys = allowUpdateMultipleClusteringKeys; + return this; + } + public MutationGenBuilder withColumnExpressions(Consumer fn) { for (Symbol symbol : allColumns) @@ -534,16 +563,45 @@ private static void values(RandomnessSource rnd, } else { - //TODO (coverage): support IN rather than just EQ for (Symbol s : columns) builder.value(s, columnExpressions.get(s).build().generate(rnd)); } } + private static void where(RandomnessSource rnd, + Map columnExpressions, + Conditional.ConditionalBuilder builder, + LinkedHashSet columns, + @Nullable Gen> gen) + { + if (gen != null) + { + Map map = gen.generate(rnd); + for (Map.Entry e : assertDeterministic(map).entrySet()) + builder.value(e.getKey(), valueGen(e.getValue(), e.getKey().type()).generate(rnd)); + return; + } + + for (Symbol s : columns) + { + if (SourceDSL.booleans().all().generate(rnd)) + { + builder.value(s, columnExpressions.get(s).build().generate(rnd)); + continue; + } + var valueGen = columnExpressions.get(s).build(); + builder.in(s, SourceDSL.lists().of(valueGen).ofSizeBetween(1, 3).generate(rnd)); + } + } + public Gen build() { Gen bool = SourceDSL.booleans().all(); Map, List> typeToReference = references.stream().collect(Collectors.groupingBy(Reference::type)); + if (allowUpdateMultipleClusteringKeys + && ignoreIssues.contains(KnownIssue.STATIC_LIST_APPEND_WITH_CLUSTERING_IN) + && staticColumns.stream().anyMatch(s -> s.type().isMultiCell() && s.type().getClass() == ListType.class)) + allowUpdateMultipleClusteringKeys = false; return rnd -> { Mutation.Kind kind = kindGen.generate(rnd); // when there are not non-primary-columns then can't support UPDATE @@ -572,6 +630,12 @@ public Gen build() if (timestamp.isPresent()) builder.timestamp(valueGen(timestamp.getAsLong(), LongType.instance).generate(rnd)); values(rnd, columnExpressions, builder, partitionColumns, partitionValueGen); + if (!staticColumns.isEmpty() && allowPartitionOnlyInsert && bool.generate(rnd)) + { + var columnsToGenerate = new LinkedHashSet<>(subset(rnd, staticColumns)); + generateRemaining(rnd, bool, Mutation.Kind.INSERT, isTransaction, typeToReference, builder, columnsToGenerate); + return builder.build(); + } values(rnd, columnExpressions, builder, clusteringColumns, clusteringValueGen); LinkedHashSet columnsToGenerate; if (regularAndStaticColumns.isEmpty()) @@ -601,6 +665,35 @@ else if (regularAndStaticColumns.size() == 1 || bool.generate(rnd)) var timestamp = timestampGen.generate(rnd); if (timestamp.isPresent()) builder.timestamp(valueGen(timestamp.getAsLong(), LongType.instance).generate(rnd)); + if (allowUpdateMultipleClusteringKeys) + where(rnd, columnExpressions, builder, partitionColumns, partitionValueGen); + else + values(rnd, columnExpressions, builder, partitionColumns, partitionValueGen); + + if (!staticColumns.isEmpty() && allowPartitionOnlyUpdate && bool.generate(rnd)) + { + var columnsToGenerate = new LinkedHashSet<>(subset(rnd, staticColumns)); + Conditional.EqBuilder setBuilder = builder::set; + generateRemaining(rnd, bool, Mutation.Kind.UPDATE, isTransaction, typeToReference, setBuilder, columnsToGenerate); + + if (isCas) + { + if (useCasIf.generate(rnd)) + { + ifGen(new ArrayList<>(staticColumns)).generate(rnd).ifPresent(c -> builder.ifCondition(c)); + } + else + { + builder.ifExists(); + } + } + return builder.build(); + } + if (allowUpdateMultipleClusteringKeys) + where(rnd, columnExpressions, builder, clusteringColumns, clusteringValueGen); + else + values(rnd, columnExpressions, builder, clusteringColumns, clusteringValueGen); + if (isCas) { if (useCasIf.generate(rnd)) @@ -612,8 +705,6 @@ else if (regularAndStaticColumns.size() == 1 || bool.generate(rnd)) builder.ifExists(); } } - values(rnd, columnExpressions, builder, partitionColumns, partitionValueGen); - values(rnd, columnExpressions, builder, clusteringColumns, clusteringValueGen); LinkedHashSet columnsToGenerate; if (regularAndStaticColumns.size() == 1 || bool.generate(rnd)) @@ -913,6 +1004,7 @@ public Gen build() .withoutCas() .withoutTimestamp() .withoutTtl() + .withAllowUpdateMultipleClusteringKeys(false) .withReferences(new ArrayList<>(builder.allowedReferences())); if (!allowReferences) mutationBuilder.withReferences(Collections.emptyList()); diff --git a/test/unit/org/apache/cassandra/utils/AbstractTypeGenerators.java b/test/unit/org/apache/cassandra/utils/AbstractTypeGenerators.java index ea9a128233dd..d44d32d0439a 100644 --- a/test/unit/org/apache/cassandra/utils/AbstractTypeGenerators.java +++ b/test/unit/org/apache/cassandra/utils/AbstractTypeGenerators.java @@ -574,10 +574,11 @@ public static Gen> safeTypeGen() .withoutTypeKinds(COUNTER) .withoutPrimitive(DecimalType.instance) // its ordering is special... - .withoutPrimitive(DurationType.instance); - // composite requires all elements fit into Short.MAX_VALUE bytes - // so try to limit the possible expansion of types - return baseline.withCompositeElementGen(new TypeGenBuilder(baseline).withDefaultSizeGen(1).withMaxDepth(1).build()) + .withoutPrimitive(DurationType.instance) + // To make sure all elements fit within Short.MAX_VALUE bytes, + // need to limit the possible expansion of types + .withDefaultSizeGen(1).withMaxDepth(1); + return baseline.withCompositeElementGen(new TypeGenBuilder(baseline).build()) .build(); } diff --git a/test/unit/org/apache/cassandra/utils/AbstractTypeGeneratorsTest.java b/test/unit/org/apache/cassandra/utils/AbstractTypeGeneratorsTest.java new file mode 100644 index 000000000000..2cd5470a6b54 --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/AbstractTypeGeneratorsTest.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.util.ArrayList; + +import org.junit.Test; + +import accord.utils.LazyToString; +import org.apache.cassandra.db.marshal.AbstractType; +import org.assertj.core.api.Assertions; +import org.quicktheories.core.Gen; +import org.quicktheories.generators.SourceDSL; + +import static org.quicktheories.QuickTheory.qt; + +public class AbstractTypeGeneratorsTest +{ + @Test + public void withoutPrimitive() + { + Gen> primitiveGen = SourceDSL.arbitrary().pick(new ArrayList<>(AbstractTypeGenerators.primitiveTypes())); + qt().forAll(r -> r).checkAssert(rs -> { + AbstractType primitiveType = primitiveGen.generate(rs); + Gen> gen = AbstractTypeGenerators.builder().withoutPrimitive(primitiveType).build(); + for (int i = 0; i < 1000; i++) + { + AbstractType type = gen.generate(rs); + Assertions.assertThat(AbstractTypeGenerators.contains(type, primitiveType)) + .describedAs("Expected type %s not to be found in %s", primitiveType.asCQL3Type(), new LazyToString(() -> AbstractTypeGenerators.typeTree(type))) + .isFalse(); + if (type.subTypes().isEmpty()) + break; // not worth checking this type again... + } + }); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/utils/AccordGenerators.java b/test/unit/org/apache/cassandra/utils/AccordGenerators.java new file mode 100644 index 000000000000..455ed25854e4 --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/AccordGenerators.java @@ -0,0 +1,767 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.lang.reflect.Array; +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.List; +import java.util.NavigableMap; +import java.util.Set; +import java.util.TreeSet; +import java.util.function.BiFunction; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSortedMap; +import com.google.common.collect.Sets; + +import accord.local.Command; +import accord.local.Command.Truncated; +import accord.local.ICommand; +import accord.local.DurableBefore; +import accord.local.Node; +import accord.local.RedundantBefore; +import accord.local.RedundantBefore.Bounds; +import accord.local.StoreParticipants; +import accord.primitives.Ballot; +import accord.primitives.Deps; +import accord.primitives.FullRoute; +import accord.primitives.KeyDeps; +import accord.primitives.PartialDeps; +import accord.primitives.PartialTxn; +import accord.primitives.Range; +import accord.primitives.RangeDeps; +import accord.primitives.Ranges; +import accord.primitives.Routable; +import accord.primitives.SaveStatus; +import accord.primitives.Seekables; +import accord.primitives.Status; +import accord.primitives.Timestamp; +import accord.primitives.Txn; +import accord.primitives.TxnId; +import accord.primitives.Writes; +import accord.topology.Shard; +import accord.topology.Topology; +import accord.utils.AccordGens; +import accord.utils.Gen; +import accord.utils.Gens; +import accord.utils.RandomSource; +import accord.utils.ReducingRangeMap; +import accord.utils.SortedArrays.SortedArrayList; +import accord.utils.TinyEnumSet; +import accord.utils.TriFunction; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.dht.AccordSplitter; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.service.accord.AccordTestUtils; +import org.apache.cassandra.service.accord.TokenRange; +import org.apache.cassandra.service.accord.api.TokenKey; +import org.apache.cassandra.service.accord.api.PartitionKey; +import org.apache.cassandra.service.accord.serializers.TableMetadatas; +import org.apache.cassandra.service.accord.txn.TxnData; +import org.apache.cassandra.service.accord.txn.TxnWrite; +import org.quicktheories.impl.JavaRandom; + +import static accord.local.CommandStores.RangesForEpoch; +import static accord.local.RedundantStatus.Property.GC_BEFORE; +import static accord.local.RedundantStatus.Property.PRE_BOOTSTRAP; +import static accord.local.RedundantStatus.SomeStatus.LOCALLY_APPLIED_ONLY; +import static accord.local.RedundantStatus.SomeStatus.LOCALLY_WITNESSED_ONLY; +import static accord.local.RedundantStatus.SomeStatus.SHARD_APPLIED_ONLY; +import static accord.local.RedundantStatus.oneSlow; +import static accord.primitives.Status.Durability.NotDurable; +import static accord.primitives.Timestamp.Flag.SHARD_BOUND; +import static accord.primitives.Txn.Kind.Write; +import static org.apache.cassandra.service.accord.AccordTestUtils.TABLE_ID1; +import static org.apache.cassandra.service.accord.AccordTestUtils.createPartialTxn; + +public class AccordGenerators +{ + private static final Gen PARTITIONER_GEN = fromQT(CassandraGenerators.nonLocalPartitioners()); + private static final Gen TABLE_ID_GEN = fromQT(CassandraGenerators.TABLE_ID_GEN); + + private AccordGenerators() + { + } + + public static boolean maybeUpdatePartitioner(Ranges ranges) + { + if (ranges.isEmpty()) return false; + for (Range range : ranges) + { + TokenRange tr = (TokenRange) range; + DatabaseDescriptor.setPartitionerUnsafe(tr.start().token().getPartitioner()); + return true; + } + return false; + } + + public static Gen partitioner() + { + return PARTITIONER_GEN.filter(IPartitioner::accordSupported); + } + + private enum SupportedCommandTypes + {notDefined, preaccepted, committed, stable} + + public static Gen commands() + { + Gen ids = AccordGens.txnIds(); + //TODO switch to Status once all types are supported + Gen supportedTypes = Gens.enums().all(SupportedCommandTypes.class); + //TODO goes against fuzz testing, and also limits to a very specific table existing... + // There is a branch that can generate random transactions, so maybe look into that? + PartialTxn txn = createPartialTxn(0); + + return rs -> { + TxnId id = ids.next(rs); + TxnId executeAt = id; + if (rs.nextBoolean()) + executeAt = ids.next(rs); + if (executeAt.compareTo(id) < 0) + { + TxnId tmp = id; + id = executeAt; + executeAt = tmp; + } + SupportedCommandTypes targetType = supportedTypes.next(rs); + switch (targetType) + { + case notDefined: + return AccordTestUtils.Commands.notDefined(id, txn); + case preaccepted: + return AccordTestUtils.Commands.preaccepted(id, txn, executeAt); + case committed: + return AccordTestUtils.Commands.committed(id, txn, executeAt); + case stable: + return AccordTestUtils.Commands.stable(id, txn, executeAt); + default: + throw new UnsupportedOperationException("Unexpected type: " + targetType); + } + }; + } + + public enum RecoveryStatus { None, Started, Complete } + + public static Gen commandsBuilder() + { + return commandsBuilder(AccordGens.txnIds(), Gens.bools().all(), Gens.enums().all(RecoveryStatus.class), (rs, txnId, txn) -> AccordGens.depsFor(txnId, txn).next(rs)); + } + + public static Gen commandsBuilder(Gen txnIdGen, Gen fastPath, Gen recover, TriFunction depsGen) + { + return rs -> { + TxnId txnId = txnIdGen.next(rs); + Txn txn = AccordTestUtils.createTxn(0, 0); + Deps deps = depsGen.apply(rs, txnId, txn); + Timestamp executeAt = fastPath.next(rs) ? txnId + : AccordGens.timestamps(AccordGens.epochs(txnId.epoch()), + AccordGens.hlcs(txnId.hlc()), + AccordGens.flags(), + RandomSource::nextInt).next(rs); + Ranges slice = AccordTestUtils.fullRange(txn); + PartialTxn partialTxn = txn.slice(slice, true); //TODO (correctness): find the case where includeQuery=false and replicate + PartialDeps partialDeps = deps.intersecting(slice); + Ballot promised; + Ballot accepted; + switch (recover.next(rs)) + { + case None: + { + promised = Ballot.ZERO; + accepted = Ballot.ZERO; + } + break; + case Started: + { + promised = AccordGens.ballot(AccordGens.epochs(executeAt.epoch()), + AccordGens.hlcs(executeAt.hlc()), + AccordGens.flags(), + RandomSource::nextInt).next(rs); + accepted = Ballot.ZERO; + } + break; + case Complete: + { + promised = accepted = AccordGens.ballot(AccordGens.epochs(executeAt.epoch()), + AccordGens.hlcs(executeAt.hlc()), + AccordGens.flags(), + RandomSource::nextInt).next(rs); + } + break; + default: + throw new UnsupportedOperationException(); + } + + Command.WaitingOn waitingOn = Command.WaitingOn.none(txnId.domain(), deps); + return new CommandBuilder(txnId, txn, executeAt, partialTxn, partialDeps, promised, accepted, waitingOn); + }; + } + + public static class CommandBuilder + { + public final TxnId txnId; + public final FullRoute route; + public final Seekables keysOrRanges; + private final Timestamp executeAt; + private final PartialTxn partialTxn; + private final PartialDeps partialDeps; + private final Ballot promised, accepted; + private final Command.WaitingOn waitingOn; + + public CommandBuilder(TxnId txnId, Txn txn, Timestamp executeAt, PartialTxn partialTxn, PartialDeps partialDeps, Ballot promised, Ballot accepted, Command.WaitingOn waitingOn) + { + this.txnId = txnId; + this.executeAt = executeAt; + this.partialTxn = partialTxn; + this.partialDeps = partialDeps; + this.promised = promised; + this.accepted = accepted; + this.waitingOn = waitingOn; + this.route = txn.keys().toRoute(txn.keys().get(0).someIntersectingRoutingKey(null)); + this.keysOrRanges = txn.keys(); + } + + private ICommand attributes(SaveStatus saveStatus) + { + ICommand.Builder builder = new ICommand.Builder(txnId); + if (saveStatus.known.isDefinitionKnown()) + builder.partialTxn(partialTxn); + if (saveStatus.known.deps().hasPreAcceptedOrProposedOrDecidedDeps()) + builder.partialDeps(partialDeps); + + builder.setParticipants(StoreParticipants.all(route)); + builder.durability(NotDurable); + if (saveStatus.compareTo(SaveStatus.PreAccepted) >= 0) + builder.executeAt(executeAt); + builder.promised(promised); + if (saveStatus.status.compareTo(Status.PreAccepted) > 0) + builder.acceptedOrCommitted(accepted); + else + builder.acceptedOrCommitted(Ballot.ZERO); + if (saveStatus.compareTo(SaveStatus.Stable) >= 0 && !saveStatus.hasBeen(Status.Truncated)) + builder.waitingOn(waitingOn); + if (saveStatus.hasBeen(Status.PreApplied) && !saveStatus.hasBeen(Status.Truncated)) + { + if (txnId.is(Write)) + builder.writes(new Writes(txnId, executeAt, keysOrRanges, new TxnWrite(TableMetadatas.none(), Collections.emptyList(), true))); + builder.result(new TxnData()); + } + return builder; + } + + public Command build(SaveStatus saveStatus) + { + ICommand command = attributes(saveStatus); + switch (saveStatus) + { + default: throw new AssertionError("Unhandled saveStatus: " + saveStatus); + case Uninitialised: + case NotDefined: + return Command.NotDefined.notDefined(command, Ballot.ZERO); + case PreAccepted: + case PreAcceptedWithVote: + case PreAcceptedWithDeps: + return Command.PreAccepted.preaccepted(command, saveStatus); + case AcceptedInvalidate: + return Command.NotAcceptedWithoutDefinition.acceptedInvalidate(command); + + case AcceptedMedium: + case AcceptedMediumWithDefinition: + case AcceptedMediumWithDefAndVote: + case AcceptedInvalidateWithDefinition: + case AcceptedSlow: + case AcceptedSlowWithDefinition: + case AcceptedSlowWithDefAndVote: + case PreCommittedWithDefinition: + case PreCommittedWithDeps: + case PreCommittedWithFixedDeps: + case PreCommittedWithDefAndDeps: + case PreCommittedWithDefAndFixedDeps: + case PreCommitted: + return Command.Accepted.accepted(command, saveStatus); + + case Committed: + return Command.Committed.committed(command, saveStatus); + + case Stable: + case ReadyToExecute: + return Command.Committed.committed(command, saveStatus); + + case PreApplied: + case Applying: + case Applied: + return Command.Executed.executed(command, saveStatus); + + case TruncatedApply: + case TruncatedUnapplied: + if (txnId.kind().awaitsOnlyDeps()) return Truncated.truncated(command, saveStatus, executeAt, null, null, null, txnId); + else return Truncated.truncated(command, saveStatus, executeAt, null, null, null, null); + + case TruncatedApplyWithOutcome: + if (txnId.kind().awaitsOnlyDeps()) return Truncated.truncated(command, saveStatus, executeAt, command.partialDeps(), txnId.is(Write) ? new Writes(txnId, executeAt, keysOrRanges, new TxnWrite(TableMetadatas.none(), Collections.emptyList(), true)) : null, new TxnData(), txnId); + else return Truncated.truncated(command, saveStatus, executeAt, command.partialDeps(), txnId.is(Write) ? new Writes(txnId, executeAt, keysOrRanges, new TxnWrite(TableMetadatas.none(), Collections.emptyList(), true)) : null, new TxnData(), null); + + case Erased: + case Vestigial: + case Invalidated: + return Truncated.invalidated(txnId, command.participants()); + } + } + } + + public static Gen keys() + { + return keys(TABLE_ID_GEN, + fromQT(CassandraGenerators.decoratedKeys())); + } + + public static Gen keys(IPartitioner partitioner) + { + return keys(TABLE_ID_GEN, + fromQT(CassandraGenerators.decoratedKeys(ignore -> partitioner))); + } + + public static Gen keys(IPartitioner partitioner, List tables) + { + //TODO (correctness): fix Gens.pick to not fail with lists of size 1 + return keys(tables.size() == 1 ? Gens.constant(tables.get(0)) : Gens.pick(tables), + fromQT(CassandraGenerators.decoratedKeys(ignore -> partitioner))); + } + + public static Gen keys(Gen tableIdGen, Gen key) + { + return rs -> new PartitionKey(tableIdGen.next(rs), key.next(rs)); + } + + public static Gen routingKeysGen(IPartitioner partitioner) + { + return routingKeyGen(TABLE_ID_GEN, + fromQT(CassandraGenerators.token(partitioner)), + partitioner); + } + + public static Gen routingKeyGen(Gen tableIdGen, Gen tokenGen, IPartitioner partitioner) + { + return routingKeyGen(tableIdGen, Gens.enums().all(RoutingKeyKind.class), tokenGen, partitioner); + } + + public enum RoutingKeyKind + { + TOKEN, SENTINEL + } + + public static Gen routingKeyGen(Gen tableIdGen, Gen kindGen, Gen tokenGen, IPartitioner partitioner) + { + return rs -> { + TableId tableId = tableIdGen.next(rs); + RoutingKeyKind kind = kindGen.next(rs); + switch (kind) + { + case TOKEN: + return new TokenKey(tableId, tokenGen.next(rs)); + case SENTINEL: + return rs.nextBoolean() ? TokenKey.min(tableId, partitioner) : TokenKey.max(tableId, partitioner); + default: + throw new AssertionError("Unknown kind: " + kind); + } + }; + } + + public static Gen allowBeforeAndAfter(Gen gen) + { + return gen.map((rs, key) -> { + if (key.isTokenSentinel()) return key; + switch (rs.nextInt(0, 3)) + { + case 0: return key; + case 1: return key.before(); + case 2: return key.after(); + default: throw new AssertionError(); + } + }); + } + + public static Gen range() + { + return partitioner().flatMap(partitioner -> range(TABLE_ID_GEN, fromQT(CassandraGenerators.token(partitioner)), partitioner)); + } + + public static Gen range(IPartitioner partitioner) + { + return range(TABLE_ID_GEN, fromQT(CassandraGenerators.token(partitioner)), partitioner); + } + + public static Gen range(IPartitioner partitioner, Gen tables) + { + return range(tables, fromQT(CassandraGenerators.token(partitioner)), partitioner); + } + + public static Gen range(Gen tables, Gen tokenGen, IPartitioner partitioner) + { + return rs -> { + Gen gen = allowBeforeAndAfter(routingKeyGen(Gens.constant(tables.next(rs)), tokenGen, partitioner)); + TokenKey a = gen.next(rs); + TokenKey b = gen.next(rs); + while (same(a, b)) + b = gen.next(rs); + return a.compareTo(b) < 0 ? TokenRange.create(a, b) : TokenRange.create(b, a); + }; + } + + private static boolean same(TokenKey a, TokenKey b) + { + if (a.equals(b)) return true; + // define +Inf == before(+Inf) as these are not actionable ranges + return a.isTableSentinel() && b.isTableSentinel() + && a.isMin() == b.isMin() + && a.isMax() == b.isMax(); + } + + public static Gen ranges() + { + // javac couldn't pick the right constructor with HashSet::new, so had to create new lambda... + return ranges(Gens.lists(TABLE_ID_GEN).unique().ofSizeBetween(1, 10), partitioner()); + } + + public static Gen ranges(Gen> tableIdGen, Gen partitionerGen) + { + Gen.IntGen splitsGen = Gens.ints().between(10, 99); + return ranges(tableIdGen, partitionerGen, splitsGen); + } + + public static Gen ranges(Gen> tableIdGen, Gen partitionerGen, Gen.IntGen splitsGen) + { + return rs -> { + List tables = tableIdGen.next(rs); + IPartitioner partitioner = partitionerGen.next(rs); + List ranges = new ArrayList<>(); + int numSplits = splitsGen.nextInt(rs); + if (numSplits == 0) return Ranges.EMPTY; + TokenRange range = TokenRange.create(TokenKey.min(TABLE_ID1, partitioner), TokenKey.max(TABLE_ID1, partitioner)); + AccordSplitter splitter = partitioner.accordSplitter().apply(Ranges.of(range)); + BigInteger size = splitter.sizeOf(range); + BigInteger update = splitter.divide(size, numSplits); + BigInteger offset = BigInteger.ZERO; + while (offset.compareTo(size) < 0) + { + BigInteger end = offset.add(update); + TokenRange r = splitter.subRange(range, offset, end); + for (TableId id : tables) + { + ranges.add(r.withTable(id)); + } + offset = end; + } + return Ranges.of(ranges.toArray(new Range[0])); + }; + } + + public static Gen ranges(IPartitioner partitioner) + { + return ranges(Gens.lists(TABLE_ID_GEN).unique().ofSizeBetween(1, 10), ignore -> partitioner); + } + + public static Gen ranges(IPartitioner partitioner, Gen.IntGen splitsGen) + { + return ranges(Gens.lists(TABLE_ID_GEN).unique().ofSizeBetween(1, 10), ignore -> partitioner, splitsGen); + } + + public static Gen ranges(TableId tableId, IPartitioner partitioner) + { + List tables = Collections.singletonList(tableId); + return ranges(i -> tables, i -> partitioner); + } + + public static Gen rangesArbitrary(IPartitioner partitioner) + { + Gen.IntGen sizeGen = Gens.ints().between(0, 10); + return rangesArbitrary(partitioner, sizeGen); + } + + public static Gen rangesArbitrary(IPartitioner partitioner, Gen.IntGen sizeGen) + { + return rangesArbitrary(partitioner, TABLE_ID_GEN, sizeGen); + } + + public static Gen rangesArbitrary(IPartitioner partitioner, Gen tables, Gen.IntGen sizeGen) + { + Gen rangeGen = range(partitioner, tables); + return rs -> { + int targetSize = sizeGen.nextInt(rs); + List ranges = new ArrayList<>(targetSize); + for (int i = 0; i < targetSize; i++) + ranges.add(rangeGen.next(rs)); + return Ranges.of(ranges.toArray(Range[]::new)); + }; + } + + public static Gen rangesSplitOrArbitrary(IPartitioner partitioner) + { + Gen split = ranges(partitioner); + Gen arbitrary = rangesArbitrary(partitioner); + return rs -> rs.nextBoolean() ? split.next(rs) : arbitrary.next(rs); + } + + public static Gen rangesSplitOrArbitrary(IPartitioner partitioner, Gen.IntGen sizeGen) + { + return rangesSplitOrArbitrary(partitioner, sizeGen, Gens.lists(TABLE_ID_GEN).unique().ofSizeBetween(1, 10)); + } + + public static Gen rangesSplitOrArbitrary(IPartitioner partitioner, Gen.IntGen sizeGen, Gen> tableIdGen) + { + Gen split = ranges(tableIdGen, i -> partitioner, sizeGen); + Gen arbitrary = rangesArbitrary(partitioner, tableIdGen.map((rs, l) -> rs.pick(l)), sizeGen); + return rs -> rs.nextBoolean() ? split.next(rs) : arbitrary.next(rs); + } + + public static Gen keyDepsGen(IPartitioner partitioner) + { + return AccordGens.keyDeps(AccordGenerators.routingKeysGen(partitioner)); + } + + public static Gen directKeyDepsGen(IPartitioner partitioner) + { + return AccordGens.directKeyDeps(AccordGenerators.routingKeysGen(partitioner)); + } + + public static Gen rangeDepsGen(IPartitioner partitioner) + { + return AccordGens.rangeDeps(AccordGenerators.range(partitioner)); + } + + public static Gen depsGen(IPartitioner partitioner) + { + return AccordGens.deps(keyDepsGen(partitioner), rangeDepsGen(partitioner)); + } + + public static Gen redundantBeforeEntry(IPartitioner partitioner) + { + return redundantBeforeEntry(Gens.bools().all(), range(partitioner), AccordGens.txnIds(Gens.pick(Txn.Kind.ExclusiveSyncPoint), ignore -> Routable.Domain.Range)); + } + + public static Gen redundantBeforeEntry(Gen emptyGen, Gen rangeGen, Gen txnIdGen) + { + return rs -> { + Range range = rangeGen.next(rs); + + List bounds = new ArrayList<>(); + if (rs.nextBoolean()) + bounds.add(Bounds.create(range, txnIdGen.next(rs), LOCALLY_WITNESSED_ONLY, null )); + if (rs.nextBoolean()) + bounds.add(Bounds.create(range, txnIdGen.next(rs), LOCALLY_APPLIED_ONLY, null )); + if (rs.nextBoolean()) + bounds.add(Bounds.create(range, txnIdGen.next(rs), SHARD_APPLIED_ONLY, null )); + if (rs.nextBoolean()) + bounds.add(Bounds.create(range, txnIdGen.next(rs).addFlag(SHARD_BOUND), oneSlow(GC_BEFORE), null )); + if (rs.nextBoolean()) + bounds.add(Bounds.create(range, txnIdGen.next(rs), oneSlow(PRE_BOOTSTRAP), null )); + if (rs.nextBoolean()) + bounds.add(new Bounds(range, Long.MIN_VALUE, Long.MAX_VALUE, new TxnId[0], new short[0], txnIdGen.next(rs))); + + Collections.shuffle(bounds); + long endEpoch = emptyGen.next(rs) ? Long.MAX_VALUE : rs.nextLong(0, Long.MAX_VALUE); + long minEpoch = Long.MAX_VALUE; + Bounds result = null; + for (Bounds b : bounds) + { + if (b.bounds.length > 0) + minEpoch = Math.min(minEpoch, b.bounds[0].epoch()); + if (result == null) result = b; + else result = Bounds.reduce(result, b); + } + + long startEpoch = rs.nextLong(Math.min(minEpoch, endEpoch)); + Bounds epochBounds = new Bounds(range, startEpoch, endEpoch, new TxnId[0], new short[0], null); + if (result == null) + return epochBounds; + return Bounds.reduce(result, epochBounds); + }; + } + + public static Gen redundantBefore(IPartitioner partitioner) + { + Gen rangeGen = rangesArbitrary(partitioner); + Gen txnIdGen = AccordGens.txnIds(Gens.pick(Txn.Kind.ExclusiveSyncPoint), ignore -> Routable.Domain.Range); + BiFunction entryGen = (rs, range) -> redundantBeforeEntry(Gens.bools().all(), i -> range, txnIdGen).next(rs); + return AccordGens.redundantBefore(rangeGen, entryGen); + } + + public static Gen durableBeforeGen(IPartitioner partitioner) + { + Gen rangeGen = rangesArbitrary(partitioner); + Gen txnIdGen = AccordGens.txnIds(Gens.pick(Txn.Kind.ExclusiveSyncPoint), ignore -> Routable.Domain.Range); + + return (rs) -> { + Ranges ranges = rangeGen.next(rs); + TxnId majority = txnIdGen.next(rs); + TxnId universal = majority; + return DurableBefore.create(ranges, majority, universal); + }; + } + + public static Gen> rejectBeforeGen(IPartitioner partitioner) + { + Gen rangeGen = rangesArbitrary(partitioner); + Gen timestampGen = AccordGens.timestamps(); + + return (rs) -> { + ReducingRangeMap initial = new ReducingRangeMap<>(); + int size = rs.nextInt(10); + for (int i = 0; i < size; i++) + initial = ReducingRangeMap.add(initial, rangeGen.next(rs), timestampGen.next(rs)); + + return initial; + }; + } + + public static Gen> safeToReadGen(IPartitioner partitioner) + { + Gen rangeGen = ranges(partitioner); + Gen timestampGen = AccordGens.timestamps(); + + return (rs) -> { + ImmutableMap.Builder initial = new ImmutableSortedMap.Builder<>(Comparator.comparing(o -> o)); + int size = rs.nextInt(10); + for (int i = 0; i < size; i++) + initial.put(timestampGen.next(rs), rangeGen.next(rs)); + + return (NavigableMap) initial.build(); + }; + } + + public static Gen rangesForEpoch(IPartitioner partitioner) + { + Gen rangesGen = ranges(partitioner); + + return rs -> { + int size = rs.nextInt(1, 5); + long[] epochs = new long[size]; + for (int i = 0; i < size; i++) + epochs[i] = rs.nextLong(1, 10_000); + Ranges[] ranges = new Ranges[size]; + for (int i = 0; i < size; i++) + ranges[i] = rangesGen.next(rs); + return new RangesForEpoch(epochs, ranges); + }; + } + + public static Gen> shardFlagsGen() + { + return rs -> { + if (rs.nextBoolean()) return Shard.NO_FLAGS; + EnumSet flags = EnumSet.noneOf(Shard.Flag.class); + for (Shard.Flag v : Shard.Flag.values()) + { + if (rs.nextBoolean()) + flags.add(v); + } + return new TinyEnumSet<>(flags.toArray(Shard.Flag[]::new)); + }; + } + + public static > Gen> sortedArrayList(Class klass, Gen.IntGen sizeGen, Gen valueGen) + { + return rs -> { + int size = sizeGen.nextInt(rs); + if (size == 0) return SortedArrayList.ofSorted(); + return SortedArrayList.copyUnsorted(Gens.lists(valueGen).unique().ofSize(size).next(rs), s -> (T[]) Array.newInstance(klass, s)); + }; + } + + private static Gen> select(List list, int size) + { + // This is better in Gens, but didn't want to alter Accord in this patch... + if (size < 0 || size > list.size()) + throw new IllegalArgumentException("Unexpected size: " + size + ", list size is " + list.size()); + if (size == 0) return i -> List.of(); + if (size == list.size()) return i -> list; + return rs -> { + List toSelect = new ArrayList<>(list); + List selected = new ArrayList<>(size); + for (int i = 0; i < size; i++) + { + int idx = rs.nextInt(0, toSelect.size()); + selected.add(toSelect.remove(idx)); + } + return selected; + }; + } + + private static Gen shardGen(Range range) + { + Gen> nodesGen = sortedArrayList(Node.Id.class, Gens.ints().between(1, 10), AccordGens.nodes()); + Gen> shardFlagsGen = shardFlagsGen(); + return rs -> { + SortedArrayList nodes = nodesGen.next(rs); + int maxFailures = Shard.maxToleratedFailures(nodes.size()); + int slowQuorumSize = Shard.slowQuorumSize(nodes.size()); + Set fastPathElectorate = new TreeSet<>(select(nodes, nodes.size() == slowQuorumSize ? slowQuorumSize : rs.nextInt(slowQuorumSize, nodes.size())).next(rs)); + List nonFastPath = new ArrayList<>(Sets.difference(new HashSet<>(nodes), fastPathElectorate)); + nonFastPath.sort(Comparator.naturalOrder()); + Set joining = new TreeSet<>(select(nonFastPath, nonFastPath.size() == 0 ? 0 : rs.nextInt(0, nonFastPath.size())).next(rs)); + return Shard.create(range, nodes, fastPathElectorate, joining, shardFlagsGen.next(rs)); + }; + } + + public static Gen topologyGen(IPartitioner partitioner) + { + return topologyGen(AccordGens.epochs(), partitioner); + } + + public static Gen topologyGen(Gen.LongGen epochGen, IPartitioner partitioner) + { + return topologyGen(epochGen, ranges(partitioner)); + } + + public static Gen topologyGen(Gen rangesGen) + { + return topologyGen(AccordGens.epochs(), rangesGen); + } + + public static Gen topologyGen(Gen.LongGen epochGen, Gen rangesGen) + { + return rs -> { + long epoch = epochGen.nextLong(rs); + Ranges ranges = rangesGen.next(rs); + if (ranges.isEmpty()) return new Topology(epoch, new Shard[0]); + + List shards = new ArrayList<>(ranges.size()); + for (Range range : ranges) + shards.add(shardGen(range).next(rs)); + //TODO (coverage): staleNodes + return new Topology(epoch, shards.toArray(Shard[]::new)); + }; + } + + public static Gen fromQT(org.quicktheories.core.Gen qt) + { + return rs -> { + JavaRandom r = new JavaRandom(rs.asJdkRandom()); + return qt.generate(r); + }; + } +} diff --git a/test/unit/org/apache/cassandra/utils/AssertionUtils.java b/test/unit/org/apache/cassandra/utils/AssertionUtils.java index d5b1981fc142..2ec07f756553 100644 --- a/test/unit/org/apache/cassandra/utils/AssertionUtils.java +++ b/test/unit/org/apache/cassandra/utils/AssertionUtils.java @@ -18,9 +18,16 @@ package org.apache.cassandra.utils; +import java.util.stream.Stream; + import com.google.common.base.Throwables; +import org.assertj.core.api.AbstractThrowableAssert; +import org.assertj.core.api.Assertions; import org.assertj.core.api.Condition; +import org.assertj.core.api.ThrowableAssert; +import org.assertj.core.error.BasicErrorMessageFactory; +import org.assertj.core.internal.Failures; public class AssertionUtils { @@ -28,6 +35,16 @@ private AssertionUtils() { } + public static Condition anyOf(Stream> stream) { + Iterable> it = () -> stream.iterator(); + return Assertions.anyOf(it); + } + + public static Condition anyOfThrowable(Class... klasses) + { + return anyOf(Stream.of(klasses).map(AssertionUtils::isThrowable)); + } + /** * When working with jvm-dtest the thrown error is in a different {@link ClassLoader} causing type checks * to fail; this method relies on naming instead. @@ -100,6 +117,11 @@ public String toString() }; } + public static Condition isThrowableInstanceof(Class klass) + { + return (Condition) (Condition) isInstanceof(klass); + } + public static Condition rootCause(Condition other) { return new Condition() { @@ -119,6 +141,59 @@ public String toString() public static Condition rootCauseIs(Class klass) { - return rootCause((Condition) (Condition) is(klass)); + return rootCause(isThrowable(klass)); + } + + public static Condition hasCause(Class klass) + { + return hasCause(isThrowable(klass)); + } + + public static Condition hasCauseAnyOf(Class... matchers) + { + return hasCause(anyOfThrowable(matchers)); + } + + public static Condition hasCause(Condition matcher) + { + return new Condition() { + @Override + public boolean matches(Throwable value) + { + for (Throwable cause = value; cause != null; cause = cause.getCause()) + { + if (matcher.matches(cause)) + return true; + } + return false; + } + }; + } + + public static ThrowableAssertPlus assertThatThrownBy(ThrowableAssert.ThrowingCallable fn) + { + return new ThrowableAssertPlus(Assertions.catchThrowable(fn)).hasBeenThrown(); + } + + public static class ThrowableAssertPlus extends AbstractThrowableAssert + { + public ThrowableAssertPlus(Throwable actual) + { + super(actual, ThrowableAssertPlus.class); + } + + @Override + protected ThrowableAssertPlus hasBeenThrown() + { + return super.hasBeenThrown(); + } + + public ThrowableAssertPlus hasRootCause() + { + Throwable cause = Throwables.getRootCause(actual); + if (cause == actual) + throw Failures.instance().failure(this.info, new BasicErrorMessageFactory("%nExpected a root cause but cause was null", new Object[0])); + return new ThrowableAssertPlus(cause); + } } } diff --git a/test/unit/org/apache/cassandra/utils/BloomFilterTest.java b/test/unit/org/apache/cassandra/utils/BloomFilterTest.java index 7f08d6ccf29a..51e6c9fb4c71 100644 --- a/test/unit/org/apache/cassandra/utils/BloomFilterTest.java +++ b/test/unit/org/apache/cassandra/utils/BloomFilterTest.java @@ -233,7 +233,7 @@ public void testHugeBFSerialization() throws IOException @Test public void testMurmur3FilterHash() { - IPartitioner partitioner = new Murmur3Partitioner(); + IPartitioner partitioner = Murmur3Partitioner.instance; Iterator gen = new KeyGenerator.RandomStringGenerator(new Random().nextInt(), FilterTestHelper.ELEMENTS); long[] expected = new long[2]; long[] actual = new long[2]; diff --git a/test/unit/org/apache/cassandra/utils/CassandraGenerators.java b/test/unit/org/apache/cassandra/utils/CassandraGenerators.java index 8867d6141e1e..08b06c073071 100644 --- a/test/unit/org/apache/cassandra/utils/CassandraGenerators.java +++ b/test/unit/org/apache/cassandra/utils/CassandraGenerators.java @@ -38,6 +38,7 @@ import java.util.Set; import java.util.TreeMap; import java.util.UUID; +import java.util.concurrent.TimeUnit; import java.util.function.Consumer; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -49,14 +50,27 @@ import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Sets; + +import org.apache.cassandra.db.compaction.LeveledManifest; +import org.apache.cassandra.schema.*; +import org.apache.cassandra.service.consensus.migration.ConsensusMigrationState; +import org.apache.cassandra.tcm.extensions.ExtensionKey; +import org.apache.cassandra.tcm.extensions.ExtensionValue; +import org.apache.cassandra.tcm.membership.Directory; +import org.apache.cassandra.tcm.ownership.DataPlacements; +import org.apache.cassandra.tcm.ownership.TokenMap; +import org.apache.cassandra.tcm.sequences.InProgressSequences; +import org.apache.cassandra.tcm.sequences.LockedRanges; import org.apache.commons.lang3.builder.MultilineRecursiveToStringStyle; import org.apache.commons.lang3.builder.ReflectionToStringBuilder; +import accord.local.Node; import org.apache.cassandra.config.DataStorageSpec; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.ColumnIdentifier; import org.apache.cassandra.cql3.Duration; import org.apache.cassandra.cql3.FieldIdentifier; +import org.apache.cassandra.db.Clustering; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.ReadCommand; import org.apache.cassandra.db.SchemaCQLHelper; @@ -73,7 +87,6 @@ import org.apache.cassandra.db.marshal.CompositeType; import org.apache.cassandra.db.marshal.CounterColumnType; import org.apache.cassandra.db.marshal.EmptyType; -import org.apache.cassandra.db.marshal.Int32Type; import org.apache.cassandra.db.rows.Cell; import org.apache.cassandra.db.marshal.UserType; import org.apache.cassandra.dht.ByteOrderedPartitioner; @@ -104,21 +117,13 @@ import org.apache.cassandra.net.NoPayload; import org.apache.cassandra.net.PingRequest; import org.apache.cassandra.net.Verb; -import org.apache.cassandra.schema.CachingParams; -import org.apache.cassandra.schema.ColumnMetadata; -import org.apache.cassandra.schema.CompactionParams; -import org.apache.cassandra.schema.CompressionParams; -import org.apache.cassandra.schema.KeyspaceMetadata; -import org.apache.cassandra.schema.KeyspaceParams; -import org.apache.cassandra.schema.MemtableParams; -import org.apache.cassandra.schema.ReplicationParams; -import org.apache.cassandra.schema.TableId; -import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.schema.TableParams; -import org.apache.cassandra.schema.Tables; -import org.apache.cassandra.schema.Types; -import org.apache.cassandra.schema.UserFunctions; -import org.apache.cassandra.schema.Views; +import org.apache.cassandra.service.accord.fastpath.FastPathStrategy; +import org.apache.cassandra.service.accord.AccordFastPath; +import org.apache.cassandra.service.accord.AccordStaleReplicas; +import org.apache.cassandra.service.accord.fastpath.InheritKeyspaceFastPathStrategy; +import org.apache.cassandra.service.accord.fastpath.ParameterizedFastPathStrategy; +import org.apache.cassandra.service.accord.fastpath.SimpleFastPathStrategy; +import org.apache.cassandra.service.consensus.TransactionalMode; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.utils.AbstractTypeGenerators.TypeGenBuilder; @@ -136,6 +141,7 @@ import static org.apache.cassandra.utils.Generators.SMALL_TIME_SPAN_NANOS; import static org.apache.cassandra.utils.Generators.TIMESTAMP_NANOS; import static org.apache.cassandra.utils.Generators.TINY_TIME_SPAN_NANOS; +import static org.apache.cassandra.utils.Generators.directAndHeapBytes; public final class CassandraGenerators { @@ -156,7 +162,7 @@ public final class CassandraGenerators return InetAddressAndPort.getByAddressOverrideDefaults(address, NETWORK_PORT_GEN.generate(rnd)); }; - public static final Gen TABLE_ID_GEN = Generators.UUID_RANDOM_GEN.map(TableId::fromUUID); + public static final Gen TABLE_ID_GEN = Generate.booleans().flatMap(uuid -> uuid ? Generators.UUID_RANDOM_GEN.map(TableId::fromUUID) : Generate.longRange(Long.MIN_VALUE, Long.MAX_VALUE).map(TableId::fromLong)); private static final Gen TABLE_KIND_GEN = SourceDSL.arbitrary().pick(TableMetadata.Kind.REGULAR, TableMetadata.Kind.INDEX, TableMetadata.Kind.VIRTUAL); public static final Gen TABLE_METADATA_GEN = gen(rnd -> createTableMetadata(IDENTIFIER_GEN.generate(rnd), rnd)).describedAs(CassandraGenerators::toStringRecursive); @@ -204,6 +210,17 @@ private static Gen> responseGen(Verb verb) cast(READ_REPAIR_RSP_GEN)) .describedAs(CassandraGenerators::toStringRecursive); + private static final Constraint CLUSTERING_OPTIONS = Constraint.between(0, 2); + public static final Gen> CLUSTERING_GEN = rnd -> { + switch ((int) rnd.next(CLUSTERING_OPTIONS)) + { + case 0: return Clustering.EMPTY; + case 1: return Clustering.STATIC_CLUSTERING; + case 2: return Clustering.make(Generators.array(ByteBuffer.class, directAndHeapBytes(0, 10), SourceDSL.integers().between(1, 3)).generate(rnd)); + default: throw new AssertionError(); + } + }; + private CassandraGenerators() { @@ -481,7 +498,7 @@ public Gen build() AbstractReplicationStrategy replication = replicationGen.generate(rs).withKeyspace(nameGen).build().generate(rs); ReplicationParams replicationParams = ReplicationParams.fromStrategy(replication); boolean durableWrites = durableWritesGen.generate(rs); - KeyspaceParams params = new KeyspaceParams(durableWrites, replicationParams); + KeyspaceParams params = new KeyspaceParams(durableWrites, replicationParams, FastPathStrategy.simple()); Tables tables = Tables.none(); Views views = Views.none(); Types types = Types.none(); @@ -548,11 +565,39 @@ public static class CompactionParamsBuilder Map options = new HashMap<>(); if (nextBoolean(rnd)) options.putAll(sizeTieredOptions.generate(rnd)); + int maxSSTableSizeInMB = LeveledCompactionStrategy.DEFAULT_MAX_SSTABLE_SIZE_MIB; if (nextBoolean(rnd)) + { // size in mb - options.put(LeveledCompactionStrategy.SSTABLE_SIZE_OPTION, SourceDSL.integers().between(1, 2_000).generate(rnd).toString()); + maxSSTableSizeInMB = SourceDSL.integers().between(1, 2_000).generate(rnd); + options.put(LeveledCompactionStrategy.SSTABLE_SIZE_OPTION, Integer.toString(maxSSTableSizeInMB)); + } if (nextBoolean(rnd)) - options.put(LeveledCompactionStrategy.LEVEL_FANOUT_SIZE_OPTION, SourceDSL.integers().between(1, 100).generate(rnd).toString()); + { + // there is a relationship between sstable size and fanout, so respect it + // see CASSANDRA-20570: Leveled Compaction doesn't validate maxBytesForLevel when the table is altered/created + long maxSSTableSizeInBytes = maxSSTableSizeInMB * 1024L * 1024L; + Gen gen = SourceDSL.integers().between(1, 100); + Integer value = gen.generate(rnd); + while (true) + { + try + { + // see org.apache.cassandra.db.compaction.LeveledGenerations.MAX_LEVEL_COUNT for why 8 is hard coded here + LeveledManifest.maxBytesForLevel(8, value, maxSSTableSizeInBytes); + break; // value is good, keep it + } + catch (RuntimeException e) + { + // this value is too large... lets shrink it + if (value.intValue() == 1) + throw new AssertionError("There is no possible fanout size that works with maxSSTableSizeInMB=" + maxSSTableSizeInMB); + gen = SourceDSL.integers().between(1, value - 1); + value = gen.generate(rnd); + } + } + options.put(LeveledCompactionStrategy.LEVEL_FANOUT_SIZE_OPTION, value.toString()); + } if (nextBoolean(rnd)) options.put(LeveledCompactionStrategy.SINGLE_SSTABLE_UPLEVEL_OPTION, nextBoolean(rnd).toString()); return options; @@ -746,6 +791,10 @@ public static class TableParamsBuilder private Gen compactionParamsGen = null; @Nullable private Gen compressionParamsGen = null; + @Nullable + private Gen transactionalMode = null; + @Nullable + private Gen fastPathStrategy = null; public TableParamsBuilder withKnownMemtables() { @@ -774,6 +823,81 @@ public TableParamsBuilder withCompression() return this; } + public TableParamsBuilder withTransactionalMode(Gen transactionalMode) + { + this.transactionalMode = transactionalMode; + return this; + } + + public TableParamsBuilder withTransactionalMode() + { + return withTransactionalMode(SourceDSL.arbitrary().enumValues(TransactionalMode.class)); + } + + public TableParamsBuilder withTransactionalMode(TransactionalMode transactionalMode) + { + return withTransactionalMode(SourceDSL.arbitrary().constant(transactionalMode)); + } + + public TableParamsBuilder withFastPathStrategy() + { + fastPathStrategy = rnd -> { + FastPathStrategy.Kind kind = SourceDSL.arbitrary().enumValues(FastPathStrategy.Kind.class).generate(rnd); + switch (kind) + { + case SIMPLE: + return SimpleFastPathStrategy.instance; + case INHERIT_KEYSPACE: + return InheritKeyspaceFastPathStrategy.instance; + case PARAMETERIZED: + { + Map map = new HashMap<>(); + int size = SourceDSL.integers().between(1, Integer.MAX_VALUE).generate(rnd); + map.put(ParameterizedFastPathStrategy.SIZE, Integer.toString(size)); + Set names = new HashSet<>(); + Gen nameGen = SourceDSL.strings().allPossible().ofLengthBetween(1, 10) + // If : is in the name then the parser will fail; we have validation to disalow this + .map(s -> s.replace(":", "_")) + // Names are used for DCs and those are seperated by , + .map(s -> s.replace(",", "_")) + .assuming(s -> !s.trim().isEmpty()); + int numNames = SourceDSL.integers().between(1, 10).generate(rnd); + for (int i = 0; i < numNames; i++) + { + while (!names.add(nameGen.generate(rnd))) + { + } + } + List sortedNames = new ArrayList<>(names); + sortedNames.sort(Comparator.naturalOrder()); + List dcs = new ArrayList<>(names.size()); + boolean auto = SourceDSL.booleans().all().generate(rnd); + if (auto) + { + dcs.addAll(sortedNames); + } + else + { + for (String name : sortedNames) + { + int weight = SourceDSL.integers().between(0, 10).generate(rnd); + dcs.add(name + ":" + weight); + } + } + // str: dcFormat(,dcFormat)* + // dcFormat: name | weight + // weight: int: >= 0 + // note: can't mix auto and user defined weight; need one or the other. Names must be unique + map.put(ParameterizedFastPathStrategy.DCS, String.join(",", dcs)); + return ParameterizedFastPathStrategy.fromMap(map); + } + default: + throw new UnsupportedOperationException(kind.name()); + } + }; + return this; + } + public Gen build() { return rnd -> { @@ -786,6 +910,10 @@ public Gen build() params.compaction(compactionParamsGen.generate(rnd)); if (compressionParamsGen != null) params.compression(compressionParamsGen.generate(rnd)); + if (transactionalMode != null) + params.transactionalMode(transactionalMode.generate(rnd)); + if (fastPathStrategy != null) + params.fastPath(fastPathStrategy.generate(rnd)); return params.build(); }; } @@ -862,6 +990,18 @@ public TableMetadataBuilder withUseCounter(Gen useCounter) return this; } + public TableMetadataBuilder withTransactionalMode(Gen transactionalMode) + { + paramsBuilder.withTransactionalMode(transactionalMode); + return this; + } + + public TableMetadataBuilder withTransactionalMode(TransactionalMode transactionalMode) + { + paramsBuilder.withTransactionalMode(transactionalMode); + return this; + } + public TableMetadataBuilder withKnownMemtables() { paramsBuilder.withKnownMemtables(); @@ -1079,10 +1219,16 @@ public TableMetadata build(RandomnessSource rnd) } } + public static Gen columnMetadataGen() + { + return columnMetadataGen(SourceDSL.arbitrary().enumValues(ColumnMetadata.Kind.class), AbstractTypeGenerators.typeGen()); + } + public static Gen columnMetadataGen(Gen kindGen, Gen> typeGen) { Gen ksNameGen = CassandraGenerators.KEYSPACE_NAME_GEN; Gen tableNameGen = IDENTIFIER_GEN; + return rs -> { String ks = ksNameGen.generate(rs); String table = tableNameGen.generate(rs); @@ -1109,7 +1255,7 @@ private static ColumnMetadata createColumnDefinition(String ks, String table, // empty type is also not supported, so filter out case PARTITION_KEY: case CLUSTERING: - typeGen = Generators.filter(typeGen, t -> t != EmptyType.instance).map(AbstractType::freeze); + typeGen = Generators.filter(typeGen, t -> t != EmptyType.instance && t != CounterColumnType.instance).map(AbstractType::freeze); break; } if (kind == ColumnMetadata.Kind.CLUSTERING) @@ -1122,7 +1268,7 @@ private static ColumnMetadata createColumnDefinition(String ks, String table, ColumnIdentifier name = new ColumnIdentifier(str, true); int position = !kind.isPrimaryKeyKind() ? -1 : kindOffset; AbstractType type = typeGen.generate(rnd); - return new ColumnMetadata(ks, table, name, type, position, kind, null); + return new ColumnMetadata(ks, table, name, type, ColumnMetadata.NO_UNIQUE_ID, position, kind, null); } public static Gen partitionKeyDataGen(TableMetadata metadata) @@ -1391,24 +1537,6 @@ public static Gen nonLocalPartitioners() .flatMap(SupportedPartitioners::partitioner); } - /** - * For {@link LocalPartitioner} it can have a very complex type which can lead to generating data larger than - * allowed in a primary key. If a test needs to filter out those cases, can just - * {@code .map(CassandraGenerators::simplify)} to resolve. - */ - public static IPartitioner simplify(IPartitioner partitioner) - { - // serializers require tokens to fit within 1 << 16, but that makes the test flakey when LocalPartitioner with a nested type is found... - if (!(partitioner instanceof LocalPartitioner)) return partitioner; - if (!shouldSimplify(partitioner.getTokenValidator())) return partitioner; - return new LocalPartitioner(Int32Type.instance); - } - - private static boolean shouldSimplify(AbstractType type) - { - return AbstractTypeGenerators.contains(type, t -> t.isCollection()); - } - public static Gen token() { return partitioners().flatMap(CassandraGenerators::token); @@ -1732,4 +1860,62 @@ public static Gen epochs() return Epoch.create(SourceDSL.longs().between(2, Long.MAX_VALUE).generate(rnd)); }; } + + public static Gen accordNodeId() + { + return SourceDSL.integers().between(0, Integer.MAX_VALUE).map(Node.Id::new); + } + + public static Gen accordStaleReplicas() + { + Gen> staleIdsGen = Generators.set(accordNodeId(), SourceDSL.integers().between(0, 10)); + Gen epochGen = epochs(); + return rnd -> new AccordStaleReplicas(staleIdsGen.generate(rnd), epochGen.generate(rnd)); + } + + public static Gen accordFastPath() + { + Gen> nodesGen = Generators.uniqueList(accordNodeId(), SourceDSL.integers().between(0, 10)); + Gen statusGen = SourceDSL.arbitrary().enumValues(AccordFastPath.Status.class); + Gen updateTimeMillis = TIMESTAMP_NANOS.map(TimeUnit.NANOSECONDS::toMillis); + Gen updateDelayMillis = SourceDSL.longs().between(0, TimeUnit.HOURS.toMillis(2)); + return rnd -> { + AccordFastPath accum = AccordFastPath.EMPTY; + for (Node.Id node : nodesGen.generate(rnd)) + { + AccordFastPath.Status status = statusGen.generate(rnd); + // can't add a NORMAL node that doesn't exist, it must be ab-NORMAL first... + if (status == AccordFastPath.Status.NORMAL) + accum = accum.withNodeStatusSince(node, AccordFastPath.Status.UNAVAILABLE, 0, 0); + accum = accum.withNodeStatusSince(node, status, updateTimeMillis.generate(rnd), updateDelayMillis.generate(rnd)); + } + return accum; + }; + } + + public static class ClusterMetadataBuilder + { + private Gen epochGen = epochs(); + private Gen partitionerGen = nonLocalPartitioners(); + private Gen accordStaleReplicasGen = accordStaleReplicas(); + private Gen accordFastPathGen = accordFastPath(); + public Gen build() + { + return rnd -> { + Epoch epoch = epochGen.generate(rnd); + IPartitioner partitioner = partitionerGen.generate(rnd); + Directory directory = Directory.EMPTY; + DistributedSchema schema = DistributedSchema.first(directory.knownDatacenters()); + TokenMap tokenMap = new TokenMap(partitioner); + DataPlacements placements = DataPlacements.EMPTY; + AccordFastPath accordFastPath = accordFastPathGen.generate(rnd); + LockedRanges lockedRanges = LockedRanges.EMPTY; + InProgressSequences inProgressSequences = InProgressSequences.EMPTY; + ConsensusMigrationState consensusMigrationState = ConsensusMigrationState.EMPTY; + Map, ExtensionValue> extensions = ImmutableMap.of(); + AccordStaleReplicas accordStaleReplicas = accordStaleReplicasGen.generate(rnd); + return new ClusterMetadata(epoch, partitioner, schema, directory, tokenMap, placements, accordFastPath, lockedRanges, inProgressSequences, consensusMigrationState, extensions, accordStaleReplicas); + }; + } + } } diff --git a/test/unit/org/apache/cassandra/utils/CassandraGeneratorsTest.java b/test/unit/org/apache/cassandra/utils/CassandraGeneratorsTest.java new file mode 100644 index 000000000000..77043d6338ff --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/CassandraGeneratorsTest.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.util.Arrays; +import java.util.List; + +import org.assertj.core.api.Assertions; +import org.junit.Test; + +import accord.utils.Gens; +import accord.utils.LazyToString; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.CounterColumnType; +import org.apache.cassandra.db.marshal.DecimalType; +import org.apache.cassandra.db.marshal.DurationType; +import org.apache.cassandra.db.marshal.EmptyType; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.utils.CassandraGenerators.TableMetadataBuilder; + +import static accord.utils.Property.qt; +import static org.apache.cassandra.utils.Generators.toGen; + +public class CassandraGeneratorsTest +{ + private static final List> NOT_ALLOWED_IN_PRIMARY_KEY = Arrays.asList(EmptyType.instance, + DurationType.instance, + DecimalType.instance, + CounterColumnType.instance); + + @Test + public void partitionerToToken() + { + qt().forAll(Gens.random(), toGen(CassandraGenerators.partitioners())) + .check((rs, p) -> Assertions.assertThat(toGen(CassandraGenerators.token(p)).next(rs)).isNotNull()); + } + + @Test + public void partitionerKeys() + { + qt().forAll(Gens.random(), toGen(CassandraGenerators.partitioners())) + .check((rs, p) -> Assertions.assertThat(toGen(CassandraGenerators.decoratedKeys(i -> p)).next(rs)).isNotNull()); + } + + @Test + public void primaryKeysNoUnsafeTypes() + { + qt().forAll(toGen(new TableMetadataBuilder().build())).check(table -> { + for (ColumnMetadata pk : table.primaryKeyColumns()) + { + for (AbstractType t : NOT_ALLOWED_IN_PRIMARY_KEY) + { + Assertions.assertThat(AbstractTypeGenerators.contains(pk.type, t)) + .describedAs("Expected type %s not to be found in %s", t.asCQL3Type(), new LazyToString(() -> AbstractTypeGenerators.typeTree(pk.type))) + .isFalse(); + } + } + }); + } +} diff --git a/test/unit/org/apache/cassandra/utils/CollectionSerializersTest.java b/test/unit/org/apache/cassandra/utils/CollectionSerializersTest.java new file mode 100644 index 000000000000..c0c42c5c9ebd --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/CollectionSerializersTest.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.io.IOException; + +import org.junit.Test; + +import accord.utils.Gens; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.io.IVersionedSerializer; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.VersionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputBuffer; +import org.apache.cassandra.io.util.DataOutputPlus; + +import static accord.utils.Property.qt; +import static org.apache.cassandra.io.Serializers.testSerde; +import static org.apache.cassandra.utils.CollectionSerializers.newListSerializer; + +public class CollectionSerializersTest +{ + @Test + public void serde() + { + // This test is testing the collection serializer and not the serializer it uses for the element. There are + // special things that must be accounted for in the test + // 1) number of elements can hit the different byte counts (vint can be 1-4 bytes) + // 2) element serializer needs to be fast. So this test avoids random values for the element + DataOutputBuffer output = new DataOutputBuffer(); + Integer cached = 42; + // 0 takes 1 bytes + // 128 takes 2 bytes + // 16384 takes 3 bytes + // 2097152 takes 4 bytes + qt().forAll(Gens.lists(i -> cached).ofSizeBetween(0, 2_097_152)).check(list -> { + testSerde(output, newListSerializer((UnversionedSerializer) IntSerializer.instance), list); + testSerde(output, newListSerializer((IVersionedSerializer) IntSerializer.instance), list, 0); + testSerde(output, newListSerializer((VersionedSerializer) IntSerializer.instance), list, Version.V1); + }); + } + + public enum Version + { + V1 + } + + public enum IntSerializer implements UnversionedSerializer, IVersionedSerializer, VersionedSerializer + { + instance; + + @Override + public void serialize(Integer t, DataOutputPlus out) throws IOException + { + out.writeInt(t); + } + + @Override + public void serialize(Integer t, DataOutputPlus out, Version version) throws IOException + { + serialize(t, out); + } + + @Override + public void serialize(Integer t, DataOutputPlus out, int version) throws IOException + { + serialize(t, out); + } + + @Override + public Integer deserialize(DataInputPlus in) throws IOException + { + return in.readInt(); + } + + @Override + public Integer deserialize(DataInputPlus in, Version version) throws IOException + { + return deserialize(in); + } + + @Override + public Integer deserialize(DataInputPlus in, int version) throws IOException + { + return deserialize(in); + } + + @Override + public long serializedSize(Integer t) + { + return TypeSizes.INT_SIZE; + } + + @Override + public long serializedSize(Integer t, Version version) + { + return serializedSize(t); + } + + @Override + public long serializedSize(Integer t, int version) + { + return serializedSize(t); + } + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/utils/ConfigGenBuilderTest.java b/test/unit/org/apache/cassandra/utils/ConfigGenBuilderTest.java index 6bae12c31987..7bf1d12fd8ca 100644 --- a/test/unit/org/apache/cassandra/utils/ConfigGenBuilderTest.java +++ b/test/unit/org/apache/cassandra/utils/ConfigGenBuilderTest.java @@ -32,12 +32,14 @@ import org.apache.cassandra.locator.SimpleSeedProvider; import static accord.utils.Property.qt; +import static org.apache.cassandra.config.CassandraRelevantProperties.STORAGE_DIR; public class ConfigGenBuilderTest { static { File.unsafeSetFilesystem(Jimfs.newFileSystem("testing")); + STORAGE_DIR.setString("/data"); } private static final Gen> GEN = new ConfigGenBuilder().build(); diff --git a/test/unit/org/apache/cassandra/utils/Generators.java b/test/unit/org/apache/cassandra/utils/Generators.java index 6bb7f56a8d5c..b8013ffee7f5 100644 --- a/test/unit/org/apache/cassandra/utils/Generators.java +++ b/test/unit/org/apache/cassandra/utils/Generators.java @@ -17,6 +17,7 @@ */ package org.apache.cassandra.utils; +import java.lang.reflect.Array; import java.math.BigDecimal; import java.math.BigInteger; import java.net.InetAddress; @@ -375,6 +376,11 @@ public static Gen directBytes(int min, int max) return bytes(min, max, SourceDSL.arbitrary().pick(BBCases.DIRECT, BBCases.READ_ONLY_DIRECT)); } + public static Gen directAndHeapBytes(int min, int max) + { + return bytes(min, max, SourceDSL.arbitrary().pick(BBCases.DIRECT, BBCases.HEAP)); + } + public static Gen bytesAnyType(int min, int max) { return bytes(min, max, SourceDSL.arbitrary().enumValues(BBCases.class)); @@ -494,6 +500,17 @@ public static Gen> set(Gen gen, Gen sizeGen) }; } + public static Gen> list(Gen gen, Gen sizeGen) + { + return rnd -> { + int size = sizeGen.generate(rnd); + List list = new ArrayList<>(size); + for (int i = 0; i < size; i++) + list.add(gen.generate(rnd)); + return list; + }; + } + public static Gen> uniqueList(Gen gen, Gen sizeGen) { return rnd -> { @@ -510,6 +527,17 @@ public static Gen> uniqueList(Gen gen, Gen sizeGen) }; } + public static Gen array(Class type, Gen gen, Gen sizeGen) + { + return rnd -> { + int size = sizeGen.generate(rnd); + T[] array = (T[]) Array.newInstance(type, size); + for (int i = 0; i < size; i++) + array[i] = gen.generate(rnd); + return array; + }; + } + public static Gen cached(Gen gen) { Object cacheMissed = new Object(); diff --git a/test/unit/org/apache/cassandra/utils/GeneratorsTest.java b/test/unit/org/apache/cassandra/utils/GeneratorsTest.java index b9358cce8e51..c065a1663f2c 100644 --- a/test/unit/org/apache/cassandra/utils/GeneratorsTest.java +++ b/test/unit/org/apache/cassandra/utils/GeneratorsTest.java @@ -18,11 +18,21 @@ package org.apache.cassandra.utils; +import java.util.List; +import java.util.Objects; +import java.util.Random; + import com.google.common.net.InternetDomainName; import org.junit.Test; +import accord.utils.Property; +import org.apache.cassandra.db.marshal.AsciiType; import org.assertj.core.api.Assertions; +import org.quicktheories.core.Gen; +import org.quicktheories.generators.SourceDSL; +import org.quicktheories.impl.JavaRandom; +import static org.apache.cassandra.utils.AbstractTypeGenerators.stringComparator; import static org.quicktheories.QuickTheory.qt; public class GeneratorsTest @@ -45,4 +55,42 @@ public void dnsDomainName() { qt().forAll(Generators.DNS_DOMAIN_NAME).checkAssert(InternetDomainName::from); } + + @Test + public void asciiDeterministic() + { + AbstractTypeGenerators.TypeSupport support = AbstractTypeGenerators.TypeSupport.of(AsciiType.instance, SourceDSL.strings().ascii().ofLengthBetween(1, 10), stringComparator(AsciiType.instance)); + int samples = 100; + int attempts = 100; + Property.qt().check(rs -> checkDeterministicGeneration(attempts, samples, rs.nextLong(), support.valueGen)); + Property.qt().check(rs -> checkDeterministicGeneration(attempts, samples, rs.nextLong(), support.bytesGen())); + } + + @Test + public void asciiThereAndBackAgain() + { + qt().forAll(SourceDSL.strings().ascii().ofLengthBetween(1, 100)).checkAssert(ascii -> { + String accum = ascii; + for (int i = 0; i < 100; i++) + accum = AsciiType.instance.compose(AsciiType.instance.decompose(accum)); + Assertions.assertThat(accum).isEqualTo(ascii); + }); + } + + private static void checkDeterministicGeneration(int attempts, int samples, long seed, Gen gen) + { + List goldSet = null; + for (int i = 0; i < attempts; i++) + { + JavaRandom qt = new JavaRandom(new Random(seed)); + List sample = SourceDSL.lists().of(gen).ofSize(samples).generate(qt); + if (goldSet == null) + { + goldSet = sample; + continue; + } + if (!Objects.equals(sample, goldSet)) + throw new AssertionError("seed=" + seed); + } + } } diff --git a/test/unit/org/apache/cassandra/utils/ImmutableUniqueList.java b/test/unit/org/apache/cassandra/utils/ImmutableUniqueList.java index 00fabea136b5..7db8b56c1848 100644 --- a/test/unit/org/apache/cassandra/utils/ImmutableUniqueList.java +++ b/test/unit/org/apache/cassandra/utils/ImmutableUniqueList.java @@ -26,8 +26,6 @@ import java.util.List; import java.util.RandomAccess; -import com.google.common.collect.Iterators; - import org.agrona.collections.Object2IntHashMap; public class ImmutableUniqueList extends AbstractList implements RandomAccess @@ -43,6 +41,12 @@ private ImmutableUniqueList(Builder builder) indexLookup = new Object2IntHashMap<>(builder.indexLookup); } + public static ImmutableUniqueList copyOf(Collection collection) + { + if (collection instanceof ImmutableUniqueList) return (ImmutableUniqueList) collection; + return ImmutableUniqueList.builder().addAll(collection).build(); + } + public static Builder builder() { return new Builder<>(); @@ -58,6 +62,14 @@ public static ImmutableUniqueList empty() return (ImmutableUniqueList) EMPTY; } + public static ImmutableUniqueList of(T... values) + { + Builder builder = builder(values.length); + for (T v : values) + builder.add(v); + return builder.build(); + } + public AsSet asSet() { if (asSet != null) return asSet; @@ -95,7 +107,7 @@ public int size() return values.length; } - public static final class Builder extends AbstractSet + public static final class Builder { private final List values; private final Object2IntHashMap indexLookup = new Object2IntHashMap<>(-1); @@ -111,29 +123,21 @@ public Builder(int expectedSize) this.values = new ArrayList<>(expectedSize); } - public Builder mayAddAll(Collection values) + public Builder add(T t) { - addAll(values); - return this; - } - - @Override - public boolean add(T t) - { - if (indexLookup.containsKey(t)) return false; + if (indexLookup.containsKey(t)) return this; int idx = this.idx++; indexLookup.put(t, idx); values.add(t); - return true; + return this; } - @Override - public boolean remove(Object o) + public Builder addAll(Collection c) { - throw new UnsupportedOperationException(); + c.forEach(this::add); + return this; } - @Override public void clear() { values.clear(); @@ -141,30 +145,6 @@ public void clear() idx = 0; } - @Override - public boolean isEmpty() - { - return values.isEmpty(); - } - - @Override - public boolean contains(Object o) - { - return indexLookup.containsKey(o); - } - - @Override - public Iterator iterator() - { - return Iterators.unmodifiableIterator(values.iterator()); - } - - @Override - public int size() - { - return values.size(); - } - public ImmutableUniqueList build() { return new ImmutableUniqueList<>(this); diff --git a/test/unit/org/apache/cassandra/utils/IntervalTreeTest.java b/test/unit/org/apache/cassandra/utils/IntervalTreeTest.java index 14e70c0b001e..2a8cd2f76f01 100644 --- a/test/unit/org/apache/cassandra/utils/IntervalTreeTest.java +++ b/test/unit/org/apache/cassandra/utils/IntervalTreeTest.java @@ -28,15 +28,17 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; +import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; +import org.apache.cassandra.config.DatabaseDescriptor; import org.quicktheories.WithQuickTheories; import org.quicktheories.core.Gen; import org.quicktheories.generators.SourceDSL; +import static com.google.common.base.Predicates.not; import static java.util.concurrent.TimeUnit.SECONDS; -import static java.util.function.Predicate.not; import static org.apache.cassandra.config.CassandraRelevantProperties.TEST_INTERVAL_TREE_EXPENSIVE_CHECKS; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -46,15 +48,26 @@ public class IntervalTreeTest implements WithQuickTheories { + static final int TESTING_SECONDS = 15; + + private final AtomicInteger id = new AtomicInteger(); + @BeforeClass - public static void enableExpensiveRangeChecks() + public static void beforeClass() { assertFalse(TEST_INTERVAL_TREE_EXPENSIVE_CHECKS.getBoolean()); // Expect off by default + DatabaseDescriptor.daemonInitialization(); TEST_INTERVAL_TREE_EXPENSIVE_CHECKS.setBoolean(true); assertTrue(TEST_INTERVAL_TREE_EXPENSIVE_CHECKS.getBoolean()); assertTrue(IntervalTree.EXPENSIVE_CHECKS); } + @Before + public void setUp() + { + id.set(0); + } + @Test public void testSearch() { @@ -314,17 +327,20 @@ public void testPointSearchEquivalence() resultPoint, resultInterval); } + private String intervalData(int lo, int hi) + { + return "(" + lo + "," + hi + "," + id.getAndIncrement() + ")"; + } + private Gen> intervalGen() { - AtomicInteger id = new AtomicInteger(); return SourceDSL.integers().between(-5, 5) .flatMap(start -> SourceDSL.integers().between(-5, 5) .map(end -> { int lo = Math.min(start, end); int hi = Math.max(start, end); - String data = "(" + lo + "," + hi + "," + id.getAndIncrement() + ")"; - return Interval.create(lo, hi, data); + return Interval.create(lo, hi, intervalData(lo, hi)); })); } @@ -369,7 +385,7 @@ private List search(Collection> intervals, Interval< @Test public void qtIntervalTreeTest() { - qt().forAll(intervalsListGen(), queryGen()) + qt().withExamples(-1).withTestingTime(TESTING_SECONDS, SECONDS).forAll(intervalsListGen(), queryGen()) .check((intervals, query) -> { IntervalTree> tree = IntervalTree.build(intervals); @@ -403,12 +419,12 @@ public void qtIntervalTreeTest() } @Test - public void qtUpdateFunctionTest() + public void qtUpdateTest() { - qt().withExamples(-1).withTestingTime(30, SECONDS).forAll(intervalsListGen(), - intervalsListGen(), - SourceDSL.lists().of(queryGen()).ofSizeBetween(1, 4), - SourceDSL.integers().all()) + qt().withExamples(-1).withTestingTime(TESTING_SECONDS, SECONDS).forAll(intervalsListGen(), + intervalsListGen(), + SourceDSL.lists().of(queryGen()).ofSizeBetween(1, 4), + SourceDSL.integers().all()) .check((original, toAdd, queries, seed) -> { IntervalTree> originalTree = IntervalTree.build(original); @@ -423,27 +439,19 @@ public void qtUpdateFunctionTest() toAdd.removeAll(original.stream().filter(not(removals::contains)).collect(Collectors.toList())); - IntervalTree> updatedTree = originalTree.update(removals.toArray(new Interval[0]), toAdd.toArray(new Interval[0])); + Set> expectedFinal = new HashSet<>(original); + expectedFinal.removeAll(removals); + expectedFinal.addAll(toAdd); - Set> naiveFinal = new HashSet<>(original); - naiveFinal.removeAll(removals); - naiveFinal.addAll(toAdd); + IntervalTree> updatedTree = originalTree.update(removals.toArray(new Interval[0]), toAdd.toArray(new Interval[0])); Set> iteratedTree = ImmutableSet.copyOf(updatedTree); - if (!naiveFinal.equals(iteratedTree)) - originalTree.update(removals.toArray(new Interval[0]), toAdd.toArray(new Interval[0])); - assertEquals(naiveFinal, iteratedTree); + assertEquals(expectedFinal, iteratedTree); for (Interval query : queries) { Set actualResults = ImmutableSet.copyOf(updatedTree.search(query)); - Set expectedResults = ImmutableSet.copyOf(search(naiveFinal, query)); - - if (!expectedResults.equals(actualResults)) - { - originalTree.update(removals.toArray(new Interval[0]), toAdd.toArray(new Interval[0])); - updatedTree.search(query); - } + Set expectedResults = ImmutableSet.copyOf(search(expectedFinal, query)); assertEquals(expectedResults, actualResults); @@ -457,4 +465,143 @@ public void qtUpdateFunctionTest() return true; }); } -} \ No newline at end of file + + @Test + public void qtReplaceFunctionTest() + { + qt().withExamples(-1).withTestingTime(TESTING_SECONDS, SECONDS) + .forAll(intervalsListGen(), // Our random list of intervals + SourceDSL.lists().of(queryGen()).ofSizeBetween(1, 4), + SourceDSL.integers().all()) + .check((original, queries, seed) -> { + + IntervalTree> originalTree = IntervalTree.build(original); + java.util.Random rng = new java.util.Random(seed); + List> expectedFinal = new ArrayList<>(original); + + int numReplacements = rng.nextInt(original.size() + 1); + List> replacements = new ArrayList<>(original); + for (int i = 0; i < original.size() - numReplacements; i++) + replacements.remove(rng.nextInt(replacements.size())); + + List, Interval>> toReplace = new ArrayList<>(); + for (int i = 0; i < replacements.size(); i++) + { + Interval oldInterval = replacements.get(i); + + Interval newInterval = Interval.create( + oldInterval.min, + oldInterval.max, + intervalData(oldInterval.min, oldInterval.max) + ); + toReplace.add(Pair.create(oldInterval, newInterval)); + } + + for (Pair, Interval> entry : toReplace) + { + expectedFinal.remove(entry.left); + expectedFinal.add(entry.right); + } + + IntervalTree> replacedTree = originalTree.replace(toReplace); + + Set> iteratedReplaced = ImmutableSet.copyOf(replacedTree); + assertEquals("Iterated intervals should match expected set after replace", + ImmutableSet.copyOf(expectedFinal), iteratedReplaced); + + for (Interval query : queries) + { + List replacedResults = replacedTree.search(query); + List expectedResults = search(expectedFinal, query); + + Set replacedSet = new HashSet<>(replacedResults); + Set expectedSet = new HashSet<>(expectedResults); + assertEquals("Search results mismatch after replace for query " + query, + expectedSet, replacedSet); + + // Also check point-search if min==max + if (query.min.equals(query.max)) + { + List replacedPoint = replacedTree.search(query.min); + assertEquals("Point-search mismatch after replace for point " + query.min, + replacedSet, new HashSet<>(replacedPoint)); + } + } + + return true; + }); + } + + @Test + public void testAddIntervals() + { + List> intervals = new ArrayList<>(); + + intervals.add(Interval.create(-300, -200)); + intervals.add(Interval.create(-3, -2)); + intervals.add(Interval.create(1, 2)); + intervals.add(Interval.create(3, 6)); + intervals.add(Interval.create(2, 4)); + intervals.add(Interval.create(5, 7)); + intervals.add(Interval.create(4, 6)); + intervals.add(Interval.create(15, 20)); + intervals.add(Interval.create(49, 60)); + + + IntervalTree> it = IntervalTree.build(intervals); + + List> intervalsToAdd = new ArrayList<>(); + intervalsToAdd.add(Interval.create(1, 3)); + intervalsToAdd.add(Interval.create(8, 9)); + intervalsToAdd.add(Interval.create(40, 50)); + intervals.addAll(intervalsToAdd); + + it = it.add(intervalsToAdd.toArray(IntervalTree.EMPTY_ARRAY)); + + assertEquals(3, it.search(Interval.create(4, 4)).size()); + assertEquals(4, it.search(Interval.create(4, 5)).size()); + assertEquals(7, it.search(Interval.create(-1, 10)).size()); + assertEquals(0, it.search(Interval.create(-1, -1)).size()); + assertEquals(5, it.search(Interval.create(1, 4)).size()); + assertEquals(2, it.search(Interval.create(0, 1)).size()); + assertEquals(0, it.search(Interval.create(10, 12)).size()); + } + + @Test + public void qtAddTest() + { + qt().withExamples(-1).withTestingTime(TESTING_SECONDS, SECONDS) + .forAll(intervalsListGen(), queryGen()) + .check((intervals, query) -> { + Set> intervalsSet = ImmutableSet.copyOf(intervals); + IntervalTree> tree = IntervalTree.build(ImmutableList.of()); + List> allIntervals = new ArrayList<>(); + for (Interval interval : intervals) + { + allIntervals.add(interval); + tree = tree.add(new Interval[] {interval}); + } + + List expected = search(intervals, query); + List actual = tree.search(query); + + Set setExpected = new HashSet<>(expected); + Set setActual = new HashSet<>(actual); + + assertEquals(setExpected, setActual); + + if (query.min.equals(query.max)) + { + List actualPoint = tree.search(query.min); + assertEquals(setExpected, new HashSet<>(actualPoint)); + } + + List> sortedByMin = new ArrayList<>(intervals); + sortedByMin.sort(Interval.minOrdering()); + + Set> fromTree = ImmutableSet.copyOf(tree); + assertEquals(intervalsSet, fromTree); + return true; + }); + } +} diff --git a/test/unit/org/apache/cassandra/utils/LoggingCommand.java b/test/unit/org/apache/cassandra/utils/LoggingCommand.java new file mode 100644 index 000000000000..190192819b36 --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/LoggingCommand.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.function.BiFunction; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.utils.Gen; +import accord.utils.Property; +import accord.utils.Property.Command; + +import static accord.utils.Property.multistep; + +public class LoggingCommand extends Property.ForwardingCommand +{ + private static final Logger logger = LoggerFactory.getLogger(LoggingCommand.class); + + public LoggingCommand(Command delegate) + { + super(delegate); + } + + public static BiFunction>, Gen>> factory() + { + return (state, commandGen) -> rs -> { + Command c = commandGen.next(rs); + if (!(c instanceof Property.MultistepCommand)) + return new LoggingCommand<>(c); + Property.MultistepCommand multistep = (Property.MultistepCommand) c; + List> subcommands = new ArrayList<>(); + for (var sub : multistep) + subcommands.add(new LoggingCommand<>(sub)); + return multistep(subcommands); + }; + } + + @Override + public Result apply(State s) throws Throwable + { + String name = detailed(s); + long startNanos = Clock.Global.nanoTime(); + try + { + logger.info("Starting command: {}", name); + Result o = super.apply(s); + logger.info("Command {} was success after {}", name, Duration.ofNanos(Clock.Global.nanoTime() - startNanos)); + return o; + } + catch (Throwable t) + { + logger.warn("Command {} failed after {}: {}", name, Duration.ofNanos(Clock.Global.nanoTime() - startNanos), t.toString()); // don't want stack trace, just type/msg + throw t; + } + } +} diff --git a/test/unit/org/apache/cassandra/utils/MockFailureDetector.java b/test/unit/org/apache/cassandra/utils/MockFailureDetector.java new file mode 100644 index 000000000000..bee0fbb5b40c --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/MockFailureDetector.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import org.apache.cassandra.gms.IFailureDetectionEventListener; +import org.apache.cassandra.gms.IFailureDetector; +import org.apache.cassandra.locator.InetAddressAndPort; + +public class MockFailureDetector implements IFailureDetector +{ + public boolean isAlive = true; + + public boolean isAlive(InetAddressAndPort ep) + { + return isAlive; + } + + public void interpret(InetAddressAndPort ep) + { + throw new UnsupportedOperationException(); + } + + public void report(InetAddressAndPort ep) + { + throw new UnsupportedOperationException(); + } + + public void registerFailureDetectionEventListener(IFailureDetectionEventListener listener) + { + throw new UnsupportedOperationException(); + } + + public void unregisterFailureDetectionEventListener(IFailureDetectionEventListener listener) + { + throw new UnsupportedOperationException(); + } + + public void remove(InetAddressAndPort ep) + { + throw new UnsupportedOperationException(); + } + + public void forceConviction(InetAddressAndPort ep) + { + throw new UnsupportedOperationException(); + } +} diff --git a/test/unit/org/apache/cassandra/utils/RangeTreeTest.java b/test/unit/org/apache/cassandra/utils/RangeTreeTest.java new file mode 100644 index 000000000000..695894603dba --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/RangeTreeTest.java @@ -0,0 +1,568 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.function.LongUnaryOperator; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import accord.impl.IntKey; +import accord.impl.IntKey.Routing; +import accord.primitives.Range; +import accord.utils.Gen; +import accord.utils.Gens; +import accord.utils.RandomSource; +import accord.utils.SearchableRangeList; +import org.agrona.collections.IntArrayList; +import org.agrona.collections.LongArrayList; +import org.assertj.core.api.Assertions; + +import static accord.utils.Property.qt; + +@RunWith(Parameterized.class) +public class RangeTreeTest +{ + private static final Logger logger = LoggerFactory.getLogger(RangeTreeTest.class); + private static final Comparator COMPARATOR = Comparator.naturalOrder(); + private static final RangeTree.Accessor END_INCLUSIVE = new RangeTree.Accessor<>() + { + @Override + public Routing start(Range range) + { + return (Routing) range.start(); + } + + @Override + public Routing end(Range range) + { + return (Routing) range.end(); + } + + @Override + public boolean contains(Range range, Routing routing) + { + return range.contains(routing); + } + + @Override + public boolean contains(Routing start, Routing end, Routing routing) + { + if (routing.compareTo(start) <= 0) + return false; + if (routing.compareTo(end) > 0) + return false; + return true; + } + + @Override + public boolean intersects(Range range, Routing start, Routing end) + { + return range.compareIntersecting(IntKey.range(start, end)) == 0; + } + + @Override + public boolean intersects(Range left, Range right) + { + return left.compareIntersecting(right) == 0; + } + }; + private static final RangeTree.Accessor ALL_INCLUSIVE = new RangeTree.Accessor<>() + { + @Override + public Routing start(Range range) + { + return (Routing) range.start(); + } + + @Override + public Routing end(Range range) + { + return (Routing) range.end(); + } + + @Override + public boolean contains(Range range, Routing routing) + { + return range.contains(routing) || range.start().equals(routing); + } + + @Override + public boolean contains(Routing start, Routing end, Routing routing) + { + if (routing.compareTo(start) < 0) + return false; + if (routing.compareTo(end) > 0) + return false; + return true; + } + + @Override + public boolean intersects(Range range, Routing start, Routing end) + { + return range.compareIntersecting(IntKey.range(start, end)) == 0 || range.end().equals(start) || range.start().equals(end); + } + + @Override + public boolean intersects(Range left, Range right) + { + return left.compareIntersecting(right) == 0 || left.end().equals(right.start()) || left.start().equals(right.end()); + } + }; + + private static final Gen.IntGen SMALL_INT_GEN = rs -> rs.nextInt(0, 10); + private static final int MIN_TOKEN = 0, MAX_TOKEN = 1 << 16; + private static final int TOKEN_RANGE_SIZE = MAX_TOKEN - MIN_TOKEN + 1; + private static final Gen TOKEN_DISTRIBUTION = Gens.mixedDistribution(MIN_TOKEN, MAX_TOKEN + 1); + private static final Gen RANGE_SIZE_DISTRIBUTION = Gens.mixedDistribution(10, (int) (TOKEN_RANGE_SIZE * .01)); + + // Used to test different worse case patterns and see how the tree performs. + private enum Pattern + { + RANDOM, // tends to have high selectivity: matches 50-100% of the tree in testing + NO_OVERLP, // tests to have low selectivity; matches 1-2 elements in testing + SMALL_RANGES // lower selectivity than RANDOM but still matches ~30% of the tree in testing + } + + // Having different models makes sure that the tree is flexiable enough and can be used with the semantics the user + // needs (with regard to inclusivity). It also adds more confidence that the search logic is correct as different + // algorithems help validate this. + private enum ModelType {List, IntervalTree, SearchableRangeList} + private final Pattern pattern; + private final ModelType modelType; + + public RangeTreeTest(Pattern pattern, ModelType modelType) + { + this.pattern = pattern; + this.modelType = modelType; + } + + @Parameterized.Parameters(name = "{0}, {1}") + public static Collection data() { + return Stream.of(Pattern.values()) + .flatMap(p -> + Stream.of(ModelType.values()) + .map(m -> new Object[]{ p, m })) + .collect(Collectors.toList()); + } + + @Test + public void test() + { + int samples = 3_000; + int examples = 10; + LongArrayList byToken = new LongArrayList(samples * examples, -1); + LongArrayList modelByToken = new LongArrayList(samples * examples, -1); + LongArrayList byTokenLength = new LongArrayList(samples * examples, -1); + LongArrayList byRange = new LongArrayList(samples * examples, -1); + LongArrayList modelByRange = new LongArrayList(samples * examples, -1); + LongArrayList byRangeLength = new LongArrayList(samples * examples, -1); + qt().withExamples(examples).check(rs -> { + var map = create(modelType); + var model = createModel(modelType); + + Gen rangeGen = rangeGen(rs, pattern, samples); + for (int i = 0; i < samples; i++) + { + var range = rangeGen.next(rs); + var value = SMALL_INT_GEN.nextInt(rs); + map.put(range, value); + model.put(range, value); + } + model.done(); + Assertions.assertThat(map.actual()).hasSize(samples); + if (rangeGen instanceof NoOverlap) + ((NoOverlap) rangeGen).reset(); + Gen.IntGen tokenGe = TOKEN_DISTRIBUTION.next(rs); + for (int i = 0; i < samples; i++) + { + { + // key lookup + var lookup = IntKey.routing(tokenGe.nextInt(rs)); + var actual = timed(byToken, () -> map.intersectsToken(lookup)); + var expected = timed(modelByToken, () -> model.intersectsToken(lookup)); + byTokenLength.addLong(expected.size()); + Assertions.assertThat(sort(actual)) + .describedAs("Write=%d; token=%s", i, lookup) + .isEqualTo(sort(expected)); + } + { + // range lookup + var lookup = rangeGen.next(rs); + var actual = timed(byRange, () -> map.intersects(lookup)); + var expected = timed(modelByRange, () -> model.intersects(lookup)); + byRangeLength.addLong(expected.size()); + Assertions.assertThat(sort(actual)) + .describedAs("Write=%d; range=%s", i, lookup) + .isEqualTo(sort(expected)); + } + } + }); + StringBuilder sb = new StringBuilder(); + sb.append("======="); + sb.append("\nPattern: " + pattern); + sb.append("\nModel: " + modelType); + sb.append("\nBy Token:"); + sb.append("\n\tSizes: " + stats(byTokenLength, false)); + sb.append("\n\t" + modelType + ": " + stats(modelByToken, true)); + sb.append("\n\tTree: " + stats(byToken, true)); + sb.append("\nBy Range:"); + sb.append("\n\tSizes: " + stats(byRangeLength, false)); + sb.append("\n\t" + modelType + ": " + stats(modelByRange, true)); + sb.append("\n\tTree: " + stats(byRange, true)); + logger.info(sb.toString()); + } + + private static class NoOverlap implements Gen + { + private final int delta; + private int idx = 0; + + public NoOverlap(int samples) + { + this.delta = TOKEN_RANGE_SIZE / samples; + } + + @Override + public Range next(RandomSource random) + { + int a = delta * idx++; + int b = a + delta; + return IntKey.range(a, b); + } + + private void reset() + { + idx = 0; + } + } + + private static Gen rangeGen(RandomSource randomSource, Pattern pattern, int samples) + { + Gen.IntGen tokenGen = TOKEN_DISTRIBUTION.next(randomSource); + switch (pattern) + { + case RANDOM: + return rs -> { + int a = tokenGen.nextInt(rs); + int b = tokenGen.nextInt(rs); + while (a == b) + b = tokenGen.nextInt(rs); + if (a > b) + { + int tmp = a; + a = b; + b = tmp; + } + return IntKey.range(a, b); + }; + case SMALL_RANGES: + Gen.IntGen rangeSizeGen = RANGE_SIZE_DISTRIBUTION.next(randomSource); + return rs -> { + int a = tokenGen.nextInt(rs); + int rangeSize = rangeSizeGen.nextInt(rs); + int b = a + rangeSize; + if (b > MAX_TOKEN) + { + b = a; + a = b - rangeSize; + } + return IntKey.range(a, b); + }; + case NO_OVERLP: + return new NoOverlap(samples); + default: + throw new AssertionError(); + } + } + + private static String stats(LongArrayList list, boolean isTime) + { + LongUnaryOperator fn = isTime ? TimeUnit.NANOSECONDS::toMicros : l -> l; + String postfix = isTime ? "micro" : ""; + long[] array = list.toLongArray(); + Arrays.sort(array); + StringBuilder sb = new StringBuilder(); + sb.append("Min: ").append(fn.applyAsLong(array[0])).append(postfix); + sb.append(", Median: ").append(fn.applyAsLong(array[array.length / 2])).append(postfix); + sb.append(", Max: ").append(fn.applyAsLong(array[array.length - 1])).append(postfix); + return sb.toString(); + } + + private static T timed(LongArrayList target, Supplier fn) + { + long nowNs = System.nanoTime(); + try + { + return fn.get(); + } + finally + { + target.add(System.nanoTime() - nowNs); + } + } + + private static List> sort(List> array) + { + array.sort((a, b) -> { + int rc = a.getKey().compare(b.getKey()); + if (rc == 0) + rc = a.getValue().compareTo(b.getValue()); + return rc; + }); + return array; + } + + private interface Model + { + Object actual(); + + void put(Range range, int value); + + List> intersectsToken(Routing key); + + List> intersects(Range range); + + void done(); + } + + private static RangeTreeModel create(ModelType modelType) + { + switch (modelType) + { + case List: + case SearchableRangeList: + return new RangeTreeModel(new RTree<>(COMPARATOR, END_INCLUSIVE)); + case IntervalTree: return new RangeTreeModel(new RTree<>(COMPARATOR, ALL_INCLUSIVE)); + default: + throw new AssertionError("Unknown type: " + modelType); + } + } + + private static Model createModel(ModelType modelType) + { + switch (modelType) + { + case List: return new ListModel(); + case SearchableRangeList: return new SearchableRangeListModel(); + case IntervalTree: return new IntervalTreeModel(); + default: + throw new AssertionError("Unknown type: " + modelType); + } + } + + private static class RangeTreeModel implements Model + { + private final RangeTree tree; + + private RangeTreeModel(RangeTree tree) + { + this.tree = tree; + } + + @Override + public RangeTree actual() + { + return tree; + } + + @Override + public void put(Range range, int value) + { + tree.add(range, value); + } + + @Override + public List> intersectsToken(Routing key) + { + return tree.searchToken(key); + } + + @Override + public List> intersects(Range range) + { + return tree.search(range); + } + + @Override + public void done() + { + + } + } + + private static class ListModel implements Model + { + List> actual = new ArrayList<>(); + + @Override + public List> actual() + { + return actual; + } + + @Override + public void put(Range range, int value) + { + actual.add(Map.entry(range, value)); + } + + @Override + public List> intersectsToken(Routing key) + { + return actual.stream() + .filter(p -> p.getKey().contains(key)) + .collect(Collectors.toList()); + } + + @Override + public List> intersects(Range range) + { + return actual.stream() + .filter(p -> p.getKey().compareIntersecting(range) == 0) + .collect(Collectors.toList()); + } + + @Override + public void done() + { + + } + } + + private static class IntervalTreeModel implements Model + { + IntervalTree.Builder> builder = IntervalTree.builder(); + IntervalTree> actual = null; + + @Override + public IntervalTree> actual() + { + return actual; + } + + @Override + public void put(Range range, int value) + { + builder.add(new Interval<>((Routing) range.start(), (Routing) range.end(), value)); + } + + @Override + public List> intersectsToken(Routing key) + { + return map(actual.matches(key)); + } + + @Override + public List> intersects(Range range) + { + return map(actual.matches(new Interval<>((Routing) range.start(), (Routing) range.end(), null))); + } + + private static List> map(List> matches) + { + return matches.stream().map(i -> Map.entry(IntKey.range(i.min, i.max), i.data)).collect(Collectors.toList()); + } + + @Override + public void done() + { + assert builder != null; + actual = builder.build(); + builder = null; + } + } + + private static class SearchableRangeListModel implements Model + { + private final Map map = new HashMap<>(); + private Range[] ranges; + private SearchableRangeList list = null; + + @Override + public Object actual() + { + return list; + } + + @Override + public void put(Range range, int value) + { + map.computeIfAbsent(range, ignore -> new IntArrayList()).addInt(value); + } + + @Override + public List> intersectsToken(Routing key) + { + List> matches = new ArrayList<>(); + // find ranges, then add the values + list.forEachKey(key, (a, b, c, d, idx) -> { + Range match = ranges[idx]; + map.get(match).forEachInt(v -> matches.add(Map.entry(match, v))); + }, (a, b, c, d, start, end) -> { + for (int i = start; i < end; i++) + { + Range match = ranges[i]; + map.get(match).forEachInt(v -> matches.add(Map.entry(match, v))); + } + }, 0, 0, 0, 0, 0); + return matches; + } + + @Override + public List> intersects(Range range) + { + List> matches = new ArrayList<>(); + // find ranges, then add the values + list.forEachRange(range, (a, b, c, d, idx) -> { + Range match = ranges[idx]; + map.get(match).forEachInt(v -> matches.add(Map.entry(match, v))); + }, (a, b, c, d, start, end) -> { + for (int i = start; i < end; i++) + { + Range match = ranges[i]; + map.get(match).forEachInt(v -> matches.add(Map.entry(match, v))); + } + }, 0, 0, 0, 0, 0); + return matches; + } + + @Override + public void done() + { + List ranges = new ArrayList<>(map.keySet()); + ranges.sort(Range::compare); + list = SearchableRangeList.build(this.ranges = ranges.toArray(Range[]::new)); + } + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/utils/SerializationsTest.java b/test/unit/org/apache/cassandra/utils/SerializationsTest.java index e23bb3883247..0fc38209e598 100644 --- a/test/unit/org/apache/cassandra/utils/SerializationsTest.java +++ b/test/unit/org/apache/cassandra/utils/SerializationsTest.java @@ -118,7 +118,7 @@ public void testBloomFilterTable() throws Exception private void testBloomFilterTable(String file, boolean oldBfFormat) throws Exception { - Murmur3Partitioner partitioner = new Murmur3Partitioner(); + Murmur3Partitioner partitioner = Murmur3Partitioner.instance; try (FileInputStreamPlus in = new File(file).newInputStream(); IFilter filter = BloomFilterSerializer.forVersion(oldBfFormat).deserialize(in)) diff --git a/test/unit/org/apache/cassandra/utils/SimulatedMiniCluster.java b/test/unit/org/apache/cassandra/utils/SimulatedMiniCluster.java new file mode 100644 index 000000000000..aabbfbfb6d16 --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/SimulatedMiniCluster.java @@ -0,0 +1,620 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Random; +import java.util.TreeSet; +import java.util.UUID; +import java.util.concurrent.TimeUnit; +import java.util.function.Function; +import java.util.function.Supplier; +import javax.annotation.Nullable; + +import com.google.common.collect.Iterables; + +import accord.utils.Gen; +import accord.utils.Gens; +import accord.utils.Invariants; +import accord.utils.RandomSource; +import org.apache.cassandra.concurrent.ExecutorFactory; +import org.apache.cassandra.concurrent.ScheduledExecutorPlus; +import org.apache.cassandra.concurrent.SequentialExecutorPlus; +import org.apache.cassandra.concurrent.SimulatedExecutorFactory; +import org.apache.cassandra.concurrent.Stage; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.compaction.ICompactionManager; +import org.apache.cassandra.db.repair.CassandraTableRepairManager; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.gms.ApplicationState; +import org.apache.cassandra.gms.EndpointState; +import org.apache.cassandra.gms.HeartBeatState; +import org.apache.cassandra.gms.IEndpointStateChangeSubscriber; +import org.apache.cassandra.gms.IFailureDetector; +import org.apache.cassandra.gms.IGossiper; +import org.apache.cassandra.gms.VersionedValue; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.Locator; +import org.apache.cassandra.net.IVerbHandler; +import org.apache.cassandra.net.MessageDelivery; +import org.apache.cassandra.net.SimulatedMessageDelivery; +import org.apache.cassandra.net.SimulatedMessageDelivery.ActionSupplier; +import org.apache.cassandra.repair.IValidationManager; +import org.apache.cassandra.repair.SharedContext; +import org.apache.cassandra.repair.StreamExecutor; +import org.apache.cassandra.repair.TableRepairManager; +import org.apache.cassandra.repair.ValidationManager; +import org.apache.cassandra.service.ActiveRepairService; +import org.apache.cassandra.service.paxos.cleanup.PaxosRepairState; +import org.apache.cassandra.streaming.StreamEventHandler; +import org.apache.cassandra.streaming.StreamState; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.ClusterMetadataService; +import org.apache.cassandra.tcm.MultiStepOperation; +import org.apache.cassandra.tcm.StubClusterMetadataService; +import org.apache.cassandra.tcm.Transformation; +import org.apache.cassandra.tcm.membership.Directory; +import org.apache.cassandra.tcm.membership.Location; +import org.apache.cassandra.tcm.membership.NodeAddresses; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.membership.NodeVersion; +import org.apache.cassandra.tcm.ownership.UniformRangePlacement; +import org.apache.cassandra.tcm.transformations.PrepareJoin; +import org.mockito.Mockito; + +import static org.apache.cassandra.utils.AccordGenerators.fromQT; + +public class SimulatedMiniCluster +{ + private final RandomSource rs; + private final Function> verbHandlerFactory; + private final SimulatedExecutorFactory executorFactory; + private final SequentialExecutorPlus orderedExecutor; + private final ScheduledExecutorPlus unorderedScheduled; + private final IFailureDetector failureDetector = Mockito.mock(IFailureDetector.class); + private final Locator locator = Mockito.mock(Locator.class); + private final MBeanWrapper mbean = Mockito.mock(MBeanWrapper.class); + private final SimulatedGossip gossiper = new SimulatedGossip(); + private final List failures = new ArrayList<>(); + private final IPartitioner partitioner; + private final Map> dcsToRacks; + private final List dcs; + private final int tokensPerInstance; + private final Gen tokenGen; + private ClusterMetadata current; + private final Map nodes = new LinkedHashMap<>(); + private final TreeSet knownTokens = new TreeSet<>(); // includes bootstraping nodes tokens (aka tokens not in the ring) + + private SimulatedMiniCluster(Builder builder) + { + this.rs = builder.rs; + this.verbHandlerFactory = builder.verbHandlerFactory; + this.executorFactory = new SimulatedExecutorFactory(rs, failures::add); + this.orderedExecutor = executorFactory.configureSequential("ignore").build(); + this.unorderedScheduled = executorFactory.scheduled("ignored"); + this.partitioner = fromQT(CassandraGenerators.nonLocalPartitioners()).next(rs); + this.dcsToRacks = createDcRackDetails(rs); + this.dcs = new ArrayList<>(dcsToRacks.keySet()); + dcs.sort(Comparator.naturalOrder()); + this.tokensPerInstance = rs.nextBoolean() ? 1 : 4; + this.tokenGen = fromQT(CassandraGenerators.token(partitioner)).filter(t -> !knownTokens.contains(t)); + // setup Directory with known dcs + this.current = new ClusterMetadata(partitioner); + ClusterMetadataService.unsetInstance(); + ClusterMetadataService.setInstance(StubClusterMetadataService.forTesting(current)); + } + + public Node node(int id) + { + return node(new NodeId(id)); + } + + public Node node(NodeId id) + { + Node node = nodes.get(id); + if (node == null) + throw new AssertionError("Unable to find node for id " + id); + return node; + } + + public Node node(InetAddressAndPort address) + { + //TODO (performance): don't walk, keep index? + for (Node node : nodes.values()) + { + if (node.broadcastAddressAndPort.equals(address)) + return node; + } + throw new AssertionError("Unable to find node for address " + address); + } + + private Collection nextUnknownTokens() + { + if (tokensPerInstance == 1) return Collections.singleton(tokenGen.next(rs)); + return Gens.lists(tokenGen).unique().ofSize(tokensPerInstance).next(rs); + } + + public Node createNode() + { + if (nodes.isEmpty()) + return createFirstNode(); + + NodeId id = new NodeId(nodes.size() + 1); + UUID hostId = id.toUUID(); + Collection tokens = nextUnknownTokens(); + String dc = rs.pick(dcs); + String rack = rs.pick(dcsToRacks.get(dc)); + Node node = new Node(id, hostId, address(id), tokens, dc, rack); + register(node); + return node; + } + + public Node createNodeAndJoin() + { + if (nodes.isEmpty()) + return createFirstNode(); + + NodeId id = new NodeId(nodes.size() + 1); + UUID hostId = id.toUUID(); + Collection tokens = nextUnknownTokens(); + String dc = rs.pick(dcs); + String rack = rs.pick(dcsToRacks.get(dc)); + Node node = new Node(id, hostId, address(id), tokens, dc, rack); + registerAndJoin(node); + return node; + } + + private Node createFirstNode() + { + NodeId id = new NodeId(nodes.size() + 1); + UUID hostId = id.toUUID(); + Collection tokens = nextUnknownTokens(); + String dc = dcs.get(0); + String rack = dcsToRacks.get(dc).get(0); + Node node = new Node(id, hostId, address(id), tokens, dc, rack); + registerAndJoin(node); + return node; + } + + private void registerAndJoin(Node node) + { + register(node); + prepareJoin(node.id); + while (!current.inProgressSequences.isEmpty()) + bumpInProgress(); + } + + private void register(Node node) + { + nodes.put(node.id, node); + knownTokens.addAll(node.tokens); + registerWithSnitch(node); + registerWithGossip(node); + registerWithCMS(node); + } + + private void registerWithCMS(Node node) + { + if (node.id.id() == 1) + { + // rebuild metadata from scratch + Directory directory = Directory.EMPTY.with(new NodeAddresses(node.hostId, node.broadcastAddressAndPort, node.broadcastAddressAndPort, node.broadcastAddressAndPort), new Location(node.dc, node.rack)); + notifyMetadataChange(new ClusterMetadata(partitioner, directory)); + } + else + { + notifyMetadataChange(current.transformer().register(new NodeAddresses(node.hostId, node.broadcastAddressAndPort, node.broadcastAddressAndPort, node.broadcastAddressAndPort), + new Location(node.dc, node.rack), + NodeVersion.CURRENT) + .build().metadata); + } + } + + private void prepareJoin(NodeId id) + { + Node node = nodes.get(id); + if (node == null) + throw new IllegalArgumentException("Unknown " + id); + PrepareJoin task = new PrepareJoin(id, new HashSet<>(node.tokens), new UniformRangePlacement(), true, false); + notifyMetadataChange(process(task).metadata); + } + + private void bumpInProgress() + { + if (current.inProgressSequences.isEmpty()) + throw new IllegalStateException("Attempted to bump epoch when nothing was pending"); + Iterator> it = current.inProgressSequences.iterator(); + Invariants.require(it.hasNext()); + notifyMetadataChange(process(it.next()).metadata); + } + + protected void notifyMetadataChange(ClusterMetadata current) + { + this.current = current; + ((StubClusterMetadataService) ClusterMetadataService.instance()).setMetadata(current); + } + + private Transformation.Success process(Transformation transformation) + { + Transformation.Result result = transformation.execute(current); + if (result.isRejected()) + throw new IllegalStateException("Unable to make TCM transition"); + return result.success(); + } + + private Transformation.Success process(MultiStepOperation transformation) + { + Transformation.Result result = transformation.applyTo(current); + if (result.isRejected()) + throw new IllegalStateException("Unable to make TCM transition"); + return result.success(); + } + + private static InetAddressAndPort address(NodeId id) + { + try + { + return InetAddressAndPort.getByAddress(ByteArrayUtil.bytes(id.id())); + } + catch (UnknownHostException e) + { + throw new AssertionError("Unable to create address for id " + id, e); + } + } + + private static Map> createDcRackDetails(RandomSource rs) + { + int numDCs = rs.nextInt(1, 4); + Map> map = new LinkedHashMap<>(); + for (int i = 0; i < numDCs; i++) + { + String name = "DC" + (i + 1); + int numRacks = rs.nextInt(1, 10); + List racks = Gens.lists(Gens.strings().ascii().ofLength(5).map(s -> "R" + s)).unique().ofSize(numRacks).next(rs); + racks.sort(Comparator.naturalOrder()); + map.put(name, racks); + } + return map; + } + + public boolean hasWork() + { + return executorFactory.hasWork(); + } + + public boolean processAny() + { + return executorFactory.processAny(); + } + + public boolean processOne() + { + return executorFactory.processOne(); + } + + public void processAll() + { + executorFactory.processAll();; + } + + public void simulateStages(Stage... stages) + { + for (Stage stage : stages) + { + switch (stage) + { + case GOSSIP: + case ANTI_ENTROPY: + case MIGRATION: + case MISC: + case TRACING: + case FETCH_METADATA: + stage.unsafeSetExecutor(orderedExecutor); + break; + default: + stage.unsafeSetExecutor(unorderedScheduled); + } + } + } + + private void registerWithSnitch(Node node) + { + Mockito.when(locator.location(Mockito.eq(node.broadcastAddressAndPort))).thenReturn(new Location(node.dc, node.rack)); + } + + private void registerWithGossip(Node node) + { + VersionedValue.VersionedValueFactory valueFactory = node.valueFactory; + EndpointState state = new EndpointState(new HeartBeatState(42, 42)); + state.addApplicationState(ApplicationState.STATUS, valueFactory.normal(node.tokens)); + state.addApplicationState(ApplicationState.STATUS_WITH_PORT, valueFactory.normal(node.tokens)); + state.addApplicationState(ApplicationState.HOST_ID, valueFactory.hostId(node.hostId)); + state.addApplicationState(ApplicationState.TOKENS, valueFactory.tokens(node.tokens)); + state.addApplicationState(ApplicationState.DC, valueFactory.datacenter(node.dc)); + state.addApplicationState(ApplicationState.RACK, valueFactory.rack(node.rack)); + state.addApplicationState(ApplicationState.RELEASE_VERSION, valueFactory.releaseVersion()); + + gossiper.endpoints.put(node.broadcastAddressAndPort, state); + } + + public static class Builder + { + private final RandomSource rs; + private final Function> verbHandlerFactory; + + public Builder(RandomSource rs, Function> verbHandlerFactory) + { + this.rs = rs; + this.verbHandlerFactory = verbHandlerFactory; + } + + public SimulatedMiniCluster build() + { + return new SimulatedMiniCluster(this); + } + } + + private enum NodeStatus { Init, Registered, Joining, Joined, Leaving, Removed} + + public class Node implements SharedContext + { + private final ICompactionManager compactionManager = Mockito.mock(ICompactionManager.class); + private final NodeId id; + private final UUID hostId; + private final InetAddressAndPort broadcastAddressAndPort; + private final Collection tokens; + private final String dc, rack; + private final VersionedValue.VersionedValueFactory valueFactory; + private final SimulatedMessageDelivery messaging; + private final SimulatedMessageDelivery.SimulatedMessageReceiver receiver; + private final ActiveRepairService activeRepairService; + private final PaxosRepairState paxosRepairState; + private final IValidationManager validationManager; + private final StreamExecutor streamExecutor; + private NodeStatus status = NodeStatus.Init; + private ActionSupplier messagingActions = (self, msg, to) -> SimulatedMessageDelivery.Action.DELIVER; + + public Node(NodeId id, UUID hostId, InetAddressAndPort broadcastAddressAndPort, Collection tokens, String dc, String rack) + { + this.id = id; + this.hostId = hostId; + this.broadcastAddressAndPort = broadcastAddressAndPort; + this.tokens = tokens; + this.dc = dc; + this.rack = rack; + + IPartitioner partitioner = Iterables.getFirst(tokens, null).getPartitioner(); + this.valueFactory = new VersionedValue.VersionedValueFactory(partitioner); + this.messaging = new SimulatedMessageDelivery(broadcastAddressAndPort, + messagingActions::get, + SimulatedMessageDelivery.randomDelay(rs), + (to, msg) -> unorderedScheduled.submit(() -> node(to).receiver.recieve(msg)), + (action, to, msg) -> {}, + unorderedScheduled::schedule, + failures::add); + this.activeRepairService = new ActiveRepairService(this); + this.paxosRepairState = new PaxosRepairState(this); + this.validationManager = (cfs, validator) -> unorderedScheduled.submit(() -> { + try + { + ValidationManager.doValidation(cfs, validator); + } + catch (Throwable e) + { + validator.fail(e); + } + }); + this.streamExecutor = plan -> { + long delayNanos = rs.nextLong(TimeUnit.SECONDS.toNanos(5), TimeUnit.MINUTES.toNanos(10)); + unorderedScheduled.schedule(() -> { + StreamState success = new StreamState(plan.planId(), plan.streamOperation(), Collections.emptySet()); + for (StreamEventHandler handler : plan.handlers()) + handler.onSuccess(success); + }, delayNanos, TimeUnit.NANOSECONDS); + return null; + }; + + // setup last as "this" is leaking, so make sure all final fields are defined first + this.receiver = messaging.receiver(verbHandlerFactory.apply(this)); + } + + public NodeId id() + { + return id; + } + + public UUID hostId() + { + return hostId; + } + + public void messagingActions(ActionSupplier messagingActions) + { + this.messagingActions = Objects.requireNonNull(messagingActions); + } + + @Override + public InetAddressAndPort broadcastAddressAndPort() + { + return broadcastAddressAndPort; + } + + @Override + public Supplier random() + { + return () -> rs.fork().asJdkRandom(); + } + + @Override + public Clock clock() + { + return executorFactory; + } + + @Override + public ExecutorFactory executorFactory() + { + return executorFactory; + } + + @Override + public MBeanWrapper mbean() + { + return mbean; + } + + @Override + public ScheduledExecutorPlus optionalTasks() + { + return unorderedScheduled; + } + + @Override + public ScheduledExecutorPlus nonPeriodicTasks() + { + return unorderedScheduled; + } + + @Override + public ScheduledExecutorPlus scheduledTasks() + { + return unorderedScheduled; + } + + + @Override + public IFailureDetector failureDetector() + { + return failureDetector; + } + + @Override + public Locator locator() + { + return locator; + } + + @Override + public IGossiper gossiper() + { + return gossiper; + } + + @Override + public MessageDelivery messaging() + { + return messaging; + } + + @Override + public ActiveRepairService repair() + { + return activeRepairService; + } + + @Override + public PaxosRepairState paxosRepairState() + { + return paxosRepairState; + } + + @Override + public ICompactionManager compactionManager() + { + return compactionManager; + } + + @Override + public IValidationManager validationManager() + { + return validationManager; + } + + @Override + public TableRepairManager repairManager(ColumnFamilyStore store) + { + return new CassandraTableRepairManager(store, this) + { + @Override + public void snapshot(String name, Collection> ranges, boolean force) + { + // no-op + } + }; + } + + @Override + public StreamExecutor streamExecutor() + { + return streamExecutor; + } + } + + private class SimulatedGossip implements IGossiper + { + private final Map endpoints = new HashMap<>(); + + @Override + public void register(IEndpointStateChangeSubscriber subscriber) + { + + } + + @Override + public void unregister(IEndpointStateChangeSubscriber subscriber) + { + + } + + @Nullable + @Override + public EndpointState getEndpointStateForEndpoint(InetAddressAndPort ep) + { + return endpoints.get(ep); + } + + @Override + public void notifyFailureDetector(Map remoteEpStateMap) + { + + } + + @Override + public void applyStateLocally(Map epStateMap) + { + // If we were testing paxos this would be wrong... + // CASSANDRA-18917 added support for simulating Gossip, but gossip issues were found so couldn't merge that patch... + // For the paxos repair, since we don't care about paxos messages, this is ok to no-op for now, but if paxos cleanup + // ever was to be tested this logic would need to be implemented + } + } +} diff --git a/test/unit/org/apache/cassandra/utils/StatefulRangeTreeTest.java b/test/unit/org/apache/cassandra/utils/StatefulRangeTreeTest.java new file mode 100644 index 000000000000..139f172dfb23 --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/StatefulRangeTreeTest.java @@ -0,0 +1,471 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.TreeSet; +import java.util.stream.Collectors; + +import org.junit.Test; + +import accord.api.RoutingKey; +import accord.impl.IntKey; +import accord.primitives.Range; +import accord.utils.Gen; +import accord.utils.Gens; +import accord.utils.Property.Command; +import accord.utils.Property.UnitCommand; +import accord.utils.RandomSource; +import org.apache.cassandra.service.accord.RangeTreeRangeAccessor; +import org.assertj.core.api.Assertions; + +import static accord.utils.Property.commands; +import static accord.utils.Property.stateful; + +public class StatefulRangeTreeTest +{ + private static final Gen.IntGen SMALL_INT_GEN = rs -> rs.nextInt(0, 10); + private static final Gen.IntGen NUM_CHILDREN_GEN = rs -> rs.nextInt(2, 12); + private static final Gen SIZE_TARGET_DISTRIBUTION = Gens.mixedDistribution(1 << 3, 1 << 9); + private static final int MIN_TOKEN = 0, MAX_TOKEN = 1 << 16; + private static final int TOKEN_RANGE_SIZE = MAX_TOKEN - MIN_TOKEN + 1; + private static final Gen TOKEN_DISTRIBUTION = Gens.mixedDistribution(MIN_TOKEN, MAX_TOKEN + 1); + private static final Gen RANGE_SIZE_DISTRIBUTION = Gens.mixedDistribution(10, (int) (TOKEN_RANGE_SIZE * .01)); + static final Comparator> COMPARATOR = (a, b) -> { + int rc = a.getKey().compare(b.getKey()); + if (rc == 0) + rc = a.getValue().compareTo(b.getValue()); + return rc; + }; + + /** + * Stateful test for RTree. + *

    + * This test is very similar to {@link RangeTreeTest#test} but is fully mutable, so can not + * use the immutable search trees (else rebuidling becomes a large cost). Both tests should exist as they use different + * models, which helps build confidence that the RTree does the correct thing; that test also covers start and end + * inclusive, which this test does not. + */ + @Test + public void test() + { + stateful().check(commands(() -> State::new, state -> new Sut(state.sizeTarget, state.numChildren)) + .add((rs, state) -> new Create(state.newRange(rs), SMALL_INT_GEN.nextInt(rs))) + .add((rs, state) -> new Read(state.newRange(rs))) + .add((rs, state) -> new KeyRead(IntKey.routing(state.tokenGen.nextInt(rs)))) + .add((rs, state) -> new RangeRead(state.rangeGen.next(rs))) + .add(Iterate.instance) + .add(Clear.instance) + .addAllIf(state -> !state.uniqRanges.isEmpty(), + b -> b.add((rs, state) -> new Read(rs.pickOrderedSet(state.uniqRanges))) + .add((rs, state) -> { + Range range = rs.pickOrderedSet(state.uniqRanges); + int token = rs.nextInt(((IntKey.Routing) range.start()).key, ((IntKey.Routing) range.end()).key) + 1; + return new KeyRead(IntKey.routing(token)); + }) + .add((rs, state) -> new RangeRead(rs.pickOrderedSet(state.uniqRanges))) + .add((rs, state) -> new Update(rs.pickOrderedSet(state.uniqRanges), SMALL_INT_GEN.nextInt(rs))) + .add((rs, state) -> new Delete(rs.pickOrderedSet(state.uniqRanges))) + ) + .build()); + } + + private static Gen rangeGen(RandomSource rand) + { + Gen.IntGen tokenGen = TOKEN_DISTRIBUTION.next(rand); + switch (rand.nextInt(0, 3)) + { + case 0: // pure random + return rs -> { + int a = tokenGen.nextInt(rs); + int b = tokenGen.nextInt(rs); + while (a == b) + b = tokenGen.nextInt(rs); + if (a > b) + { + int tmp = a; + a = b; + b = tmp; + } + return IntKey.range(a, b); + }; + case 1: // small range + Gen.IntGen rangeSizeGen = RANGE_SIZE_DISTRIBUTION.next(rand); + return rs -> { + int a = tokenGen.nextInt(rs); + int rangeSize = rangeSizeGen.nextInt(rs); + int b = a + rangeSize; + if (b > MAX_TOKEN) + { + b = a; + a = b - rangeSize; + } + return IntKey.range(a, b); + }; + case 2: // single element + return rs -> { + int a = tokenGen.nextInt(rs); + int b = a + 1; + return IntKey.range(a, b); + }; + default: + throw new AssertionError(); + } + } + + static class Create implements UnitCommand + { + private final Range range; + private final int value; + + Create(Range range, int value) + { + this.range = range; + this.value = value; + } + + @Override + public void applyUnit(State state) + { + state.add(range, value); + } + + @Override + public void runUnit(Sut sut) + { + sut.tree.add(range, value); + } + + @Override + public void checkPostconditions(State state, Void expected, + Sut sut, Void actual) + { + Assertions.assertThat(sut.tree.size()).isEqualTo(state.list.size()); + } + + @Override + public String detailed(State state) + { + return "Create(" + range + ", " + value + ")"; + } + } + + static abstract class AbstractRead implements Command> + { + private final Comparator comparator; + + protected AbstractRead(Comparator comparator) + { + this.comparator = comparator; + } + + @Override + public void checkPostconditions(State state, List expected, + Sut sut, List actual) + { + expected.sort(comparator); + actual.sort(comparator); + Assertions.assertThat(actual).isEqualTo(expected); + } + } + + static class Read extends AbstractRead + { + private final Range range; + + Read(Range range) + { + super(Comparator.naturalOrder()); + this.range = range; + } + + @Override + public List apply(State state) + { + return state.get(range); + } + + @Override + public List run(Sut sut) + { + return sut.tree.get(range); + } + + @Override + public String detailed(State state) + { + return "Read(" + range + ")"; + } + } + + static class RangeRead extends AbstractRead> + { + private final Range range; + + RangeRead(Range range) + { + super(COMPARATOR); + this.range = range; + } + + @Override + public List> apply(State state) + { + return state.list.stream().filter(e -> e.getKey().compareIntersecting(range) == 0).collect(Collectors.toList()); + } + + @Override + public List> run(Sut sut) + { + return sut.tree.search(range); + } + + @Override + public String detailed(State state) + { + return "Range Read(" + range + ")"; + } + } + + static class KeyRead extends AbstractRead> + { + final RoutingKey key; + + KeyRead(RoutingKey key) + { + super(COMPARATOR); + this.key = key; + } + + @Override + public List> apply(State state) + { + return state.list.stream().filter(e -> e.getKey().contains(key)).collect(Collectors.toList()); + } + + @Override + public List> run(Sut sut) + { + return sut.tree.searchToken(key); + } + + @Override + public String detailed(State state) + { + return "Token Read(" + key + ")"; + } + } + + static class Update implements UnitCommand + { + private final Range range; + private final int value; + + Update(Range range, int value) + { + this.range = range; + this.value = value; + } + + @Override + public void applyUnit(State state) + { + state.update(range, value); + } + + @Override + public void runUnit(Sut sut) + { + sut.tree.get(range, e -> e.setValue(value)); + } + + @Override + public String detailed(State state) + { + return "Update(" + range + ", " + value + ")"; + } + } + + static class Delete implements UnitCommand + { + private final Range range; + + Delete(Range range) + { + this.range = range; + } + + @Override + public void applyUnit(State state) + { + state.remove(range); + } + + @Override + public void runUnit(Sut sut) + { + sut.tree.remove(range); + } + + @Override + public void checkPostconditions(State state, Void expected, + Sut sut, Void actual) + { + Assertions.assertThat(sut.tree.size()).isEqualTo(state.list.size()); + } + + @Override + public String detailed(State state) + { + return "Delete(" + range + ")"; + } + } + + static class Clear implements UnitCommand + { + static final Clear instance = new Clear(); + + @Override + public void applyUnit(State state) + { + state.uniqRanges.clear(); + state.list.clear(); + } + + @Override + public void runUnit(Sut sut) + { + sut.tree.clear(); + } + + @Override + public String detailed(State state) + { + return "Clear(size=" + state.list.size() + ")"; + } + } + + static class Iterate extends AbstractRead> + { + static final Iterate instance = new Iterate(); + + public Iterate() + { + super(COMPARATOR); + } + + @Override + public List> apply(State state) + { + return state.list; + } + + @Override + public List> run(Sut sut) + { + return sut.tree.stream().collect(Collectors.toList()); + } + + @Override + public String detailed(State state) + { + return "Iterate(size=" + state.list.size() + ")"; + } + } + + private static class State + { + private final List> list = new ArrayList<>(); + private final TreeSet uniqRanges = new TreeSet<>(Range::compare); + private final int sizeTarget, numChildren; + private final Gen.IntGen tokenGen; + private final Gen rangeGen; + + private State(RandomSource rs) + { + this.numChildren = NUM_CHILDREN_GEN.nextInt(rs); + this.sizeTarget = SIZE_TARGET_DISTRIBUTION.next(rs).filter(s -> s > numChildren).nextInt(rs); + this.tokenGen = TOKEN_DISTRIBUTION.next(rs); + this.rangeGen = rangeGen(rs); + } + + public Range newRange(RandomSource rs) + { + Range range; + while ((uniqRanges.contains(range = rangeGen.next(rs)))) + { + } + return range; + } + + public void add(Range range, int value) + { + list.add(new MutableEntry<>(range, value)); + uniqRanges.add(range); + } + + public List get(Range range) + { + if (!uniqRanges.contains(range)) + return Collections.emptyList(); + return list.stream().filter(e -> e.getKey().equals(range)).map(e -> e.getValue()).collect(Collectors.toList()); + } + + public void update(Range range, int value) + { + if (!uniqRanges.contains(range)) + return; + list.forEach(e -> { + if (e.getKey().equals(range)) + e.setValue(value); + }); + } + + public void remove(Range range) + { + if (!uniqRanges.contains(range)) + return; + uniqRanges.remove(range); + list.removeIf(e -> e.getKey().equals(range)); + } + + @Override + public String toString() + { + return "State{" + + "sizeTarget=" + sizeTarget + + ", numChildren=" + numChildren + + '}'; + } + } + + public static class Sut + { + private final RangeTree tree; + + private Sut(int sizeTarget, int numChildren) + { + tree = new RTree(Comparator.naturalOrder(), RangeTreeRangeAccessor.instance, sizeTarget, numChildren); + } + } +} diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java index 496f6355c1d3..e1f11ac15a0b 100644 --- a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java +++ b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java @@ -494,7 +494,7 @@ public void testTupleTypeNonFull() tt.pack(decomposeAndRandomPad(UTF8Type.instance, ""), decomposeAndRandomPad(Int32Type.instance, 0)), // Note: a decomposed null (e.g. decomposeAndRandomPad(Int32Type.instance, null)) should not reach a tuple tt.pack(decomposeAndRandomPad(UTF8Type.instance, ""), null), - tt.pack(null, decomposeAndRandomPad(Int32Type.instance, 0)), + tt.pack((ByteBuffer) null, decomposeAndRandomPad(Int32Type.instance, 0)), tt.pack(decomposeAndRandomPad(UTF8Type.instance, "")), tt.pack((ByteBuffer) null), tt.pack() diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java index 7ab30adfa792..740234d10035 100644 --- a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java +++ b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java @@ -341,7 +341,7 @@ public void testEmptyClustering() void assertClusteringPairConvertsSame(AbstractType t1, AbstractType t2, Object o1, Object o2) { - for (ValueAccessor accessor : ValueAccessors.ACCESSORS) + for (ValueAccessor accessor : ValueAccessors.FACTORY_SUPPORTED_ACCESSORS) assertClusteringPairConvertsSame(accessor, t1, t2, o1, o2, AbstractType::decompose); } @@ -442,7 +442,7 @@ public void testTupleTypeNonFull() tt.pack(decomposeAndRandomPad(UTF8Type.instance, ""), decomposeAndRandomPad(Int32Type.instance, 0)), // Note: a decomposed null (e.g. decomposeAndRandomPad(Int32Type.instance, null)) should not reach a tuple tt.pack(decomposeAndRandomPad(UTF8Type.instance, ""), null), - tt.pack(null, decomposeAndRandomPad(Int32Type.instance, 0)), + tt.pack((ByteBuffer) null, decomposeAndRandomPad(Int32Type.instance, 0)), tt.pack(decomposeAndRandomPad(UTF8Type.instance, "")), tt.pack((ByteBuffer) null), tt.pack() diff --git a/test/unit/org/apache/cassandra/utils/concurrent/AbstractTestPromise.java b/test/unit/org/apache/cassandra/utils/concurrent/AbstractTestPromise.java index 982d42df5d7c..35a43ca2ac93 100644 --- a/test/unit/org/apache/cassandra/utils/concurrent/AbstractTestPromise.java +++ b/test/unit/org/apache/cassandra/utils/concurrent/AbstractTestPromise.java @@ -50,7 +50,7 @@ void verify() } catch (Throwable t) { - throw new AssertionError("" + i, t); + throw new AssertionError(i + ": " + waitingOn.get(i), t); } } } diff --git a/test/unit/org/apache/cassandra/utils/concurrent/LockWithAsyncSignalTest.java b/test/unit/org/apache/cassandra/utils/concurrent/LockWithAsyncSignalTest.java new file mode 100644 index 000000000000..d9117454f8b8 --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/concurrent/LockWithAsyncSignalTest.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils.concurrent; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Random; +import java.util.concurrent.ConcurrentSkipListMap; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.LockSupport; + +import org.junit.Test; + +import org.apache.cassandra.concurrent.ExecutorPlus; +import org.apache.cassandra.utils.EstimatedHistogram; +import org.apache.cassandra.utils.FBUtilities; + +import static java.util.concurrent.TimeUnit.NANOSECONDS; +import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; + +public class LockWithAsyncSignalTest +{ + @Test + public void test() + { + ExecutorPlus submitters = executorFactory().pooled("test-submitters", 16); + ExecutorPlus consumers = executorFactory().pooled("test-consumers", 16); + for (int i = 0 ; i < 10 ; ++i) + testOne(submitters, consumers, 4, 8, i); + } + + private static void testOne(ExecutorPlus submitterExecutor, ExecutorPlus consumerExecutor, int submitterCount, int consumerCount, int seconds) + { + class Waiting extends AtomicBoolean implements Comparable + { + final long ticket; + + Waiting(long ticket) + { + this.ticket = ticket; + } + + @Override + public int compareTo(Waiting that) + { + return Long.compare(this.ticket, that.ticket); + } + } + final LockWithAsyncSignal lock = new LockWithAsyncSignal(); + final List> submitters = new ArrayList<>(); + final List> consumers = new ArrayList<>(); + final AtomicBoolean submittersRunning = new AtomicBoolean(true); + final AtomicBoolean consumersRunning = new AtomicBoolean(true); + final ConcurrentSkipListMap waiting = new ConcurrentSkipListMap<>(); + final AtomicLong nextTicket = new AtomicLong(); + final EstimatedHistogram latency = new EstimatedHistogram(); + for (int i = 0; i < submitterCount + consumerCount ; ++i) + { + boolean submitter = i/2 >= consumerCount || ((i & 1) == 0 && i/2 < submitterCount); + if (submitter) + { + submitters.add(submitterExecutor.submit(() -> { + final Random rnd = new Random(); + while (submittersRunning.get()) + { + LockSupport.parkNanos(TimeUnit.MICROSECONDS.toNanos(rnd.nextInt(100))); + long start = System.nanoTime(); + Waiting awaiting = new Waiting(nextTicket.incrementAndGet()); + waiting.put(awaiting, true); + lock.signal(); + while (!awaiting.get()); + long end = System.nanoTime(); + latency.add(NANOSECONDS.toMicros(end - start)); + } + return null; + })); + } + else + { + consumers.add(consumerExecutor.submit(() -> { + final Random rnd = new Random(); + while (true) + { + if (rnd.nextBoolean()) lock.lock(); + else if (!lock.tryLock()) continue; + + Waiting waitUntil; + try + { + AtomicBoolean awaiting; + while (null != (awaiting = pollFirst(waiting))) + awaiting.set(true); + + lock.await(); + if (null != (awaiting = pollFirst(waiting))) + awaiting.set(true); + + if (!consumersRunning.get()) + { + lock.signal(); + return null; + } + + waitUntil = peekLast(waiting); + if (!waiting.isEmpty()) + lock.signal(); + } + finally + { + lock.unlock(); + } + + if (waitUntil != null) + { + while (!waitUntil.get()); + } + } + })); + } + } + long deadline = System.nanoTime() + TimeUnit.SECONDS.toNanos(seconds); + while (true) + { + long wait = deadline - System.nanoTime(); + if (wait < 0) + break; + LockSupport.parkNanos(wait); + } + submittersRunning.set(false); + lock.signal(); + FBUtilities.waitOnFutures(submitters, 2L, TimeUnit.SECONDS); + consumersRunning.set(false); + FBUtilities.waitOnFutures(consumers, 2L, TimeUnit.SECONDS); + } + + private static T pollFirst(NavigableMap map) + { + Map.Entry e = map.pollFirstEntry(); + return e == null ? null : e.getKey(); + } + + private static T peekLast(NavigableMap map) + { + Map.Entry e = map.lastEntry(); + return e == null ? null : e.getKey(); + } +} diff --git a/test/unit/org/apache/cassandra/utils/memory/BigEndianMemoryUtilTest.java b/test/unit/org/apache/cassandra/utils/memory/BigEndianMemoryUtilTest.java new file mode 100644 index 000000000000..fbc2595ae62f --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/memory/BigEndianMemoryUtilTest.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils.memory; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import org.junit.Assert; +import org.junit.Test; + +public class BigEndianMemoryUtilTest +{ + private static final int TEST_BUFFER_LENGTH = 8; + private final ByteBuffer directBuffer = ByteBuffer.allocateDirect(TEST_BUFFER_LENGTH); + { + directBuffer.order(ByteOrder.BIG_ENDIAN); + } + private final long address = BigEndianMemoryUtil.getAddress(directBuffer); + + @Test + public void testGetSetLong() + { + long originalValue = 0xAB_CD_EF_12_34_56_78_90L; + directBuffer.putLong(originalValue); + Assert.assertEquals(originalValue, BigEndianMemoryUtil.getLong(address)); + + directBuffer.rewind(); + directBuffer.putLong(0); + BigEndianMemoryUtil.setLong(address, originalValue); + + Assert.assertEquals(originalValue, directBuffer.getLong(0)); + Assert.assertEquals(originalValue, BigEndianMemoryUtil.getLong(address)); + + } + + @Test + public void testGetSetInt() + { + int originalValue = 0xAB_CD_EF_12; + directBuffer.putInt(originalValue); + Assert.assertEquals(originalValue, BigEndianMemoryUtil.getInt(address)); + + directBuffer.rewind(); + directBuffer.putInt(0); + BigEndianMemoryUtil.setInt(address, originalValue); + + Assert.assertEquals(originalValue, directBuffer.getInt(0)); + Assert.assertEquals(originalValue, BigEndianMemoryUtil.getInt(address)); + + } + + @Test + public void testGetSetUnsighedShort() + { + short originalValue = (short) 0xAB_CD; + directBuffer.putShort(originalValue); + Assert.assertEquals(originalValue & 0xffff, BigEndianMemoryUtil.getUnsignedShort(address)); + + directBuffer.rewind(); + directBuffer.putShort((short) 0); + BigEndianMemoryUtil.setShort(address, originalValue); + + Assert.assertEquals(originalValue, directBuffer.getShort(0)); + Assert.assertEquals(originalValue & 0xffff, BigEndianMemoryUtil.getUnsignedShort(address)); + } + + @Test + public void testGetSetLongByBytes() + { + long originalValue = 0xAB_CD_EF_12_34_56_78_90L; + directBuffer.putLong(originalValue); + Assert.assertEquals(originalValue, BigEndianMemoryUtil.getLongByByte(address)); + + directBuffer.rewind(); + directBuffer.putLong(0); + BigEndianMemoryUtil.putLongByByte(address, originalValue); + + Assert.assertEquals(originalValue, directBuffer.getLong(0)); + Assert.assertEquals(originalValue, BigEndianMemoryUtil.getLongByByte(address)); + + } + + @Test + public void testGetSetIntByBytes() + { + int originalValue = 0xAB_CD_EF_12; + directBuffer.putInt(originalValue); + Assert.assertEquals(originalValue, BigEndianMemoryUtil.getIntByByte(address)); + + directBuffer.rewind(); + directBuffer.putInt(0); + BigEndianMemoryUtil.putIntByByte(address, originalValue); + + Assert.assertEquals(originalValue, directBuffer.getInt(0)); + Assert.assertEquals(originalValue, BigEndianMemoryUtil.getIntByByte(address)); + + } + + @Test + public void testGetSetShortByBytes() + { + short originalValue = (short) 0xAB_CD; + directBuffer.putShort(originalValue); + Assert.assertEquals(originalValue, BigEndianMemoryUtil.getShortByByte(address)); + + directBuffer.rewind(); + directBuffer.putShort((short) 0); + BigEndianMemoryUtil.putShortByByte(address, originalValue); + + Assert.assertEquals(originalValue, directBuffer.getShort(0)); + Assert.assertEquals(originalValue, BigEndianMemoryUtil.getShortByByte(address)); + } + + + @Test + public void testGetHollowDirectByteBuffer() + { + ByteBuffer byteBuffer = BigEndianMemoryUtil.getHollowDirectByteBuffer(); + Assert.assertEquals(directBuffer.getClass(), byteBuffer.getClass()); + Assert.assertEquals(ByteOrder.BIG_ENDIAN, byteBuffer.order()); + } + + @Test + public void testGetByteBuffer() + { + ByteBuffer byteBuffer = BigEndianMemoryUtil.getByteBuffer(address, TEST_BUFFER_LENGTH); + Assert.assertEquals(directBuffer.getClass(), byteBuffer.getClass()); + Assert.assertEquals(ByteOrder.BIG_ENDIAN, byteBuffer.order()); + Assert.assertEquals(TEST_BUFFER_LENGTH, byteBuffer.capacity()); + Assert.assertEquals(0, byteBuffer.position()); + } +} diff --git a/test/unit/org/apache/cassandra/utils/memory/LittleEndianMemoryUtilTest.java b/test/unit/org/apache/cassandra/utils/memory/LittleEndianMemoryUtilTest.java new file mode 100644 index 000000000000..592d01906bcc --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/memory/LittleEndianMemoryUtilTest.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils.memory; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import org.junit.Assert; +import org.junit.Test; + +public class LittleEndianMemoryUtilTest +{ + private static final int TEST_BUFFER_LENGTH = 8; + private final ByteBuffer directBuffer = ByteBuffer.allocateDirect(TEST_BUFFER_LENGTH); + { + directBuffer.order(ByteOrder.LITTLE_ENDIAN); + } + private final long address = LittleEndianMemoryUtil.getAddress(directBuffer); + + @Test + public void testGetSetLong() + { + long originalValue = 0xAB_CD_EF_12_34_56_78_90L; + directBuffer.putLong(originalValue); + Assert.assertEquals(originalValue, LittleEndianMemoryUtil.getLong(address)); + + directBuffer.rewind(); + directBuffer.putLong(0); + LittleEndianMemoryUtil.setLong(address, originalValue); + + Assert.assertEquals(originalValue, directBuffer.getLong(0)); + Assert.assertEquals(originalValue, LittleEndianMemoryUtil.getLong(address)); + + } + + @Test + public void testGetSetInt() + { + int originalValue = 0xAB_CD_EF_12; + directBuffer.putInt(originalValue); + Assert.assertEquals(originalValue, LittleEndianMemoryUtil.getInt(address)); + + directBuffer.rewind(); + directBuffer.putInt(0); + LittleEndianMemoryUtil.setInt(address, originalValue); + + Assert.assertEquals(originalValue, directBuffer.getInt(0)); + Assert.assertEquals(originalValue, LittleEndianMemoryUtil.getInt(address)); + + } + + @Test + public void testGetSetUnsighedShort() + { + short originalValue = (short) 0xAB_CD; + directBuffer.putShort(originalValue); + Assert.assertEquals(originalValue & 0xffff, LittleEndianMemoryUtil.getUnsignedShort(address)); + + directBuffer.rewind(); + directBuffer.putShort((short) 0); + LittleEndianMemoryUtil.setShort(address, originalValue); + + Assert.assertEquals(originalValue, directBuffer.getShort(0)); + Assert.assertEquals(originalValue & 0xffff, LittleEndianMemoryUtil.getUnsignedShort(address)); + } + + @Test + public void testGetSetLongByBytes() + { + long originalValue = 0xAB_CD_EF_12_34_56_78_90L; + directBuffer.putLong(originalValue); + Assert.assertEquals(originalValue, LittleEndianMemoryUtil.getLongByByte(address)); + + directBuffer.rewind(); + directBuffer.putLong(0); + LittleEndianMemoryUtil.putLongByByte(address, originalValue); + + Assert.assertEquals(originalValue, directBuffer.getLong(0)); + Assert.assertEquals(originalValue, LittleEndianMemoryUtil.getLongByByte(address)); + + } + + @Test + public void testGetSetIntByBytes() + { + int originalValue = 0xAB_CD_EF_12; + directBuffer.putInt(originalValue); + Assert.assertEquals(originalValue, LittleEndianMemoryUtil.getIntByByte(address)); + + directBuffer.rewind(); + directBuffer.putInt(0); + LittleEndianMemoryUtil.putIntByByte(address, originalValue); + + Assert.assertEquals(originalValue, directBuffer.getInt(0)); + Assert.assertEquals(originalValue, LittleEndianMemoryUtil.getIntByByte(address)); + + } + + @Test + public void testGetSetShortByBytes() + { + short originalValue = (short) 0xAB_CD; + directBuffer.putShort(originalValue); + Assert.assertEquals(originalValue, LittleEndianMemoryUtil.getShortByByte(address)); + + directBuffer.rewind(); + directBuffer.putShort((short) 0); + LittleEndianMemoryUtil.putShortByByte(address, originalValue); + + Assert.assertEquals(originalValue, directBuffer.getShort(0)); + Assert.assertEquals(originalValue, LittleEndianMemoryUtil.getShortByByte(address)); + } + + + @Test + public void testGetHollowDirectByteBuffer() + { + ByteBuffer byteBuffer = LittleEndianMemoryUtil.getHollowDirectByteBuffer(); + Assert.assertEquals(directBuffer.getClass(), byteBuffer.getClass()); + Assert.assertEquals(ByteOrder.LITTLE_ENDIAN, byteBuffer.order()); + } + + @Test + public void testGetByteBuffer() + { + ByteBuffer byteBuffer = LittleEndianMemoryUtil.getByteBuffer(address, TEST_BUFFER_LENGTH); + Assert.assertEquals(directBuffer.getClass(), byteBuffer.getClass()); + Assert.assertEquals(ByteOrder.LITTLE_ENDIAN, byteBuffer.order()); + Assert.assertEquals(TEST_BUFFER_LENGTH, byteBuffer.capacity()); + Assert.assertEquals(0, byteBuffer.position()); + } +} diff --git a/test/unit/org/apache/cassandra/utils/memory/NativeEndianMemoryUtilTest.java b/test/unit/org/apache/cassandra/utils/memory/NativeEndianMemoryUtilTest.java new file mode 100644 index 000000000000..ba0527b18a0e --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/memory/NativeEndianMemoryUtilTest.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils.memory; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import org.junit.Assert; +import org.junit.Test; + +public class NativeEndianMemoryUtilTest +{ + private static final int TEST_BUFFER_LENGTH = 8; + private final ByteBuffer directBuffer = ByteBuffer.allocateDirect(TEST_BUFFER_LENGTH); + { + directBuffer.order(ByteOrder.nativeOrder()); + } + private final long address = NativeEndianMemoryUtil.getAddress(directBuffer); + + @Test + public void testGetSetLong() + { + long originalValue = 0xAB_CD_EF_12_34_56_78_90L; + directBuffer.putLong(originalValue); + Assert.assertEquals(originalValue, NativeEndianMemoryUtil.getLong(address)); + + directBuffer.rewind(); + directBuffer.putLong(0); + NativeEndianMemoryUtil.setLong(address, originalValue); + + Assert.assertEquals(originalValue, directBuffer.getLong(0)); + Assert.assertEquals(originalValue, NativeEndianMemoryUtil.getLong(address)); + + } + + @Test + public void testGetSetInt() + { + int originalValue = 0xAB_CD_EF_12; + directBuffer.putInt(originalValue); + Assert.assertEquals(originalValue, NativeEndianMemoryUtil.getInt(address)); + + directBuffer.rewind(); + directBuffer.putInt(0); + NativeEndianMemoryUtil.setInt(address, originalValue); + + Assert.assertEquals(originalValue, directBuffer.getInt(0)); + Assert.assertEquals(originalValue, NativeEndianMemoryUtil.getInt(address)); + + } + + @Test + public void testGetSetUnsighedShort() + { + short originalValue = (short) 0xAB_CD; + directBuffer.putShort(originalValue); + Assert.assertEquals(originalValue & 0xffff, NativeEndianMemoryUtil.getUnsignedShort(address)); + + directBuffer.rewind(); + directBuffer.putShort((short) 0); + NativeEndianMemoryUtil.setShort(address, originalValue); + + Assert.assertEquals(originalValue, directBuffer.getShort(0)); + Assert.assertEquals(originalValue & 0xffff, NativeEndianMemoryUtil.getUnsignedShort(address)); + } + + @Test + public void testGetSetLongByBytes() + { + long originalValue = 0xAB_CD_EF_12_34_56_78_90L; + directBuffer.putLong(originalValue); + Assert.assertEquals(originalValue, NativeEndianMemoryUtil.getLongByByte(address)); + + directBuffer.rewind(); + directBuffer.putLong(0); + NativeEndianMemoryUtil.putLongByByte(address, originalValue); + + Assert.assertEquals(originalValue, directBuffer.getLong(0)); + Assert.assertEquals(originalValue, NativeEndianMemoryUtil.getLongByByte(address)); + + } + + @Test + public void testGetSetIntByBytes() + { + int originalValue = 0xAB_CD_EF_12; + directBuffer.putInt(originalValue); + Assert.assertEquals(originalValue, NativeEndianMemoryUtil.getIntByByte(address)); + + directBuffer.rewind(); + directBuffer.putInt(0); + NativeEndianMemoryUtil.putIntByByte(address, originalValue); + + Assert.assertEquals(originalValue, directBuffer.getInt(0)); + Assert.assertEquals(originalValue, NativeEndianMemoryUtil.getIntByByte(address)); + + } + + @Test + public void testGetSetShortByBytes() + { + short originalValue = (short) 0xAB_CD; + directBuffer.putShort(originalValue); + Assert.assertEquals(originalValue, NativeEndianMemoryUtil.getShortByByte(address)); + + directBuffer.rewind(); + directBuffer.putShort((short) 0); + NativeEndianMemoryUtil.putShortByByte(address, originalValue); + + Assert.assertEquals(originalValue, directBuffer.getShort(0)); + Assert.assertEquals(originalValue, NativeEndianMemoryUtil.getShortByByte(address)); + } + + + @Test + public void testGetHollowDirectByteBuffer() + { + ByteBuffer byteBuffer = NativeEndianMemoryUtil.getHollowDirectByteBuffer(); + Assert.assertEquals(directBuffer.getClass(), byteBuffer.getClass()); + Assert.assertEquals(ByteOrder.nativeOrder(), byteBuffer.order()); + } + + @Test + public void testGetByteBuffer() + { + ByteBuffer byteBuffer = NativeEndianMemoryUtil.getByteBuffer(address, TEST_BUFFER_LENGTH); + Assert.assertEquals(directBuffer.getClass(), byteBuffer.getClass()); + Assert.assertEquals(ByteOrder.nativeOrder(), byteBuffer.order()); + Assert.assertEquals(TEST_BUFFER_LENGTH, byteBuffer.capacity()); + Assert.assertEquals(0, byteBuffer.position()); + } +} diff --git a/tools/bin/cassandra.in.sh b/tools/bin/cassandra.in.sh index 056bedc71504..79bbfcd34a64 100644 --- a/tools/bin/cassandra.in.sh +++ b/tools/bin/cassandra.in.sh @@ -39,7 +39,7 @@ if [ -d $CASSANDRA_HOME/build ] ; then if [ "$jars_cnt" = "1" ]; then cassandra_bin="`ls -1 $CASSANDRA_HOME/build/apache-cassandra*.jar | grep -v javadoc | grep -v sources`" - cassandra_bin="$cassandra_bin:$CASSANDRA_HOME/build/classes/stress:$CASSANDRA_HOME/build/classes/fqltool" + cassandra_bin="$cassandra_bin:$CASSANDRA_HOME/build/classes/stress:$CASSANDRA_HOME/build/classes/fqltool:$CASSANDRA_HOME/build/classes/sstableloader" CLASSPATH="$CLASSPATH:$cassandra_bin" fi fi diff --git a/tools/bin/sstableloader b/tools/bin/sstableloader new file mode 100755 index 000000000000..9045adfda392 --- /dev/null +++ b/tools/bin/sstableloader @@ -0,0 +1,49 @@ +#!/bin/sh + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [ "x$CASSANDRA_INCLUDE" = "x" ]; then + # Locations (in order) to use when searching for an include file. + for include in "`dirname "$0"`/cassandra.in.sh" \ + "$HOME/.cassandra.in.sh" \ + /usr/share/cassandra/cassandra.in.sh \ + /usr/local/share/cassandra/cassandra.in.sh \ + /opt/cassandra/cassandra.in.sh; do + if [ -r "$include" ]; then + . "$include" + break + fi + done +elif [ -r "$CASSANDRA_INCLUDE" ]; then + . "$CASSANDRA_INCLUDE" +fi + +if [ -z "$CLASSPATH" ]; then + echo "You must set the CLASSPATH var" >&2 + exit 1 +fi + +if [ "x$MAX_HEAP_SIZE" = "x" ]; then + MAX_HEAP_SIZE="256M" +fi + +"$JAVA" $JAVA_AGENT -ea -cp "$CLASSPATH" $JVM_OPTS -Xmx$MAX_HEAP_SIZE \ + -Dcassandra.storagedir="$cassandra_storagedir" \ + -Dlogback.configurationFile=logback-tools.xml \ + org.apache.cassandra.tools.BulkLoader "$@" + +# vi:ai sw=4 ts=4 tw=0 et diff --git a/tools/sstableloader/build.xml b/tools/sstableloader/build.xml new file mode 100644 index 000000000000..401ba3aea1a8 --- /dev/null +++ b/tools/sstableloader/build.xml @@ -0,0 +1,125 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/java/org/apache/cassandra/tools/BulkLoadConnectionFactory.java b/tools/sstableloader/src/org/apache/cassandra/tools/BulkLoadConnectionFactory.java similarity index 100% rename from src/java/org/apache/cassandra/tools/BulkLoadConnectionFactory.java rename to tools/sstableloader/src/org/apache/cassandra/tools/BulkLoadConnectionFactory.java diff --git a/src/java/org/apache/cassandra/tools/BulkLoadException.java b/tools/sstableloader/src/org/apache/cassandra/tools/BulkLoadException.java similarity index 100% rename from src/java/org/apache/cassandra/tools/BulkLoadException.java rename to tools/sstableloader/src/org/apache/cassandra/tools/BulkLoadException.java diff --git a/src/java/org/apache/cassandra/tools/BulkLoader.java b/tools/sstableloader/src/org/apache/cassandra/tools/BulkLoader.java similarity index 87% rename from src/java/org/apache/cassandra/tools/BulkLoader.java rename to tools/sstableloader/src/org/apache/cassandra/tools/BulkLoader.java index ecd6a5d31b70..a3a4dda35046 100644 --- a/src/java/org/apache/cassandra/tools/BulkLoader.java +++ b/tools/sstableloader/src/org/apache/cassandra/tools/BulkLoader.java @@ -25,8 +25,6 @@ import com.google.common.collect.HashMultimap; import com.google.common.collect.Multimap; -import org.apache.commons.cli.Option; -import org.apache.commons.cli.Options; import com.datastax.driver.core.AuthProvider; import com.datastax.driver.core.RemoteEndpointAwareJdkSSLOptions; @@ -48,7 +46,7 @@ import org.apache.cassandra.utils.NativeSSTableLoaderClient; import org.apache.cassandra.utils.OutputHandler; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; import static org.apache.cassandra.utils.Clock.Global.nanoTime; public class BulkLoader @@ -313,52 +311,4 @@ public StreamingChannel.Factory getConnectionFactory() return new BulkLoadConnectionFactory(serverEncOptions, storagePort); } } - - public static class CmdLineOptions extends Options - { - /** - * Add option with argument and argument name - * @param opt shortcut for option name - * @param longOpt complete option name - * @param argName argument name - * @param description description of the option - * @return updated Options object - */ - public Options addOption(String opt, String longOpt, String argName, String description) - { - Option option = new Option(opt, longOpt, true, description); - option.setArgName(argName); - - return addOption(option); - } - - /** - * Add option with argument and argument name that accepts being defined multiple times as a list - * @param opt shortcut for option name - * @param longOpt complete option name - * @param argName argument name - * @param description description of the option - * @return updated Options object - */ - public Options addOptionList(String opt, String longOpt, String argName, String description) - { - Option option = new Option(opt, longOpt, true, description); - option.setArgName(argName); - option.setArgs(Option.UNLIMITED_VALUES); - - return addOption(option); - } - - /** - * Add option without argument - * @param opt shortcut for option name - * @param longOpt complete option name - * @param description description of the option - * @return updated Options object - */ - public Options addOption(String opt, String longOpt, String description) - { - return addOption(new Option(opt, longOpt, false, description)); - } - } } diff --git a/src/java/org/apache/cassandra/tools/LoaderOptions.java b/tools/sstableloader/src/org/apache/cassandra/tools/LoaderOptions.java similarity index 93% rename from src/java/org/apache/cassandra/tools/LoaderOptions.java rename to tools/sstableloader/src/org/apache/cassandra/tools/LoaderOptions.java index c3d2072ff437..74940e0df6de 100644 --- a/src/java/org/apache/cassandra/tools/LoaderOptions.java +++ b/tools/sstableloader/src/org/apache/cassandra/tools/LoaderOptions.java @@ -49,10 +49,9 @@ import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.io.util.File; import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.tools.BulkLoader.CmdLineOptions; import static org.apache.cassandra.config.DataRateSpec.DataRateUnit.MEBIBYTES_PER_SECOND; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; public class LoaderOptions { @@ -77,9 +76,8 @@ public class LoaderOptions /** * Throttle defined in megabits per second. CASSANDRA-10637 introduced a builder and is the preferred way to * provide options instead of using these constant fields. - * @deprecated Use {@code throttle-mib} instead + * @deprecated Use {@code throttle-mib} instead. See CASSANDRA-17677 */ - /** @deprecated See CASSANDRA-17677 */ @Deprecated(since = "5.0") public static final String THROTTLE_MBITS = "throttle"; public static final String THROTTLE_MEBIBYTES = "throttle-mib"; @@ -121,7 +119,7 @@ public class LoaderOptions public final int entireSSTableInterDcThrottleMebibytes; public final int storagePort; public final int sslStoragePort; - public final EncryptionOptions clientEncOptions; + public final EncryptionOptions.ClientEncryptionOptions clientEncOptions; public final int connectionsPerHost; public final EncryptionOptions.ServerEncryptionOptions serverEncOptions; public final Set hosts; @@ -172,9 +170,11 @@ static class Builder int storagePort; int sslStoragePort; - EncryptionOptions clientEncOptions = new EncryptionOptions(); + EncryptionOptions.ClientEncryptionOptions clientEncOptions = new EncryptionOptions.ClientEncryptionOptions(); + EncryptionOptions.ClientEncryptionOptions.Builder clientEncOptionsBuilder = new EncryptionOptions.ClientEncryptionOptions.Builder(clientEncOptions); int connectionsPerHost = 1; EncryptionOptions.ServerEncryptionOptions serverEncOptions = new EncryptionOptions.ServerEncryptionOptions(); + EncryptionOptions.ServerEncryptionOptions.Builder serverEncOptionsBuilder = new EncryptionOptions.ServerEncryptionOptions.Builder(serverEncOptions); Set hostsArg = new HashSet<>(); Set ignoresArg = new HashSet<>(); Set hosts = new HashSet<>(); @@ -333,9 +333,10 @@ public Builder sslStoragePort(int sslStoragePort) return this; } - public Builder encOptions(EncryptionOptions encOptions) + public Builder encOptions(EncryptionOptions.ClientEncryptionOptions encOptions) { this.clientEncOptions = encOptions; + this.clientEncOptionsBuilder = new EncryptionOptions.ClientEncryptionOptions.Builder(encOptions); return this; } @@ -348,6 +349,7 @@ public Builder connectionsPerHost(int connectionsPerHost) public Builder serverEncOptions(EncryptionOptions.ServerEncryptionOptions serverEncOptions) { this.serverEncOptions = serverEncOptions; + this.serverEncOptionsBuilder = new EncryptionOptions.ServerEncryptionOptions.Builder(serverEncOptions); return this; } @@ -551,9 +553,10 @@ public Builder parseArgs(String cmdArgs[]) "which is able to handle encrypted communication too."); // Copy the encryption options and apply the config so that argument parsing can accesss isEnabled. - clientEncOptions = config.client_encryption_options.applyConfig(); - serverEncOptions = config.server_encryption_options; - serverEncOptions.applyConfig(); + clientEncOptionsBuilder = new EncryptionOptions.ClientEncryptionOptions.Builder(config.client_encryption_options); + clientEncOptions = clientEncOptionsBuilder.build(); + serverEncOptionsBuilder = new EncryptionOptions.ServerEncryptionOptions.Builder(config.server_encryption_options); + serverEncOptions = serverEncOptionsBuilder.build(); if (cmd.hasOption(NATIVE_PORT_OPTION)) nativePort = Integer.parseInt(cmd.getOptionValue(NATIVE_PORT_OPTION)); @@ -625,51 +628,53 @@ public Builder parseArgs(String cmdArgs[]) if (cmd.hasOption(SSL_TRUSTSTORE) || cmd.hasOption(SSL_TRUSTSTORE_PW) || cmd.hasOption(SSL_KEYSTORE) || cmd.hasOption(SSL_KEYSTORE_PW)) { - clientEncOptions = clientEncOptions.withEnabled(true); + clientEncOptionsBuilder.withEnabled(true); } if (cmd.hasOption(SSL_TRUSTSTORE)) { - clientEncOptions = clientEncOptions.withTrustStore(cmd.getOptionValue(SSL_TRUSTSTORE)); + clientEncOptionsBuilder.withTrustStore(cmd.getOptionValue(SSL_TRUSTSTORE)); } if (cmd.hasOption(SSL_TRUSTSTORE_PW)) { - clientEncOptions = clientEncOptions.withTrustStorePassword(cmd.getOptionValue(SSL_TRUSTSTORE_PW)); + clientEncOptionsBuilder.withTrustStorePassword(cmd.getOptionValue(SSL_TRUSTSTORE_PW)); } if (cmd.hasOption(SSL_KEYSTORE)) { // if a keystore was provided, lets assume we'll need to use - clientEncOptions = clientEncOptions.withKeyStore(cmd.getOptionValue(SSL_KEYSTORE)) + clientEncOptionsBuilder.withKeyStore(cmd.getOptionValue(SSL_KEYSTORE)) .withRequireClientAuth(REQUIRED); } if (cmd.hasOption(SSL_KEYSTORE_PW)) { - clientEncOptions = clientEncOptions.withKeyStorePassword(cmd.getOptionValue(SSL_KEYSTORE_PW)); + clientEncOptionsBuilder.withKeyStorePassword(cmd.getOptionValue(SSL_KEYSTORE_PW)); } if (cmd.hasOption(SSL_PROTOCOL)) { - clientEncOptions = clientEncOptions.withProtocol(cmd.getOptionValue(SSL_PROTOCOL)); + clientEncOptionsBuilder.withProtocol(cmd.getOptionValue(SSL_PROTOCOL)); } if (cmd.hasOption(SSL_ALGORITHM)) { - clientEncOptions = clientEncOptions.withAlgorithm(cmd.getOptionValue(SSL_ALGORITHM)); + clientEncOptionsBuilder.withAlgorithm(cmd.getOptionValue(SSL_ALGORITHM)); } if (cmd.hasOption(SSL_STORE_TYPE)) { - clientEncOptions = clientEncOptions.withStoreType(cmd.getOptionValue(SSL_STORE_TYPE)); + clientEncOptionsBuilder.withStoreType(cmd.getOptionValue(SSL_STORE_TYPE)); } if (cmd.hasOption(SSL_CIPHER_SUITES)) { - clientEncOptions = clientEncOptions.withCipherSuites(cmd.getOptionValue(SSL_CIPHER_SUITES).split(",")); + clientEncOptionsBuilder.withCipherSuites(cmd.getOptionValue(SSL_CIPHER_SUITES).split(",")); } + clientEncOptions = clientEncOptionsBuilder.build(); + if (cmd.hasOption(TARGET_KEYSPACE)) { targetKeyspace = cmd.getOptionValue(TARGET_KEYSPACE); diff --git a/src/java/org/apache/cassandra/utils/NativeSSTableLoaderClient.java b/tools/sstableloader/src/org/apache/cassandra/utils/NativeSSTableLoaderClient.java similarity index 98% rename from src/java/org/apache/cassandra/utils/NativeSSTableLoaderClient.java rename to tools/sstableloader/src/org/apache/cassandra/utils/NativeSSTableLoaderClient.java index 6ce840efbb5e..03292d9d8c92 100644 --- a/src/java/org/apache/cassandra/utils/NativeSSTableLoaderClient.java +++ b/tools/sstableloader/src/org/apache/cassandra/utils/NativeSSTableLoaderClient.java @@ -214,7 +214,7 @@ private static ColumnMetadata createDefinitionFromRow(Row row, String keyspace, int position = row.getInt("position"); org.apache.cassandra.schema.ColumnMetadata.Kind kind = ColumnMetadata.Kind.valueOf(toUpperCaseLocalized(row.getString("kind"))); - return new ColumnMetadata(keyspace, table, name, type, position, kind, null); + return new ColumnMetadata(keyspace, table, name, type, ColumnMetadata.NO_UNIQUE_ID, position, kind, null); } private static DroppedColumn createDroppedColumnFromRow(Row row, String keyspace, String table) @@ -222,7 +222,7 @@ private static DroppedColumn createDroppedColumnFromRow(Row row, String keyspace String name = row.getString("column_name"); AbstractType type = CQLTypeParser.parse(keyspace, row.getString("type"), Types.none()); ColumnMetadata.Kind kind = ColumnMetadata.Kind.valueOf(toUpperCaseLocalized(row.getString("kind"))); - ColumnMetadata column = new ColumnMetadata(keyspace, table, ColumnIdentifier.getInterned(name, true), type, ColumnMetadata.NO_POSITION, kind, null); + ColumnMetadata column = new ColumnMetadata(keyspace, table, ColumnIdentifier.getInterned(name, true), type, ColumnMetadata.NO_UNIQUE_ID, ColumnMetadata.NO_POSITION, kind, null); long droppedTime = row.getTimestamp("dropped_time").getTime(); return new DroppedColumn(column, droppedTime); } diff --git a/test/unit/org/apache/cassandra/tools/LoaderOptionsTest.java b/tools/sstableloader/test/unit/org/apache/cassandra/tools/LoaderOptionsTest.java similarity index 75% rename from test/unit/org/apache/cassandra/tools/LoaderOptionsTest.java rename to tools/sstableloader/test/unit/org/apache/cassandra/tools/LoaderOptionsTest.java index 4fc23256770a..04e1380164eb 100644 --- a/test/unit/org/apache/cassandra/tools/LoaderOptionsTest.java +++ b/tools/sstableloader/test/unit/org/apache/cassandra/tools/LoaderOptionsTest.java @@ -25,11 +25,11 @@ import java.security.Permission; import com.google.common.net.HostAndPort; +import org.apache.commons.io.FileUtils; import org.junit.Test; import org.apache.cassandra.io.util.File; -import static org.apache.cassandra.tools.OfflineToolUtils.sstableDirName; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -85,6 +85,35 @@ public void testEncryptionSettings() throws Exception assertEquals("test.jks", options.clientEncOptions.keystore); } + /** + * Tests for client_encryption_options override from the command line. + */ + @Test + public void testEncryptionSettingsOverride() throws Exception + { + // Default Cassandra config + File config = new File(Paths.get(".", "test", "conf", "cassandra-mtls.yaml").normalize()); + String[] args = { "-d", "127.9.9.1", "-f", config.absolutePath(), + "-ts", "test.jks", "-tspw", "truststorePass1", + "-ks", "test.jks", "-kspw", "testdata1", + "--ssl-ciphers", "TLS_RSA_WITH_AES_256_CBC_SHA", + "--ssl-alg", "SunX509", "--store-type", "JKS", "--ssl-protocol", "TLS", + sstableDirName("legacy_sstables", "legacy_ma_simple") }; + LoaderOptions options = LoaderOptions.builder().parseArgs(args).build(); + // Below two lines validating server encryption options is to verify that we are loading config from the yaml + assertEquals("test/conf/cassandra_ssl_test.keystore", options.serverEncOptions.keystore); + assertEquals("cassandra", options.serverEncOptions.keystore_password); + // Below asserts validate the overrides for the client encryption options from the command line + // Since the values are provided by (and local to) this test, they are hardcoded + assertEquals("JKS", options.clientEncOptions.store_type); + assertEquals("test.jks", options.clientEncOptions.truststore); + assertEquals("truststorePass1", options.clientEncOptions.truststore_password); + assertEquals("test.jks", options.clientEncOptions.keystore); + assertEquals("testdata1", options.clientEncOptions.keystore_password); + assertEquals("TLS_RSA_WITH_AES_256_CBC_SHA", options.clientEncOptions.cipherSuitesArray()[0]); + assertEquals("SunX509", options.clientEncOptions.algorithm); + } + @Test public void testThrottleDefaultSettings() { @@ -215,5 +244,40 @@ public void checkPermission(Permission perm, Object context) System.setSecurityManager(null); } } + + // Copied from OfflineToolUtils + + public static String sstableDirName(String ks, String cf) throws IOException + { + return sstableDir(ks, cf).absolutePath(); + } + + public static File sstableDir(String ks, String cf) throws IOException + { + File dataDir = copySSTables(); + File ksDir = new File(dataDir, ks); + File[] cfDirs = ksDir.tryList((dir, name) -> cf.equals(name) || name.startsWith(cf + '-')); + return cfDirs[0]; + } + + public static File copySSTables() throws IOException + { + File dataDir = new File("build/test/cassandra/data"); + File srcDir = new File("test/data/legacy-sstables/ma"); + FileUtils.copyDirectory(new File(srcDir, "legacy_tables").toJavaIOFile(), new File(dataDir, "legacy_sstables").toJavaIOFile()); + return dataDir; + } + + // Copied from SystemExitException in unit tests + + private static class SystemExitException extends Error + { + public final int status; + + public SystemExitException(int status) + { + this.status = status; + } + } } diff --git a/tools/stress/src/org/apache/cassandra/io/sstable/StressCQLSSTableWriter.java b/tools/stress/src/org/apache/cassandra/io/sstable/StressCQLSSTableWriter.java index 3745f717be1b..c7b9cbef4bff 100644 --- a/tools/stress/src/org/apache/cassandra/io/sstable/StressCQLSSTableWriter.java +++ b/tools/stress/src/org/apache/cassandra/io/sstable/StressCQLSSTableWriter.java @@ -262,7 +262,6 @@ public StressCQLSSTableWriter rawAddRow(List values) // Note that we asks indexes to not validate values (the last 'false' arg below) because that triggers a 'Keyspace.open' // and that forces a lot of initialization that we don't want. UpdateParameters params = new UpdateParameters(insert.metadata(), - insert.updatedColumns(), ClientState.forInternalCalls(), options, insert.getTimestamp(TimeUnit.MILLISECONDS.toMicros(now), options), diff --git a/tools/stress/src/org/apache/cassandra/stress/operations/predefined/PredefinedOperation.java b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/PredefinedOperation.java index bf969ad46bcc..630ee3aca5f3 100644 --- a/tools/stress/src/org/apache/cassandra/stress/operations/predefined/PredefinedOperation.java +++ b/tools/stress/src/org/apache/cassandra/stress/operations/predefined/PredefinedOperation.java @@ -28,7 +28,6 @@ import org.apache.cassandra.stress.operations.PartitionOperation; import org.apache.cassandra.stress.report.Timer; import org.apache.cassandra.stress.settings.Command; -import org.apache.cassandra.stress.settings.CqlVersion; import org.apache.cassandra.stress.settings.StressSettings; public abstract class PredefinedOperation extends PartitionOperation diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsTransport.java b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsTransport.java index cf629998225b..ccdf0a53b207 100644 --- a/tools/stress/src/org/apache/cassandra/stress/settings/SettingsTransport.java +++ b/tools/stress/src/org/apache/cassandra/stress/settings/SettingsTransport.java @@ -44,31 +44,32 @@ public SettingsTransport(TOptions options, SettingsCredentials credentials) this.credentials = credentials; } - public EncryptionOptions getEncryptionOptions() + public EncryptionOptions.ClientEncryptionOptions getEncryptionOptions() { - EncryptionOptions encOptions = new EncryptionOptions().applyConfig(); + EncryptionOptions.ClientEncryptionOptions encOptions = new EncryptionOptions.ClientEncryptionOptions().applyConfig(); if (options.trustStore.present()) { - encOptions = encOptions - .withEnabled(true) - .withTrustStore(options.trustStore.value()) - .withTrustStorePassword(options.trustStorePw.setByUser() ? options.trustStorePw.value() : credentials.transportTruststorePassword) - .withAlgorithm(options.alg.value()) - .withProtocol(options.protocol.value()) - .withCipherSuites(options.ciphers.value().split(",")); + EncryptionOptions.Builder encOptionsBuilder = new EncryptionOptions.ClientEncryptionOptions.Builder(encOptions) + .withEnabled(true) + .withTrustStore(options.trustStore.value()) + .withTrustStorePassword(options.trustStorePw.setByUser() ? options.trustStorePw.value() : credentials.transportTruststorePassword) + .withAlgorithm(options.alg.value()) + .withProtocol(options.protocol.value()) + .withCipherSuites(options.ciphers.value().split(",")); + if (options.keyStore.present()) { - encOptions = encOptions - .withKeyStore(options.keyStore.value()) - .withKeyStorePassword(options.keyStorePw.setByUser() ? options.keyStorePw.value() : credentials.transportKeystorePassword); + encOptionsBuilder.withKeyStore(options.keyStore.value()) + .withKeyStorePassword(options.keyStorePw.setByUser() ? options.keyStorePw.value() : credentials.transportKeystorePassword); } else { // mandatory for SSLFactory.createSSLContext(), see CASSANDRA-9325 - encOptions = encOptions - .withKeyStore(encOptions.truststore) - .withKeyStorePassword(encOptions.truststore_password != null ? encOptions.truststore_password : credentials.transportTruststorePassword); + encOptionsBuilder.withKeyStore(encOptions.truststore) + .withKeyStorePassword(encOptions.truststore_password != null ? encOptions.truststore_password : credentials.transportTruststorePassword); } + + encOptions = encOptionsBuilder.build(); } return encOptions; } diff --git a/tools/stress/src/org/apache/cassandra/stress/settings/StressSettings.java b/tools/stress/src/org/apache/cassandra/stress/settings/StressSettings.java index 6aea048b4cc0..f81c4a9787c9 100644 --- a/tools/stress/src/org/apache/cassandra/stress/settings/StressSettings.java +++ b/tools/stress/src/org/apache/cassandra/stress/settings/StressSettings.java @@ -139,7 +139,7 @@ public JavaDriverClient getJavaDriverClient(String keyspace) if (client != null) return client; - EncryptionOptions encOptions = transport.getEncryptionOptions(); + EncryptionOptions.ClientEncryptionOptions encOptions = transport.getEncryptionOptions(); JavaDriverClient c = new JavaDriverClient(this, node.nodes, port.nativePort, encOptions); c.connect(mode.compression()); if (keyspace != null) diff --git a/tools/stress/src/org/apache/cassandra/stress/util/JavaDriverClient.java b/tools/stress/src/org/apache/cassandra/stress/util/JavaDriverClient.java index 3d72828daf73..f2a92fe5aa7d 100644 --- a/tools/stress/src/org/apache/cassandra/stress/util/JavaDriverClient.java +++ b/tools/stress/src/org/apache/cassandra/stress/util/JavaDriverClient.java @@ -41,7 +41,7 @@ import org.apache.cassandra.security.SSLFactory; import org.apache.cassandra.stress.settings.StressSettings; -import static org.apache.cassandra.config.EncryptionOptions.ClientAuth.REQUIRED; +import static org.apache.cassandra.config.EncryptionOptions.ClientEncryptionOptions.ClientAuth.REQUIRED; public class JavaDriverClient { @@ -60,7 +60,7 @@ public class JavaDriverClient public final int connectionsPerHost; private final ProtocolVersion protocolVersion; - private final EncryptionOptions encryptionOptions; + private final EncryptionOptions.ClientEncryptionOptions encryptionOptions; private Cluster cluster; private Session session; private final LoadBalancingPolicy loadBalancingPolicy; @@ -69,15 +69,15 @@ public class JavaDriverClient public JavaDriverClient(StressSettings settings, String host, int port) { - this(settings, Collections.singletonList(host), port, new EncryptionOptions()); + this(settings, Collections.singletonList(host), port, new EncryptionOptions.ClientEncryptionOptions()); } public JavaDriverClient(StressSettings settings, List hosts, int port) { - this(settings, hosts, port, new EncryptionOptions()); + this(settings, hosts, port, new EncryptionOptions.ClientEncryptionOptions()); } - public JavaDriverClient(StressSettings settings, List hosts, int port, EncryptionOptions encryptionOptions) + public JavaDriverClient(StressSettings settings, List hosts, int port, EncryptionOptions.ClientEncryptionOptions encryptionOptions) { this.protocolVersion = settings.mode.protocolVersion; this.hosts = hosts; @@ -85,7 +85,7 @@ public JavaDriverClient(StressSettings settings, List hosts, int port, E this.username = settings.mode.username; this.password = settings.mode.password; this.authProvider = settings.mode.authProvider; - this.encryptionOptions = new EncryptionOptions(encryptionOptions).applyConfig(); + this.encryptionOptions = new EncryptionOptions.ClientEncryptionOptions(encryptionOptions).applyConfig(); this.loadBalancingPolicy = loadBalancingPolicy(settings); this.connectionsPerHost = settings.mode.connectionsPerHost == null ? 8 : settings.mode.connectionsPerHost;